Compare commits

..

4 Commits

Author SHA1 Message Date
cb29678285 Let interactive imply verification
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-06-04 15:34:48 +01:00
ab14a9d891 Document resolver and progress modes
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-06-04 15:30:31 +01:00
4dafcac9dc Add duplicate resolver and real progress
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-06-04 15:30:22 +01:00
72906ed4f3 Add terminal UI dependencies
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-06-04 15:30:12 +01:00
5 changed files with 910 additions and 70 deletions

309
Cargo.lock generated
View File

@@ -47,7 +47,7 @@ version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
"windows-sys",
"windows-sys 0.61.2",
]
[[package]]
@@ -58,7 +58,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
"anstyle",
"once_cell_polyfill",
"windows-sys",
"windows-sys 0.61.2",
]
[[package]]
@@ -109,6 +109,12 @@ dependencies = [
"serde",
]
[[package]]
name = "bumpalo"
version = "3.20.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649"
[[package]]
name = "cc"
version = "1.2.63"
@@ -171,6 +177,31 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
[[package]]
name = "console"
version = "0.15.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
dependencies = [
"encode_unicode",
"libc",
"once_cell",
"unicode-width",
"windows-sys 0.59.0",
]
[[package]]
name = "console"
version = "0.16.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
dependencies = [
"encode_unicode",
"libc",
"unicode-width",
"windows-sys 0.61.2",
]
[[package]]
name = "constant_time_eq"
version = "0.4.2"
@@ -211,6 +242,19 @@ version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "dialoguer"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de"
dependencies = [
"console 0.15.11",
"shell-words",
"tempfile",
"thiserror",
"zeroize",
]
[[package]]
name = "disk-checker"
version = "0.1.0"
@@ -218,7 +262,9 @@ dependencies = [
"anyhow",
"blake3",
"clap",
"dialoguer",
"ignore",
"indicatif",
"rayon",
"serde",
"serde_json",
@@ -231,6 +277,12 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
[[package]]
name = "encode_unicode"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "equivalent"
version = "1.0.2"
@@ -244,7 +296,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
"windows-sys",
"windows-sys 0.61.2",
]
[[package]]
@@ -265,6 +317,30 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "futures-core"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
[[package]]
name = "futures-task"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
[[package]]
name = "futures-util"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
dependencies = [
"futures-core",
"futures-task",
"pin-project-lite",
"slab",
]
[[package]]
name = "getrandom"
version = "0.4.2"
@@ -346,6 +422,19 @@ dependencies = [
"serde_core",
]
[[package]]
name = "indicatif"
version = "0.18.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
dependencies = [
"console 0.16.3",
"portable-atomic",
"unicode-width",
"unit-prefix",
"web-time",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.2"
@@ -358,6 +447,18 @@ version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "js-sys"
version = "0.3.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11"
dependencies = [
"cfg-if",
"futures-util",
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "leb128fmt"
version = "0.1.0"
@@ -400,6 +501,18 @@ version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "pin-project-lite"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
[[package]]
name = "portable-atomic"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
[[package]]
name = "prettyplease"
version = "0.2.37"
@@ -481,9 +594,15 @@ dependencies = [
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
"windows-sys 0.61.2",
]
[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "same-file"
version = "1.0.6"
@@ -542,12 +661,24 @@ dependencies = [
"zmij",
]
[[package]]
name = "shell-words"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc6fe69c597f9c37bfeeeeeb33da3530379845f10be461a66d16d03eca2ded77"
[[package]]
name = "shlex"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
[[package]]
name = "slab"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
[[package]]
name = "strsim"
version = "0.11.1"
@@ -575,7 +706,27 @@ dependencies = [
"getrandom",
"once_cell",
"rustix",
"windows-sys",
"windows-sys 0.61.2",
]
[[package]]
name = "thiserror"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
@@ -584,12 +735,24 @@ version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "unicode-width"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
[[package]]
name = "unicode-xid"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "unit-prefix"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
[[package]]
name = "utf8parse"
version = "0.2.2"
@@ -624,6 +787,51 @@ dependencies = [
"wit-bindgen 0.51.0",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.122"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409"
dependencies = [
"cfg-if",
"once_cell",
"rustversion",
"wasm-bindgen-macro",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.122"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.122"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e"
dependencies = [
"bumpalo",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.122"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437"
dependencies = [
"unicode-ident",
]
[[package]]
name = "wasm-encoder"
version = "0.244.0"
@@ -658,13 +866,23 @@ dependencies = [
"semver",
]
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi-util"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys",
"windows-sys 0.61.2",
]
[[package]]
@@ -673,6 +891,15 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-sys"
version = "0.61.2"
@@ -682,6 +909,70 @@ dependencies = [
"windows-link",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "wit-bindgen"
version = "0.51.0"
@@ -776,6 +1067,12 @@ dependencies = [
"wasmparser",
]
[[package]]
name = "zeroize"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
[[package]]
name = "zmij"
version = "1.0.21"

View File

@@ -7,7 +7,9 @@ edition = "2024"
anyhow = "1"
blake3 = "1"
clap = { version = "4", features = ["derive"] }
dialoguer = "0.11"
ignore = "0.4"
indicatif = "0.18"
rayon = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"

View File

@@ -61,16 +61,50 @@ Verify possible duplicates with a full-file hash pass:
disk-checker ~/Downloads --verify-full
```
Review duplicate groups one by one and choose which path to keep:
```bash
disk-checker ~/Downloads --interactive
```
Interactive mode automatically full-verifies only the duplicate candidate groups before prompting. It is non-destructive: it writes a reviewed shell deletion plan instead of deleting files immediately.
```bash
disk-checker ~/Downloads --interactive --delete-plan review-delete.sh
```
Use the fastest triage mode for huge datasets by grouping same-size files without hashing:
```bash
disk-checker /mnt/storage --size-only --min-size 100MiB --threads 32
```
Limit traversal depth:
```bash
disk-checker /mnt/storage --max-depth 3
```
Limit scanning and hashing workers:
```bash
disk-checker ~/Downloads --threads 4
```
Disable progress output:
```bash
disk-checker ~/Downloads --no-progress
```
## Notes
- By default, duplicate results are **possible duplicates**: same file size plus same first `1MiB` BLAKE3 hash.
- This is intentionally fast because it avoids reading whole files unless you pass `--verify-full`.
- `--size-only` is even faster for triage, but it only means files have the same size; use it to narrow the search, not as proof.
- Symlinks are not followed by default to avoid surprises and cycles.
- Hard link groups are reported separately because they are multiple paths to the same inode, not extra disk copies.
- Hidden files and gitignored files are included; this is a disk scanner, not a source-code search tool.
- Fast mode does **not** read 30TB of file content. It reads metadata plus up to the hash window for same-size candidate files: for example, 30,000 candidate files at `1MiB` is about 30GiB of content reads.
- Fully verifying all 30TB in 10 minutes would require roughly 50GB/s sustained reads. `--verify-full` only fully reads candidate groups, but storage throughput is still the hard limit for exact verification.
- Progress output is real and writes to stderr: traversal shows live discovered counts because total traversal work is unknown, while hashing shows determinate byte progress from actual reads. Progress is disabled automatically for `--json` and can be disabled with `--no-progress`.

View File

@@ -6,6 +6,7 @@ use std::sync::mpsc;
use std::thread;
use ignore::{WalkBuilder, WalkState};
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use serde::{Serialize, Serializer};
@@ -22,6 +23,10 @@ pub struct ScanConfig {
pub follow_links: bool,
pub verify_full: bool,
pub threads: Option<usize>,
pub size_only: bool,
pub min_size: u64,
pub max_depth: Option<usize>,
pub progress: bool,
}
#[derive(Debug, Clone, Serialize)]
@@ -30,6 +35,9 @@ pub struct ScanReport {
pub scanned_paths: Vec<PathBuf>,
pub hash_bytes: u64,
pub worker_threads: usize,
pub size_only: bool,
pub min_size: u64,
pub max_depth: Option<usize>,
pub followed_symlinks: bool,
pub full_verification: bool,
pub summary: ScanSummary,
@@ -139,6 +147,16 @@ enum HashOutcome {
Issue(ScanIssue),
}
#[derive(Debug, Default)]
struct ScanAccumulator {
files: Vec<FileEntry>,
symlinks: Vec<SymlinkInfo>,
special_entries: Vec<SpecialEntry>,
errors: Vec<ScanIssue>,
directories: usize,
total_file_bytes: u64,
}
#[derive(Debug, Clone)]
enum ScannedEntry {
File(FileEntry),
@@ -198,12 +216,13 @@ pub fn parse_byte_count(input: &str) -> Result<u64, String> {
pub fn scan_paths(config: ScanConfig) -> ScanReport {
let hash_bytes = config.hash_bytes.max(1);
let worker_threads = worker_threads(config.threads);
let min_size = config.min_size;
let mut files = Vec::new();
let mut symlinks = Vec::new();
let mut special_entries = Vec::new();
let mut errors = Vec::new();
let mut directories = 0;
let mut total_file_bytes = 0;
let mut directories: usize = 0;
let mut total_file_bytes: u64 = 0;
for root in &config.paths {
let mut builder = WalkBuilder::new(root);
@@ -216,33 +235,21 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
.git_global(false)
.git_exclude(false)
.parents(false);
let (sender, receiver) = mpsc::channel();
builder.build_parallel().run(|| {
let sender = sender.clone();
let follow_links = config.follow_links;
Box::new(move |entry| {
for scanned_entry in classify_walk_entry(entry, follow_links) {
if sender.send(scanned_entry).is_err() {
return WalkState::Quit;
if let Some(max_depth) = config.max_depth {
builder.max_depth(Some(max_depth));
}
}
WalkState::Continue
})
});
drop(sender);
for scanned_entry in receiver {
collect_scanned_entry(
scanned_entry,
&mut files,
&mut symlinks,
&mut special_entries,
&mut errors,
&mut directories,
&mut total_file_bytes,
let accumulator = walk_root_parallel(
&mut builder,
config.follow_links,
walk_progress(config.progress, root),
);
}
files.extend(accumulator.files);
symlinks.extend(accumulator.symlinks);
special_entries.extend(accumulator.special_entries);
errors.extend(accumulator.errors);
directories += accumulator.directories;
total_file_bytes = total_file_bytes.saturating_add(accumulator.total_file_bytes);
}
files.sort_by(|left, right| left.path.cmp(&right.path));
@@ -250,17 +257,43 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
special_entries.sort_by(|left, right| left.path.cmp(&right.path));
let hard_links = find_hard_links(&files);
let same_size_candidates = same_size_candidates(&files);
let same_size_candidates = same_size_candidates(&files, min_size);
let same_size_candidate_files = same_size_candidates.len();
let partial_outcomes = hash_files(&same_size_candidates, hash_bytes, false);
let possible_duplicates = if config.size_only {
size_only_duplicate_groups(same_size_candidates.clone())
} else {
let partial_outcomes = hash_files(
&same_size_candidates,
hash_bytes,
false,
hash_progress(
config.progress,
&same_size_candidates,
hash_bytes,
false,
"Hashing file prefixes",
),
);
let mut partial_hashes = Vec::new();
collect_hash_outcomes(partial_outcomes, &mut partial_hashes, &mut errors);
let possible_duplicates = duplicate_groups(partial_hashes);
duplicate_groups(partial_hashes)
};
let verified_duplicates = if config.verify_full {
let full_candidates = files_from_duplicate_groups(&possible_duplicates);
let full_outcomes = hash_files(&full_candidates, hash_bytes, true);
let full_outcomes = hash_files(
&full_candidates,
hash_bytes,
true,
hash_progress(
config.progress,
&full_candidates,
hash_bytes,
true,
"Full verification hashing",
),
);
let mut full_hashes = Vec::new();
collect_hash_outcomes(full_outcomes, &mut full_hashes, &mut errors);
duplicate_groups(full_hashes)
@@ -291,6 +324,9 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
scanned_paths: config.paths,
hash_bytes,
worker_threads,
size_only: config.size_only,
min_size,
max_depth: config.max_depth,
followed_symlinks: config.follow_links,
full_verification: config.verify_full,
summary: ScanSummary {
@@ -325,6 +361,123 @@ fn worker_threads(configured_threads: Option<usize>) -> usize {
})
}
fn walk_root_parallel(
builder: &mut WalkBuilder,
follow_links: bool,
progress: Option<ProgressBar>,
) -> ScanAccumulator {
let (sender, receiver) = mpsc::channel();
let collector = thread::spawn(move || {
let mut accumulator = ScanAccumulator::default();
for scanned_entry in receiver {
collect_scanned_entry(scanned_entry, &mut accumulator);
update_walk_progress(progress.as_ref(), &accumulator, false);
}
update_walk_progress(progress.as_ref(), &accumulator, true);
accumulator
});
builder.build_parallel().run(|| {
let sender = sender.clone();
Box::new(move |entry| {
for scanned_entry in classify_walk_entry(entry, follow_links) {
if sender.send(scanned_entry).is_err() {
return WalkState::Quit;
}
}
WalkState::Continue
})
});
drop(sender);
collector
.join()
.expect("scan result collector thread should not panic")
}
fn walk_progress(enabled: bool, root: &Path) -> Option<ProgressBar> {
if !enabled {
return None;
}
let progress = ProgressBar::new_spinner();
progress.set_style(
ProgressStyle::with_template("{spinner:.green} {msg}")
.expect("valid traversal progress template"),
);
progress.set_message(format!(
"Scanning {} — 0 files, 0 dirs, 0 symlinks, 0 errors",
root.display()
));
Some(progress)
}
fn update_walk_progress(progress: Option<&ProgressBar>, accumulator: &ScanAccumulator, done: bool) {
let Some(progress) = progress else {
return;
};
let interactions = accumulator.interactions();
if done {
progress.finish_with_message(format!(
"Scanned {} files, {} dirs, {} symlinks, {} special entries, {} errors",
accumulator.files.len(),
accumulator.directories,
accumulator.symlinks.len(),
accumulator.special_entries.len(),
accumulator.errors.len()
));
} else if interactions == 1 || interactions.is_multiple_of(100) {
progress.tick();
progress.set_message(format!(
"Scanning — {} files, {} dirs, {} symlinks, {} special entries, {} errors",
accumulator.files.len(),
accumulator.directories,
accumulator.symlinks.len(),
accumulator.special_entries.len(),
accumulator.errors.len()
));
}
}
fn hash_progress(
enabled: bool,
files: &[FileEntry],
hash_bytes: u64,
full_file: bool,
message: &'static str,
) -> Option<ProgressBar> {
if !enabled || files.is_empty() {
return None;
}
let total_bytes = files
.iter()
.map(|file| {
if full_file {
file.size
} else {
file.size.min(hash_bytes)
}
})
.sum::<u64>();
if total_bytes == 0 {
return None;
}
let progress = ProgressBar::new(total_bytes);
progress.set_style(
ProgressStyle::with_template(
"{msg} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {binary_bytes}/{binary_total_bytes} {binary_bytes_per_sec}",
)
.expect("valid hashing progress template")
.progress_chars("=>-"),
);
progress.set_message(message);
Some(progress)
}
fn classify_walk_entry(
entry: Result<ignore::DirEntry, ignore::Error>,
follow_links: bool,
@@ -386,24 +539,16 @@ fn non_symlink_entry(path: PathBuf, metadata: &Metadata) -> ScannedEntry {
}
}
fn collect_scanned_entry(
entry: ScannedEntry,
files: &mut Vec<FileEntry>,
symlinks: &mut Vec<SymlinkInfo>,
special_entries: &mut Vec<SpecialEntry>,
errors: &mut Vec<ScanIssue>,
directories: &mut usize,
total_file_bytes: &mut u64,
) {
fn collect_scanned_entry(entry: ScannedEntry, accumulator: &mut ScanAccumulator) {
match entry {
ScannedEntry::File(file) => {
*total_file_bytes = total_file_bytes.saturating_add(file.size);
files.push(file);
accumulator.total_file_bytes = accumulator.total_file_bytes.saturating_add(file.size);
accumulator.files.push(file);
}
ScannedEntry::Directory => *directories += 1,
ScannedEntry::Symlink(symlink) => symlinks.push(symlink),
ScannedEntry::Special(special_entry) => special_entries.push(special_entry),
ScannedEntry::Issue(error) => errors.push(error),
ScannedEntry::Directory => accumulator.directories += 1,
ScannedEntry::Symlink(symlink) => accumulator.symlinks.push(symlink),
ScannedEntry::Special(special_entry) => accumulator.special_entries.push(special_entry),
ScannedEntry::Issue(error) => accumulator.errors.push(error),
}
}
@@ -465,10 +610,13 @@ fn find_hard_links(files: &[FileEntry]) -> Vec<HardLinkGroup> {
.collect()
}
fn same_size_candidates(files: &[FileEntry]) -> Vec<FileEntry> {
fn same_size_candidates(files: &[FileEntry], min_size: u64) -> Vec<FileEntry> {
let files = unique_file_id_entries(files);
let mut by_size: BTreeMap<u64, Vec<FileEntry>> = BTreeMap::new();
for file in files {
if file.size < min_size {
continue;
}
by_size.entry(file.size).or_default().push(file);
}
@@ -479,6 +627,25 @@ fn same_size_candidates(files: &[FileEntry]) -> Vec<FileEntry> {
.collect()
}
fn size_only_duplicate_groups(files: Vec<FileEntry>) -> Vec<DuplicateGroup> {
let mut by_size: BTreeMap<u64, Vec<PathBuf>> = BTreeMap::new();
for file in files {
by_size.entry(file.size).or_default().push(file.path);
}
by_size
.into_iter()
.filter_map(|(size, mut paths)| {
paths.sort();
(paths.len() > 1).then_some(DuplicateGroup {
size,
hash: "size-only".to_string(),
paths,
})
})
.collect()
}
fn unique_file_id_entries(files: &[FileEntry]) -> Vec<FileEntry> {
let mut by_file_id: BTreeMap<(u64, u64), &FileEntry> = BTreeMap::new();
for file in files {
@@ -488,14 +655,20 @@ fn unique_file_id_entries(files: &[FileEntry]) -> Vec<FileEntry> {
by_file_id.into_values().cloned().collect()
}
fn hash_files(files: &[FileEntry], hash_bytes: u64, full_file: bool) -> Vec<HashOutcome> {
files
fn hash_files(
files: &[FileEntry],
hash_bytes: u64,
full_file: bool,
progress: Option<ProgressBar>,
) -> Vec<HashOutcome> {
let outcomes = files
.par_iter()
.map(|file| {
let file_progress = progress.clone();
let hash_result = if full_file {
hash_full_file(&file.path)
hash_full_file(&file.path, file_progress.as_ref())
} else {
hash_file_prefix(&file.path, hash_bytes)
hash_file_prefix(&file.path, hash_bytes, file_progress.as_ref())
};
match hash_result {
@@ -510,7 +683,13 @@ fn hash_files(files: &[FileEntry], hash_bytes: u64, full_file: bool) -> Vec<Hash
)),
}
})
.collect()
.collect();
if let Some(progress) = progress {
progress.finish_and_clear();
}
outcomes
}
fn collect_hash_outcomes(
@@ -558,7 +737,11 @@ fn files_from_duplicate_groups(groups: &[DuplicateGroup]) -> Vec<FileEntry> {
.collect()
}
fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
fn hash_file_prefix(
path: &Path,
hash_bytes: u64,
progress: Option<&ProgressBar>,
) -> io::Result<String> {
let file = File::open(path)?;
let mut reader = BufReader::new(file);
let mut hasher = blake3::Hasher::new();
@@ -571,6 +754,9 @@ fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
if bytes_read == 0 {
break;
}
if let Some(progress) = progress {
progress.inc(bytes_read as u64);
}
hasher.update(&buffer[..bytes_read]);
remaining -= bytes_read as u64;
}
@@ -578,7 +764,7 @@ fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
Ok(hasher.finalize().to_hex().to_string())
}
fn hash_full_file(path: &Path) -> io::Result<String> {
fn hash_full_file(path: &Path, progress: Option<&ProgressBar>) -> io::Result<String> {
let file = File::open(path)?;
let mut reader = BufReader::new(file);
let mut hasher = blake3::Hasher::new();
@@ -589,6 +775,9 @@ fn hash_full_file(path: &Path) -> io::Result<String> {
if bytes_read == 0 {
break;
}
if let Some(progress) = progress {
progress.inc(bytes_read as u64);
}
hasher.update(&buffer[..bytes_read]);
}
@@ -609,6 +798,28 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
)?;
writeln!(writer, "Hash window: {}", format_bytes(report.hash_bytes))?;
writeln!(writer, "Worker threads: {}", report.worker_threads)?;
writeln!(
writer,
"Duplicate mode: {}",
if report.size_only {
"size only"
} else {
"size + partial hash"
}
)?;
writeln!(
writer,
"Minimum duplicate size: {}",
format_bytes(report.min_size)
)?;
writeln!(
writer,
"Maximum depth: {}",
report
.max_depth
.map(|depth| depth.to_string())
.unwrap_or_else(|| "unlimited".to_string())
)?;
writeln!(
writer,
"Symlink traversal: {}",
@@ -640,7 +851,7 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
)?;
writeln!(
writer,
"Same-size files hashed: {}",
"Same-size duplicate candidates: {}",
report.summary.same_size_candidate_files
)?;
writeln!(
@@ -675,7 +886,11 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
write_duplicate_section(
&mut writer,
"Possible duplicates (same size + partial hash)",
if report.size_only {
"Possible duplicates (same size only)"
} else {
"Possible duplicates (same size + partial hash)"
},
&report.possible_duplicates,
)?;
@@ -898,6 +1113,16 @@ impl SpecialEntryKind {
}
}
impl ScanAccumulator {
fn interactions(&self) -> usize {
self.files.len()
+ self.directories
+ self.symlinks.len()
+ self.special_entries.len()
+ self.errors.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
@@ -929,6 +1154,10 @@ mod tests {
follow_links: false,
verify_full: false,
threads: None,
size_only: false,
min_size: 0,
max_depth: None,
progress: false,
});
assert_eq!(report.summary.files, 3);
@@ -953,12 +1182,75 @@ mod tests {
follow_links: false,
verify_full: true,
threads: None,
size_only: false,
min_size: 0,
max_depth: None,
progress: false,
});
assert_eq!(report.possible_duplicates.len(), 1);
assert!(report.verified_duplicates.is_empty());
}
#[test]
fn size_only_mode_groups_same_size_without_hashing_prefixes() {
let temp = TempDir::new().expect("temp dir");
let first = temp.path().join("first.bin");
let second = temp.path().join("second.bin");
fs::write(&first, b"abcdef").expect("write first");
fs::write(&second, b"uvwxyz").expect("write second");
let report = scan_paths(ScanConfig {
paths: vec![temp.path().to_path_buf()],
hash_bytes: DEFAULT_HASH_BYTES,
follow_links: false,
verify_full: false,
threads: None,
size_only: true,
min_size: 0,
max_depth: None,
progress: false,
});
assert_eq!(report.possible_duplicates.len(), 1);
assert_eq!(report.possible_duplicates[0].hash, "size-only");
assert!(report.possible_duplicates[0].paths.contains(&first));
assert!(report.possible_duplicates[0].paths.contains(&second));
}
#[test]
fn min_size_filters_duplicate_candidates_before_hashing() {
let temp = TempDir::new().expect("temp dir");
let small_first = temp.path().join("small-first.bin");
let small_second = temp.path().join("small-second.bin");
let large_first = temp.path().join("large-first.bin");
let large_second = temp.path().join("large-second.bin");
fs::write(&small_first, b"abc").expect("write small first");
fs::write(&small_second, b"abc").expect("write small second");
fs::write(&large_first, b"abcdef").expect("write large first");
fs::write(&large_second, b"abcdef").expect("write large second");
let report = scan_paths(ScanConfig {
paths: vec![temp.path().to_path_buf()],
hash_bytes: DEFAULT_HASH_BYTES,
follow_links: false,
verify_full: false,
threads: None,
size_only: false,
min_size: 4,
max_depth: None,
progress: false,
});
assert_eq!(report.possible_duplicates.len(), 1);
assert!(report.possible_duplicates[0].paths.contains(&large_first));
assert!(report.possible_duplicates[0].paths.contains(&large_second));
assert!(!report.possible_duplicates[0].paths.contains(&small_first));
assert!(!report.possible_duplicates[0].paths.contains(&small_second));
}
#[cfg(unix)]
#[test]
fn reports_symlinks_without_following_them() {
@@ -976,6 +1268,10 @@ mod tests {
follow_links: false,
verify_full: false,
threads: None,
size_only: false,
min_size: 0,
max_depth: None,
progress: false,
});
assert_eq!(report.summary.files, 1);
@@ -999,6 +1295,10 @@ mod tests {
follow_links: false,
verify_full: false,
threads: None,
size_only: false,
min_size: 0,
max_depth: None,
progress: false,
});
assert_eq!(report.summary.files, 2);
@@ -1024,6 +1324,10 @@ mod tests {
follow_links: false,
verify_full: false,
threads: None,
size_only: false,
min_size: 0,
max_depth: None,
progress: false,
});
let json = serde_json::to_string(&report).expect("serialize report with lossy path");
@@ -1037,6 +1341,9 @@ mod tests {
scanned_paths: vec![PathBuf::from(".")],
hash_bytes: DEFAULT_HASH_BYTES,
worker_threads: 1,
size_only: false,
min_size: 0,
max_depth: None,
followed_symlinks: false,
full_verification: false,
summary: ScanSummary {

View File

@@ -1,10 +1,12 @@
use std::io::{self, Write};
use std::path::PathBuf;
use std::fs::OpenOptions;
use std::io::{self, IsTerminal, Write};
use std::path::{Path, PathBuf};
use std::process::ExitCode;
use anyhow::Context;
use anyhow::{Context, bail};
use clap::Parser;
use disk_checker::{ScanConfig, parse_byte_count, scan_paths, write_human_report};
use dialoguer::{Confirm, Select, theme::ColorfulTheme};
use disk_checker::{DuplicateGroup, ScanConfig, parse_byte_count, scan_paths, write_human_report};
#[derive(Debug, Parser)]
#[command(
@@ -31,6 +33,18 @@ struct Cli {
#[arg(long)]
verify_full: bool,
/// Group duplicate candidates by size only. Fastest mode for huge triage; less precise.
#[arg(long)]
size_only: bool,
/// Ignore duplicate candidates smaller than this size. Accepts units like 100MiB or 1GB.
#[arg(long, default_value = "0", value_parser = parse_min_size)]
min_size: u64,
/// Maximum directory depth to scan. Depth 0 means only the provided path itself.
#[arg(long, value_parser = parse_nonzero_or_zero_usize)]
max_depth: Option<usize>,
/// Number of worker threads used for scanning and hashing. Defaults to CPU parallelism.
#[arg(long, value_parser = parse_thread_count)]
threads: Option<usize>,
@@ -38,6 +52,18 @@ struct Cli {
/// Print machine-readable JSON instead of the human summary.
#[arg(long)]
json: bool,
/// Disable progress output.
#[arg(long)]
no_progress: bool,
/// Interactively review duplicate groups and choose which path to keep.
#[arg(long)]
interactive: bool,
/// Shell script path for planned deletes when --interactive is used.
#[arg(long, default_value = "disk-checker-delete-plan.sh")]
delete_plan: PathBuf,
}
fn parse_thread_count(input: &str) -> Result<usize, String> {
@@ -51,8 +77,28 @@ fn parse_thread_count(input: &str) -> Result<usize, String> {
}
}
fn parse_nonzero_or_zero_usize(input: &str) -> Result<usize, String> {
input
.parse::<usize>()
.map_err(|error| format!("invalid depth {input:?}: {error}"))
}
fn parse_min_size(input: &str) -> Result<u64, String> {
if input.trim() == "0" {
Ok(0)
} else {
parse_byte_count(input)
}
}
fn main() -> anyhow::Result<ExitCode> {
let cli = Cli::parse();
if cli.interactive && cli.json {
bail!(
"--interactive cannot be combined with --json because prompts would contaminate JSON output"
);
}
let verify_full = cli.verify_full || cli.interactive;
if let Some(threads) = cli.threads {
rayon::ThreadPoolBuilder::new()
@@ -71,8 +117,12 @@ fn main() -> anyhow::Result<ExitCode> {
paths,
hash_bytes: cli.hash_bytes,
follow_links: cli.follow_links,
verify_full: cli.verify_full,
verify_full,
threads: cli.threads,
size_only: cli.size_only,
min_size: cli.min_size,
max_depth: cli.max_depth,
progress: !cli.no_progress && !cli.json && io::stderr().is_terminal(),
});
let stdout = io::stdout();
@@ -83,6 +133,11 @@ fn main() -> anyhow::Result<ExitCode> {
} else {
write_human_report(&mut out, &report).context("failed to write report")?;
}
drop(out);
if cli.interactive {
run_interactive_resolver(&report.verified_duplicates, true, &cli.delete_plan)?;
}
if report.summary.errors > 0 {
Ok(ExitCode::from(2))
@@ -90,3 +145,148 @@ fn main() -> anyhow::Result<ExitCode> {
Ok(ExitCode::SUCCESS)
}
}
fn run_interactive_resolver(
groups: &[DuplicateGroup],
verified: bool,
delete_plan: &PathBuf,
) -> anyhow::Result<()> {
if groups.is_empty() {
println!("No duplicate groups to resolve.");
return Ok(());
}
let theme = ColorfulTheme::default();
let mut planned_deletes = Vec::new();
let mut skipped = 0usize;
for (group_index, group) in groups.iter().enumerate() {
println!();
println!(
"Duplicate group {}/{}{} across {} files",
group_index + 1,
groups.len(),
disk_checker::format_bytes(group.size),
group.paths.len()
);
let mut choices = group
.paths
.iter()
.map(|path| path.display().to_string())
.collect::<Vec<_>>();
choices.push("Skip this group".to_string());
let selection = Select::with_theme(&theme)
.with_prompt("Choose the version to keep")
.items(&choices)
.default(0)
.interact()
.context("interactive selection failed")?;
if selection == group.paths.len() {
skipped += 1;
continue;
}
let keep_path = &group.paths[selection];
let delete_paths = group
.paths
.iter()
.filter(|path| *path != keep_path)
.cloned()
.collect::<Vec<_>>();
println!("Keeping: {}", keep_path.display());
for path in &delete_paths {
println!(" remove: {}", path.display());
}
let confirmed = Confirm::with_theme(&theme)
.with_prompt("Add these files to the deletion plan?")
.default(false)
.interact()
.context("interactive confirmation failed")?;
if !confirmed {
skipped += 1;
continue;
}
planned_deletes.extend(delete_paths);
}
if !planned_deletes.is_empty() {
write_delete_plan(delete_plan, &planned_deletes, verified)?;
println!(
"Wrote deletion plan for {} files: {}",
planned_deletes.len(),
delete_plan.display()
);
println!("Review it, then run: sh {}", delete_plan.display());
}
println!(
"Interactive resolver complete: {} planned, {} groups skipped.",
planned_deletes.len(),
skipped
);
Ok(())
}
fn write_delete_plan(
path: &PathBuf,
delete_paths: &[PathBuf],
verified: bool,
) -> anyhow::Result<()> {
let mut file = OpenOptions::new()
.write(true)
.create_new(true)
.open(path)
.with_context(|| format!("failed to create delete plan {}", path.display()))?;
writeln!(file, "#!/bin/sh")?;
writeln!(file, "set -eu")?;
writeln!(
file,
"# Review carefully before running. Generated by disk-checker."
)?;
if verified {
writeln!(
file,
"# Source groups were fully verified with --verify-full."
)?;
} else {
writeln!(
file,
"# WARNING: Source groups were possible duplicates only, not fully verified."
)?;
}
for delete_path in delete_paths {
writeln!(file, "rm -- {}", shell_quote(delete_path)?)?;
}
Ok(())
}
fn shell_quote(path: &Path) -> anyhow::Result<String> {
let value = path.to_str().with_context(|| {
format!(
"delete plan cannot safely encode non-UTF-8 path: {}",
path.display()
)
})?;
Ok(format!("'{}'", value.replace('\'', "'\\''")))
}
#[cfg(test)]
mod tests {
use super::shell_quote;
use std::path::Path;
#[test]
fn shell_quote_escapes_single_quotes() {
assert_eq!(
shell_quote(Path::new("/tmp/it's-here.txt")).expect("quote path"),
"'/tmp/it'\\''s-here.txt'"
);
}
}