Add duplicate resolver and real progress
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
417
src/lib.rs
417
src/lib.rs
@@ -6,6 +6,7 @@ use std::sync::mpsc;
|
||||
use std::thread;
|
||||
|
||||
use ignore::{WalkBuilder, WalkState};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use rayon::prelude::*;
|
||||
use serde::{Serialize, Serializer};
|
||||
|
||||
@@ -22,6 +23,10 @@ pub struct ScanConfig {
|
||||
pub follow_links: bool,
|
||||
pub verify_full: bool,
|
||||
pub threads: Option<usize>,
|
||||
pub size_only: bool,
|
||||
pub min_size: u64,
|
||||
pub max_depth: Option<usize>,
|
||||
pub progress: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
@@ -30,6 +35,9 @@ pub struct ScanReport {
|
||||
pub scanned_paths: Vec<PathBuf>,
|
||||
pub hash_bytes: u64,
|
||||
pub worker_threads: usize,
|
||||
pub size_only: bool,
|
||||
pub min_size: u64,
|
||||
pub max_depth: Option<usize>,
|
||||
pub followed_symlinks: bool,
|
||||
pub full_verification: bool,
|
||||
pub summary: ScanSummary,
|
||||
@@ -139,6 +147,16 @@ enum HashOutcome {
|
||||
Issue(ScanIssue),
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct ScanAccumulator {
|
||||
files: Vec<FileEntry>,
|
||||
symlinks: Vec<SymlinkInfo>,
|
||||
special_entries: Vec<SpecialEntry>,
|
||||
errors: Vec<ScanIssue>,
|
||||
directories: usize,
|
||||
total_file_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
enum ScannedEntry {
|
||||
File(FileEntry),
|
||||
@@ -198,12 +216,13 @@ pub fn parse_byte_count(input: &str) -> Result<u64, String> {
|
||||
pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
||||
let hash_bytes = config.hash_bytes.max(1);
|
||||
let worker_threads = worker_threads(config.threads);
|
||||
let min_size = config.min_size;
|
||||
let mut files = Vec::new();
|
||||
let mut symlinks = Vec::new();
|
||||
let mut special_entries = Vec::new();
|
||||
let mut errors = Vec::new();
|
||||
let mut directories = 0;
|
||||
let mut total_file_bytes = 0;
|
||||
let mut directories: usize = 0;
|
||||
let mut total_file_bytes: u64 = 0;
|
||||
|
||||
for root in &config.paths {
|
||||
let mut builder = WalkBuilder::new(root);
|
||||
@@ -216,33 +235,21 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
||||
.git_global(false)
|
||||
.git_exclude(false)
|
||||
.parents(false);
|
||||
|
||||
let (sender, receiver) = mpsc::channel();
|
||||
builder.build_parallel().run(|| {
|
||||
let sender = sender.clone();
|
||||
let follow_links = config.follow_links;
|
||||
Box::new(move |entry| {
|
||||
for scanned_entry in classify_walk_entry(entry, follow_links) {
|
||||
if sender.send(scanned_entry).is_err() {
|
||||
return WalkState::Quit;
|
||||
if let Some(max_depth) = config.max_depth {
|
||||
builder.max_depth(Some(max_depth));
|
||||
}
|
||||
}
|
||||
WalkState::Continue
|
||||
})
|
||||
});
|
||||
drop(sender);
|
||||
|
||||
for scanned_entry in receiver {
|
||||
collect_scanned_entry(
|
||||
scanned_entry,
|
||||
&mut files,
|
||||
&mut symlinks,
|
||||
&mut special_entries,
|
||||
&mut errors,
|
||||
&mut directories,
|
||||
&mut total_file_bytes,
|
||||
let accumulator = walk_root_parallel(
|
||||
&mut builder,
|
||||
config.follow_links,
|
||||
walk_progress(config.progress, root),
|
||||
);
|
||||
}
|
||||
files.extend(accumulator.files);
|
||||
symlinks.extend(accumulator.symlinks);
|
||||
special_entries.extend(accumulator.special_entries);
|
||||
errors.extend(accumulator.errors);
|
||||
directories += accumulator.directories;
|
||||
total_file_bytes = total_file_bytes.saturating_add(accumulator.total_file_bytes);
|
||||
}
|
||||
|
||||
files.sort_by(|left, right| left.path.cmp(&right.path));
|
||||
@@ -250,17 +257,43 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
||||
special_entries.sort_by(|left, right| left.path.cmp(&right.path));
|
||||
|
||||
let hard_links = find_hard_links(&files);
|
||||
let same_size_candidates = same_size_candidates(&files);
|
||||
let same_size_candidates = same_size_candidates(&files, min_size);
|
||||
let same_size_candidate_files = same_size_candidates.len();
|
||||
|
||||
let partial_outcomes = hash_files(&same_size_candidates, hash_bytes, false);
|
||||
let possible_duplicates = if config.size_only {
|
||||
size_only_duplicate_groups(same_size_candidates.clone())
|
||||
} else {
|
||||
let partial_outcomes = hash_files(
|
||||
&same_size_candidates,
|
||||
hash_bytes,
|
||||
false,
|
||||
hash_progress(
|
||||
config.progress,
|
||||
&same_size_candidates,
|
||||
hash_bytes,
|
||||
false,
|
||||
"Hashing file prefixes",
|
||||
),
|
||||
);
|
||||
let mut partial_hashes = Vec::new();
|
||||
collect_hash_outcomes(partial_outcomes, &mut partial_hashes, &mut errors);
|
||||
let possible_duplicates = duplicate_groups(partial_hashes);
|
||||
duplicate_groups(partial_hashes)
|
||||
};
|
||||
|
||||
let verified_duplicates = if config.verify_full {
|
||||
let full_candidates = files_from_duplicate_groups(&possible_duplicates);
|
||||
let full_outcomes = hash_files(&full_candidates, hash_bytes, true);
|
||||
let full_outcomes = hash_files(
|
||||
&full_candidates,
|
||||
hash_bytes,
|
||||
true,
|
||||
hash_progress(
|
||||
config.progress,
|
||||
&full_candidates,
|
||||
hash_bytes,
|
||||
true,
|
||||
"Full verification hashing",
|
||||
),
|
||||
);
|
||||
let mut full_hashes = Vec::new();
|
||||
collect_hash_outcomes(full_outcomes, &mut full_hashes, &mut errors);
|
||||
duplicate_groups(full_hashes)
|
||||
@@ -291,6 +324,9 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
||||
scanned_paths: config.paths,
|
||||
hash_bytes,
|
||||
worker_threads,
|
||||
size_only: config.size_only,
|
||||
min_size,
|
||||
max_depth: config.max_depth,
|
||||
followed_symlinks: config.follow_links,
|
||||
full_verification: config.verify_full,
|
||||
summary: ScanSummary {
|
||||
@@ -325,6 +361,123 @@ fn worker_threads(configured_threads: Option<usize>) -> usize {
|
||||
})
|
||||
}
|
||||
|
||||
fn walk_root_parallel(
|
||||
builder: &mut WalkBuilder,
|
||||
follow_links: bool,
|
||||
progress: Option<ProgressBar>,
|
||||
) -> ScanAccumulator {
|
||||
let (sender, receiver) = mpsc::channel();
|
||||
let collector = thread::spawn(move || {
|
||||
let mut accumulator = ScanAccumulator::default();
|
||||
for scanned_entry in receiver {
|
||||
collect_scanned_entry(scanned_entry, &mut accumulator);
|
||||
update_walk_progress(progress.as_ref(), &accumulator, false);
|
||||
}
|
||||
update_walk_progress(progress.as_ref(), &accumulator, true);
|
||||
accumulator
|
||||
});
|
||||
|
||||
builder.build_parallel().run(|| {
|
||||
let sender = sender.clone();
|
||||
Box::new(move |entry| {
|
||||
for scanned_entry in classify_walk_entry(entry, follow_links) {
|
||||
if sender.send(scanned_entry).is_err() {
|
||||
return WalkState::Quit;
|
||||
}
|
||||
}
|
||||
WalkState::Continue
|
||||
})
|
||||
});
|
||||
drop(sender);
|
||||
|
||||
collector
|
||||
.join()
|
||||
.expect("scan result collector thread should not panic")
|
||||
}
|
||||
|
||||
fn walk_progress(enabled: bool, root: &Path) -> Option<ProgressBar> {
|
||||
if !enabled {
|
||||
return None;
|
||||
}
|
||||
|
||||
let progress = ProgressBar::new_spinner();
|
||||
progress.set_style(
|
||||
ProgressStyle::with_template("{spinner:.green} {msg}")
|
||||
.expect("valid traversal progress template"),
|
||||
);
|
||||
progress.set_message(format!(
|
||||
"Scanning {} — 0 files, 0 dirs, 0 symlinks, 0 errors",
|
||||
root.display()
|
||||
));
|
||||
Some(progress)
|
||||
}
|
||||
|
||||
fn update_walk_progress(progress: Option<&ProgressBar>, accumulator: &ScanAccumulator, done: bool) {
|
||||
let Some(progress) = progress else {
|
||||
return;
|
||||
};
|
||||
|
||||
let interactions = accumulator.interactions();
|
||||
if done {
|
||||
progress.finish_with_message(format!(
|
||||
"Scanned {} files, {} dirs, {} symlinks, {} special entries, {} errors",
|
||||
accumulator.files.len(),
|
||||
accumulator.directories,
|
||||
accumulator.symlinks.len(),
|
||||
accumulator.special_entries.len(),
|
||||
accumulator.errors.len()
|
||||
));
|
||||
} else if interactions == 1 || interactions.is_multiple_of(100) {
|
||||
progress.tick();
|
||||
progress.set_message(format!(
|
||||
"Scanning — {} files, {} dirs, {} symlinks, {} special entries, {} errors",
|
||||
accumulator.files.len(),
|
||||
accumulator.directories,
|
||||
accumulator.symlinks.len(),
|
||||
accumulator.special_entries.len(),
|
||||
accumulator.errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
fn hash_progress(
|
||||
enabled: bool,
|
||||
files: &[FileEntry],
|
||||
hash_bytes: u64,
|
||||
full_file: bool,
|
||||
message: &'static str,
|
||||
) -> Option<ProgressBar> {
|
||||
if !enabled || files.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let total_bytes = files
|
||||
.iter()
|
||||
.map(|file| {
|
||||
if full_file {
|
||||
file.size
|
||||
} else {
|
||||
file.size.min(hash_bytes)
|
||||
}
|
||||
})
|
||||
.sum::<u64>();
|
||||
|
||||
if total_bytes == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let progress = ProgressBar::new(total_bytes);
|
||||
progress.set_style(
|
||||
ProgressStyle::with_template(
|
||||
"{msg} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {binary_bytes}/{binary_total_bytes} {binary_bytes_per_sec}",
|
||||
)
|
||||
.expect("valid hashing progress template")
|
||||
.progress_chars("=>-"),
|
||||
);
|
||||
progress.set_message(message);
|
||||
Some(progress)
|
||||
}
|
||||
|
||||
fn classify_walk_entry(
|
||||
entry: Result<ignore::DirEntry, ignore::Error>,
|
||||
follow_links: bool,
|
||||
@@ -386,24 +539,16 @@ fn non_symlink_entry(path: PathBuf, metadata: &Metadata) -> ScannedEntry {
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_scanned_entry(
|
||||
entry: ScannedEntry,
|
||||
files: &mut Vec<FileEntry>,
|
||||
symlinks: &mut Vec<SymlinkInfo>,
|
||||
special_entries: &mut Vec<SpecialEntry>,
|
||||
errors: &mut Vec<ScanIssue>,
|
||||
directories: &mut usize,
|
||||
total_file_bytes: &mut u64,
|
||||
) {
|
||||
fn collect_scanned_entry(entry: ScannedEntry, accumulator: &mut ScanAccumulator) {
|
||||
match entry {
|
||||
ScannedEntry::File(file) => {
|
||||
*total_file_bytes = total_file_bytes.saturating_add(file.size);
|
||||
files.push(file);
|
||||
accumulator.total_file_bytes = accumulator.total_file_bytes.saturating_add(file.size);
|
||||
accumulator.files.push(file);
|
||||
}
|
||||
ScannedEntry::Directory => *directories += 1,
|
||||
ScannedEntry::Symlink(symlink) => symlinks.push(symlink),
|
||||
ScannedEntry::Special(special_entry) => special_entries.push(special_entry),
|
||||
ScannedEntry::Issue(error) => errors.push(error),
|
||||
ScannedEntry::Directory => accumulator.directories += 1,
|
||||
ScannedEntry::Symlink(symlink) => accumulator.symlinks.push(symlink),
|
||||
ScannedEntry::Special(special_entry) => accumulator.special_entries.push(special_entry),
|
||||
ScannedEntry::Issue(error) => accumulator.errors.push(error),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -465,10 +610,13 @@ fn find_hard_links(files: &[FileEntry]) -> Vec<HardLinkGroup> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn same_size_candidates(files: &[FileEntry]) -> Vec<FileEntry> {
|
||||
fn same_size_candidates(files: &[FileEntry], min_size: u64) -> Vec<FileEntry> {
|
||||
let files = unique_file_id_entries(files);
|
||||
let mut by_size: BTreeMap<u64, Vec<FileEntry>> = BTreeMap::new();
|
||||
for file in files {
|
||||
if file.size < min_size {
|
||||
continue;
|
||||
}
|
||||
by_size.entry(file.size).or_default().push(file);
|
||||
}
|
||||
|
||||
@@ -479,6 +627,25 @@ fn same_size_candidates(files: &[FileEntry]) -> Vec<FileEntry> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn size_only_duplicate_groups(files: Vec<FileEntry>) -> Vec<DuplicateGroup> {
|
||||
let mut by_size: BTreeMap<u64, Vec<PathBuf>> = BTreeMap::new();
|
||||
for file in files {
|
||||
by_size.entry(file.size).or_default().push(file.path);
|
||||
}
|
||||
|
||||
by_size
|
||||
.into_iter()
|
||||
.filter_map(|(size, mut paths)| {
|
||||
paths.sort();
|
||||
(paths.len() > 1).then_some(DuplicateGroup {
|
||||
size,
|
||||
hash: "size-only".to_string(),
|
||||
paths,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn unique_file_id_entries(files: &[FileEntry]) -> Vec<FileEntry> {
|
||||
let mut by_file_id: BTreeMap<(u64, u64), &FileEntry> = BTreeMap::new();
|
||||
for file in files {
|
||||
@@ -488,14 +655,20 @@ fn unique_file_id_entries(files: &[FileEntry]) -> Vec<FileEntry> {
|
||||
by_file_id.into_values().cloned().collect()
|
||||
}
|
||||
|
||||
fn hash_files(files: &[FileEntry], hash_bytes: u64, full_file: bool) -> Vec<HashOutcome> {
|
||||
files
|
||||
fn hash_files(
|
||||
files: &[FileEntry],
|
||||
hash_bytes: u64,
|
||||
full_file: bool,
|
||||
progress: Option<ProgressBar>,
|
||||
) -> Vec<HashOutcome> {
|
||||
let outcomes = files
|
||||
.par_iter()
|
||||
.map(|file| {
|
||||
let file_progress = progress.clone();
|
||||
let hash_result = if full_file {
|
||||
hash_full_file(&file.path)
|
||||
hash_full_file(&file.path, file_progress.as_ref())
|
||||
} else {
|
||||
hash_file_prefix(&file.path, hash_bytes)
|
||||
hash_file_prefix(&file.path, hash_bytes, file_progress.as_ref())
|
||||
};
|
||||
|
||||
match hash_result {
|
||||
@@ -510,7 +683,13 @@ fn hash_files(files: &[FileEntry], hash_bytes: u64, full_file: bool) -> Vec<Hash
|
||||
)),
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
.collect();
|
||||
|
||||
if let Some(progress) = progress {
|
||||
progress.finish_and_clear();
|
||||
}
|
||||
|
||||
outcomes
|
||||
}
|
||||
|
||||
fn collect_hash_outcomes(
|
||||
@@ -558,7 +737,11 @@ fn files_from_duplicate_groups(groups: &[DuplicateGroup]) -> Vec<FileEntry> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
|
||||
fn hash_file_prefix(
|
||||
path: &Path,
|
||||
hash_bytes: u64,
|
||||
progress: Option<&ProgressBar>,
|
||||
) -> io::Result<String> {
|
||||
let file = File::open(path)?;
|
||||
let mut reader = BufReader::new(file);
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
@@ -571,6 +754,9 @@ fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
|
||||
if bytes_read == 0 {
|
||||
break;
|
||||
}
|
||||
if let Some(progress) = progress {
|
||||
progress.inc(bytes_read as u64);
|
||||
}
|
||||
hasher.update(&buffer[..bytes_read]);
|
||||
remaining -= bytes_read as u64;
|
||||
}
|
||||
@@ -578,7 +764,7 @@ fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
|
||||
Ok(hasher.finalize().to_hex().to_string())
|
||||
}
|
||||
|
||||
fn hash_full_file(path: &Path) -> io::Result<String> {
|
||||
fn hash_full_file(path: &Path, progress: Option<&ProgressBar>) -> io::Result<String> {
|
||||
let file = File::open(path)?;
|
||||
let mut reader = BufReader::new(file);
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
@@ -589,6 +775,9 @@ fn hash_full_file(path: &Path) -> io::Result<String> {
|
||||
if bytes_read == 0 {
|
||||
break;
|
||||
}
|
||||
if let Some(progress) = progress {
|
||||
progress.inc(bytes_read as u64);
|
||||
}
|
||||
hasher.update(&buffer[..bytes_read]);
|
||||
}
|
||||
|
||||
@@ -609,6 +798,28 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
|
||||
)?;
|
||||
writeln!(writer, "Hash window: {}", format_bytes(report.hash_bytes))?;
|
||||
writeln!(writer, "Worker threads: {}", report.worker_threads)?;
|
||||
writeln!(
|
||||
writer,
|
||||
"Duplicate mode: {}",
|
||||
if report.size_only {
|
||||
"size only"
|
||||
} else {
|
||||
"size + partial hash"
|
||||
}
|
||||
)?;
|
||||
writeln!(
|
||||
writer,
|
||||
"Minimum duplicate size: {}",
|
||||
format_bytes(report.min_size)
|
||||
)?;
|
||||
writeln!(
|
||||
writer,
|
||||
"Maximum depth: {}",
|
||||
report
|
||||
.max_depth
|
||||
.map(|depth| depth.to_string())
|
||||
.unwrap_or_else(|| "unlimited".to_string())
|
||||
)?;
|
||||
writeln!(
|
||||
writer,
|
||||
"Symlink traversal: {}",
|
||||
@@ -640,7 +851,7 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
|
||||
)?;
|
||||
writeln!(
|
||||
writer,
|
||||
"Same-size files hashed: {}",
|
||||
"Same-size duplicate candidates: {}",
|
||||
report.summary.same_size_candidate_files
|
||||
)?;
|
||||
writeln!(
|
||||
@@ -675,7 +886,11 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
|
||||
|
||||
write_duplicate_section(
|
||||
&mut writer,
|
||||
"Possible duplicates (same size + partial hash)",
|
||||
if report.size_only {
|
||||
"Possible duplicates (same size only)"
|
||||
} else {
|
||||
"Possible duplicates (same size + partial hash)"
|
||||
},
|
||||
&report.possible_duplicates,
|
||||
)?;
|
||||
|
||||
@@ -898,6 +1113,16 @@ impl SpecialEntryKind {
|
||||
}
|
||||
}
|
||||
|
||||
impl ScanAccumulator {
|
||||
fn interactions(&self) -> usize {
|
||||
self.files.len()
|
||||
+ self.directories
|
||||
+ self.symlinks.len()
|
||||
+ self.special_entries.len()
|
||||
+ self.errors.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -929,6 +1154,10 @@ mod tests {
|
||||
follow_links: false,
|
||||
verify_full: false,
|
||||
threads: None,
|
||||
size_only: false,
|
||||
min_size: 0,
|
||||
max_depth: None,
|
||||
progress: false,
|
||||
});
|
||||
|
||||
assert_eq!(report.summary.files, 3);
|
||||
@@ -953,12 +1182,75 @@ mod tests {
|
||||
follow_links: false,
|
||||
verify_full: true,
|
||||
threads: None,
|
||||
size_only: false,
|
||||
min_size: 0,
|
||||
max_depth: None,
|
||||
progress: false,
|
||||
});
|
||||
|
||||
assert_eq!(report.possible_duplicates.len(), 1);
|
||||
assert!(report.verified_duplicates.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn size_only_mode_groups_same_size_without_hashing_prefixes() {
|
||||
let temp = TempDir::new().expect("temp dir");
|
||||
let first = temp.path().join("first.bin");
|
||||
let second = temp.path().join("second.bin");
|
||||
|
||||
fs::write(&first, b"abcdef").expect("write first");
|
||||
fs::write(&second, b"uvwxyz").expect("write second");
|
||||
|
||||
let report = scan_paths(ScanConfig {
|
||||
paths: vec![temp.path().to_path_buf()],
|
||||
hash_bytes: DEFAULT_HASH_BYTES,
|
||||
follow_links: false,
|
||||
verify_full: false,
|
||||
threads: None,
|
||||
size_only: true,
|
||||
min_size: 0,
|
||||
max_depth: None,
|
||||
progress: false,
|
||||
});
|
||||
|
||||
assert_eq!(report.possible_duplicates.len(), 1);
|
||||
assert_eq!(report.possible_duplicates[0].hash, "size-only");
|
||||
assert!(report.possible_duplicates[0].paths.contains(&first));
|
||||
assert!(report.possible_duplicates[0].paths.contains(&second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn min_size_filters_duplicate_candidates_before_hashing() {
|
||||
let temp = TempDir::new().expect("temp dir");
|
||||
let small_first = temp.path().join("small-first.bin");
|
||||
let small_second = temp.path().join("small-second.bin");
|
||||
let large_first = temp.path().join("large-first.bin");
|
||||
let large_second = temp.path().join("large-second.bin");
|
||||
|
||||
fs::write(&small_first, b"abc").expect("write small first");
|
||||
fs::write(&small_second, b"abc").expect("write small second");
|
||||
fs::write(&large_first, b"abcdef").expect("write large first");
|
||||
fs::write(&large_second, b"abcdef").expect("write large second");
|
||||
|
||||
let report = scan_paths(ScanConfig {
|
||||
paths: vec![temp.path().to_path_buf()],
|
||||
hash_bytes: DEFAULT_HASH_BYTES,
|
||||
follow_links: false,
|
||||
verify_full: false,
|
||||
threads: None,
|
||||
size_only: false,
|
||||
min_size: 4,
|
||||
max_depth: None,
|
||||
progress: false,
|
||||
});
|
||||
|
||||
assert_eq!(report.possible_duplicates.len(), 1);
|
||||
assert!(report.possible_duplicates[0].paths.contains(&large_first));
|
||||
assert!(report.possible_duplicates[0].paths.contains(&large_second));
|
||||
assert!(!report.possible_duplicates[0].paths.contains(&small_first));
|
||||
assert!(!report.possible_duplicates[0].paths.contains(&small_second));
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
#[test]
|
||||
fn reports_symlinks_without_following_them() {
|
||||
@@ -976,6 +1268,10 @@ mod tests {
|
||||
follow_links: false,
|
||||
verify_full: false,
|
||||
threads: None,
|
||||
size_only: false,
|
||||
min_size: 0,
|
||||
max_depth: None,
|
||||
progress: false,
|
||||
});
|
||||
|
||||
assert_eq!(report.summary.files, 1);
|
||||
@@ -999,6 +1295,10 @@ mod tests {
|
||||
follow_links: false,
|
||||
verify_full: false,
|
||||
threads: None,
|
||||
size_only: false,
|
||||
min_size: 0,
|
||||
max_depth: None,
|
||||
progress: false,
|
||||
});
|
||||
|
||||
assert_eq!(report.summary.files, 2);
|
||||
@@ -1024,6 +1324,10 @@ mod tests {
|
||||
follow_links: false,
|
||||
verify_full: false,
|
||||
threads: None,
|
||||
size_only: false,
|
||||
min_size: 0,
|
||||
max_depth: None,
|
||||
progress: false,
|
||||
});
|
||||
|
||||
let json = serde_json::to_string(&report).expect("serialize report with lossy path");
|
||||
@@ -1037,6 +1341,9 @@ mod tests {
|
||||
scanned_paths: vec![PathBuf::from(".")],
|
||||
hash_bytes: DEFAULT_HASH_BYTES,
|
||||
worker_threads: 1,
|
||||
size_only: false,
|
||||
min_size: 0,
|
||||
max_depth: None,
|
||||
followed_symlinks: false,
|
||||
full_verification: false,
|
||||
summary: ScanSummary {
|
||||
|
||||
217
src/main.rs
217
src/main.rs
@@ -1,10 +1,12 @@
|
||||
use std::io::{self, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{self, IsTerminal, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::ExitCode;
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{Context, bail};
|
||||
use clap::Parser;
|
||||
use disk_checker::{ScanConfig, parse_byte_count, scan_paths, write_human_report};
|
||||
use dialoguer::{Confirm, Select, theme::ColorfulTheme};
|
||||
use disk_checker::{DuplicateGroup, ScanConfig, parse_byte_count, scan_paths, write_human_report};
|
||||
|
||||
#[derive(Debug, Parser)]
|
||||
#[command(
|
||||
@@ -31,6 +33,18 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
verify_full: bool,
|
||||
|
||||
/// Group duplicate candidates by size only. Fastest mode for huge triage; less precise.
|
||||
#[arg(long)]
|
||||
size_only: bool,
|
||||
|
||||
/// Ignore duplicate candidates smaller than this size. Accepts units like 100MiB or 1GB.
|
||||
#[arg(long, default_value = "0", value_parser = parse_min_size)]
|
||||
min_size: u64,
|
||||
|
||||
/// Maximum directory depth to scan. Depth 0 means only the provided path itself.
|
||||
#[arg(long, value_parser = parse_nonzero_or_zero_usize)]
|
||||
max_depth: Option<usize>,
|
||||
|
||||
/// Number of worker threads used for scanning and hashing. Defaults to CPU parallelism.
|
||||
#[arg(long, value_parser = parse_thread_count)]
|
||||
threads: Option<usize>,
|
||||
@@ -38,6 +52,18 @@ struct Cli {
|
||||
/// Print machine-readable JSON instead of the human summary.
|
||||
#[arg(long)]
|
||||
json: bool,
|
||||
|
||||
/// Disable progress output.
|
||||
#[arg(long)]
|
||||
no_progress: bool,
|
||||
|
||||
/// Interactively review duplicate groups and choose which path to keep.
|
||||
#[arg(long)]
|
||||
interactive: bool,
|
||||
|
||||
/// Shell script path for planned deletes when --interactive is used.
|
||||
#[arg(long, default_value = "disk-checker-delete-plan.sh")]
|
||||
delete_plan: PathBuf,
|
||||
}
|
||||
|
||||
fn parse_thread_count(input: &str) -> Result<usize, String> {
|
||||
@@ -51,8 +77,32 @@ fn parse_thread_count(input: &str) -> Result<usize, String> {
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_nonzero_or_zero_usize(input: &str) -> Result<usize, String> {
|
||||
input
|
||||
.parse::<usize>()
|
||||
.map_err(|error| format!("invalid depth {input:?}: {error}"))
|
||||
}
|
||||
|
||||
fn parse_min_size(input: &str) -> Result<u64, String> {
|
||||
if input.trim() == "0" {
|
||||
Ok(0)
|
||||
} else {
|
||||
parse_byte_count(input)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<ExitCode> {
|
||||
let cli = Cli::parse();
|
||||
if cli.interactive && !cli.verify_full {
|
||||
bail!(
|
||||
"--interactive requires --verify-full so keep/delete plans are based on fully verified duplicates"
|
||||
);
|
||||
}
|
||||
if cli.interactive && cli.json {
|
||||
bail!(
|
||||
"--interactive cannot be combined with --json because prompts would contaminate JSON output"
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(threads) = cli.threads {
|
||||
rayon::ThreadPoolBuilder::new()
|
||||
@@ -73,6 +123,10 @@ fn main() -> anyhow::Result<ExitCode> {
|
||||
follow_links: cli.follow_links,
|
||||
verify_full: cli.verify_full,
|
||||
threads: cli.threads,
|
||||
size_only: cli.size_only,
|
||||
min_size: cli.min_size,
|
||||
max_depth: cli.max_depth,
|
||||
progress: !cli.no_progress && !cli.json && io::stderr().is_terminal(),
|
||||
});
|
||||
|
||||
let stdout = io::stdout();
|
||||
@@ -83,6 +137,16 @@ fn main() -> anyhow::Result<ExitCode> {
|
||||
} else {
|
||||
write_human_report(&mut out, &report).context("failed to write report")?;
|
||||
}
|
||||
drop(out);
|
||||
|
||||
if cli.interactive {
|
||||
let groups = if cli.verify_full {
|
||||
&report.verified_duplicates
|
||||
} else {
|
||||
&report.possible_duplicates
|
||||
};
|
||||
run_interactive_resolver(groups, cli.verify_full, &cli.delete_plan)?;
|
||||
}
|
||||
|
||||
if report.summary.errors > 0 {
|
||||
Ok(ExitCode::from(2))
|
||||
@@ -90,3 +154,148 @@ fn main() -> anyhow::Result<ExitCode> {
|
||||
Ok(ExitCode::SUCCESS)
|
||||
}
|
||||
}
|
||||
|
||||
fn run_interactive_resolver(
|
||||
groups: &[DuplicateGroup],
|
||||
verified: bool,
|
||||
delete_plan: &PathBuf,
|
||||
) -> anyhow::Result<()> {
|
||||
if groups.is_empty() {
|
||||
println!("No duplicate groups to resolve.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let theme = ColorfulTheme::default();
|
||||
let mut planned_deletes = Vec::new();
|
||||
let mut skipped = 0usize;
|
||||
|
||||
for (group_index, group) in groups.iter().enumerate() {
|
||||
println!();
|
||||
println!(
|
||||
"Duplicate group {}/{} — {} across {} files",
|
||||
group_index + 1,
|
||||
groups.len(),
|
||||
disk_checker::format_bytes(group.size),
|
||||
group.paths.len()
|
||||
);
|
||||
|
||||
let mut choices = group
|
||||
.paths
|
||||
.iter()
|
||||
.map(|path| path.display().to_string())
|
||||
.collect::<Vec<_>>();
|
||||
choices.push("Skip this group".to_string());
|
||||
|
||||
let selection = Select::with_theme(&theme)
|
||||
.with_prompt("Choose the version to keep")
|
||||
.items(&choices)
|
||||
.default(0)
|
||||
.interact()
|
||||
.context("interactive selection failed")?;
|
||||
|
||||
if selection == group.paths.len() {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let keep_path = &group.paths[selection];
|
||||
let delete_paths = group
|
||||
.paths
|
||||
.iter()
|
||||
.filter(|path| *path != keep_path)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("Keeping: {}", keep_path.display());
|
||||
for path in &delete_paths {
|
||||
println!(" remove: {}", path.display());
|
||||
}
|
||||
|
||||
let confirmed = Confirm::with_theme(&theme)
|
||||
.with_prompt("Add these files to the deletion plan?")
|
||||
.default(false)
|
||||
.interact()
|
||||
.context("interactive confirmation failed")?;
|
||||
|
||||
if !confirmed {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
planned_deletes.extend(delete_paths);
|
||||
}
|
||||
|
||||
if !planned_deletes.is_empty() {
|
||||
write_delete_plan(delete_plan, &planned_deletes, verified)?;
|
||||
println!(
|
||||
"Wrote deletion plan for {} files: {}",
|
||||
planned_deletes.len(),
|
||||
delete_plan.display()
|
||||
);
|
||||
println!("Review it, then run: sh {}", delete_plan.display());
|
||||
}
|
||||
|
||||
println!(
|
||||
"Interactive resolver complete: {} planned, {} groups skipped.",
|
||||
planned_deletes.len(),
|
||||
skipped
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_delete_plan(
|
||||
path: &PathBuf,
|
||||
delete_paths: &[PathBuf],
|
||||
verified: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut file = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(path)
|
||||
.with_context(|| format!("failed to create delete plan {}", path.display()))?;
|
||||
writeln!(file, "#!/bin/sh")?;
|
||||
writeln!(file, "set -eu")?;
|
||||
writeln!(
|
||||
file,
|
||||
"# Review carefully before running. Generated by disk-checker."
|
||||
)?;
|
||||
if verified {
|
||||
writeln!(
|
||||
file,
|
||||
"# Source groups were fully verified with --verify-full."
|
||||
)?;
|
||||
} else {
|
||||
writeln!(
|
||||
file,
|
||||
"# WARNING: Source groups were possible duplicates only, not fully verified."
|
||||
)?;
|
||||
}
|
||||
for delete_path in delete_paths {
|
||||
writeln!(file, "rm -- {}", shell_quote(delete_path)?)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn shell_quote(path: &Path) -> anyhow::Result<String> {
|
||||
let value = path.to_str().with_context(|| {
|
||||
format!(
|
||||
"delete plan cannot safely encode non-UTF-8 path: {}",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(format!("'{}'", value.replace('\'', "'\\''")))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::shell_quote;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn shell_quote_escapes_single_quotes() {
|
||||
assert_eq!(
|
||||
shell_quote(Path::new("/tmp/it's-here.txt")).expect("quote path"),
|
||||
"'/tmp/it'\\''s-here.txt'"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user