Add duplicate resolver and real progress
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
417
src/lib.rs
417
src/lib.rs
@@ -6,6 +6,7 @@ use std::sync::mpsc;
|
|||||||
use std::thread;
|
use std::thread;
|
||||||
|
|
||||||
use ignore::{WalkBuilder, WalkState};
|
use ignore::{WalkBuilder, WalkState};
|
||||||
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use serde::{Serialize, Serializer};
|
use serde::{Serialize, Serializer};
|
||||||
|
|
||||||
@@ -22,6 +23,10 @@ pub struct ScanConfig {
|
|||||||
pub follow_links: bool,
|
pub follow_links: bool,
|
||||||
pub verify_full: bool,
|
pub verify_full: bool,
|
||||||
pub threads: Option<usize>,
|
pub threads: Option<usize>,
|
||||||
|
pub size_only: bool,
|
||||||
|
pub min_size: u64,
|
||||||
|
pub max_depth: Option<usize>,
|
||||||
|
pub progress: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
@@ -30,6 +35,9 @@ pub struct ScanReport {
|
|||||||
pub scanned_paths: Vec<PathBuf>,
|
pub scanned_paths: Vec<PathBuf>,
|
||||||
pub hash_bytes: u64,
|
pub hash_bytes: u64,
|
||||||
pub worker_threads: usize,
|
pub worker_threads: usize,
|
||||||
|
pub size_only: bool,
|
||||||
|
pub min_size: u64,
|
||||||
|
pub max_depth: Option<usize>,
|
||||||
pub followed_symlinks: bool,
|
pub followed_symlinks: bool,
|
||||||
pub full_verification: bool,
|
pub full_verification: bool,
|
||||||
pub summary: ScanSummary,
|
pub summary: ScanSummary,
|
||||||
@@ -139,6 +147,16 @@ enum HashOutcome {
|
|||||||
Issue(ScanIssue),
|
Issue(ScanIssue),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
struct ScanAccumulator {
|
||||||
|
files: Vec<FileEntry>,
|
||||||
|
symlinks: Vec<SymlinkInfo>,
|
||||||
|
special_entries: Vec<SpecialEntry>,
|
||||||
|
errors: Vec<ScanIssue>,
|
||||||
|
directories: usize,
|
||||||
|
total_file_bytes: u64,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
enum ScannedEntry {
|
enum ScannedEntry {
|
||||||
File(FileEntry),
|
File(FileEntry),
|
||||||
@@ -198,12 +216,13 @@ pub fn parse_byte_count(input: &str) -> Result<u64, String> {
|
|||||||
pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
||||||
let hash_bytes = config.hash_bytes.max(1);
|
let hash_bytes = config.hash_bytes.max(1);
|
||||||
let worker_threads = worker_threads(config.threads);
|
let worker_threads = worker_threads(config.threads);
|
||||||
|
let min_size = config.min_size;
|
||||||
let mut files = Vec::new();
|
let mut files = Vec::new();
|
||||||
let mut symlinks = Vec::new();
|
let mut symlinks = Vec::new();
|
||||||
let mut special_entries = Vec::new();
|
let mut special_entries = Vec::new();
|
||||||
let mut errors = Vec::new();
|
let mut errors = Vec::new();
|
||||||
let mut directories = 0;
|
let mut directories: usize = 0;
|
||||||
let mut total_file_bytes = 0;
|
let mut total_file_bytes: u64 = 0;
|
||||||
|
|
||||||
for root in &config.paths {
|
for root in &config.paths {
|
||||||
let mut builder = WalkBuilder::new(root);
|
let mut builder = WalkBuilder::new(root);
|
||||||
@@ -216,33 +235,21 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
|||||||
.git_global(false)
|
.git_global(false)
|
||||||
.git_exclude(false)
|
.git_exclude(false)
|
||||||
.parents(false);
|
.parents(false);
|
||||||
|
if let Some(max_depth) = config.max_depth {
|
||||||
let (sender, receiver) = mpsc::channel();
|
builder.max_depth(Some(max_depth));
|
||||||
builder.build_parallel().run(|| {
|
|
||||||
let sender = sender.clone();
|
|
||||||
let follow_links = config.follow_links;
|
|
||||||
Box::new(move |entry| {
|
|
||||||
for scanned_entry in classify_walk_entry(entry, follow_links) {
|
|
||||||
if sender.send(scanned_entry).is_err() {
|
|
||||||
return WalkState::Quit;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
WalkState::Continue
|
|
||||||
})
|
|
||||||
});
|
|
||||||
drop(sender);
|
|
||||||
|
|
||||||
for scanned_entry in receiver {
|
let accumulator = walk_root_parallel(
|
||||||
collect_scanned_entry(
|
&mut builder,
|
||||||
scanned_entry,
|
config.follow_links,
|
||||||
&mut files,
|
walk_progress(config.progress, root),
|
||||||
&mut symlinks,
|
|
||||||
&mut special_entries,
|
|
||||||
&mut errors,
|
|
||||||
&mut directories,
|
|
||||||
&mut total_file_bytes,
|
|
||||||
);
|
);
|
||||||
}
|
files.extend(accumulator.files);
|
||||||
|
symlinks.extend(accumulator.symlinks);
|
||||||
|
special_entries.extend(accumulator.special_entries);
|
||||||
|
errors.extend(accumulator.errors);
|
||||||
|
directories += accumulator.directories;
|
||||||
|
total_file_bytes = total_file_bytes.saturating_add(accumulator.total_file_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
files.sort_by(|left, right| left.path.cmp(&right.path));
|
files.sort_by(|left, right| left.path.cmp(&right.path));
|
||||||
@@ -250,17 +257,43 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
|||||||
special_entries.sort_by(|left, right| left.path.cmp(&right.path));
|
special_entries.sort_by(|left, right| left.path.cmp(&right.path));
|
||||||
|
|
||||||
let hard_links = find_hard_links(&files);
|
let hard_links = find_hard_links(&files);
|
||||||
let same_size_candidates = same_size_candidates(&files);
|
let same_size_candidates = same_size_candidates(&files, min_size);
|
||||||
let same_size_candidate_files = same_size_candidates.len();
|
let same_size_candidate_files = same_size_candidates.len();
|
||||||
|
|
||||||
let partial_outcomes = hash_files(&same_size_candidates, hash_bytes, false);
|
let possible_duplicates = if config.size_only {
|
||||||
|
size_only_duplicate_groups(same_size_candidates.clone())
|
||||||
|
} else {
|
||||||
|
let partial_outcomes = hash_files(
|
||||||
|
&same_size_candidates,
|
||||||
|
hash_bytes,
|
||||||
|
false,
|
||||||
|
hash_progress(
|
||||||
|
config.progress,
|
||||||
|
&same_size_candidates,
|
||||||
|
hash_bytes,
|
||||||
|
false,
|
||||||
|
"Hashing file prefixes",
|
||||||
|
),
|
||||||
|
);
|
||||||
let mut partial_hashes = Vec::new();
|
let mut partial_hashes = Vec::new();
|
||||||
collect_hash_outcomes(partial_outcomes, &mut partial_hashes, &mut errors);
|
collect_hash_outcomes(partial_outcomes, &mut partial_hashes, &mut errors);
|
||||||
let possible_duplicates = duplicate_groups(partial_hashes);
|
duplicate_groups(partial_hashes)
|
||||||
|
};
|
||||||
|
|
||||||
let verified_duplicates = if config.verify_full {
|
let verified_duplicates = if config.verify_full {
|
||||||
let full_candidates = files_from_duplicate_groups(&possible_duplicates);
|
let full_candidates = files_from_duplicate_groups(&possible_duplicates);
|
||||||
let full_outcomes = hash_files(&full_candidates, hash_bytes, true);
|
let full_outcomes = hash_files(
|
||||||
|
&full_candidates,
|
||||||
|
hash_bytes,
|
||||||
|
true,
|
||||||
|
hash_progress(
|
||||||
|
config.progress,
|
||||||
|
&full_candidates,
|
||||||
|
hash_bytes,
|
||||||
|
true,
|
||||||
|
"Full verification hashing",
|
||||||
|
),
|
||||||
|
);
|
||||||
let mut full_hashes = Vec::new();
|
let mut full_hashes = Vec::new();
|
||||||
collect_hash_outcomes(full_outcomes, &mut full_hashes, &mut errors);
|
collect_hash_outcomes(full_outcomes, &mut full_hashes, &mut errors);
|
||||||
duplicate_groups(full_hashes)
|
duplicate_groups(full_hashes)
|
||||||
@@ -291,6 +324,9 @@ pub fn scan_paths(config: ScanConfig) -> ScanReport {
|
|||||||
scanned_paths: config.paths,
|
scanned_paths: config.paths,
|
||||||
hash_bytes,
|
hash_bytes,
|
||||||
worker_threads,
|
worker_threads,
|
||||||
|
size_only: config.size_only,
|
||||||
|
min_size,
|
||||||
|
max_depth: config.max_depth,
|
||||||
followed_symlinks: config.follow_links,
|
followed_symlinks: config.follow_links,
|
||||||
full_verification: config.verify_full,
|
full_verification: config.verify_full,
|
||||||
summary: ScanSummary {
|
summary: ScanSummary {
|
||||||
@@ -325,6 +361,123 @@ fn worker_threads(configured_threads: Option<usize>) -> usize {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn walk_root_parallel(
|
||||||
|
builder: &mut WalkBuilder,
|
||||||
|
follow_links: bool,
|
||||||
|
progress: Option<ProgressBar>,
|
||||||
|
) -> ScanAccumulator {
|
||||||
|
let (sender, receiver) = mpsc::channel();
|
||||||
|
let collector = thread::spawn(move || {
|
||||||
|
let mut accumulator = ScanAccumulator::default();
|
||||||
|
for scanned_entry in receiver {
|
||||||
|
collect_scanned_entry(scanned_entry, &mut accumulator);
|
||||||
|
update_walk_progress(progress.as_ref(), &accumulator, false);
|
||||||
|
}
|
||||||
|
update_walk_progress(progress.as_ref(), &accumulator, true);
|
||||||
|
accumulator
|
||||||
|
});
|
||||||
|
|
||||||
|
builder.build_parallel().run(|| {
|
||||||
|
let sender = sender.clone();
|
||||||
|
Box::new(move |entry| {
|
||||||
|
for scanned_entry in classify_walk_entry(entry, follow_links) {
|
||||||
|
if sender.send(scanned_entry).is_err() {
|
||||||
|
return WalkState::Quit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WalkState::Continue
|
||||||
|
})
|
||||||
|
});
|
||||||
|
drop(sender);
|
||||||
|
|
||||||
|
collector
|
||||||
|
.join()
|
||||||
|
.expect("scan result collector thread should not panic")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn walk_progress(enabled: bool, root: &Path) -> Option<ProgressBar> {
|
||||||
|
if !enabled {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let progress = ProgressBar::new_spinner();
|
||||||
|
progress.set_style(
|
||||||
|
ProgressStyle::with_template("{spinner:.green} {msg}")
|
||||||
|
.expect("valid traversal progress template"),
|
||||||
|
);
|
||||||
|
progress.set_message(format!(
|
||||||
|
"Scanning {} — 0 files, 0 dirs, 0 symlinks, 0 errors",
|
||||||
|
root.display()
|
||||||
|
));
|
||||||
|
Some(progress)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn update_walk_progress(progress: Option<&ProgressBar>, accumulator: &ScanAccumulator, done: bool) {
|
||||||
|
let Some(progress) = progress else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
let interactions = accumulator.interactions();
|
||||||
|
if done {
|
||||||
|
progress.finish_with_message(format!(
|
||||||
|
"Scanned {} files, {} dirs, {} symlinks, {} special entries, {} errors",
|
||||||
|
accumulator.files.len(),
|
||||||
|
accumulator.directories,
|
||||||
|
accumulator.symlinks.len(),
|
||||||
|
accumulator.special_entries.len(),
|
||||||
|
accumulator.errors.len()
|
||||||
|
));
|
||||||
|
} else if interactions == 1 || interactions.is_multiple_of(100) {
|
||||||
|
progress.tick();
|
||||||
|
progress.set_message(format!(
|
||||||
|
"Scanning — {} files, {} dirs, {} symlinks, {} special entries, {} errors",
|
||||||
|
accumulator.files.len(),
|
||||||
|
accumulator.directories,
|
||||||
|
accumulator.symlinks.len(),
|
||||||
|
accumulator.special_entries.len(),
|
||||||
|
accumulator.errors.len()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn hash_progress(
|
||||||
|
enabled: bool,
|
||||||
|
files: &[FileEntry],
|
||||||
|
hash_bytes: u64,
|
||||||
|
full_file: bool,
|
||||||
|
message: &'static str,
|
||||||
|
) -> Option<ProgressBar> {
|
||||||
|
if !enabled || files.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_bytes = files
|
||||||
|
.iter()
|
||||||
|
.map(|file| {
|
||||||
|
if full_file {
|
||||||
|
file.size
|
||||||
|
} else {
|
||||||
|
file.size.min(hash_bytes)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.sum::<u64>();
|
||||||
|
|
||||||
|
if total_bytes == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let progress = ProgressBar::new(total_bytes);
|
||||||
|
progress.set_style(
|
||||||
|
ProgressStyle::with_template(
|
||||||
|
"{msg} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {binary_bytes}/{binary_total_bytes} {binary_bytes_per_sec}",
|
||||||
|
)
|
||||||
|
.expect("valid hashing progress template")
|
||||||
|
.progress_chars("=>-"),
|
||||||
|
);
|
||||||
|
progress.set_message(message);
|
||||||
|
Some(progress)
|
||||||
|
}
|
||||||
|
|
||||||
fn classify_walk_entry(
|
fn classify_walk_entry(
|
||||||
entry: Result<ignore::DirEntry, ignore::Error>,
|
entry: Result<ignore::DirEntry, ignore::Error>,
|
||||||
follow_links: bool,
|
follow_links: bool,
|
||||||
@@ -386,24 +539,16 @@ fn non_symlink_entry(path: PathBuf, metadata: &Metadata) -> ScannedEntry {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_scanned_entry(
|
fn collect_scanned_entry(entry: ScannedEntry, accumulator: &mut ScanAccumulator) {
|
||||||
entry: ScannedEntry,
|
|
||||||
files: &mut Vec<FileEntry>,
|
|
||||||
symlinks: &mut Vec<SymlinkInfo>,
|
|
||||||
special_entries: &mut Vec<SpecialEntry>,
|
|
||||||
errors: &mut Vec<ScanIssue>,
|
|
||||||
directories: &mut usize,
|
|
||||||
total_file_bytes: &mut u64,
|
|
||||||
) {
|
|
||||||
match entry {
|
match entry {
|
||||||
ScannedEntry::File(file) => {
|
ScannedEntry::File(file) => {
|
||||||
*total_file_bytes = total_file_bytes.saturating_add(file.size);
|
accumulator.total_file_bytes = accumulator.total_file_bytes.saturating_add(file.size);
|
||||||
files.push(file);
|
accumulator.files.push(file);
|
||||||
}
|
}
|
||||||
ScannedEntry::Directory => *directories += 1,
|
ScannedEntry::Directory => accumulator.directories += 1,
|
||||||
ScannedEntry::Symlink(symlink) => symlinks.push(symlink),
|
ScannedEntry::Symlink(symlink) => accumulator.symlinks.push(symlink),
|
||||||
ScannedEntry::Special(special_entry) => special_entries.push(special_entry),
|
ScannedEntry::Special(special_entry) => accumulator.special_entries.push(special_entry),
|
||||||
ScannedEntry::Issue(error) => errors.push(error),
|
ScannedEntry::Issue(error) => accumulator.errors.push(error),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -465,10 +610,13 @@ fn find_hard_links(files: &[FileEntry]) -> Vec<HardLinkGroup> {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn same_size_candidates(files: &[FileEntry]) -> Vec<FileEntry> {
|
fn same_size_candidates(files: &[FileEntry], min_size: u64) -> Vec<FileEntry> {
|
||||||
let files = unique_file_id_entries(files);
|
let files = unique_file_id_entries(files);
|
||||||
let mut by_size: BTreeMap<u64, Vec<FileEntry>> = BTreeMap::new();
|
let mut by_size: BTreeMap<u64, Vec<FileEntry>> = BTreeMap::new();
|
||||||
for file in files {
|
for file in files {
|
||||||
|
if file.size < min_size {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
by_size.entry(file.size).or_default().push(file);
|
by_size.entry(file.size).or_default().push(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -479,6 +627,25 @@ fn same_size_candidates(files: &[FileEntry]) -> Vec<FileEntry> {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn size_only_duplicate_groups(files: Vec<FileEntry>) -> Vec<DuplicateGroup> {
|
||||||
|
let mut by_size: BTreeMap<u64, Vec<PathBuf>> = BTreeMap::new();
|
||||||
|
for file in files {
|
||||||
|
by_size.entry(file.size).or_default().push(file.path);
|
||||||
|
}
|
||||||
|
|
||||||
|
by_size
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|(size, mut paths)| {
|
||||||
|
paths.sort();
|
||||||
|
(paths.len() > 1).then_some(DuplicateGroup {
|
||||||
|
size,
|
||||||
|
hash: "size-only".to_string(),
|
||||||
|
paths,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
fn unique_file_id_entries(files: &[FileEntry]) -> Vec<FileEntry> {
|
fn unique_file_id_entries(files: &[FileEntry]) -> Vec<FileEntry> {
|
||||||
let mut by_file_id: BTreeMap<(u64, u64), &FileEntry> = BTreeMap::new();
|
let mut by_file_id: BTreeMap<(u64, u64), &FileEntry> = BTreeMap::new();
|
||||||
for file in files {
|
for file in files {
|
||||||
@@ -488,14 +655,20 @@ fn unique_file_id_entries(files: &[FileEntry]) -> Vec<FileEntry> {
|
|||||||
by_file_id.into_values().cloned().collect()
|
by_file_id.into_values().cloned().collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn hash_files(files: &[FileEntry], hash_bytes: u64, full_file: bool) -> Vec<HashOutcome> {
|
fn hash_files(
|
||||||
files
|
files: &[FileEntry],
|
||||||
|
hash_bytes: u64,
|
||||||
|
full_file: bool,
|
||||||
|
progress: Option<ProgressBar>,
|
||||||
|
) -> Vec<HashOutcome> {
|
||||||
|
let outcomes = files
|
||||||
.par_iter()
|
.par_iter()
|
||||||
.map(|file| {
|
.map(|file| {
|
||||||
|
let file_progress = progress.clone();
|
||||||
let hash_result = if full_file {
|
let hash_result = if full_file {
|
||||||
hash_full_file(&file.path)
|
hash_full_file(&file.path, file_progress.as_ref())
|
||||||
} else {
|
} else {
|
||||||
hash_file_prefix(&file.path, hash_bytes)
|
hash_file_prefix(&file.path, hash_bytes, file_progress.as_ref())
|
||||||
};
|
};
|
||||||
|
|
||||||
match hash_result {
|
match hash_result {
|
||||||
@@ -510,7 +683,13 @@ fn hash_files(files: &[FileEntry], hash_bytes: u64, full_file: bool) -> Vec<Hash
|
|||||||
)),
|
)),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect()
|
.collect();
|
||||||
|
|
||||||
|
if let Some(progress) = progress {
|
||||||
|
progress.finish_and_clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
outcomes
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_hash_outcomes(
|
fn collect_hash_outcomes(
|
||||||
@@ -558,7 +737,11 @@ fn files_from_duplicate_groups(groups: &[DuplicateGroup]) -> Vec<FileEntry> {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
|
fn hash_file_prefix(
|
||||||
|
path: &Path,
|
||||||
|
hash_bytes: u64,
|
||||||
|
progress: Option<&ProgressBar>,
|
||||||
|
) -> io::Result<String> {
|
||||||
let file = File::open(path)?;
|
let file = File::open(path)?;
|
||||||
let mut reader = BufReader::new(file);
|
let mut reader = BufReader::new(file);
|
||||||
let mut hasher = blake3::Hasher::new();
|
let mut hasher = blake3::Hasher::new();
|
||||||
@@ -571,6 +754,9 @@ fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
|
|||||||
if bytes_read == 0 {
|
if bytes_read == 0 {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if let Some(progress) = progress {
|
||||||
|
progress.inc(bytes_read as u64);
|
||||||
|
}
|
||||||
hasher.update(&buffer[..bytes_read]);
|
hasher.update(&buffer[..bytes_read]);
|
||||||
remaining -= bytes_read as u64;
|
remaining -= bytes_read as u64;
|
||||||
}
|
}
|
||||||
@@ -578,7 +764,7 @@ fn hash_file_prefix(path: &Path, hash_bytes: u64) -> io::Result<String> {
|
|||||||
Ok(hasher.finalize().to_hex().to_string())
|
Ok(hasher.finalize().to_hex().to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn hash_full_file(path: &Path) -> io::Result<String> {
|
fn hash_full_file(path: &Path, progress: Option<&ProgressBar>) -> io::Result<String> {
|
||||||
let file = File::open(path)?;
|
let file = File::open(path)?;
|
||||||
let mut reader = BufReader::new(file);
|
let mut reader = BufReader::new(file);
|
||||||
let mut hasher = blake3::Hasher::new();
|
let mut hasher = blake3::Hasher::new();
|
||||||
@@ -589,6 +775,9 @@ fn hash_full_file(path: &Path) -> io::Result<String> {
|
|||||||
if bytes_read == 0 {
|
if bytes_read == 0 {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if let Some(progress) = progress {
|
||||||
|
progress.inc(bytes_read as u64);
|
||||||
|
}
|
||||||
hasher.update(&buffer[..bytes_read]);
|
hasher.update(&buffer[..bytes_read]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -609,6 +798,28 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
|
|||||||
)?;
|
)?;
|
||||||
writeln!(writer, "Hash window: {}", format_bytes(report.hash_bytes))?;
|
writeln!(writer, "Hash window: {}", format_bytes(report.hash_bytes))?;
|
||||||
writeln!(writer, "Worker threads: {}", report.worker_threads)?;
|
writeln!(writer, "Worker threads: {}", report.worker_threads)?;
|
||||||
|
writeln!(
|
||||||
|
writer,
|
||||||
|
"Duplicate mode: {}",
|
||||||
|
if report.size_only {
|
||||||
|
"size only"
|
||||||
|
} else {
|
||||||
|
"size + partial hash"
|
||||||
|
}
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
writer,
|
||||||
|
"Minimum duplicate size: {}",
|
||||||
|
format_bytes(report.min_size)
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
writer,
|
||||||
|
"Maximum depth: {}",
|
||||||
|
report
|
||||||
|
.max_depth
|
||||||
|
.map(|depth| depth.to_string())
|
||||||
|
.unwrap_or_else(|| "unlimited".to_string())
|
||||||
|
)?;
|
||||||
writeln!(
|
writeln!(
|
||||||
writer,
|
writer,
|
||||||
"Symlink traversal: {}",
|
"Symlink traversal: {}",
|
||||||
@@ -640,7 +851,7 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
|
|||||||
)?;
|
)?;
|
||||||
writeln!(
|
writeln!(
|
||||||
writer,
|
writer,
|
||||||
"Same-size files hashed: {}",
|
"Same-size duplicate candidates: {}",
|
||||||
report.summary.same_size_candidate_files
|
report.summary.same_size_candidate_files
|
||||||
)?;
|
)?;
|
||||||
writeln!(
|
writeln!(
|
||||||
@@ -675,7 +886,11 @@ pub fn write_human_report(mut writer: impl Write, report: &ScanReport) -> io::Re
|
|||||||
|
|
||||||
write_duplicate_section(
|
write_duplicate_section(
|
||||||
&mut writer,
|
&mut writer,
|
||||||
"Possible duplicates (same size + partial hash)",
|
if report.size_only {
|
||||||
|
"Possible duplicates (same size only)"
|
||||||
|
} else {
|
||||||
|
"Possible duplicates (same size + partial hash)"
|
||||||
|
},
|
||||||
&report.possible_duplicates,
|
&report.possible_duplicates,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
@@ -898,6 +1113,16 @@ impl SpecialEntryKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl ScanAccumulator {
|
||||||
|
fn interactions(&self) -> usize {
|
||||||
|
self.files.len()
|
||||||
|
+ self.directories
|
||||||
|
+ self.symlinks.len()
|
||||||
|
+ self.special_entries.len()
|
||||||
|
+ self.errors.len()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -929,6 +1154,10 @@ mod tests {
|
|||||||
follow_links: false,
|
follow_links: false,
|
||||||
verify_full: false,
|
verify_full: false,
|
||||||
threads: None,
|
threads: None,
|
||||||
|
size_only: false,
|
||||||
|
min_size: 0,
|
||||||
|
max_depth: None,
|
||||||
|
progress: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert_eq!(report.summary.files, 3);
|
assert_eq!(report.summary.files, 3);
|
||||||
@@ -953,12 +1182,75 @@ mod tests {
|
|||||||
follow_links: false,
|
follow_links: false,
|
||||||
verify_full: true,
|
verify_full: true,
|
||||||
threads: None,
|
threads: None,
|
||||||
|
size_only: false,
|
||||||
|
min_size: 0,
|
||||||
|
max_depth: None,
|
||||||
|
progress: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert_eq!(report.possible_duplicates.len(), 1);
|
assert_eq!(report.possible_duplicates.len(), 1);
|
||||||
assert!(report.verified_duplicates.is_empty());
|
assert!(report.verified_duplicates.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn size_only_mode_groups_same_size_without_hashing_prefixes() {
|
||||||
|
let temp = TempDir::new().expect("temp dir");
|
||||||
|
let first = temp.path().join("first.bin");
|
||||||
|
let second = temp.path().join("second.bin");
|
||||||
|
|
||||||
|
fs::write(&first, b"abcdef").expect("write first");
|
||||||
|
fs::write(&second, b"uvwxyz").expect("write second");
|
||||||
|
|
||||||
|
let report = scan_paths(ScanConfig {
|
||||||
|
paths: vec![temp.path().to_path_buf()],
|
||||||
|
hash_bytes: DEFAULT_HASH_BYTES,
|
||||||
|
follow_links: false,
|
||||||
|
verify_full: false,
|
||||||
|
threads: None,
|
||||||
|
size_only: true,
|
||||||
|
min_size: 0,
|
||||||
|
max_depth: None,
|
||||||
|
progress: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert_eq!(report.possible_duplicates.len(), 1);
|
||||||
|
assert_eq!(report.possible_duplicates[0].hash, "size-only");
|
||||||
|
assert!(report.possible_duplicates[0].paths.contains(&first));
|
||||||
|
assert!(report.possible_duplicates[0].paths.contains(&second));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn min_size_filters_duplicate_candidates_before_hashing() {
|
||||||
|
let temp = TempDir::new().expect("temp dir");
|
||||||
|
let small_first = temp.path().join("small-first.bin");
|
||||||
|
let small_second = temp.path().join("small-second.bin");
|
||||||
|
let large_first = temp.path().join("large-first.bin");
|
||||||
|
let large_second = temp.path().join("large-second.bin");
|
||||||
|
|
||||||
|
fs::write(&small_first, b"abc").expect("write small first");
|
||||||
|
fs::write(&small_second, b"abc").expect("write small second");
|
||||||
|
fs::write(&large_first, b"abcdef").expect("write large first");
|
||||||
|
fs::write(&large_second, b"abcdef").expect("write large second");
|
||||||
|
|
||||||
|
let report = scan_paths(ScanConfig {
|
||||||
|
paths: vec![temp.path().to_path_buf()],
|
||||||
|
hash_bytes: DEFAULT_HASH_BYTES,
|
||||||
|
follow_links: false,
|
||||||
|
verify_full: false,
|
||||||
|
threads: None,
|
||||||
|
size_only: false,
|
||||||
|
min_size: 4,
|
||||||
|
max_depth: None,
|
||||||
|
progress: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert_eq!(report.possible_duplicates.len(), 1);
|
||||||
|
assert!(report.possible_duplicates[0].paths.contains(&large_first));
|
||||||
|
assert!(report.possible_duplicates[0].paths.contains(&large_second));
|
||||||
|
assert!(!report.possible_duplicates[0].paths.contains(&small_first));
|
||||||
|
assert!(!report.possible_duplicates[0].paths.contains(&small_second));
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
#[test]
|
#[test]
|
||||||
fn reports_symlinks_without_following_them() {
|
fn reports_symlinks_without_following_them() {
|
||||||
@@ -976,6 +1268,10 @@ mod tests {
|
|||||||
follow_links: false,
|
follow_links: false,
|
||||||
verify_full: false,
|
verify_full: false,
|
||||||
threads: None,
|
threads: None,
|
||||||
|
size_only: false,
|
||||||
|
min_size: 0,
|
||||||
|
max_depth: None,
|
||||||
|
progress: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert_eq!(report.summary.files, 1);
|
assert_eq!(report.summary.files, 1);
|
||||||
@@ -999,6 +1295,10 @@ mod tests {
|
|||||||
follow_links: false,
|
follow_links: false,
|
||||||
verify_full: false,
|
verify_full: false,
|
||||||
threads: None,
|
threads: None,
|
||||||
|
size_only: false,
|
||||||
|
min_size: 0,
|
||||||
|
max_depth: None,
|
||||||
|
progress: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert_eq!(report.summary.files, 2);
|
assert_eq!(report.summary.files, 2);
|
||||||
@@ -1024,6 +1324,10 @@ mod tests {
|
|||||||
follow_links: false,
|
follow_links: false,
|
||||||
verify_full: false,
|
verify_full: false,
|
||||||
threads: None,
|
threads: None,
|
||||||
|
size_only: false,
|
||||||
|
min_size: 0,
|
||||||
|
max_depth: None,
|
||||||
|
progress: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
let json = serde_json::to_string(&report).expect("serialize report with lossy path");
|
let json = serde_json::to_string(&report).expect("serialize report with lossy path");
|
||||||
@@ -1037,6 +1341,9 @@ mod tests {
|
|||||||
scanned_paths: vec![PathBuf::from(".")],
|
scanned_paths: vec![PathBuf::from(".")],
|
||||||
hash_bytes: DEFAULT_HASH_BYTES,
|
hash_bytes: DEFAULT_HASH_BYTES,
|
||||||
worker_threads: 1,
|
worker_threads: 1,
|
||||||
|
size_only: false,
|
||||||
|
min_size: 0,
|
||||||
|
max_depth: None,
|
||||||
followed_symlinks: false,
|
followed_symlinks: false,
|
||||||
full_verification: false,
|
full_verification: false,
|
||||||
summary: ScanSummary {
|
summary: ScanSummary {
|
||||||
|
|||||||
217
src/main.rs
217
src/main.rs
@@ -1,10 +1,12 @@
|
|||||||
use std::io::{self, Write};
|
use std::fs::OpenOptions;
|
||||||
use std::path::PathBuf;
|
use std::io::{self, IsTerminal, Write};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::ExitCode;
|
use std::process::ExitCode;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::{Context, bail};
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use disk_checker::{ScanConfig, parse_byte_count, scan_paths, write_human_report};
|
use dialoguer::{Confirm, Select, theme::ColorfulTheme};
|
||||||
|
use disk_checker::{DuplicateGroup, ScanConfig, parse_byte_count, scan_paths, write_human_report};
|
||||||
|
|
||||||
#[derive(Debug, Parser)]
|
#[derive(Debug, Parser)]
|
||||||
#[command(
|
#[command(
|
||||||
@@ -31,6 +33,18 @@ struct Cli {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
verify_full: bool,
|
verify_full: bool,
|
||||||
|
|
||||||
|
/// Group duplicate candidates by size only. Fastest mode for huge triage; less precise.
|
||||||
|
#[arg(long)]
|
||||||
|
size_only: bool,
|
||||||
|
|
||||||
|
/// Ignore duplicate candidates smaller than this size. Accepts units like 100MiB or 1GB.
|
||||||
|
#[arg(long, default_value = "0", value_parser = parse_min_size)]
|
||||||
|
min_size: u64,
|
||||||
|
|
||||||
|
/// Maximum directory depth to scan. Depth 0 means only the provided path itself.
|
||||||
|
#[arg(long, value_parser = parse_nonzero_or_zero_usize)]
|
||||||
|
max_depth: Option<usize>,
|
||||||
|
|
||||||
/// Number of worker threads used for scanning and hashing. Defaults to CPU parallelism.
|
/// Number of worker threads used for scanning and hashing. Defaults to CPU parallelism.
|
||||||
#[arg(long, value_parser = parse_thread_count)]
|
#[arg(long, value_parser = parse_thread_count)]
|
||||||
threads: Option<usize>,
|
threads: Option<usize>,
|
||||||
@@ -38,6 +52,18 @@ struct Cli {
|
|||||||
/// Print machine-readable JSON instead of the human summary.
|
/// Print machine-readable JSON instead of the human summary.
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
json: bool,
|
json: bool,
|
||||||
|
|
||||||
|
/// Disable progress output.
|
||||||
|
#[arg(long)]
|
||||||
|
no_progress: bool,
|
||||||
|
|
||||||
|
/// Interactively review duplicate groups and choose which path to keep.
|
||||||
|
#[arg(long)]
|
||||||
|
interactive: bool,
|
||||||
|
|
||||||
|
/// Shell script path for planned deletes when --interactive is used.
|
||||||
|
#[arg(long, default_value = "disk-checker-delete-plan.sh")]
|
||||||
|
delete_plan: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_thread_count(input: &str) -> Result<usize, String> {
|
fn parse_thread_count(input: &str) -> Result<usize, String> {
|
||||||
@@ -51,8 +77,32 @@ fn parse_thread_count(input: &str) -> Result<usize, String> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_nonzero_or_zero_usize(input: &str) -> Result<usize, String> {
|
||||||
|
input
|
||||||
|
.parse::<usize>()
|
||||||
|
.map_err(|error| format!("invalid depth {input:?}: {error}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_min_size(input: &str) -> Result<u64, String> {
|
||||||
|
if input.trim() == "0" {
|
||||||
|
Ok(0)
|
||||||
|
} else {
|
||||||
|
parse_byte_count(input)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> anyhow::Result<ExitCode> {
|
fn main() -> anyhow::Result<ExitCode> {
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
|
if cli.interactive && !cli.verify_full {
|
||||||
|
bail!(
|
||||||
|
"--interactive requires --verify-full so keep/delete plans are based on fully verified duplicates"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if cli.interactive && cli.json {
|
||||||
|
bail!(
|
||||||
|
"--interactive cannot be combined with --json because prompts would contaminate JSON output"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(threads) = cli.threads {
|
if let Some(threads) = cli.threads {
|
||||||
rayon::ThreadPoolBuilder::new()
|
rayon::ThreadPoolBuilder::new()
|
||||||
@@ -73,6 +123,10 @@ fn main() -> anyhow::Result<ExitCode> {
|
|||||||
follow_links: cli.follow_links,
|
follow_links: cli.follow_links,
|
||||||
verify_full: cli.verify_full,
|
verify_full: cli.verify_full,
|
||||||
threads: cli.threads,
|
threads: cli.threads,
|
||||||
|
size_only: cli.size_only,
|
||||||
|
min_size: cli.min_size,
|
||||||
|
max_depth: cli.max_depth,
|
||||||
|
progress: !cli.no_progress && !cli.json && io::stderr().is_terminal(),
|
||||||
});
|
});
|
||||||
|
|
||||||
let stdout = io::stdout();
|
let stdout = io::stdout();
|
||||||
@@ -83,6 +137,16 @@ fn main() -> anyhow::Result<ExitCode> {
|
|||||||
} else {
|
} else {
|
||||||
write_human_report(&mut out, &report).context("failed to write report")?;
|
write_human_report(&mut out, &report).context("failed to write report")?;
|
||||||
}
|
}
|
||||||
|
drop(out);
|
||||||
|
|
||||||
|
if cli.interactive {
|
||||||
|
let groups = if cli.verify_full {
|
||||||
|
&report.verified_duplicates
|
||||||
|
} else {
|
||||||
|
&report.possible_duplicates
|
||||||
|
};
|
||||||
|
run_interactive_resolver(groups, cli.verify_full, &cli.delete_plan)?;
|
||||||
|
}
|
||||||
|
|
||||||
if report.summary.errors > 0 {
|
if report.summary.errors > 0 {
|
||||||
Ok(ExitCode::from(2))
|
Ok(ExitCode::from(2))
|
||||||
@@ -90,3 +154,148 @@ fn main() -> anyhow::Result<ExitCode> {
|
|||||||
Ok(ExitCode::SUCCESS)
|
Ok(ExitCode::SUCCESS)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn run_interactive_resolver(
|
||||||
|
groups: &[DuplicateGroup],
|
||||||
|
verified: bool,
|
||||||
|
delete_plan: &PathBuf,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
if groups.is_empty() {
|
||||||
|
println!("No duplicate groups to resolve.");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let theme = ColorfulTheme::default();
|
||||||
|
let mut planned_deletes = Vec::new();
|
||||||
|
let mut skipped = 0usize;
|
||||||
|
|
||||||
|
for (group_index, group) in groups.iter().enumerate() {
|
||||||
|
println!();
|
||||||
|
println!(
|
||||||
|
"Duplicate group {}/{} — {} across {} files",
|
||||||
|
group_index + 1,
|
||||||
|
groups.len(),
|
||||||
|
disk_checker::format_bytes(group.size),
|
||||||
|
group.paths.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut choices = group
|
||||||
|
.paths
|
||||||
|
.iter()
|
||||||
|
.map(|path| path.display().to_string())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
choices.push("Skip this group".to_string());
|
||||||
|
|
||||||
|
let selection = Select::with_theme(&theme)
|
||||||
|
.with_prompt("Choose the version to keep")
|
||||||
|
.items(&choices)
|
||||||
|
.default(0)
|
||||||
|
.interact()
|
||||||
|
.context("interactive selection failed")?;
|
||||||
|
|
||||||
|
if selection == group.paths.len() {
|
||||||
|
skipped += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let keep_path = &group.paths[selection];
|
||||||
|
let delete_paths = group
|
||||||
|
.paths
|
||||||
|
.iter()
|
||||||
|
.filter(|path| *path != keep_path)
|
||||||
|
.cloned()
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
println!("Keeping: {}", keep_path.display());
|
||||||
|
for path in &delete_paths {
|
||||||
|
println!(" remove: {}", path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
let confirmed = Confirm::with_theme(&theme)
|
||||||
|
.with_prompt("Add these files to the deletion plan?")
|
||||||
|
.default(false)
|
||||||
|
.interact()
|
||||||
|
.context("interactive confirmation failed")?;
|
||||||
|
|
||||||
|
if !confirmed {
|
||||||
|
skipped += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
planned_deletes.extend(delete_paths);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !planned_deletes.is_empty() {
|
||||||
|
write_delete_plan(delete_plan, &planned_deletes, verified)?;
|
||||||
|
println!(
|
||||||
|
"Wrote deletion plan for {} files: {}",
|
||||||
|
planned_deletes.len(),
|
||||||
|
delete_plan.display()
|
||||||
|
);
|
||||||
|
println!("Review it, then run: sh {}", delete_plan.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"Interactive resolver complete: {} planned, {} groups skipped.",
|
||||||
|
planned_deletes.len(),
|
||||||
|
skipped
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_delete_plan(
|
||||||
|
path: &PathBuf,
|
||||||
|
delete_paths: &[PathBuf],
|
||||||
|
verified: bool,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let mut file = OpenOptions::new()
|
||||||
|
.write(true)
|
||||||
|
.create_new(true)
|
||||||
|
.open(path)
|
||||||
|
.with_context(|| format!("failed to create delete plan {}", path.display()))?;
|
||||||
|
writeln!(file, "#!/bin/sh")?;
|
||||||
|
writeln!(file, "set -eu")?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"# Review carefully before running. Generated by disk-checker."
|
||||||
|
)?;
|
||||||
|
if verified {
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"# Source groups were fully verified with --verify-full."
|
||||||
|
)?;
|
||||||
|
} else {
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"# WARNING: Source groups were possible duplicates only, not fully verified."
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
for delete_path in delete_paths {
|
||||||
|
writeln!(file, "rm -- {}", shell_quote(delete_path)?)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shell_quote(path: &Path) -> anyhow::Result<String> {
|
||||||
|
let value = path.to_str().with_context(|| {
|
||||||
|
format!(
|
||||||
|
"delete plan cannot safely encode non-UTF-8 path: {}",
|
||||||
|
path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
Ok(format!("'{}'", value.replace('\'', "'\\''")))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::shell_quote;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shell_quote_escapes_single_quotes() {
|
||||||
|
assert_eq!(
|
||||||
|
shell_quote(Path::new("/tmp/it's-here.txt")).expect("quote path"),
|
||||||
|
"'/tmp/it'\\''s-here.txt'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user