Files
disk-checker/src/main.rs
2026-06-04 15:34:48 +01:00

293 lines
8.4 KiB
Rust

use std::fs::OpenOptions;
use std::io::{self, IsTerminal, Write};
use std::path::{Path, PathBuf};
use std::process::ExitCode;
use anyhow::{Context, bail};
use clap::Parser;
use dialoguer::{Confirm, Select, theme::ColorfulTheme};
use disk_checker::{DuplicateGroup, ScanConfig, parse_byte_count, scan_paths, write_human_report};
#[derive(Debug, Parser)]
#[command(
author,
version,
about = "Fast folder scanner for sizes, symlinks, hard links, and possible duplicate files",
long_about = "disk-checker scans one or more folders, groups files by size, hashes only the first N bytes of same-size candidates, and reports possible duplicates plus symlinks, hard links, special files, and scan errors."
)]
struct Cli {
/// Folder or file paths to scan. Defaults to the current directory.
#[arg(value_name = "PATH")]
paths: Vec<PathBuf>,
/// Number of bytes to hash from each same-size candidate file.
/// Accepts plain bytes or units like 512KiB, 1MiB, 2MB.
#[arg(long, default_value = "1MiB", value_parser = parse_byte_count)]
hash_bytes: u64,
/// Follow symlinks while scanning. Symlinks are still reported separately.
#[arg(long)]
follow_links: bool,
/// Fully hash possible duplicate groups after the fast partial-hash pass.
#[arg(long)]
verify_full: bool,
/// Group duplicate candidates by size only. Fastest mode for huge triage; less precise.
#[arg(long)]
size_only: bool,
/// Ignore duplicate candidates smaller than this size. Accepts units like 100MiB or 1GB.
#[arg(long, default_value = "0", value_parser = parse_min_size)]
min_size: u64,
/// Maximum directory depth to scan. Depth 0 means only the provided path itself.
#[arg(long, value_parser = parse_nonzero_or_zero_usize)]
max_depth: Option<usize>,
/// Number of worker threads used for scanning and hashing. Defaults to CPU parallelism.
#[arg(long, value_parser = parse_thread_count)]
threads: Option<usize>,
/// Print machine-readable JSON instead of the human summary.
#[arg(long)]
json: bool,
/// Disable progress output.
#[arg(long)]
no_progress: bool,
/// Interactively review duplicate groups and choose which path to keep.
#[arg(long)]
interactive: bool,
/// Shell script path for planned deletes when --interactive is used.
#[arg(long, default_value = "disk-checker-delete-plan.sh")]
delete_plan: PathBuf,
}
fn parse_thread_count(input: &str) -> Result<usize, String> {
let threads = input
.parse::<usize>()
.map_err(|error| format!("invalid thread count {input:?}: {error}"))?;
if threads == 0 {
Err("thread count must be greater than zero".to_string())
} else {
Ok(threads)
}
}
fn parse_nonzero_or_zero_usize(input: &str) -> Result<usize, String> {
input
.parse::<usize>()
.map_err(|error| format!("invalid depth {input:?}: {error}"))
}
fn parse_min_size(input: &str) -> Result<u64, String> {
if input.trim() == "0" {
Ok(0)
} else {
parse_byte_count(input)
}
}
fn main() -> anyhow::Result<ExitCode> {
let cli = Cli::parse();
if cli.interactive && cli.json {
bail!(
"--interactive cannot be combined with --json because prompts would contaminate JSON output"
);
}
let verify_full = cli.verify_full || cli.interactive;
if let Some(threads) = cli.threads {
rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build_global()
.context("failed to initialize hashing thread pool")?;
}
let paths = if cli.paths.is_empty() {
vec![PathBuf::from(".")]
} else {
cli.paths
};
let report = scan_paths(ScanConfig {
paths,
hash_bytes: cli.hash_bytes,
follow_links: cli.follow_links,
verify_full,
threads: cli.threads,
size_only: cli.size_only,
min_size: cli.min_size,
max_depth: cli.max_depth,
progress: !cli.no_progress && !cli.json && io::stderr().is_terminal(),
});
let stdout = io::stdout();
let mut out = stdout.lock();
if cli.json {
serde_json::to_writer_pretty(&mut out, &report).context("failed to write JSON report")?;
writeln!(out).context("failed to finish JSON report")?;
} else {
write_human_report(&mut out, &report).context("failed to write report")?;
}
drop(out);
if cli.interactive {
run_interactive_resolver(&report.verified_duplicates, true, &cli.delete_plan)?;
}
if report.summary.errors > 0 {
Ok(ExitCode::from(2))
} else {
Ok(ExitCode::SUCCESS)
}
}
fn run_interactive_resolver(
groups: &[DuplicateGroup],
verified: bool,
delete_plan: &PathBuf,
) -> anyhow::Result<()> {
if groups.is_empty() {
println!("No duplicate groups to resolve.");
return Ok(());
}
let theme = ColorfulTheme::default();
let mut planned_deletes = Vec::new();
let mut skipped = 0usize;
for (group_index, group) in groups.iter().enumerate() {
println!();
println!(
"Duplicate group {}/{}{} across {} files",
group_index + 1,
groups.len(),
disk_checker::format_bytes(group.size),
group.paths.len()
);
let mut choices = group
.paths
.iter()
.map(|path| path.display().to_string())
.collect::<Vec<_>>();
choices.push("Skip this group".to_string());
let selection = Select::with_theme(&theme)
.with_prompt("Choose the version to keep")
.items(&choices)
.default(0)
.interact()
.context("interactive selection failed")?;
if selection == group.paths.len() {
skipped += 1;
continue;
}
let keep_path = &group.paths[selection];
let delete_paths = group
.paths
.iter()
.filter(|path| *path != keep_path)
.cloned()
.collect::<Vec<_>>();
println!("Keeping: {}", keep_path.display());
for path in &delete_paths {
println!(" remove: {}", path.display());
}
let confirmed = Confirm::with_theme(&theme)
.with_prompt("Add these files to the deletion plan?")
.default(false)
.interact()
.context("interactive confirmation failed")?;
if !confirmed {
skipped += 1;
continue;
}
planned_deletes.extend(delete_paths);
}
if !planned_deletes.is_empty() {
write_delete_plan(delete_plan, &planned_deletes, verified)?;
println!(
"Wrote deletion plan for {} files: {}",
planned_deletes.len(),
delete_plan.display()
);
println!("Review it, then run: sh {}", delete_plan.display());
}
println!(
"Interactive resolver complete: {} planned, {} groups skipped.",
planned_deletes.len(),
skipped
);
Ok(())
}
fn write_delete_plan(
path: &PathBuf,
delete_paths: &[PathBuf],
verified: bool,
) -> anyhow::Result<()> {
let mut file = OpenOptions::new()
.write(true)
.create_new(true)
.open(path)
.with_context(|| format!("failed to create delete plan {}", path.display()))?;
writeln!(file, "#!/bin/sh")?;
writeln!(file, "set -eu")?;
writeln!(
file,
"# Review carefully before running. Generated by disk-checker."
)?;
if verified {
writeln!(
file,
"# Source groups were fully verified with --verify-full."
)?;
} else {
writeln!(
file,
"# WARNING: Source groups were possible duplicates only, not fully verified."
)?;
}
for delete_path in delete_paths {
writeln!(file, "rm -- {}", shell_quote(delete_path)?)?;
}
Ok(())
}
fn shell_quote(path: &Path) -> anyhow::Result<String> {
let value = path.to_str().with_context(|| {
format!(
"delete plan cannot safely encode non-UTF-8 path: {}",
path.display()
)
})?;
Ok(format!("'{}'", value.replace('\'', "'\\''")))
}
#[cfg(test)]
mod tests {
use super::shell_quote;
use std::path::Path;
#[test]
fn shell_quote_escapes_single_quotes() {
assert_eq!(
shell_quote(Path::new("/tmp/it's-here.txt")).expect("quote path"),
"'/tmp/it'\\''s-here.txt'"
);
}
}