From 0fa5e7beaf46b49a7bd0fa91ed90c577dbe71dee Mon Sep 17 00:00:00 2001 From: Juga Paazmaya Date: Thu, 28 Nov 2024 08:20:50 +0200 Subject: [PATCH] Run cargo fmt and start creating unit tests --- Cargo.lock | 157 +++++++++++++++++++++++- Cargo.toml | 3 + src/main.rs | 348 +++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 459 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2f1359..4092bdd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,6 +49,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + [[package]] name = "blake3" version = "0.3.8" @@ -58,7 +64,7 @@ dependencies = [ "arrayref", "arrayvec", "cc", - "cfg-if", + "cfg-if 0.1.10", "constant_time_eq", "crypto-mac", "digest", @@ -76,6 +82,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "clap" version = "2.34.0" @@ -84,7 +96,7 @@ checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ "ansi_term", "atty", - "bitflags", + "bitflags 1.3.2", "strsim", "textwrap", "unicode-width", @@ -116,6 +128,22 @@ dependencies = [ "generic-array", ] +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "fastrand" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" + [[package]] name = "fddf" version = "1.7.0" @@ -127,6 +155,7 @@ dependencies = [ "regex", "scoped_threadpool", "structopt", + "tempfile", "unbytify", "walkdir", ] @@ -185,9 +214,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.126" +version = "0.2.166" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" +checksum = "c2ccc108bbc0b1331bd061864e7cd823c0cab660bbe6970e66e2c0614decde36" + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "memchr" @@ -205,6 +240,12 @@ dependencies = [ "libc", ] +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -264,6 +305,19 @@ version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" +[[package]] +name = "rustix" +version = "0.38.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + [[package]] name = "same-file" version = "1.0.6" @@ -326,6 +380,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +dependencies = [ + "cfg-if 1.0.0", + "fastrand", + "once_cell", + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -421,3 +488,85 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index 32580d4..ec701ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,3 +21,6 @@ fnv = "1.0.7" unbytify = "0.2.0" regex = "1.5.6" glob = "0.3.0" + +[dev-dependencies] +tempfile = "3.2.0" diff --git a/src/main.rs b/src/main.rs index c148b5c..88925e0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,16 +1,16 @@ +use blake3::Hasher; +use fnv::{FnvHashMap as HashMap, FnvHashSet as HashSet}; +use glob::Pattern; +use regex::Regex; +use std::collections::hash_map::Entry; use std::fs::{File, Metadata}; -use std::io::{self, Read, Write, Seek, SeekFrom}; +use std::io::{self, Read, Seek, SeekFrom, Write}; +#[cfg(unix)] +use std::os::unix::fs::MetadataExt; use std::path::PathBuf; -use std::sync::mpsc::{Sender, channel}; -use std::collections::hash_map::Entry; +use std::sync::mpsc::{channel, Sender}; use structopt::StructOpt; -use fnv::{FnvHashMap as HashMap, FnvHashSet as HashSet}; -use blake3::Hasher; use walkdir::{DirEntry, WalkDir}; -#[cfg(unix)] -use std::os::unix::fs::MetadataExt; -use regex::Regex; -use glob::Pattern; fn err(path: &PathBuf, err: io::Error) { eprintln!("Error processing file {}: {}", path.display(), err); @@ -71,8 +71,14 @@ struct SlowCandidate { impl Candidate for FastCandidate { fn read_block(&mut self) -> Result { match self.file.read(&mut self.buf) { - Ok(n) => { self.n = n; Ok(n) }, - Err(e) => { err(&self.path, e); Err(()) } + Ok(n) => { + self.n = n; + Ok(n) + } + Err(e) => { + err(&self.path, e); + Err(()) + } } } @@ -88,12 +94,18 @@ impl Candidate for FastCandidate { impl Candidate for SlowCandidate { fn read_block(&mut self) -> Result { match File::open(&self.path).and_then(|mut f| { - f.seek(SeekFrom::Start(self.pos as u64)).and_then(|_| { - f.read(&mut self.buf) - }) + f.seek(SeekFrom::Start(self.pos as u64)) + .and_then(|_| f.read(&mut self.buf)) }) { - Ok(n) => { self.n = n; self.pos += n; Ok(n) }, - Err(e) => { err(&self.path, e); Err(()) } + Ok(n) => { + self.n = n; + self.pos += n; + Ok(n) + } + Err(e) => { + err(&self.path, e); + Err(()) + } } } @@ -131,13 +143,16 @@ fn compare_files_inner(fsize: u64, mut todo: Vec, tx: &DupeSend // If we're at EOF, all remaining are dupes. Ok(0) => break 'outer, // If an error occurs, do not process this file further. - Err(_) => { todo.remove(i); } - _ => () + Err(_) => { + todo.remove(i); + } + _ => (), } } } // We are finished and have more than one file in the candidate list. - tx.send((fsize, todo.into_iter().map(Candidate::into_path).collect())).unwrap(); + tx.send((fsize, todo.into_iter().map(Candidate::into_path).collect())) + .unwrap(); } fn compare_files(verbose: bool, fsize: u64, paths: Vec, tx: DupeSender) { @@ -149,23 +164,38 @@ fn compare_files(verbose: bool, fsize: u64, paths: Vec, tx: DupeSender) // If there are too many candidates, we cannot process them opening all // files at the same time. if paths.len() < 100 { - let todo = paths.into_iter().filter_map(|p| { - match File::open(&p) { - Ok(f) => Some(FastCandidate { path: p, file: f, buf: [0u8; BLOCKSIZE], n: 0 }), - Err(e) => { err(&p, e); None } - } - }).collect(); + let todo = paths + .into_iter() + .filter_map(|p| match File::open(&p) { + Ok(f) => Some(FastCandidate { + path: p, + file: f, + buf: [0u8; BLOCKSIZE], + n: 0, + }), + Err(e) => { + err(&p, e); + None + } + }) + .collect(); compare_files_inner(fsize, todo, &tx); } else { - let todo = paths.into_iter().map(|p| { - SlowCandidate { path: p, pos: 0, buf: [0u8; BLOCKSIZE], n: 0 } - }).collect(); + let todo = paths + .into_iter() + .map(|p| SlowCandidate { + path: p, + pos: 0, + buf: [0u8; BLOCKSIZE], + n: 0, + }) + .collect(); compare_files_inner(fsize, todo, &tx); } } #[derive(StructOpt)] -#[structopt(about="A parallel duplicate file finder.")] +#[structopt(about = "A parallel duplicate file finder.")] struct Args { #[structopt(short="m", default_value="1", parse(try_from_str=unbytify::unbytify), help="Minimum file size to consider")] @@ -173,36 +203,62 @@ struct Args { #[structopt(short="M", parse(try_from_str=unbytify::unbytify), help="Maximum file size to consider")] maxsize: Option, - #[structopt(short="H", help="Exclude Unix hidden files (names starting with dot)")] + #[structopt( + short = "H", + help = "Exclude Unix hidden files (names starting with dot)" + )] nohidden: bool, - #[structopt(short="S", help="Don't scan recursively in directories?")] + #[structopt(short = "S", help = "Don't scan recursively in directories?")] nonrecursive: bool, - #[structopt(short="t", help="Report a grand total of duplicates?")] + #[structopt(short = "t", help = "Report a grand total of duplicates?")] grandtotal: bool, - #[structopt(short="s", help="Report dupes on a single line?")] + #[structopt(short = "s", help = "Report dupes on a single line?")] singleline: bool, - #[structopt(short="v", help="Verbose operation?")] + #[structopt(short = "v", help = "Verbose operation?")] verbose: bool, - #[structopt(short="0", help="With -s, separate dupes with NUL, replace newline with two NULs")] + #[structopt( + short = "0", + help = "With -s, separate dupes with NUL, replace newline with two NULs" + )] nul: bool, - #[structopt(short="f", help="Check only filenames matching this pattern", group="patterns")] + #[structopt( + short = "f", + help = "Check only filenames matching this pattern", + group = "patterns" + )] pattern: Option, - #[structopt(short="F", help="Check only filenames matching this regexp", group="patterns")] + #[structopt( + short = "F", + help = "Check only filenames matching this regexp", + group = "patterns" + )] regexp: Option, - #[structopt(help="Root directory or directories to search")] + #[structopt(help = "Root directory or directories to search")] roots: Vec, } fn is_hidden_file(entry: &DirEntry) -> bool { - entry.file_name() + entry + .file_name() .to_str() .map(|s| s.starts_with(".")) .unwrap_or(false) } fn main() { - let Args { minsize, maxsize, verbose, singleline, grandtotal, nohidden, - nonrecursive, nul, pattern, regexp, roots } = Args::from_args(); + let Args { + minsize, + maxsize, + verbose, + singleline, + grandtotal, + nohidden, + nonrecursive, + nul, + pattern, + regexp, + roots, + } = Args::from_args(); let maxsize = maxsize.unwrap_or(u64::max_value()); enum Select { @@ -256,13 +312,16 @@ fn main() { let hashref = &mut hashes; scope.execute(move || { for (size, path, hash) in rx.iter() { - hashref.entry((size, hash)).or_insert_with(Vec::new).push(path); + hashref + .entry((size, hash)) + .or_insert_with(Vec::new) + .push(path); } }); enum Found { One(PathBuf), - Multiple + Multiple, } // Processing a single file entry, with the "sizes" hashmap collecting @@ -290,7 +349,11 @@ fn main() { // The main thread just walks and filters the directory tree. Symlinks // are uninteresting and ignored. - let roots = if roots.is_empty() { vec![".".into()] } else { roots }; + let roots = if roots.is_empty() { + vec![".".into()] + } else { + roots + }; for root in roots { let walkdir = if nonrecursive { WalkDir::new(root).max_depth(1).follow_links(false) @@ -306,7 +369,9 @@ fn main() { let fsize = meta.len(); if fsize >= minsize && fsize <= maxsize { if check_inode(&mut inodes, &meta) { - if !hidden_excluded(&dir_entry) && matches_pattern(&dir_entry) { + if !hidden_excluded(&dir_entry) + && matches_pattern(&dir_entry) + { process(fsize, dir_entry); } } @@ -394,3 +459,196 @@ fn main() { println!(" {:.1} {} of space taken by duplicates", val, suffix); } } + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::{self, File}; + use std::io::Write; + use std::path::PathBuf; + use tempfile::tempdir; + + #[test] + fn test_hash_file_inner() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("test_file"); + let mut file = File::create(&file_path).unwrap(); + writeln!(file, "This is a test file").unwrap(); + file.flush().unwrap(); + + let hash = hash_file_inner(&file_path).unwrap(); + assert_eq!(hash.len(), 32, "Blake3 hash should be 32 bytes long."); + } + + #[test] + fn test_candidate_trait_fast_candidate() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("test_file"); + let mut file = File::create(&file_path).unwrap(); + writeln!(file, "Fast candidate test").unwrap(); + file.flush().unwrap(); + + let file = File::open(&file_path).unwrap(); + let mut fast_candidate = FastCandidate { + path: file_path.clone(), + file, + buf: [0u8; BLOCKSIZE], + n: 0, + }; + + assert!(fast_candidate.read_block().is_ok()); + assert_eq!( + fast_candidate.buf[..fast_candidate.n], + b"Fast candidate test\n"[..], + "Read data should match file content." + ); + } + + #[test] + fn test_candidate_trait_slow_candidate() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("test_file"); + let mut file = File::create(&file_path).unwrap(); + writeln!(file, "Slow candidate test").unwrap(); + file.flush().unwrap(); + + let mut slow_candidate = SlowCandidate { + path: file_path.clone(), + pos: 0, + buf: [0u8; BLOCKSIZE], + n: 0, + }; + + assert!(slow_candidate.read_block().is_ok()); + assert_eq!( + slow_candidate.buf[..slow_candidate.n], + b"Slow candidate test\n"[..], + "Read data should match file content." + ); + } + + #[test] + fn test_is_hidden_file() { + let temp_dir = tempdir().unwrap(); + let hidden_file_path = temp_dir.path().join(".hidden_file"); + File::create(&hidden_file_path).unwrap(); + + let entry = walkdir::WalkDir::new(temp_dir.path()) + .into_iter() + .filter_map(Result::ok) + .find(|e| e.file_name().to_str().unwrap() == ".hidden_file") + .unwrap(); + + assert!(is_hidden_file(&entry), "Hidden file should be detected."); + } + + #[test] + fn test_non_hidden_file() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("visible_file"); + File::create(&file_path).unwrap(); + + let entry = walkdir::WalkDir::new(temp_dir.path()) + .into_iter() + .filter_map(Result::ok) + .find(|e| e.file_name().to_str().unwrap() == "visible_file") + .unwrap(); + + assert!( + !is_hidden_file(&entry), + "Visible file should not be marked as hidden." + ); + } + + #[test] + fn test_compare_files_inner_no_difference() { + let temp_dir = tempdir().unwrap(); + let file_path1 = temp_dir.path().join("file1"); + let file_path2 = temp_dir.path().join("file2"); + + // Write identical content + fs::write(&file_path1, b"Identical content").unwrap(); + fs::write(&file_path2, b"Identical content").unwrap(); + + let mut fast_candidate1 = FastCandidate { + path: file_path1.clone(), + file: File::open(&file_path1).unwrap(), + buf: [0u8; BLOCKSIZE], + n: 0, + }; + + let mut fast_candidate2 = FastCandidate { + path: file_path2.clone(), + file: File::open(&file_path2).unwrap(), + buf: [0u8; BLOCKSIZE], + n: 0, + }; + + let mut candidates = vec![fast_candidate1, fast_candidate2]; + let (tx, rx) = channel(); + + compare_files_inner(16, candidates, &tx); + + let result = rx.recv().unwrap(); + assert_eq!( + result.1.len(), + 2, + "Both files should be considered duplicates." + ); + assert!(result.1.contains(&file_path1)); + assert!(result.1.contains(&file_path2)); + } + + #[test] + fn test_compare_files_inner_with_difference() { + let temp_dir = tempdir().unwrap(); + let file_path1 = temp_dir.path().join("file1"); + let file_path2 = temp_dir.path().join("file2"); + + // Write different content + fs::write(&file_path1, b"File 1 content").unwrap(); + fs::write(&file_path2, b"Different content").unwrap(); + + let mut fast_candidate1 = FastCandidate { + path: file_path1.clone(), + file: File::open(&file_path1).unwrap(), + buf: [0u8; BLOCKSIZE], + n: 0, + }; + + let mut fast_candidate2 = FastCandidate { + path: file_path2.clone(), + file: File::open(&file_path2).unwrap(), + buf: [0u8; BLOCKSIZE], + n: 0, + }; + + let mut candidates = vec![fast_candidate1, fast_candidate2]; + let (tx, rx) = channel(); + + compare_files_inner(16, candidates, &tx); + + assert!( + rx.try_recv().is_err(), + "Files with different content should not be considered duplicates." + ); + } + + #[test] + fn test_hash_file_inner_success() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("test.txt"); + let mut file = File::create(&file_path).unwrap(); + writeln!(file, "Test content").unwrap(); + + let hash = hash_file_inner(&file_path).unwrap(); + assert!(!hash.is_empty()); + } + + #[test] + fn test_hash_file_inner_error() { + let invalid_path = PathBuf::from("/invalid/path/test.txt"); + let result = hash_file_inner(&invalid_path); + assert!(result.is_err()); + } +}