diff --git a/Cargo.lock b/Cargo.lock index 31214ae..8de1e5d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -123,6 +123,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "dyn-clone" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21e50f3adc76d6a43f5ed73b698a87d0760ca74617f60f7c3b879003536fdd28" + [[package]] name = "fnv" version = "1.0.7" @@ -240,13 +246,22 @@ checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" [[package]] name = "os_str_bytes" -version = "6.0.0" +version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" +checksum = "029d8d0b2f198229de29dca79676f2738ff952edf3fde542eb8bf94d8c21b435" dependencies = [ "memchr", ] +[[package]] +name = "patricia_tree" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c4b8ef84caee22395fa083b7d8ee9351e71cdf69a46c832528acdcac402117" +dependencies = [ + "bitflags", +] + [[package]] name = "ppv-lite86" version = "0.2.15" @@ -379,7 +394,10 @@ dependencies = [ "atty", "clap", "colored", + "dyn-clone", "ignore", + "os_str_bytes", + "patricia_tree", "regex", "tempfile", ] diff --git a/Cargo.toml b/Cargo.toml index a9c7a78..15b25ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,8 +18,11 @@ anyhow = "1.0.32" atty = "0.2.14" clap = { version = "3.0.7", features = ["derive"] } colored = "2.0" +dyn-clone = "1.0.5" ignore = "0.4" Inflector = "0.11" +os_str_bytes = "6.0.1" +patricia_tree = "0.3.1" regex = "1.5.1" diff --git a/src/app.rs b/src/app.rs index e27e757..8c6ef3a 100644 --- a/src/app.rs +++ b/src/app.rs @@ -68,7 +68,7 @@ struct Options { parse(from_os_str), help = "The source path. Defaults to the working directory" )] - path: Option, + paths: Vec, #[clap( long = "--no-regex", @@ -176,7 +176,7 @@ pub fn run() -> Result<()> { ignored, ignored_file_types, no_regex, - path, + mut paths, pattern, replacement, selected_file_types, @@ -217,11 +217,13 @@ pub fn run() -> Result<()> { ignored_file_types, }; - let path = path.unwrap_or_else(|| Path::new(".").to_path_buf()); - if path == PathBuf::from("-") { + if paths.is_empty() { + paths.push(Path::new(".").to_path_buf()); + } + if paths.len() == 1 && paths.first().unwrap() == &PathBuf::from("-") { run_on_stdin(query) } else { - run_on_directory(console, path, settings, query) + run_on_directory(console, paths, settings, query) } } @@ -241,12 +243,16 @@ fn run_on_stdin(query: Query) -> Result<()> { fn run_on_directory( console: Console, - path: PathBuf, + paths: Vec, settings: Settings, query: Query, ) -> Result<()> { let dry_run = settings.dry_run; - let mut directory_patcher = DirectoryPatcher::new(&console, &path, &settings); + let mut directory_patcher = DirectoryPatcher::new( + &console, + Box::new(paths.iter().map(|p| -> &Path { &*p })), + &settings, + ); directory_patcher.run(&query)?; let stats = directory_patcher.stats(); if stats.total_replacements() == 0 { diff --git a/src/directory_patcher.rs b/src/directory_patcher.rs index f88b1d5..1c80990 100644 --- a/src/directory_patcher.rs +++ b/src/directory_patcher.rs @@ -1,5 +1,10 @@ -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; +use dyn_clone::DynClone; +use ignore::WalkState; +use std::fmt::Debug; +use std::fs; use std::path::Path; +use std::sync::{Arc, Mutex}; use crate::console::Console; use crate::file_patcher::FilePatcher; @@ -7,6 +12,10 @@ use crate::query::Query; use crate::settings::Settings; use crate::stats::Stats; +use self::path_deduplicator::PathDeduplicator; + +mod path_deduplicator; + #[derive(Debug)] /// Used to run replacement query on every text file present in a given path /// ```rust @@ -29,22 +38,32 @@ use crate::stats::Stats; // Note: keep the dry_run: true in the doc test above or the integration test // will fail ... pub struct DirectoryPatcher<'a> { - path: &'a Path, + paths: Box + 'a>, settings: &'a Settings, console: &'a Console, stats: Stats, } +pub trait PathsIter<'a> +where + Self: Debug + DynClone + Iterator + Send, +{ +} + +dyn_clone::clone_trait_object!(<'a> PathsIter<'a>); + +impl<'a, T> PathsIter<'a> for T where Self: Debug + DynClone + Iterator + Send + 'a {} + impl<'a> DirectoryPatcher<'a> { pub fn new( console: &'a Console, - path: &'a Path, + paths: Box + 'a>, settings: &'a Settings, ) -> DirectoryPatcher<'a> { let stats = Stats::default(); DirectoryPatcher { console, - path, + paths, settings, stats, } @@ -53,41 +72,74 @@ impl<'a> DirectoryPatcher<'a> { /// Run the given query on the selected files in self.path pub fn run(&mut self, query: &Query) -> Result<()> { let walker = self.build_walker()?; - for entry in walker { - let entry = entry.with_context(|| "Could not read directory entry")?; - if let Some(file_type) = entry.file_type() { - if file_type.is_file() { - self.patch_file(entry.path(), query)?; + let mut error_happened = Arc::new(Mutex::new(false)); + walker.run(|| { + let error_happened = error_happened.clone(); + let console = self.console; + let stats = &self.stats; + let settings = &self.settings; + Box::new(move |entry| -> WalkState { + let res = (|| -> Result<()> { + let entry = entry.with_context(|| "Could not read directory entry")?; + if let Some(file_type) = entry.file_type() { + if file_type.is_file() { + Self::patch_file(console, stats, settings, entry.path(), query)?; + } + } + Ok(()) + })(); + + match res { + Ok(()) => WalkState::Continue, + Err(e) => { + *error_happened.lock().unwrap() = true; + console.print_error(&format!("{:?}", e)); + WalkState::Quit + } } - } + }) + }); + let error_happened = *Arc::get_mut(&mut error_happened) + .expect("no references to error flag expected after dir walking") + .get_mut() + .unwrap(); + if error_happened { + bail!("one or more directory walking operations failed, see above for more details") + } else { + Ok(()) } - Ok(()) } pub fn stats(self) -> Stats { self.stats } - pub(crate) fn patch_file(&mut self, entry: &Path, query: &Query) -> Result<()> { - let file_patcher = FilePatcher::new(self.console, entry, query)?; + pub(crate) fn patch_file( + console: &Console, + stats: &Stats, + settings: &Settings, + entry: &Path, + query: &Query, + ) -> Result<()> { + let file_patcher = FilePatcher::new(console, entry, query)?; let file_patcher = match file_patcher { None => return Ok(()), Some(f) => f, }; let num_replacements = file_patcher.num_replacements(); if num_replacements != 0 { - self.console.print_message("\n"); + console.print_message("\n"); } let num_lines = file_patcher.num_lines(); - self.stats.update(num_lines, num_replacements); - if self.settings.dry_run { + stats.update(num_lines, num_replacements); + if settings.dry_run { return Ok(()); } file_patcher.run()?; Ok(()) } - fn build_walker(&self) -> Result { + fn build_walker(&self) -> Result { let mut types_builder = ignore::types::TypesBuilder::new(); types_builder.add_defaults(); let mut count: u32 = 0; @@ -116,7 +168,19 @@ impl<'a> DirectoryPatcher<'a> { } } let types_matcher = types_builder.build()?; - let mut walk_builder = ignore::WalkBuilder::new(&self.path); + + let mut paths = self.paths.clone(); + + let mut walk_builder = ignore::WalkBuilder::new( + paths + .next() + .expect("internal error: expected at least one path"), + ); + + for path in paths { + walk_builder.add(path); + } + walk_builder.types(types_matcher); // Note: the walk_builder configures the "ignore" settings of the Walker, // hence the negations @@ -126,6 +190,54 @@ impl<'a> DirectoryPatcher<'a> { if self.settings.hidden { walk_builder.hidden(false); } - Ok(walk_builder.build()) + + let path_deduplicator = Arc::new(Mutex::new(PathDeduplicator::new())); + walk_builder.filter_entry(move |dir_entry| { + fs::canonicalize(dir_entry.path()).map_or(false, |abs_path_buf| { + let was_not_seen_before = + path_deduplicator.lock().unwrap().insert_path(&abs_path_buf); + was_not_seen_before + }) + }); + + // NOTE(erichdongubler): `walkdir` parallel API lets us do fancy things like skipping + // duplicate canonicalized entries, which we absolutely need with specification of multiple + // paths for walking. However, we only use a single thread here for now because there's + // a few issues to tackle with enabling multiple threads: + // + // - Blocker: Console printing is only synchronized for individual write operations -- + // which is not good enough when we have entire blocks of output that need to stay + // together. + // - Minor: Design thinking should be given to the following issues: + // - Errors for any reason in old logic halted the entire search-and-replace operation. + // However, we can no [longer guarantee that there won't be a few straggler + // operations][stragglers] before `walkdir` quits its iteration flow when we encounter + // an error. + // + // [stragglers]: https://docs.rs/ignore/latest/ignore/enum.WalkState.html#variant.Quit + // + // This is probably fine to just note as expected behavior. However, we also have + // an opportunity for user-defined behavior for this! + // + // Idea: expose a `--on-replace-error=(report-and-continue|ignore|stop)` flag. + // - How to expose the number of threads used to a user? + // - Idea: Stick to 1 by default for now, to keep behavior until a new breaking version. + // - Idea: allow specifying non-zero number, or "max", ex. `--num-threads=(<1..n>|max)`, + // `-j` for short like with many other *nix tools. + // - How to print results -- the first alternatives that come to mind are: + // - Approach 1: Print replacement reports per file one-at-a-time, queueing them once + // the entire file is processed. + // - Approach 2: Print replacement reports per file one-at-a-time, queueing individual + // line replacements as they are processed. + // - Pro: Slightly more responsive UI? + // - Con: Once a file is picked for reporting, even finished reports can't print + // until the picked one is done. This could have some pathological UX compared to + // approach 1. + // - Approach 3: Print replacements reports after all of them have been received. + // - Almost certainly not what we want -- the most memory and waiting to show the + // user something. + walk_builder.threads(1); + + Ok(walk_builder.build_parallel()) } } diff --git a/src/directory_patcher/path_deduplicator.rs b/src/directory_patcher/path_deduplicator.rs new file mode 100644 index 0000000..5cc8f08 --- /dev/null +++ b/src/directory_patcher/path_deduplicator.rs @@ -0,0 +1,22 @@ +use std::path::Path; + +use os_str_bytes::RawOsStr; +use patricia_tree::PatriciaSet; + +#[derive(Debug, Default)] +pub struct PathDeduplicator { + set: PatriciaSet, +} + +impl PathDeduplicator { + pub fn new() -> Self { + Self::default() + } + + // Returns `true` if the given `path` was called for this instance before. + pub fn insert_path(&mut self, path: &Path) -> bool { + let Self { set } = self; + let raw = RawOsStr::new(path.as_os_str()); + set.insert(raw.as_raw_bytes()) + } +} diff --git a/src/stats.rs b/src/stats.rs index 04b31a0..ba14487 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -1,36 +1,40 @@ +use std::sync::atomic::{self, AtomicUsize}; + use inflector::string::pluralize::to_plural; #[derive(Default, Debug)] /// Statistics about a run of DirectoryPatcher pub struct Stats { - matching_files: usize, - matching_lines: usize, - total_replacements: usize, + matching_files: AtomicUsize, + matching_lines: AtomicUsize, + total_replacements: AtomicUsize, } impl Stats { - pub(crate) fn update(&mut self, lines: usize, replacements: usize) { + pub(crate) fn update(&self, lines: usize, replacements: usize) { if replacements == 0 { return; } - self.matching_files += 1; - self.matching_lines += lines; - self.total_replacements += replacements; + self.matching_files.fetch_add(1, atomic::Ordering::SeqCst); + self.matching_lines + .fetch_add(lines, atomic::Ordering::SeqCst); + self.total_replacements + .fetch_add(replacements, atomic::Ordering::SeqCst); } /// Number of matching files pub fn matching_files(&self) -> usize { - self.matching_files + self.matching_files.load(atomic::Ordering::SeqCst) } /// Total number of lines that were replaced pub fn matching_lines(&self) -> usize { - self.matching_lines + self.matching_lines.load(atomic::Ordering::SeqCst) } /// Total number of lines that were replaced pub fn total_replacements(&self) -> usize { - self.total_replacements + self.total_replacements.load(atomic::Ordering::SeqCst) } } @@ -44,12 +48,15 @@ fn pluralize(input: &str, num: usize) -> String { impl std::fmt::Display for Stats { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let file_string = pluralize("file", self.matching_files); - let replacements_string = pluralize("replacement", self.total_replacements); + let file_string = pluralize("file", self.matching_files()); + let replacements_string = pluralize("replacement", self.total_replacements()); write!( f, "{} {} on {} matching {}", - self.total_replacements, replacements_string, self.matching_files, file_string + self.total_replacements(), + replacements_string, + self.matching_files(), + file_string ) } } @@ -61,17 +68,17 @@ mod tests { #[test] fn test_stats_to_string() { let stats = Stats { - matching_files: 2, - total_replacements: 4, - matching_lines: 1, + matching_files: AtomicUsize::new(2), + total_replacements: AtomicUsize::new(4), + matching_lines: AtomicUsize::new(1), }; let actual = stats.to_string(); assert_eq!(actual, "4 replacements on 2 matching files"); let stats = Stats { - matching_files: 1, - total_replacements: 2, - matching_lines: 1, + matching_files: AtomicUsize::new(1), + total_replacements: AtomicUsize::new(2), + matching_lines: AtomicUsize::new(1), }; let actual = stats.to_string(); assert_eq!(actual, "2 replacements on 1 matching file"); diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 8027675..0f37d04 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -45,9 +45,13 @@ fn assert_not_replaced(path: &Path) { assert!(contents.contains("old")); } -fn run_ruplacer(data_path: &Path, settings: Settings) -> Result { +fn run_ruplacer(data_path: &[&Path], settings: Settings) -> Result { let console = Console::new(); - let mut directory_patcher = DirectoryPatcher::new(&console, data_path, &settings); + let mut directory_patcher = DirectoryPatcher::new( + &console, + Box::new(data_path.into_iter().cloned()), + &settings, + ); directory_patcher.run(&Query::substring("old", "new"))?; Ok(directory_patcher.stats()) } @@ -65,7 +69,7 @@ fn test_replace_old_by_new() { let data_path = setup_test(&tmp_dir); let settings = Settings::default(); - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); let top_txt_path = data_path.join("top.txt"); assert_replaced(&top_txt_path); @@ -80,7 +84,7 @@ fn test_stats() { let data_path = setup_test(&tmp_dir); let settings = Settings::default(); - let stats = run_ruplacer(&data_path, settings).unwrap(); + let stats = run_ruplacer(&[&data_path], settings).unwrap(); assert!(stats.matching_files() > 1); assert!(stats.total_replacements() > 1); } @@ -94,7 +98,7 @@ fn test_dry_run() { dry_run: true, ..Default::default() }; - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); let top_txt_path = data_path.join("top.txt"); assert_not_replaced(&top_txt_path); @@ -106,7 +110,7 @@ fn test_skip_hidden_and_ignored_by_default() { let data_path = setup_test(&tmp_dir); let settings = Settings::default(); - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); let hidden_path = data_path.join(".hidden.txt"); assert_not_replaced(&hidden_path); @@ -124,7 +128,7 @@ fn test_can_replace_hidden_files() { hidden: true, ..Default::default() }; - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); let hidden_path = data_path.join(".hidden.txt"); assert_replaced(&hidden_path); @@ -139,7 +143,7 @@ fn test_can_replace_ignored_files() { ignored: true, ..Default::default() }; - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); let ignored_path = data_path.join("ignore.txt"); assert_replaced(&ignored_path); @@ -153,7 +157,7 @@ fn test_skip_non_utf8_files() { fs::write(bin_path, b"caf\xef\n").unwrap(); let settings = Settings::default(); - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); } fn add_python_file(data_path: &Path) -> PathBuf { @@ -172,7 +176,7 @@ fn test_select_file_types() { selected_file_types: vec!["py".to_string()], ..Default::default() }; - let stats = run_ruplacer(&data_path, settings).unwrap(); + let stats = run_ruplacer(&[&data_path], settings).unwrap(); assert_eq!(stats.matching_files(), 1); } @@ -187,7 +191,7 @@ fn test_select_file_types_by_glob_pattern_1() { selected_file_types: vec!["*.py".to_string()], ..Default::default() }; - let stats = run_ruplacer(&data_path, settings).unwrap(); + let stats = run_ruplacer(&[&data_path], settings).unwrap(); assert_eq!(stats.matching_files(), 1); } @@ -202,7 +206,7 @@ fn test_select_file_types_by_glob_pattern_2() { selected_file_types: vec!["f*.py".to_string()], ..Default::default() }; - let stats = run_ruplacer(&data_path, settings).unwrap(); + let stats = run_ruplacer(&[&data_path], settings).unwrap(); assert_eq!(stats.matching_files(), 1); } @@ -216,7 +220,7 @@ fn test_select_file_types_by_incorrect_glob_pattern() { selected_file_types: vec!["[*.py".to_string()], ..Default::default() }; - let err = run_ruplacer(&data_path, settings).unwrap_err(); + let err = run_ruplacer(&[&data_path], settings).unwrap_err(); assert!(err.to_string().contains("error parsing glob")); } @@ -229,7 +233,7 @@ fn test_ignore_file_types() { ignored_file_types: vec!["py".to_string()], ..Default::default() }; - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); assert_not_replaced(&py_path); } @@ -243,7 +247,7 @@ fn test_ignore_file_types_by_glob_pattern_1() { ignored_file_types: vec!["*.py".to_string()], ..Default::default() }; - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); assert_not_replaced(&py_path); } @@ -257,7 +261,7 @@ fn test_ignore_file_types_by_glob_pattern_2() { ignored_file_types: vec!["f*.py".to_string()], ..Default::default() }; - run_ruplacer(&data_path, settings).unwrap(); + run_ruplacer(&[&data_path], settings).unwrap(); assert_not_replaced(&py_path); } @@ -270,6 +274,6 @@ fn test_ignore_file_types_by_incorrect_glob_pattern() { ignored_file_types: vec!["[.py".to_string()], ..Default::default() }; - let err = run_ruplacer(&data_path, settings).unwrap_err(); + let err = run_ruplacer(&[&data_path], settings).unwrap_err(); assert!(err.to_string().contains("unrecognized file type")); }