diff --git a/CHANGELOG.md b/CHANGELOG.md index 455addd..284e27d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +- feat: much faster (3x) implementation for common scenarios + ## [1.2.0] - 2024-01-01 - feat: new option --json to format output as JSON array diff --git a/Cargo.lock b/Cargo.lock index d02fb4b..42c53e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,13 +40,12 @@ dependencies = [ [[package]] name = "bstr" -version = "1.1.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45ea9b00a7b3f2988e9a65ad3917e62123c38dba709b666506207be96d1790b" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "once_cell", - "regex-automata 0.1.10", + "regex-automata", "serde", ] @@ -70,15 +69,9 @@ checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6" [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" - -[[package]] -name = "once_cell" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "pico-args" @@ -121,16 +114,10 @@ checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.5", + "regex-automata", "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-automata" version = "0.4.5" @@ -166,6 +153,8 @@ version = "1.2.0" dependencies = [ "anyhow", "assert_cmd", + "bstr", + "memchr", "pico-args", "predicates", "regex", diff --git a/Cargo.toml b/Cargo.toml index 05281f4..d83dada 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,8 @@ categories = ["command-line-utilities"] [dependencies] anyhow = "1.0.79" +bstr = "1.9.0" +memchr = "2.7.1" pico-args = { version = "0.5.0", features = ["short-space-opt", "combined-flags", "eq-separator"] } regex = { version = "1.10", default-features = false, features = ["std", "unicode-bool", "unicode-perl", "unicode-gencat"], optional = true } diff --git a/src/bin/tuc.rs b/src/bin/tuc.rs index 83b7092..a38e58e 100644 --- a/src/bin/tuc.rs +++ b/src/bin/tuc.rs @@ -1,10 +1,12 @@ use anyhow::Result; +use std::convert::TryFrom; use std::io::Write; use std::str::FromStr; use tuc::bounds::{BoundOrFiller, BoundsType, UserBoundsList}; use tuc::cut_bytes::read_and_cut_bytes; use tuc::cut_lines::read_and_cut_lines; use tuc::cut_str::read_and_cut_str; +use tuc::fast_lane::{read_and_cut_text_as_bytes, FastOpt}; use tuc::options::{Opt, EOL}; #[cfg(feature = "regex")] @@ -257,13 +259,15 @@ fn parse_args() -> Result { fn main() -> Result<()> { let opt: Opt = parse_args()?; - let mut stdin = std::io::BufReader::new(std::io::stdin().lock()); - let mut stdout = std::io::BufWriter::new(std::io::stdout().lock()); + let mut stdin = std::io::BufReader::with_capacity(64 * 1024, std::io::stdin().lock()); + let mut stdout = std::io::BufWriter::with_capacity(64 * 1024, std::io::stdout().lock()); if opt.bounds_type == BoundsType::Bytes { read_and_cut_bytes(&mut stdin, &mut stdout, &opt)?; } else if opt.bounds_type == BoundsType::Lines { read_and_cut_lines(&mut stdin, &mut stdout, &opt)?; + } else if let Ok(fast_opt) = FastOpt::try_from(&opt) { + read_and_cut_text_as_bytes(&mut stdin, &mut stdout, &fast_opt)?; } else { read_and_cut_str(&mut stdin, &mut stdout, opt)?; } diff --git a/src/bounds.rs b/src/bounds.rs index b190314..6258889 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -2,10 +2,10 @@ use anyhow::{bail, Result}; use std::cmp::Ordering; use std::convert::TryInto; use std::fmt; -use std::ops::Range; +use std::ops::{Deref, Range}; use std::str::FromStr; -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, PartialEq)] pub enum BoundsType { Bytes, Characters, @@ -13,7 +13,7 @@ pub enum BoundsType { Lines, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub enum BoundOrFiller { Bound(UserBounds), Filler(String), @@ -98,6 +98,14 @@ pub fn parse_bounds_list(s: &str) -> Result> { #[derive(Debug)] pub struct UserBoundsList(pub Vec); +impl Deref for UserBoundsList { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + impl FromStr for UserBoundsList { type Err = anyhow::Error; fn from_str(s: &str) -> Result { @@ -106,7 +114,10 @@ impl FromStr for UserBoundsList { } impl UserBoundsList { - fn is_sortable(&self) -> bool { + /// Detect whether the list can be sorted. + /// It can be sorted only if every bound + /// has the same sign (all positive or all negative). + pub fn is_sortable(&self) -> bool { let mut has_positive_idx = false; let mut has_negative_idx = false; self.get_userbounds_only().for_each(|b| { @@ -168,6 +179,10 @@ impl UserBoundsList { }) } + /// Check if the bounds in the list match the following conditions: + /// - they are in ascending order + /// - they use solely positive indices + /// - they don't overlap (but they can be adjacent, e.g. 1:2,2,3) pub fn is_forward_only(&self) -> bool { self.is_sortable() && self.is_sorted() && !self.has_negative_indices() } @@ -220,6 +235,23 @@ impl fmt::Display for Side { } } +impl PartialOrd for Side { + fn partial_cmp(&self, other: &Self) -> Option { + match (self, other) { + (Side::Some(s), Side::Some(o)) => { + if !(s * o).is_positive() { + // We can't compare two sides with different sign + return None; + } + Some(s.cmp(o)) + } + (Side::Continue, Side::Some(_)) => Some(Ordering::Greater), + (Side::Some(_), Side::Continue) => Some(Ordering::Less), + (Side::Continue, Side::Continue) => Some(Ordering::Equal), + } + } +} + #[derive(Debug, Eq, Clone)] pub struct UserBounds { pub l: Side, @@ -287,12 +319,15 @@ impl UserBounds { UserBounds { l, r } } /** - * Check if an index is between the bounds. + * Check if a field is between the bounds. * * It errors out if the index has different sign than the bounds * (we can't verify if e.g. -1 idx is between 3:5 without knowing the number * of matching bounds). + * + * Fields are 1-indexed. */ + #[inline(always)] pub fn matches(&self, idx: i32) -> Result { match (self.l, self.r) { (Side::Some(left), _) if (left * idx).is_negative() => { @@ -317,6 +352,68 @@ impl UserBounds { } } + /// Transform UserBounds into std::opt::Range + /// + /// UserBounds is 1-indexed and inclusive on both sides, while + /// the resulting range is 0-indexed and exclusive on the right side. + /// + /// `parts_length` is necessary to calculate Side::Continue on + /// the right side, or any negative indexes. + /// + /// e.g. + /// + /// ```rust + /// # use tuc::bounds::UserBounds; + /// # use std::ops::Range; + /// # use tuc::bounds::Side; + /// + /// assert_eq!( + /// (UserBounds { l: Side::Some(1), r: Side::Some(2) }).try_into_range(5).unwrap(), + /// Range { start: 0, end: 2} // 2, not 1, because it's exclusive + /// ); + /// + /// assert_eq!( + /// (UserBounds { l: Side::Some(1), r: Side::Continue }).try_into_range(5).unwrap(), + /// Range { start: 0, end: 5} + /// ); + /// ``` + pub fn try_into_range(&self, parts_length: usize) -> Result> { + let start: usize = match self.l { + Side::Continue => 0, + Side::Some(v) => { + if v.unsigned_abs() as usize > parts_length { + bail!("Out of bounds: {}", v); + } + if v < 0 { + parts_length - v.unsigned_abs() as usize + } else { + v as usize - 1 + } + } + }; + + let end: usize = match self.r { + Side::Continue => parts_length, + Side::Some(v) => { + if v.unsigned_abs() as usize > parts_length { + bail!("Out of bounds: {}", v); + } + if v < 0 { + parts_length - v.unsigned_abs() as usize + 1 + } else { + v as usize + } + } + }; + + if end <= start { + // `end` must always be 1 or more greater than start + bail!("Field left value cannot be greater than right value"); + } + + Ok(Range { start, end }) + } + /** * Transform a ranged bound into a list of one or more * 1 slot bound @@ -340,40 +437,20 @@ impl UserBounds { }; for i in start..=end { - bounds.push(UserBounds { - l: Side::Some(i), - r: Side::Some(i), - }) + bounds.push(UserBounds::new(Side::Some(i), Side::Some(i))) } bounds } } -impl Ord for UserBounds { - /* - * Compare UserBounds. Note that comparison gives wrong results if - * bounds happen to have a mix of positive/negative indexes (you cannot - * reliably compare -1 with 3 without kwowing how many parts are there). - * Check with UserBounds.is_sortable before comparing. - */ - fn cmp(&self, other: &Self) -> Ordering { - if self == other { - return Ordering::Equal; - } - - match (self.l, self.r, other.l, other.r) { - (_, Side::Some(s_r), Side::Some(o_l), _) if (s_r * o_l).is_positive() && s_r <= o_l => { - Ordering::Less - } - _ => Ordering::Greater, - } - } -} - impl PartialOrd for UserBounds { + /// Compare UserBounds. Note that you cannot reliably compare + /// bounds with a mix of positive/negative indices (you cannot + /// compare `-1` with `3` without kwowing how many parts are there). + /// Check with UserBounds.is_sortable before comparing. fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + self.r.partial_cmp(&other.l) } } @@ -389,43 +466,6 @@ impl Default for UserBounds { } } -pub fn bounds_to_std_range(parts_length: usize, bounds: &UserBounds) -> Result> { - let start: usize = match bounds.l { - Side::Continue => 0, - Side::Some(v) => { - if v.unsigned_abs() as usize > parts_length { - bail!("Out of bounds: {}", v); - } - if v < 0 { - parts_length - v.unsigned_abs() as usize - } else { - v as usize - 1 - } - } - }; - - let end: usize = match bounds.r { - Side::Continue => parts_length, - Side::Some(v) => { - if v.unsigned_abs() as usize > parts_length { - bail!("Out of bounds: {}", v); - } - if v < 0 { - parts_length - v.unsigned_abs() as usize + 1 - } else { - v as usize - } - } - }; - - if end <= start { - // `end` must always be 1 or more greater than start - bail!("Field left value cannot be greater than right value"); - } - - Ok(Range { start, end }) -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/cut_bytes.rs b/src/cut_bytes.rs index 4ed594c..a80f20d 100644 --- a/src/cut_bytes.rs +++ b/src/cut_bytes.rs @@ -1,7 +1,7 @@ use anyhow::Result; use std::io::{Read, Write}; -use crate::bounds::{bounds_to_std_range, BoundOrFiller}; +use crate::bounds::BoundOrFiller; use crate::options::Opt; use crate::read_utils::read_bytes_to_end; @@ -13,7 +13,7 @@ fn cut_bytes(data: &[u8], opt: &Opt, stdout: &mut W) -> Result<()> { opt.bounds.0.iter().try_for_each(|bof| -> Result<()> { let output = match bof { BoundOrFiller::Bound(b) => { - let r = bounds_to_std_range(data.len(), b)?; + let r = b.try_into_range(data.len())?; &data[r.start..r.end] } BoundOrFiller::Filler(f) => f.as_bytes(), diff --git a/src/cut_lines.rs b/src/cut_lines.rs index 6a38fa6..4a303c7 100644 --- a/src/cut_lines.rs +++ b/src/cut_lines.rs @@ -26,15 +26,15 @@ fn cut_lines_forward_only( // Print the matching fields. Fields are ordered but can still be // duplicated, e.g. 1-2,2,3 , so we may have to print the same // line multiple times - while bounds_idx < opt.bounds.0.len() { - let bof = opt.bounds.0.get(bounds_idx).unwrap(); + while bounds_idx < opt.bounds.len() { + let bof = opt.bounds.get(bounds_idx).unwrap(); let b = match bof { BoundOrFiller::Filler(f) => { stdout.write_all(f.as_bytes())?; bounds_idx += 1; - if opt.join && bounds_idx != opt.bounds.0.len() { + if opt.join && bounds_idx != opt.bounds.len() { stdout.write_all(&[opt.eol as u8])?; } @@ -57,7 +57,7 @@ fn cut_lines_forward_only( add_newline_next = false; // if opt.join and it was not the last matching bound - if opt.join && bounds_idx != opt.bounds.0.len() { + if opt.join && bounds_idx != opt.bounds.len() { stdout.write_all(&[opt.eol as u8])?; } @@ -68,14 +68,14 @@ fn cut_lines_forward_only( break; // nothing matched, let's go to the next line } - if bounds_idx == opt.bounds.0.len() { + if bounds_idx == opt.bounds.len() { // no need to read the rest, we don't have other bounds to test break; } } // Output is finished. Did we output every bound? - if let Some(BoundOrFiller::Bound(b)) = opt.bounds.0.get(bounds_idx) { + if let Some(BoundOrFiller::Bound(b)) = opt.bounds.get(bounds_idx) { if b.r != Side::Continue { // not good, we still have bounds to print but the input is exhausted bail!("Out of bounds: {}", b); diff --git a/src/cut_str.rs b/src/cut_str.rs index 3357ad7..3867d4a 100644 --- a/src/cut_str.rs +++ b/src/cut_str.rs @@ -2,9 +2,7 @@ use anyhow::{bail, Result}; use std::io::{BufRead, Write}; use std::ops::Range; -use crate::bounds::{ - bounds_to_std_range, BoundOrFiller, BoundsType, Side, UserBounds, UserBoundsList, -}; +use crate::bounds::{BoundOrFiller, BoundsType, Side, UserBounds, UserBoundsList}; use crate::json::escape_json; use crate::options::{Opt, Trim}; use crate::read_utils::read_line_with_eol; @@ -27,13 +25,19 @@ fn complement_std_range(parts_length: usize, r: &Range) -> Vec>, line: &str, delimiter: &str, greedy: bool) { +/// Split a string into parts and fill a buffer with ranges +/// that match those parts. +/// +/// - `buffer` - vector that will be filled with ranges +/// - `line` - the string to split +/// - `delimiter` - what to search to split the string +/// - `greedy` - whether to consider consecutive delimiters as one or not +fn fill_with_fields_locations( + buffer: &mut Vec>, + line: &str, + delimiter: &str, + greedy: bool, +) { buffer.clear(); if line.is_empty() { @@ -193,7 +197,7 @@ pub fn cut_str( line: &str, opt: &Opt, stdout: &mut W, - bounds_as_ranges: &mut Vec>, + fields: &mut Vec>, compressed_line_buf: &mut String, eol: &[u8], ) -> Result<()> { @@ -260,7 +264,7 @@ pub fn cut_str( if should_build_ranges_using_regex { #[cfg(feature = "regex")] build_ranges_vec_from_regex( - bounds_as_ranges, + fields, line, if opt.greedy_delimiter { &opt.regex_bag.as_ref().unwrap().greedy @@ -269,18 +273,20 @@ pub fn cut_str( }, ); } else { - build_ranges_vec(bounds_as_ranges, line, delimiter, opt.greedy_delimiter); + fill_with_fields_locations(fields, line, delimiter, opt.greedy_delimiter); } - if opt.bounds_type == BoundsType::Characters && bounds_as_ranges.len() > 2 { + if opt.bounds_type == BoundsType::Characters && fields.len() > 2 { // Unless the line is empty (which should have already been handled), // then the empty-string delimiter generated ranges alongside each // character, plus one at each boundary, e.g. _f_o_o_. We drop them. - bounds_as_ranges.pop(); - bounds_as_ranges.drain(..1); + fields.pop(); + fields.drain(..1); } - if opt.only_delimited && bounds_as_ranges.len() == 1 { + let num_fields = fields.len(); + + if opt.only_delimited && num_fields == 1 { // If there's only 1 field it means that there were no delimiters // and when used alogside `only_delimited` we must skip the line return Ok(()); @@ -302,7 +308,7 @@ pub fn cut_str( // rare usage). // Start by checking if we actually need to rewrite the bounds - if bounds.0.iter().any(|b| { + if bounds.iter().any(|b| { matches!( b, BoundOrFiller::Bound(UserBounds { @@ -312,18 +318,17 @@ pub fn cut_str( ) }) { // Yep, there at least a range bound. Let's do it - _bounds = bounds.unpack(bounds_as_ranges.len()); + _bounds = bounds.unpack(num_fields); bounds = &_bounds; } } - match bounds_as_ranges.len() { - 1 if bounds.0.len() == 1 => { + match num_fields { + 1 if bounds.len() == 1 => { write_maybe_as_json!(stdout, line, opt.json); } _ => { bounds - .0 .iter() .enumerate() .try_for_each(|(i, bof)| -> Result<()> { @@ -335,10 +340,10 @@ pub fn cut_str( BoundOrFiller::Bound(b) => b, }; - let mut r_array = vec![bounds_to_std_range(bounds_as_ranges.len(), b)?]; + let mut r_array = vec![b.try_into_range(num_fields)?]; if opt.complement { - r_array = complement_std_range(bounds_as_ranges.len(), &r_array[0]); + r_array = complement_std_range(num_fields, &r_array[0]); } if opt.json { @@ -356,14 +361,14 @@ pub fn cut_str( let n_ranges = r_array.len(); for (idx_r, r) in r_iter.enumerate() { - let idx_start = bounds_as_ranges[r.start].start; - let idx_end = bounds_as_ranges[r.end - 1].end; + let idx_start = fields[r.start].start; + let idx_end = fields[r.end - 1].end; let output = &line[idx_start..idx_end]; let field_to_print = maybe_replace_delimiter(output, opt); write_maybe_as_json!(stdout, field_to_print, opt.json); - if opt.join && !(i == bounds.0.len() - 1 && idx_r == n_ranges - 1) { + if opt.join && !(i == bounds.len() - 1 && idx_r == n_ranges - 1) { stdout.write_all( opt.replace_delimiter .as_ref() @@ -474,29 +479,29 @@ mod tests { // non greedy v_range.clear(); - build_ranges_vec(&mut v_range, "", "-", false); + fill_with_fields_locations(&mut v_range, "", "-", false); assert_eq!(v_range, vec![] as Vec>); v_range.clear(); - build_ranges_vec(&mut v_range, "a", "-", false); + fill_with_fields_locations(&mut v_range, "a", "-", false); assert_eq!(v_range, vec![Range { start: 0, end: 1 }]); v_range.clear(); - build_ranges_vec(&mut v_range, "-", "-", true); + fill_with_fields_locations(&mut v_range, "-", "-", true); assert_eq!( v_range, vec![Range { start: 0, end: 0 }, Range { start: 1, end: 1 }] ); v_range.clear(); - build_ranges_vec(&mut v_range, "a-b", "-", false); + fill_with_fields_locations(&mut v_range, "a-b", "-", false); assert_eq!( v_range, vec![Range { start: 0, end: 1 }, Range { start: 2, end: 3 }] ); v_range.clear(); - build_ranges_vec(&mut v_range, "-a-", "-", false); + fill_with_fields_locations(&mut v_range, "-a-", "-", false); assert_eq!( v_range, vec![ @@ -507,7 +512,7 @@ mod tests { ); v_range.clear(); - build_ranges_vec(&mut v_range, "a--", "-", false); + fill_with_fields_locations(&mut v_range, "a--", "-", false); assert_eq!( v_range, vec![ @@ -520,22 +525,22 @@ mod tests { // greedy v_range.clear(); - build_ranges_vec(&mut v_range, "", "-", true); + fill_with_fields_locations(&mut v_range, "", "-", true); assert_eq!(v_range, empty_vec); v_range.clear(); - build_ranges_vec(&mut v_range, "a", "-", true); + fill_with_fields_locations(&mut v_range, "a", "-", true); assert_eq!(v_range, vec![Range { start: 0, end: 1 }]); v_range.clear(); - build_ranges_vec(&mut v_range, "-", "-", true); + fill_with_fields_locations(&mut v_range, "-", "-", true); assert_eq!( v_range, vec![Range { start: 0, end: 0 }, Range { start: 1, end: 1 }] ); v_range.clear(); - build_ranges_vec(&mut v_range, "-a--b", "-", true); + fill_with_fields_locations(&mut v_range, "-a--b", "-", true); assert_eq!( v_range, vec![ @@ -546,7 +551,7 @@ mod tests { ); v_range.clear(); - build_ranges_vec(&mut v_range, "-a--", "-", true); + fill_with_fields_locations(&mut v_range, "-a--", "-", true); assert_eq!( v_range, vec![ @@ -579,24 +584,38 @@ mod tests { #[test] fn cut_str_echo_non_delimited_strings() { let opt = make_fields_opt(); - let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); let eol = &[EOL::Newline as u8]; let line = "foo"; + // non-empty line missing the delimiter + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); assert_eq!(output, b"foo\n".as_slice()); + + // empty line + let line = ""; + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); + cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); + assert_eq!(output, b"\n".as_slice()); } #[test] fn cut_str_skip_non_delimited_strings_when_requested() { let mut opt = make_fields_opt(); - let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); let eol = &[EOL::Newline as u8]; opt.only_delimited = true; + + // non-empty line missing the delimiter let line = "foo"; + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); + cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); + assert_eq!(output, b"".as_slice()); + // empty line + let line = ""; + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); assert_eq!(output, b"".as_slice()); } diff --git a/src/fast_lane.rs b/src/fast_lane.rs new file mode 100644 index 0000000..fb53939 --- /dev/null +++ b/src/fast_lane.rs @@ -0,0 +1,588 @@ +use crate::bounds::{BoundOrFiller, BoundsType, Side, UserBounds, UserBoundsList}; +use crate::options::{Opt, Trim, EOL}; +use anyhow::{bail, Result}; +use bstr::ByteSlice; +use std::convert::TryFrom; +use std::io::{self, BufRead}; +use std::ops::Deref; +use std::str::FromStr; +use std::{io::Write, ops::Range}; + +use bstr::io::BufReadExt; + +fn trim<'a>(buffer: &'a [u8], trim_kind: &Trim, delimiter: u8) -> &'a [u8] { + match trim_kind { + Trim::Both => buffer + .trim_start_with(|x| x == delimiter as char) + .trim_end_with(|x| x == delimiter as char), + Trim::Left => buffer.trim_start_with(|x| x == delimiter as char), + Trim::Right => buffer.trim_end_with(|x| x == delimiter as char), + } +} + +fn cut_str_fast_lane( + initial_buffer: &[u8], + opt: &FastOpt, + stdout: &mut W, + fields: &mut Vec>, + last_interesting_field: Side, +) -> Result<()> { + let mut buffer = initial_buffer; + + if opt.trim.is_some() { + buffer = trim(buffer, opt.trim.as_ref().unwrap(), opt.delimiter) + } + + if buffer.is_empty() { + if !opt.only_delimited { + stdout.write_all(&[opt.eol.into()])?; + } + return Ok(()); + } + + let bounds = &opt.bounds; + + let mut prev_field_start = 0; + + let mut curr_field = 0; + + fields.clear(); + + for i in memchr::memchr_iter(opt.delimiter, buffer) { + curr_field += 1; + + let (start, end) = (prev_field_start, i); // end exclusive + prev_field_start = i + 1; + + fields.push(Range { start, end }); + + if Side::Some(curr_field) == last_interesting_field { + // We have no use for any other fields in this line + break; + } + } + + if curr_field == 0 && opt.only_delimited { + // The delimiter was not found + return Ok(()); + } + + // After the last loop ended, everything remaining is the field + // after the last delimiter (we want it), or "useless" fields after the + // last one that the user is interested in (and we can ignore them). + if Side::Some(curr_field) != last_interesting_field { + fields.push(Range { + start: prev_field_start, + end: buffer.len(), + }); + } + + let num_fields = fields.len(); + + match num_fields { + 1 if bounds.len() == 1 && fields[0].end == buffer.len() => { + stdout.write_all(buffer)?; + } + _ => { + bounds + .iter() + .enumerate() + .try_for_each(|(bounds_idx, bof)| -> Result<()> { + let b = match bof { + BoundOrFiller::Filler(f) => { + stdout.write_all(f.as_bytes())?; + return Ok(()); + } + BoundOrFiller::Bound(b) => b, + }; + + let is_last = bounds_idx == bounds.len() - 1; + + output_parts(buffer, b, fields, stdout, is_last, opt) + })?; + } + } + + stdout.write_all(&[opt.eol.into()])?; + + Ok(()) +} + +#[inline(always)] +fn output_parts( + line: &[u8], + // which parts to print + b: &UserBounds, + // where to find the parts inside `line` + fields: &[Range], + stdout: &mut W, + is_last: bool, + opt: &FastOpt, +) -> Result<()> { + let r = b.try_into_range(fields.len())?; + + let idx_start = fields[r.start].start; + let idx_end = fields[r.end - 1].end; + let output = &line[idx_start..idx_end]; + + let field_to_print = output; + stdout.write_all(field_to_print)?; + + if opt.join && !(is_last) { + stdout.write_all(&[opt.delimiter])?; + } + + Ok(()) +} + +#[derive(Debug)] +pub struct FastOpt { + delimiter: u8, + join: bool, + eol: EOL, + bounds: ForwardBounds, + only_delimited: bool, + trim: Option, +} + +impl Default for FastOpt { + fn default() -> Self { + Self { + delimiter: b'\t', + join: false, + eol: EOL::Newline, + bounds: ForwardBounds::try_from(&UserBoundsList::from_str("1:").unwrap()).unwrap(), + only_delimited: false, + trim: None, + } + } +} + +impl TryFrom<&Opt> for FastOpt { + type Error = &'static str; + + fn try_from(value: &Opt) -> Result { + if !value.delimiter.as_bytes().len() == 1 { + return Err("Delimiter must be 1 byte wide for FastOpt"); + } + + if value.complement + || value.greedy_delimiter + || value.compress_delimiter + || value.json + || value.bounds_type != BoundsType::Fields + || value.replace_delimiter.is_some() + || value.regex_bag.is_some() + { + return Err( + "FastOpt supports solely forward fields, join and single-character delimiters", + ); + } + + if let Ok(forward_bounds) = ForwardBounds::try_from(&value.bounds) { + Ok(FastOpt { + delimiter: value.delimiter.as_bytes().first().unwrap().to_owned(), + join: value.join, + eol: value.eol, + bounds: forward_bounds, + only_delimited: value.only_delimited, + trim: value.trim, + }) + } else { + Err("Bounds cannot be converted to ForwardBounds") + } + } +} + +#[derive(Debug)] +struct ForwardBounds { + pub list: UserBoundsList, + // Optimization that we can use to stop searching for fields + // It's available only when every bound use positive indexes. + // When conditions do not apply, Side::Continue is used. + last_interesting_field: Side, +} + +impl TryFrom<&UserBoundsList> for ForwardBounds { + type Error = anyhow::Error; + + fn try_from(value: &UserBoundsList) -> Result { + if value.is_empty() { + bail!("Cannot create ForwardBounds from an empty UserBoundsList"); + } else { + let value: UserBoundsList = UserBoundsList(value.iter().cloned().collect()); + + let mut rightmost_bound: Option = None; + if value.is_sortable() { + value.iter().for_each(|bof| { + if let BoundOrFiller::Bound(b) = bof { + if rightmost_bound.is_none() || b.r > rightmost_bound.unwrap() { + rightmost_bound = Some(b.r); + } + } + }); + } + + Ok(ForwardBounds { + list: value, + last_interesting_field: rightmost_bound.unwrap_or(Side::Continue), + }) + } + } +} + +impl Deref for ForwardBounds { + type Target = UserBoundsList; + + fn deref(&self) -> &Self::Target { + &self.list + } +} + +impl ForwardBounds { + fn get_last_bound(&self) -> Side { + self.last_interesting_field + } +} + +impl FromStr for ForwardBounds { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + let bounds_list = UserBoundsList::from_str(s)?; + ForwardBounds::try_from(&bounds_list) + } +} + +pub fn read_and_cut_text_as_bytes( + stdin: &mut R, + stdout: &mut W, + opt: &FastOpt, +) -> Result<()> { + let mut fields: Vec> = Vec::with_capacity(16); + + let last_interesting_field = opt.bounds.get_last_bound(); + + match opt.eol { + EOL::Newline => stdin.for_byte_line(|line| { + cut_str_fast_lane(line, opt, stdout, &mut fields, last_interesting_field) + // XXX Should map properly the error + .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) + .and(Ok(true)) + })?, + EOL::Zero => stdin.for_byte_record(opt.eol.into(), |line| { + cut_str_fast_lane(line, opt, stdout, &mut fields, last_interesting_field) + // XXX Should map properly the error + .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) + .and(Ok(true)) + })?, + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use crate::options::Trim; + + use super::*; + + fn make_fields_opt() -> FastOpt { + FastOpt { + delimiter: b'-', + ..FastOpt::default() + } + } + + #[test] + fn test_read_and_cut_str_echo_non_delimited_strings() { + // read_and_cut_str is difficult to test, let's verify at least + // that it reads the input and appears to call cut_str + + let opt = make_fields_opt(); + let mut input = b"foo".as_slice(); + let mut output = Vec::new(); + read_and_cut_text_as_bytes(&mut input, &mut output, &opt).unwrap(); + assert_eq!(output, b"foo\n".as_slice()); + } + + fn make_cut_str_buffers() -> (Vec, Vec>) { + let output = Vec::new(); + let fields = Vec::new(); + (output, fields) + } + + #[test] + fn cut_str_echo_non_delimited_strings() { + let opt = make_fields_opt(); + + // non-empty line missing the delimiter + let line = b"foo"; + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"foo\n".as_slice()); + + // empty line + let line = b""; + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"\n".as_slice()); + } + + #[test] + fn cut_str_skip_non_delimited_strings_when_requested() { + let mut opt = make_fields_opt(); + + opt.only_delimited = true; + + // non-empty line missing the delimiter + let line = b"foo"; + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"".as_slice()); + + // empty line + let line = b""; + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"".as_slice()); + } + + #[test] + fn cut_str_it_cut_a_field() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("1").unwrap(); + + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"a\n".as_slice()); + } + + #[test] + fn cut_str_it_cut_with_negative_indices() { + let mut opt = make_fields_opt(); + + let line = b"a-b-c"; + + // just one negative index + opt.bounds = ForwardBounds::from_str("-1").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"c\n".as_slice()); + + // multiple negative indices, in forward order + opt.bounds = ForwardBounds::from_str("-2,-1").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"bc\n".as_slice()); + + // multiple negative indices, in non-forward order + opt.bounds = ForwardBounds::from_str("-1,-2").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"cb\n".as_slice()); + + // mix positive and negative indices + // (this is particularly useful to verify that we don't screw + // up optimizations on last field to check) + opt.bounds = ForwardBounds::from_str("-1,1").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"ca\n".as_slice()); + } + + #[test] + fn cut_str_it_cut_consecutive_delimiters() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("1,3").unwrap(); + + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"ac\n".as_slice()); + } + + #[test] + fn cut_str_it_supports_zero_terminated_lines() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + opt.eol = EOL::Zero; + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("2").unwrap(); + + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"b\0".as_slice()); + } + + #[test] + fn cut_str_it_join_fields() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("1,3").unwrap(); + opt.join = true; + + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"a-c\n".as_slice()); + } + + #[test] + fn cut_str_it_format_fields() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("{1} < {3} > {2}").unwrap(); + + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"a < c > b\n".as_slice()); + } + + #[test] + fn cut_str_it_trim_fields() { + let mut opt = make_fields_opt(); + let line = b"--a--b--c--"; + + // check Trim::Both + opt.trim = Some(Trim::Both); + opt.bounds = ForwardBounds::from_str("1,3,-1").unwrap(); + + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"abc\n".as_slice()); + + // check Trim::Left + opt.trim = Some(Trim::Left); + opt.bounds = ForwardBounds::from_str("1,3,-3").unwrap(); + + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"abc\n".as_slice()); + + // check Trim::Right + opt.trim = Some(Trim::Right); + opt.bounds = ForwardBounds::from_str("3,5,-1").unwrap(); + + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"abc\n".as_slice()); + } +} diff --git a/src/lib.rs b/src/lib.rs index 15edbd3..ce9df89 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ pub mod bounds; pub mod cut_bytes; pub mod cut_lines; pub mod cut_str; +pub mod fast_lane; mod json; pub mod options; mod read_utils; diff --git a/src/options.rs b/src/options.rs index 0f630e9..d4eefbf 100644 --- a/src/options.rs +++ b/src/options.rs @@ -19,6 +19,15 @@ pub enum EOL { Newline = 10, } +impl From for u8 { + fn from(value: EOL) -> Self { + match value { + EOL::Zero => b'\0', + EOL::Newline => b'\n', + } + } +} + #[derive(Debug)] pub struct Opt { pub delimiter: String,