From 785586445a7df900506a58951331f54f515b27d8 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Mon, 29 Jan 2024 21:46:46 -0800 Subject: [PATCH 01/22] Refactor bounds_to_std_range to try_into_range --- src/bounds.rs | 95 +++++++++++++++++++++++++++++------------------- src/cut_bytes.rs | 4 +- src/cut_str.rs | 30 +++++++-------- 3 files changed, 74 insertions(+), 55 deletions(-) diff --git a/src/bounds.rs b/src/bounds.rs index b190314..d62d6cf 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -317,6 +317,64 @@ impl UserBounds { } } + /// Transform UserBounds into std::opt::Range + /// + /// UserBounds is 1-indexed and inclusive on both sides, while + /// the resulting range is 0-indexed and exclusive on the right side. + /// + /// `parts_length` is necessary to calculate Side::Continue on + /// the right side, or any negative indexes. + /// + /// e.g. + /// + /// ```rust + /// assert_eq!( + /// (UserBounds { l: 1, r: 2 }).try_into_range(5), + /// Ok(Range { start: 0, end: 2}) // 2, not 1, because it's exclusive + /// ); + /// + /// assert_eq!( + /// (UserBounds { l: 1, r: Side::Continue }).try_into_range(5), + /// Ok(Range { start: 0, end: 5}) + /// ); + /// ``` + pub fn try_into_range(&self, parts_length: usize) -> Result> { + let start: usize = match self.l { + Side::Continue => 0, + Side::Some(v) => { + if v.unsigned_abs() as usize > parts_length { + bail!("Out of bounds: {}", v); + } + if v < 0 { + parts_length - v.unsigned_abs() as usize + } else { + v as usize - 1 + } + } + }; + + let end: usize = match self.r { + Side::Continue => parts_length, + Side::Some(v) => { + if v.unsigned_abs() as usize > parts_length { + bail!("Out of bounds: {}", v); + } + if v < 0 { + parts_length - v.unsigned_abs() as usize + 1 + } else { + v as usize + } + } + }; + + if end <= start { + // `end` must always be 1 or more greater than start + bail!("Field left value cannot be greater than right value"); + } + + Ok(Range { start, end }) + } + /** * Transform a ranged bound into a list of one or more * 1 slot bound @@ -389,43 +447,6 @@ impl Default for UserBounds { } } -pub fn bounds_to_std_range(parts_length: usize, bounds: &UserBounds) -> Result> { - let start: usize = match bounds.l { - Side::Continue => 0, - Side::Some(v) => { - if v.unsigned_abs() as usize > parts_length { - bail!("Out of bounds: {}", v); - } - if v < 0 { - parts_length - v.unsigned_abs() as usize - } else { - v as usize - 1 - } - } - }; - - let end: usize = match bounds.r { - Side::Continue => parts_length, - Side::Some(v) => { - if v.unsigned_abs() as usize > parts_length { - bail!("Out of bounds: {}", v); - } - if v < 0 { - parts_length - v.unsigned_abs() as usize + 1 - } else { - v as usize - } - } - }; - - if end <= start { - // `end` must always be 1 or more greater than start - bail!("Field left value cannot be greater than right value"); - } - - Ok(Range { start, end }) -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/cut_bytes.rs b/src/cut_bytes.rs index 4ed594c..a80f20d 100644 --- a/src/cut_bytes.rs +++ b/src/cut_bytes.rs @@ -1,7 +1,7 @@ use anyhow::Result; use std::io::{Read, Write}; -use crate::bounds::{bounds_to_std_range, BoundOrFiller}; +use crate::bounds::BoundOrFiller; use crate::options::Opt; use crate::read_utils::read_bytes_to_end; @@ -13,7 +13,7 @@ fn cut_bytes(data: &[u8], opt: &Opt, stdout: &mut W) -> Result<()> { opt.bounds.0.iter().try_for_each(|bof| -> Result<()> { let output = match bof { BoundOrFiller::Bound(b) => { - let r = bounds_to_std_range(data.len(), b)?; + let r = b.try_into_range(data.len())?; &data[r.start..r.end] } BoundOrFiller::Filler(f) => f.as_bytes(), diff --git a/src/cut_str.rs b/src/cut_str.rs index 3357ad7..daf6050 100644 --- a/src/cut_str.rs +++ b/src/cut_str.rs @@ -2,9 +2,7 @@ use anyhow::{bail, Result}; use std::io::{BufRead, Write}; use std::ops::Range; -use crate::bounds::{ - bounds_to_std_range, BoundOrFiller, BoundsType, Side, UserBounds, UserBoundsList, -}; +use crate::bounds::{BoundOrFiller, BoundsType, Side, UserBounds, UserBoundsList}; use crate::json::escape_json; use crate::options::{Opt, Trim}; use crate::read_utils::read_line_with_eol; @@ -193,7 +191,7 @@ pub fn cut_str( line: &str, opt: &Opt, stdout: &mut W, - bounds_as_ranges: &mut Vec>, + fields: &mut Vec>, compressed_line_buf: &mut String, eol: &[u8], ) -> Result<()> { @@ -260,7 +258,7 @@ pub fn cut_str( if should_build_ranges_using_regex { #[cfg(feature = "regex")] build_ranges_vec_from_regex( - bounds_as_ranges, + fields, line, if opt.greedy_delimiter { &opt.regex_bag.as_ref().unwrap().greedy @@ -269,18 +267,18 @@ pub fn cut_str( }, ); } else { - build_ranges_vec(bounds_as_ranges, line, delimiter, opt.greedy_delimiter); + build_ranges_vec(fields, line, delimiter, opt.greedy_delimiter); } - if opt.bounds_type == BoundsType::Characters && bounds_as_ranges.len() > 2 { + if opt.bounds_type == BoundsType::Characters && fields.len() > 2 { // Unless the line is empty (which should have already been handled), // then the empty-string delimiter generated ranges alongside each // character, plus one at each boundary, e.g. _f_o_o_. We drop them. - bounds_as_ranges.pop(); - bounds_as_ranges.drain(..1); + fields.pop(); + fields.drain(..1); } - if opt.only_delimited && bounds_as_ranges.len() == 1 { + if opt.only_delimited && fields.len() == 1 { // If there's only 1 field it means that there were no delimiters // and when used alogside `only_delimited` we must skip the line return Ok(()); @@ -312,12 +310,12 @@ pub fn cut_str( ) }) { // Yep, there at least a range bound. Let's do it - _bounds = bounds.unpack(bounds_as_ranges.len()); + _bounds = bounds.unpack(fields.len()); bounds = &_bounds; } } - match bounds_as_ranges.len() { + match fields.len() { 1 if bounds.0.len() == 1 => { write_maybe_as_json!(stdout, line, opt.json); } @@ -335,10 +333,10 @@ pub fn cut_str( BoundOrFiller::Bound(b) => b, }; - let mut r_array = vec![bounds_to_std_range(bounds_as_ranges.len(), b)?]; + let mut r_array = vec![b.try_into_range(fields.len())?]; if opt.complement { - r_array = complement_std_range(bounds_as_ranges.len(), &r_array[0]); + r_array = complement_std_range(fields.len(), &r_array[0]); } if opt.json { @@ -356,8 +354,8 @@ pub fn cut_str( let n_ranges = r_array.len(); for (idx_r, r) in r_iter.enumerate() { - let idx_start = bounds_as_ranges[r.start].start; - let idx_end = bounds_as_ranges[r.end - 1].end; + let idx_start = fields[r.start].start; + let idx_end = fields[r.end - 1].end; let output = &line[idx_start..idx_end]; let field_to_print = maybe_replace_delimiter(output, opt); From c82a07893c30efba4e10e787b0e04ef0b4c5dac1 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Mon, 29 Jan 2024 21:47:52 -0800 Subject: [PATCH 02/22] Use num_fields instead of bounds_as_ranges.len for clarity --- src/cut_str.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/cut_str.rs b/src/cut_str.rs index daf6050..d6fefa5 100644 --- a/src/cut_str.rs +++ b/src/cut_str.rs @@ -278,7 +278,9 @@ pub fn cut_str( fields.drain(..1); } - if opt.only_delimited && fields.len() == 1 { + let num_fields = fields.len(); + + if opt.only_delimited && num_fields == 1 { // If there's only 1 field it means that there were no delimiters // and when used alogside `only_delimited` we must skip the line return Ok(()); @@ -310,12 +312,12 @@ pub fn cut_str( ) }) { // Yep, there at least a range bound. Let's do it - _bounds = bounds.unpack(fields.len()); + _bounds = bounds.unpack(num_fields); bounds = &_bounds; } } - match fields.len() { + match num_fields { 1 if bounds.0.len() == 1 => { write_maybe_as_json!(stdout, line, opt.json); } @@ -333,10 +335,10 @@ pub fn cut_str( BoundOrFiller::Bound(b) => b, }; - let mut r_array = vec![b.try_into_range(fields.len())?]; + let mut r_array = vec![b.try_into_range(num_fields)?]; if opt.complement { - r_array = complement_std_range(fields.len(), &r_array[0]); + r_array = complement_std_range(num_fields, &r_array[0]); } if opt.json { From e46e4730e45ff807a2284b3a6ebac229416bce78 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Mon, 29 Jan 2024 22:12:12 -0800 Subject: [PATCH 03/22] Refactor build_ranges_vec into fill_with_fields_locations --- src/cut_str.rs | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/src/cut_str.rs b/src/cut_str.rs index d6fefa5..6d341d8 100644 --- a/src/cut_str.rs +++ b/src/cut_str.rs @@ -25,13 +25,19 @@ fn complement_std_range(parts_length: usize, r: &Range) -> Vec>, line: &str, delimiter: &str, greedy: bool) { +/// Split a string into parts and fill a buffer with ranges +/// that match those parts. +/// +/// - `buffer` - vector that will be filled with ranges +/// - `line` - the string to split +/// - `delimiter` - what to search to split the string +/// - `greedy` - whether to consider consecutive delimiters as one or not +fn fill_with_fields_locations( + buffer: &mut Vec>, + line: &str, + delimiter: &str, + greedy: bool, +) { buffer.clear(); if line.is_empty() { @@ -267,7 +273,7 @@ pub fn cut_str( }, ); } else { - build_ranges_vec(fields, line, delimiter, opt.greedy_delimiter); + fill_with_fields_locations(fields, line, delimiter, opt.greedy_delimiter); } if opt.bounds_type == BoundsType::Characters && fields.len() > 2 { @@ -474,29 +480,29 @@ mod tests { // non greedy v_range.clear(); - build_ranges_vec(&mut v_range, "", "-", false); + fill_with_fields_locations(&mut v_range, "", "-", false); assert_eq!(v_range, vec![] as Vec>); v_range.clear(); - build_ranges_vec(&mut v_range, "a", "-", false); + fill_with_fields_locations(&mut v_range, "a", "-", false); assert_eq!(v_range, vec![Range { start: 0, end: 1 }]); v_range.clear(); - build_ranges_vec(&mut v_range, "-", "-", true); + fill_with_fields_locations(&mut v_range, "-", "-", true); assert_eq!( v_range, vec![Range { start: 0, end: 0 }, Range { start: 1, end: 1 }] ); v_range.clear(); - build_ranges_vec(&mut v_range, "a-b", "-", false); + fill_with_fields_locations(&mut v_range, "a-b", "-", false); assert_eq!( v_range, vec![Range { start: 0, end: 1 }, Range { start: 2, end: 3 }] ); v_range.clear(); - build_ranges_vec(&mut v_range, "-a-", "-", false); + fill_with_fields_locations(&mut v_range, "-a-", "-", false); assert_eq!( v_range, vec![ @@ -507,7 +513,7 @@ mod tests { ); v_range.clear(); - build_ranges_vec(&mut v_range, "a--", "-", false); + fill_with_fields_locations(&mut v_range, "a--", "-", false); assert_eq!( v_range, vec![ @@ -520,22 +526,22 @@ mod tests { // greedy v_range.clear(); - build_ranges_vec(&mut v_range, "", "-", true); + fill_with_fields_locations(&mut v_range, "", "-", true); assert_eq!(v_range, empty_vec); v_range.clear(); - build_ranges_vec(&mut v_range, "a", "-", true); + fill_with_fields_locations(&mut v_range, "a", "-", true); assert_eq!(v_range, vec![Range { start: 0, end: 1 }]); v_range.clear(); - build_ranges_vec(&mut v_range, "-", "-", true); + fill_with_fields_locations(&mut v_range, "-", "-", true); assert_eq!( v_range, vec![Range { start: 0, end: 0 }, Range { start: 1, end: 1 }] ); v_range.clear(); - build_ranges_vec(&mut v_range, "-a--b", "-", true); + fill_with_fields_locations(&mut v_range, "-a--b", "-", true); assert_eq!( v_range, vec![ @@ -546,7 +552,7 @@ mod tests { ); v_range.clear(); - build_ranges_vec(&mut v_range, "-a--", "-", true); + fill_with_fields_locations(&mut v_range, "-a--", "-", true); assert_eq!( v_range, vec![ From 98f8a2ff2d9ed3c287c8b7d7bd07c250fefb7709 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sat, 3 Feb 2024 01:06:01 -0800 Subject: [PATCH 04/22] Initial version of fast lane --- src/bin/tuc.rs | 4 + src/fast_lane.rs | 229 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 3 files changed, 234 insertions(+) create mode 100644 src/fast_lane.rs diff --git a/src/bin/tuc.rs b/src/bin/tuc.rs index 83b7092..a82b132 100644 --- a/src/bin/tuc.rs +++ b/src/bin/tuc.rs @@ -1,10 +1,12 @@ use anyhow::Result; +use std::convert::{TryFrom, TryInto}; use std::io::Write; use std::str::FromStr; use tuc::bounds::{BoundOrFiller, BoundsType, UserBoundsList}; use tuc::cut_bytes::read_and_cut_bytes; use tuc::cut_lines::read_and_cut_lines; use tuc::cut_str::read_and_cut_str; +use tuc::fast_lane::{read_and_cut_text_as_bytes, FastOpt}; use tuc::options::{Opt, EOL}; #[cfg(feature = "regex")] @@ -264,6 +266,8 @@ fn main() -> Result<()> { read_and_cut_bytes(&mut stdin, &mut stdout, &opt)?; } else if opt.bounds_type == BoundsType::Lines { read_and_cut_lines(&mut stdin, &mut stdout, &opt)?; + } else if let Ok(fast_opt) = FastOpt::try_from(&opt) { + read_and_cut_text_as_bytes(&mut stdin, &mut stdout, &fast_opt)?; } else { read_and_cut_str(&mut stdin, &mut stdout, opt)?; } diff --git a/src/fast_lane.rs b/src/fast_lane.rs new file mode 100644 index 0000000..d9d6bdc --- /dev/null +++ b/src/fast_lane.rs @@ -0,0 +1,229 @@ +use crate::bounds::{BoundOrFiller, BoundsType, Side, UserBounds, UserBoundsList}; +use crate::options::{Opt, EOL}; +use anyhow::Result; +use std::convert::TryFrom; +use std::io::{self, BufRead}; +use std::{io::Write, ops::Range}; + +use bstr::io::BufReadExt; + +fn cut_str_fast_line(buffer: &[u8], opt: &FastOpt, stdout: &mut W) -> Result<()> { + if buffer.is_empty() { + return Ok(()); + } + + let bounds = &opt.bounds; + assert!(!bounds.0.is_empty()); + // if we're here there must be at least one bound to check + let last_interesting_field = bounds.0.last().unwrap().end; + + let mut prev_field_start = 0; + + let mut fields: Vec> = Vec::new(); + + let mut curr_field = 0; + + fields.clear(); + + for i in memchr::memchr_iter(opt.delimiter, buffer) { + curr_field += 1; + + let (start, end) = (prev_field_start, i); // end exclusive + prev_field_start = i + 1; + + fields.push(Range { start, end }); + + if curr_field == last_interesting_field { + // we have no use for this field or any of the following ones + break; + } + } + + if curr_field == 0 && opt.only_delimited { + // The delimiter was not found + return Ok(()); + } + + if curr_field != last_interesting_field { + fields.push(Range { + start: prev_field_start, + end: buffer.len(), + }); + } + + let num_fields = fields.len(); + + match num_fields { + 1 if bounds.0.len() == 1 => { + stdout.write_all(buffer)?; + } + _ => { + bounds + .0 + .iter() + .enumerate() + .try_for_each(|(bounds_idx, b)| -> Result<()> { + // let b = match bof { + // BoundOrFiller::Filler(f) => { + // stdout.write_all(f.as_bytes())?; + // return Ok(()); + // } + // BoundOrFiller::Bound(b) => b, + // }; + + //let mut r_array = vec![b.try_into_range(num_fields)?]; + + let is_last = bounds_idx == bounds.0.len() - 1; + + output_parts(buffer, b, &fields, stdout, is_last, opt) + })?; + } + } + + stdout.write_all(&[b'\n'])?; + + Ok(()) +} + +#[inline] +fn output_parts( + line: &[u8], + // which parts to print + r: &Range, + // where to find the parts inside `line` + fields: &[Range], + stdout: &mut W, + is_last: bool, + opt: &FastOpt, +) -> Result<()> { + // dbg!(&line.to_str_lossy(), &r, &fields, is_last); + + let idx_start = fields[r.start].start; + let idx_end = fields[r.end - 1].end; + let output = &line[idx_start..idx_end]; + + // let field_to_print = maybe_replace_delimiter(output, opt); + let field_to_print = output; + stdout.write_all(field_to_print)?; + + if opt.join && !(is_last) { + // stdout.write_all( + // opt.replace_delimiter + // .as_ref() + // .unwrap_or(&opt.delimiter) + // .as_bytes(), + // )?; + stdout.write_all(&[opt.delimiter])?; + } + + Ok(()) +} + +pub struct FastOpt { + delimiter: u8, + join: bool, + eol: u8, + bounds: ForwardBounds, + only_delimited: bool, +} + +impl TryFrom<&Opt> for FastOpt { + type Error = &'static str; + + fn try_from(value: &Opt) -> Result { + if !value.delimiter.as_bytes().len() == 1 { + return Err("Delimiter must be 1 byte wide for FastOpt"); + } + + if value.complement + || value.greedy_delimiter + || value.compress_delimiter + || value.json + || value.bounds_type != BoundsType::Fields + || value.replace_delimiter.is_some() + || value.trim.is_some() + || value.regex_bag.is_some() + || matches!(value.eol, EOL::Zero) + { + return Err( + "FastOpt supports solely forward fields, join and single-character delimiters", + ); + } + + if let Ok(forward_bounds) = ForwardBounds::try_from(&value.bounds) { + Ok(FastOpt { + delimiter: value.delimiter.as_bytes().first().unwrap().to_owned(), + join: value.join, + eol: b'\n', + bounds: forward_bounds, + only_delimited: value.only_delimited, + }) + } else { + Err("Bounds cannot be converted to ForwardBounds") + } + } +} + +impl From<&UserBounds> for Range { + fn from(value: &UserBounds) -> Self { + // XXX this will explode in our face at the first negative value + // XXX we should have a try into and more checks in place + // (also, values must be sequential, but that should be covered by UserBounds + // ... if we will still pass by it) + + let (l, r): (usize, usize) = match (value.l, value.r) { + // (Side::Some(l), Side::Some(r)) => (l as usize, (r - l) as usize), + // (Side::Some(l), Side::Continue) => (l as usize, usize::MAX - (l as usize)), + (Side::Some(l), Side::Some(r)) => ((l - 1) as usize, r as usize), + (Side::Some(l), Side::Continue) => ((l - 1) as usize, usize::MAX), + (Side::Continue, Side::Some(r)) => (0, r as usize), + (Side::Continue, Side::Continue) => (0, usize::MAX), + }; + + // FastRange { + // l, + // r_sub_l: r - l, + // buff_start: 0, + // buff_end: 0, + // } + Range { start: l, end: r } + } +} + +#[derive(Debug)] +pub struct ForwardBounds(Vec>); + +impl TryFrom<&UserBoundsList> for ForwardBounds { + type Error = &'static str; + + fn try_from(value: &UserBoundsList) -> Result { + if value.is_forward_only() { + let mut v: Vec> = Vec::with_capacity(value.0.len()); + for maybe_bounds in value.0.iter() { + // XXX for now let's drop the fillers + // XXX TODO + + if let BoundOrFiller::Bound(bounds) = maybe_bounds { + v.push(bounds.into()); + } + } + Ok(ForwardBounds(v)) + } else { + Err("The provided UserBoundsList is not forward only") + } + } +} + +pub fn read_and_cut_text_as_bytes( + stdin: &mut R, + stdout: &mut W, + opt: &FastOpt, +) -> Result<()> { + stdin.for_byte_line(|line| { + cut_str_fast_line(line, opt, stdout) + .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) + .and(Ok(true)) + })?; + + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index 15edbd3..ce9df89 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ pub mod bounds; pub mod cut_bytes; pub mod cut_lines; pub mod cut_str; +pub mod fast_lane; mod json; pub mod options; mod read_utils; From 892ce2cdb06583c19b8fde44e57e5e7a8f1f68e7 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sat, 3 Feb 2024 01:06:19 -0800 Subject: [PATCH 05/22] Add new dependencies --- Cargo.lock | 27 ++++++++------------------- Cargo.toml | 2 ++ src/fast_lane.rs | 30 +++--------------------------- 3 files changed, 13 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d02fb4b..42c53e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,13 +40,12 @@ dependencies = [ [[package]] name = "bstr" -version = "1.1.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45ea9b00a7b3f2988e9a65ad3917e62123c38dba709b666506207be96d1790b" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "once_cell", - "regex-automata 0.1.10", + "regex-automata", "serde", ] @@ -70,15 +69,9 @@ checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6" [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" - -[[package]] -name = "once_cell" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "pico-args" @@ -121,16 +114,10 @@ checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.5", + "regex-automata", "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-automata" version = "0.4.5" @@ -166,6 +153,8 @@ version = "1.2.0" dependencies = [ "anyhow", "assert_cmd", + "bstr", + "memchr", "pico-args", "predicates", "regex", diff --git a/Cargo.toml b/Cargo.toml index 05281f4..d83dada 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,8 @@ categories = ["command-line-utilities"] [dependencies] anyhow = "1.0.79" +bstr = "1.9.0" +memchr = "2.7.1" pico-args = { version = "0.5.0", features = ["short-space-opt", "combined-flags", "eq-separator"] } regex = { version = "1.10", default-features = false, features = ["std", "unicode-bool", "unicode-perl", "unicode-gencat"], optional = true } diff --git a/src/fast_lane.rs b/src/fast_lane.rs index d9d6bdc..48f3ce8 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -63,16 +63,6 @@ fn cut_str_fast_line(buffer: &[u8], opt: &FastOpt, stdout: &mut W) -> .iter() .enumerate() .try_for_each(|(bounds_idx, b)| -> Result<()> { - // let b = match bof { - // BoundOrFiller::Filler(f) => { - // stdout.write_all(f.as_bytes())?; - // return Ok(()); - // } - // BoundOrFiller::Bound(b) => b, - // }; - - //let mut r_array = vec![b.try_into_range(num_fields)?]; - let is_last = bounds_idx == bounds.0.len() - 1; output_parts(buffer, b, &fields, stdout, is_last, opt) @@ -96,8 +86,6 @@ fn output_parts( is_last: bool, opt: &FastOpt, ) -> Result<()> { - // dbg!(&line.to_str_lossy(), &r, &fields, is_last); - let idx_start = fields[r.start].start; let idx_end = fields[r.end - 1].end; let output = &line[idx_start..idx_end]; @@ -107,12 +95,6 @@ fn output_parts( stdout.write_all(field_to_print)?; if opt.join && !(is_last) { - // stdout.write_all( - // opt.replace_delimiter - // .as_ref() - // .unwrap_or(&opt.delimiter) - // .as_bytes(), - // )?; stdout.write_all(&[opt.delimiter])?; } @@ -172,20 +154,12 @@ impl From<&UserBounds> for Range { // ... if we will still pass by it) let (l, r): (usize, usize) = match (value.l, value.r) { - // (Side::Some(l), Side::Some(r)) => (l as usize, (r - l) as usize), - // (Side::Some(l), Side::Continue) => (l as usize, usize::MAX - (l as usize)), (Side::Some(l), Side::Some(r)) => ((l - 1) as usize, r as usize), (Side::Some(l), Side::Continue) => ((l - 1) as usize, usize::MAX), (Side::Continue, Side::Some(r)) => (0, r as usize), (Side::Continue, Side::Continue) => (0, usize::MAX), }; - // FastRange { - // l, - // r_sub_l: r - l, - // buff_start: 0, - // buff_end: 0, - // } Range { start: l, end: r } } } @@ -220,7 +194,9 @@ pub fn read_and_cut_text_as_bytes( opt: &FastOpt, ) -> Result<()> { stdin.for_byte_line(|line| { - cut_str_fast_line(line, opt, stdout) + let mut fields: Vec> = Vec::with_capacity(16); + cut_str_fast_line(line, opt, stdout, &mut fields) + // XXX Should map properly the error .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) .and(Ok(true)) })?; From 28ce7d50f66ef017be0c5fd15caccb6072c2aef4 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Mon, 5 Feb 2024 00:35:54 -0800 Subject: [PATCH 06/22] Fix usage of right Continue in fast_lane --- src/fast_lane.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 48f3ce8..4f31373 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -87,7 +87,12 @@ fn output_parts( opt: &FastOpt, ) -> Result<()> { let idx_start = fields[r.start].start; - let idx_end = fields[r.end - 1].end; + let idx_end = fields[if r.end == usize::MAX { + fields.len() + } else { + r.end + } - 1] + .end; let output = &line[idx_start..idx_end]; // let field_to_print = maybe_replace_delimiter(output, opt); From 91d19659df3c4a4a897013753c264b63f5ad077e Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Tue, 6 Feb 2024 18:54:36 -0800 Subject: [PATCH 07/22] Implement Deref for UserBoundsList --- src/bounds.rs | 18 ++++++++++++++---- src/cut_lines.rs | 12 ++++++------ src/cut_str.rs | 7 +++---- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/bounds.rs b/src/bounds.rs index d62d6cf..c99e5b4 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -2,10 +2,10 @@ use anyhow::{bail, Result}; use std::cmp::Ordering; use std::convert::TryInto; use std::fmt; -use std::ops::Range; +use std::ops::{Deref, Range}; use std::str::FromStr; -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, PartialEq)] pub enum BoundsType { Bytes, Characters, @@ -13,7 +13,7 @@ pub enum BoundsType { Lines, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub enum BoundOrFiller { Bound(UserBounds), Filler(String), @@ -98,6 +98,14 @@ pub fn parse_bounds_list(s: &str) -> Result> { #[derive(Debug)] pub struct UserBoundsList(pub Vec); +impl Deref for UserBoundsList { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + impl FromStr for UserBoundsList { type Err = anyhow::Error; fn from_str(s: &str) -> Result { @@ -287,11 +295,13 @@ impl UserBounds { UserBounds { l, r } } /** - * Check if an index is between the bounds. + * Check if a field is between the bounds. * * It errors out if the index has different sign than the bounds * (we can't verify if e.g. -1 idx is between 3:5 without knowing the number * of matching bounds). + * + * Fields are 1-indexed. */ pub fn matches(&self, idx: i32) -> Result { match (self.l, self.r) { diff --git a/src/cut_lines.rs b/src/cut_lines.rs index 6a38fa6..4a303c7 100644 --- a/src/cut_lines.rs +++ b/src/cut_lines.rs @@ -26,15 +26,15 @@ fn cut_lines_forward_only( // Print the matching fields. Fields are ordered but can still be // duplicated, e.g. 1-2,2,3 , so we may have to print the same // line multiple times - while bounds_idx < opt.bounds.0.len() { - let bof = opt.bounds.0.get(bounds_idx).unwrap(); + while bounds_idx < opt.bounds.len() { + let bof = opt.bounds.get(bounds_idx).unwrap(); let b = match bof { BoundOrFiller::Filler(f) => { stdout.write_all(f.as_bytes())?; bounds_idx += 1; - if opt.join && bounds_idx != opt.bounds.0.len() { + if opt.join && bounds_idx != opt.bounds.len() { stdout.write_all(&[opt.eol as u8])?; } @@ -57,7 +57,7 @@ fn cut_lines_forward_only( add_newline_next = false; // if opt.join and it was not the last matching bound - if opt.join && bounds_idx != opt.bounds.0.len() { + if opt.join && bounds_idx != opt.bounds.len() { stdout.write_all(&[opt.eol as u8])?; } @@ -68,14 +68,14 @@ fn cut_lines_forward_only( break; // nothing matched, let's go to the next line } - if bounds_idx == opt.bounds.0.len() { + if bounds_idx == opt.bounds.len() { // no need to read the rest, we don't have other bounds to test break; } } // Output is finished. Did we output every bound? - if let Some(BoundOrFiller::Bound(b)) = opt.bounds.0.get(bounds_idx) { + if let Some(BoundOrFiller::Bound(b)) = opt.bounds.get(bounds_idx) { if b.r != Side::Continue { // not good, we still have bounds to print but the input is exhausted bail!("Out of bounds: {}", b); diff --git a/src/cut_str.rs b/src/cut_str.rs index 6d341d8..d5a3051 100644 --- a/src/cut_str.rs +++ b/src/cut_str.rs @@ -308,7 +308,7 @@ pub fn cut_str( // rare usage). // Start by checking if we actually need to rewrite the bounds - if bounds.0.iter().any(|b| { + if bounds.iter().any(|b| { matches!( b, BoundOrFiller::Bound(UserBounds { @@ -324,12 +324,11 @@ pub fn cut_str( } match num_fields { - 1 if bounds.0.len() == 1 => { + 1 if bounds.len() == 1 => { write_maybe_as_json!(stdout, line, opt.json); } _ => { bounds - .0 .iter() .enumerate() .try_for_each(|(i, bof)| -> Result<()> { @@ -369,7 +368,7 @@ pub fn cut_str( let field_to_print = maybe_replace_delimiter(output, opt); write_maybe_as_json!(stdout, field_to_print, opt.json); - if opt.join && !(i == bounds.0.len() - 1 && idx_r == n_ranges - 1) { + if opt.join && !(i == bounds.len() - 1 && idx_r == n_ranges - 1) { stdout.write_all( opt.replace_delimiter .as_ref() From 63724f86c61318ec1c08bf3415f325eb8a24bbb9 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Tue, 6 Feb 2024 18:57:32 -0800 Subject: [PATCH 08/22] Update doctests in bounds.rs --- src/bounds.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/bounds.rs b/src/bounds.rs index c99e5b4..b880044 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -338,14 +338,18 @@ impl UserBounds { /// e.g. /// /// ```rust + /// # use tuc::bounds::UserBounds; + /// # use std::ops::Range; + /// # use tuc::bounds::Side; + /// /// assert_eq!( - /// (UserBounds { l: 1, r: 2 }).try_into_range(5), - /// Ok(Range { start: 0, end: 2}) // 2, not 1, because it's exclusive + /// (UserBounds { l: Side::Some(1), r: Side::Some(2) }).try_into_range(5).unwrap(), + /// Range { start: 0, end: 2} // 2, not 1, because it's exclusive /// ); /// /// assert_eq!( - /// (UserBounds { l: 1, r: Side::Continue }).try_into_range(5), - /// Ok(Range { start: 0, end: 5}) + /// (UserBounds { l: Side::Some(1), r: Side::Continue }).try_into_range(5).unwrap(), + /// Range { start: 0, end: 5} /// ); /// ``` pub fn try_into_range(&self, parts_length: usize) -> Result> { From edfee6d6a504b67e9145d40c20f58b47ff37f828 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Tue, 6 Feb 2024 19:00:23 -0800 Subject: [PATCH 09/22] Implement field formatting for fast lane --- src/fast_lane.rs | 114 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 31 deletions(-) diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 4f31373..e737975 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -3,23 +3,27 @@ use crate::options::{Opt, EOL}; use anyhow::Result; use std::convert::TryFrom; use std::io::{self, BufRead}; +use std::ops::Deref; use std::{io::Write, ops::Range}; use bstr::io::BufReadExt; -fn cut_str_fast_line(buffer: &[u8], opt: &FastOpt, stdout: &mut W) -> Result<()> { +fn cut_str_fast_line( + buffer: &[u8], + opt: &FastOpt, + stdout: &mut W, + fields: &mut Vec>, +) -> Result<()> { if buffer.is_empty() { return Ok(()); } let bounds = &opt.bounds; - assert!(!bounds.0.is_empty()); - // if we're here there must be at least one bound to check - let last_interesting_field = bounds.0.last().unwrap().end; - let mut prev_field_start = 0; + // ForwardBounds guarantees that there is at least one field to check + let last_interesting_field = bounds.get_last_bound().r; - let mut fields: Vec> = Vec::new(); + let mut prev_field_start = 0; let mut curr_field = 0; @@ -33,7 +37,7 @@ fn cut_str_fast_line(buffer: &[u8], opt: &FastOpt, stdout: &mut W) -> fields.push(Range { start, end }); - if curr_field == last_interesting_field { + if Side::Some(curr_field) == last_interesting_field { // we have no use for this field or any of the following ones break; } @@ -44,7 +48,10 @@ fn cut_str_fast_line(buffer: &[u8], opt: &FastOpt, stdout: &mut W) -> return Ok(()); } - if curr_field != last_interesting_field { + // After the last loop ended, everything remaining is the field + // after the last delimiter (we want it), or "useless" fields after the + // last one that the user is interested in (and we can ignore them). + if Side::Some(curr_field) != last_interesting_field { fields.push(Range { start: prev_field_start, end: buffer.len(), @@ -54,18 +61,25 @@ fn cut_str_fast_line(buffer: &[u8], opt: &FastOpt, stdout: &mut W) -> let num_fields = fields.len(); match num_fields { - 1 if bounds.0.len() == 1 => { + 1 if bounds.len() == 1 => { stdout.write_all(buffer)?; } _ => { bounds - .0 .iter() .enumerate() - .try_for_each(|(bounds_idx, b)| -> Result<()> { - let is_last = bounds_idx == bounds.0.len() - 1; - - output_parts(buffer, b, &fields, stdout, is_last, opt) + .try_for_each(|(bounds_idx, bof)| -> Result<()> { + let b = match bof { + BoundOrFiller::Filler(f) => { + stdout.write_all(f.as_bytes())?; + return Ok(()); + } + BoundOrFiller::Bound(b) => b, + }; + + let is_last = bounds_idx == bounds.len() - 1; + + output_parts(buffer, b, fields, stdout, is_last, opt) })?; } } @@ -79,20 +93,17 @@ fn cut_str_fast_line(buffer: &[u8], opt: &FastOpt, stdout: &mut W) -> fn output_parts( line: &[u8], // which parts to print - r: &Range, + b: &UserBounds, // where to find the parts inside `line` fields: &[Range], stdout: &mut W, is_last: bool, opt: &FastOpt, ) -> Result<()> { + let r = b.try_into_range(fields.len())?; + let idx_start = fields[r.start].start; - let idx_end = fields[if r.end == usize::MAX { - fields.len() - } else { - r.end - } - 1] - .end; + let idx_end = fields[r.end - 1].end; let output = &line[idx_start..idx_end]; // let field_to_print = maybe_replace_delimiter(output, opt); @@ -106,6 +117,7 @@ fn output_parts( Ok(()) } +#[derive(Debug)] pub struct FastOpt { delimiter: u8, join: bool, @@ -159,40 +171,80 @@ impl From<&UserBounds> for Range { // ... if we will still pass by it) let (l, r): (usize, usize) = match (value.l, value.r) { + // (Side::Some(l), Side::Some(r)) => (l as usize, (r - l) as usize), + // (Side::Some(l), Side::Continue) => (l as usize, usize::MAX - (l as usize)), (Side::Some(l), Side::Some(r)) => ((l - 1) as usize, r as usize), (Side::Some(l), Side::Continue) => ((l - 1) as usize, usize::MAX), (Side::Continue, Side::Some(r)) => (0, r as usize), (Side::Continue, Side::Continue) => (0, usize::MAX), }; + // FastRange { + // l, + // r_sub_l: r - l, + // buff_start: 0, + // buff_end: 0, + // } Range { start: l, end: r } } } #[derive(Debug)] -pub struct ForwardBounds(Vec>); +struct ForwardBounds { + pub list: UserBoundsList, + last_bound_idx: usize, +} impl TryFrom<&UserBoundsList> for ForwardBounds { type Error = &'static str; fn try_from(value: &UserBoundsList) -> Result { - if value.is_forward_only() { - let mut v: Vec> = Vec::with_capacity(value.0.len()); - for maybe_bounds in value.0.iter() { - // XXX for now let's drop the fillers - // XXX TODO - - if let BoundOrFiller::Bound(bounds) = maybe_bounds { - v.push(bounds.into()); + if value.0.is_empty() { + Err("Cannot create ForwardBounds from an empty UserBoundsList") + } else if value.is_forward_only() { + let value: UserBoundsList = UserBoundsList(value.iter().cloned().collect()); + let mut maybe_last_bound: Option = None; + value.iter().enumerate().rev().any(|(idx, bof)| { + if matches!(bof, BoundOrFiller::Bound(_)) { + maybe_last_bound = Some(idx); + true + } else { + false } + }); + + if let Some(last_bound_idx) = maybe_last_bound { + Ok(ForwardBounds { + list: value, + last_bound_idx, + }) + } else { + Err("Cannot create ForwardBounds from UserBoundsList without bounds") } - Ok(ForwardBounds(v)) } else { Err("The provided UserBoundsList is not forward only") } } } +impl Deref for ForwardBounds { + type Target = UserBoundsList; + + fn deref(&self) -> &Self::Target { + &self.list + } +} + +impl ForwardBounds { + fn get_last_bound(&self) -> &UserBounds { + if let Some(BoundOrFiller::Bound(b)) = self.list.get(self.last_bound_idx) { + b + } else { + panic!("Invariant error: last_bound_idx failed to match a bound") + } + } +} + pub fn read_and_cut_text_as_bytes( stdin: &mut R, stdout: &mut W, From 972514c38bf11bc6389c8dea40137c68f2fe5383 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Tue, 6 Feb 2024 20:22:31 -0800 Subject: [PATCH 10/22] Support zero eol in fast lane --- src/bin/tuc.rs | 2 +- src/fast_lane.rs | 31 +++++++++++++++++++------------ src/options.rs | 9 +++++++++ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/bin/tuc.rs b/src/bin/tuc.rs index a82b132..e528df6 100644 --- a/src/bin/tuc.rs +++ b/src/bin/tuc.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use std::convert::{TryFrom, TryInto}; +use std::convert::TryFrom; use std::io::Write; use std::str::FromStr; use tuc::bounds::{BoundOrFiller, BoundsType, UserBoundsList}; diff --git a/src/fast_lane.rs b/src/fast_lane.rs index e737975..303ac9f 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -84,7 +84,7 @@ fn cut_str_fast_line( } } - stdout.write_all(&[b'\n'])?; + stdout.write_all(&[opt.eol.into()])?; Ok(()) } @@ -121,7 +121,7 @@ fn output_parts( pub struct FastOpt { delimiter: u8, join: bool, - eol: u8, + eol: EOL, bounds: ForwardBounds, only_delimited: bool, } @@ -142,7 +142,6 @@ impl TryFrom<&Opt> for FastOpt { || value.replace_delimiter.is_some() || value.trim.is_some() || value.regex_bag.is_some() - || matches!(value.eol, EOL::Zero) { return Err( "FastOpt supports solely forward fields, join and single-character delimiters", @@ -153,7 +152,7 @@ impl TryFrom<&Opt> for FastOpt { Ok(FastOpt { delimiter: value.delimiter.as_bytes().first().unwrap().to_owned(), join: value.join, - eol: b'\n', + eol: value.eol, bounds: forward_bounds, only_delimited: value.only_delimited, }) @@ -199,7 +198,7 @@ impl TryFrom<&UserBoundsList> for ForwardBounds { type Error = &'static str; fn try_from(value: &UserBoundsList) -> Result { - if value.0.is_empty() { + if value.is_empty() { Err("Cannot create ForwardBounds from an empty UserBoundsList") } else if value.is_forward_only() { let value: UserBoundsList = UserBoundsList(value.iter().cloned().collect()); @@ -250,13 +249,21 @@ pub fn read_and_cut_text_as_bytes( stdout: &mut W, opt: &FastOpt, ) -> Result<()> { - stdin.for_byte_line(|line| { - let mut fields: Vec> = Vec::with_capacity(16); - cut_str_fast_line(line, opt, stdout, &mut fields) - // XXX Should map properly the error - .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) - .and(Ok(true)) - })?; + let mut fields: Vec> = Vec::with_capacity(16); + match opt.eol { + EOL::Newline => stdin.for_byte_line(|line| { + cut_str_fast_line(line, opt, stdout, &mut fields) + // XXX Should map properly the error + .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) + .and(Ok(true)) + })?, + EOL::Zero => stdin.for_byte_record(opt.eol.into(), |line| { + cut_str_fast_line(line, opt, stdout, &mut fields) + // XXX Should map properly the error + .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) + .and(Ok(true)) + })?, + } Ok(()) } diff --git a/src/options.rs b/src/options.rs index 0f630e9..d4eefbf 100644 --- a/src/options.rs +++ b/src/options.rs @@ -19,6 +19,15 @@ pub enum EOL { Newline = 10, } +impl From for u8 { + fn from(value: EOL) -> Self { + match value { + EOL::Zero => b'\0', + EOL::Newline => b'\n', + } + } +} + #[derive(Debug)] pub struct Opt { pub delimiter: String, From 201f90765764cf85f3c924a100b213ab91dc8fd0 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Tue, 6 Feb 2024 22:06:26 -0800 Subject: [PATCH 11/22] Increase buffer size of input/output --- src/bin/tuc.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bin/tuc.rs b/src/bin/tuc.rs index e528df6..a38e58e 100644 --- a/src/bin/tuc.rs +++ b/src/bin/tuc.rs @@ -259,8 +259,8 @@ fn parse_args() -> Result { fn main() -> Result<()> { let opt: Opt = parse_args()?; - let mut stdin = std::io::BufReader::new(std::io::stdin().lock()); - let mut stdout = std::io::BufWriter::new(std::io::stdout().lock()); + let mut stdin = std::io::BufReader::with_capacity(64 * 1024, std::io::stdin().lock()); + let mut stdout = std::io::BufWriter::with_capacity(64 * 1024, std::io::stdout().lock()); if opt.bounds_type == BoundsType::Bytes { read_and_cut_bytes(&mut stdin, &mut stdout, &opt)?; From 0b90d54a0666d83a63641442e053301358f38e0d Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sat, 10 Feb 2024 21:50:28 -0800 Subject: [PATCH 12/22] Document more functions --- src/bounds.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/bounds.rs b/src/bounds.rs index b880044..24c3480 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -114,6 +114,9 @@ impl FromStr for UserBoundsList { } impl UserBoundsList { + /// Detect whether the list can be sorted. + /// It can be sorted only if every bound + /// has the same sign (all positive or all negative). fn is_sortable(&self) -> bool { let mut has_positive_idx = false; let mut has_negative_idx = false; @@ -176,6 +179,10 @@ impl UserBoundsList { }) } + /// Check if the bounds in the list match the following conditions: + /// - they are in ascending order + /// - they use solely positive indices + /// - they don't overlap (but they can be adjacent, e.g. 1:2,2,3) pub fn is_forward_only(&self) -> bool { self.is_sortable() && self.is_sorted() && !self.has_negative_indices() } From bb4ef6d79996cdbefe490f50232429bf173995fe Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sat, 10 Feb 2024 21:56:02 -0800 Subject: [PATCH 13/22] Pre-compute last_interesting_field --- src/fast_lane.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 303ac9f..2dd5276 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -13,6 +13,7 @@ fn cut_str_fast_line( opt: &FastOpt, stdout: &mut W, fields: &mut Vec>, + last_interesting_field: Side, ) -> Result<()> { if buffer.is_empty() { return Ok(()); @@ -20,9 +21,6 @@ fn cut_str_fast_line( let bounds = &opt.bounds; - // ForwardBounds guarantees that there is at least one field to check - let last_interesting_field = bounds.get_last_bound().r; - let mut prev_field_start = 0; let mut curr_field = 0; @@ -38,7 +36,7 @@ fn cut_str_fast_line( fields.push(Range { start, end }); if Side::Some(curr_field) == last_interesting_field { - // we have no use for this field or any of the following ones + // We have no use for any other fields in this line break; } } @@ -89,7 +87,7 @@ fn cut_str_fast_line( Ok(()) } -#[inline] +#[inline(always)] fn output_parts( line: &[u8], // which parts to print @@ -250,15 +248,19 @@ pub fn read_and_cut_text_as_bytes( opt: &FastOpt, ) -> Result<()> { let mut fields: Vec> = Vec::with_capacity(16); + + // ForwardBounds guarantees that there is at least one field to check + let last_interesting_field = opt.bounds.get_last_bound().r; + match opt.eol { EOL::Newline => stdin.for_byte_line(|line| { - cut_str_fast_line(line, opt, stdout, &mut fields) + cut_str_fast_line(line, opt, stdout, &mut fields, last_interesting_field) // XXX Should map properly the error .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) .and(Ok(true)) })?, EOL::Zero => stdin.for_byte_record(opt.eol.into(), |line| { - cut_str_fast_line(line, opt, stdout, &mut fields) + cut_str_fast_line(line, opt, stdout, &mut fields, last_interesting_field) // XXX Should map properly the error .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) .and(Ok(true)) From ddc27519d31eae06d43cc6dfc2557b04a1229d40 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sat, 10 Feb 2024 22:14:30 -0800 Subject: [PATCH 14/22] Remove unused code --- src/fast_lane.rs | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 2dd5276..72e737b 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -160,32 +160,6 @@ impl TryFrom<&Opt> for FastOpt { } } -impl From<&UserBounds> for Range { - fn from(value: &UserBounds) -> Self { - // XXX this will explode in our face at the first negative value - // XXX we should have a try into and more checks in place - // (also, values must be sequential, but that should be covered by UserBounds - // ... if we will still pass by it) - - let (l, r): (usize, usize) = match (value.l, value.r) { - // (Side::Some(l), Side::Some(r)) => (l as usize, (r - l) as usize), - // (Side::Some(l), Side::Continue) => (l as usize, usize::MAX - (l as usize)), - (Side::Some(l), Side::Some(r)) => ((l - 1) as usize, r as usize), - (Side::Some(l), Side::Continue) => ((l - 1) as usize, usize::MAX), - (Side::Continue, Side::Some(r)) => (0, r as usize), - (Side::Continue, Side::Continue) => (0, usize::MAX), - }; - - // FastRange { - // l, - // r_sub_l: r - l, - // buff_start: 0, - // buff_end: 0, - // } - Range { start: l, end: r } - } -} - #[derive(Debug)] struct ForwardBounds { pub list: UserBoundsList, From 3ef2f7ec4d4f687da8b70d1b824862cc4e14b3bc Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 00:32:04 -0800 Subject: [PATCH 15/22] Add trim support to fast_lane --- src/fast_lane.rs | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 72e737b..d2e98e5 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -1,20 +1,38 @@ use crate::bounds::{BoundOrFiller, BoundsType, Side, UserBounds, UserBoundsList}; -use crate::options::{Opt, EOL}; -use anyhow::Result; +use crate::options::{Opt, Trim, EOL}; +use anyhow::{bail, Result}; +use bstr::ByteSlice; use std::convert::TryFrom; use std::io::{self, BufRead}; use std::ops::Deref; +use std::str::FromStr; use std::{io::Write, ops::Range}; use bstr::io::BufReadExt; -fn cut_str_fast_line( - buffer: &[u8], +fn trim<'a>(buffer: &'a [u8], trim_kind: &Trim, delimiter: u8) -> &'a [u8] { + match trim_kind { + Trim::Both => buffer + .trim_start_with(|x| x == delimiter as char) + .trim_end_with(|x| x == delimiter as char), + Trim::Left => buffer.trim_start_with(|x| x == delimiter as char), + Trim::Right => buffer.trim_end_with(|x| x == delimiter as char), + } +} + +fn cut_str_fast_lane( + initial_buffer: &[u8], opt: &FastOpt, stdout: &mut W, fields: &mut Vec>, last_interesting_field: Side, ) -> Result<()> { + let mut buffer = initial_buffer; + + if opt.trim.is_some() { + buffer = trim(buffer, opt.trim.as_ref().unwrap(), opt.delimiter) + } + if buffer.is_empty() { return Ok(()); } @@ -122,6 +140,7 @@ pub struct FastOpt { eol: EOL, bounds: ForwardBounds, only_delimited: bool, + trim: Option, } impl TryFrom<&Opt> for FastOpt { @@ -138,7 +157,6 @@ impl TryFrom<&Opt> for FastOpt { || value.json || value.bounds_type != BoundsType::Fields || value.replace_delimiter.is_some() - || value.trim.is_some() || value.regex_bag.is_some() { return Err( @@ -153,6 +171,7 @@ impl TryFrom<&Opt> for FastOpt { eol: value.eol, bounds: forward_bounds, only_delimited: value.only_delimited, + trim: value.trim, }) } else { Err("Bounds cannot be converted to ForwardBounds") @@ -167,12 +186,12 @@ struct ForwardBounds { } impl TryFrom<&UserBoundsList> for ForwardBounds { - type Error = &'static str; + type Error = anyhow::Error; fn try_from(value: &UserBoundsList) -> Result { if value.is_empty() { - Err("Cannot create ForwardBounds from an empty UserBoundsList") - } else if value.is_forward_only() { + bail!("Cannot create ForwardBounds from an empty UserBoundsList"); + } else { let value: UserBoundsList = UserBoundsList(value.iter().cloned().collect()); let mut maybe_last_bound: Option = None; value.iter().enumerate().rev().any(|(idx, bof)| { @@ -190,10 +209,8 @@ impl TryFrom<&UserBoundsList> for ForwardBounds { last_bound_idx, }) } else { - Err("Cannot create ForwardBounds from UserBoundsList without bounds") + bail!("Cannot create ForwardBounds from UserBoundsList without bounds"); } - } else { - Err("The provided UserBoundsList is not forward only") } } } @@ -211,7 +228,7 @@ impl ForwardBounds { if let Some(BoundOrFiller::Bound(b)) = self.list.get(self.last_bound_idx) { b } else { - panic!("Invariant error: last_bound_idx failed to match a bound") + panic!("Invariant error: last_bound_idx failed to match a bound. The constructor should have verified that") } } } @@ -228,13 +245,13 @@ pub fn read_and_cut_text_as_bytes( match opt.eol { EOL::Newline => stdin.for_byte_line(|line| { - cut_str_fast_line(line, opt, stdout, &mut fields, last_interesting_field) + cut_str_fast_lane(line, opt, stdout, &mut fields, last_interesting_field) // XXX Should map properly the error .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) .and(Ok(true)) })?, EOL::Zero => stdin.for_byte_record(opt.eol.into(), |line| { - cut_str_fast_line(line, opt, stdout, &mut fields, last_interesting_field) + cut_str_fast_lane(line, opt, stdout, &mut fields, last_interesting_field) // XXX Should map properly the error .map_err(|x| io::Error::new(io::ErrorKind::Other, x.to_string())) .and(Ok(true)) From 288db8050372c2a94ce4b315f3acbb33a9d8dd6a Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 00:33:12 -0800 Subject: [PATCH 16/22] Add tests to fast lane --- src/fast_lane.rs | 180 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) diff --git a/src/fast_lane.rs b/src/fast_lane.rs index d2e98e5..999eb1b 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -143,6 +143,19 @@ pub struct FastOpt { trim: Option, } +impl Default for FastOpt { + fn default() -> Self { + Self { + delimiter: b'\t', + join: false, + eol: EOL::Newline, + bounds: ForwardBounds::try_from(&UserBoundsList::from_str("1:").unwrap()).unwrap(), + only_delimited: false, + trim: None, + } + } +} + impl TryFrom<&Opt> for FastOpt { type Error = &'static str; @@ -233,6 +246,14 @@ impl ForwardBounds { } } +impl FromStr for ForwardBounds { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + let bounds_list = UserBoundsList::from_str(s)?; + ForwardBounds::try_from(&bounds_list) + } +} + pub fn read_and_cut_text_as_bytes( stdin: &mut R, stdout: &mut W, @@ -260,3 +281,162 @@ pub fn read_and_cut_text_as_bytes( Ok(()) } + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use crate::options::Trim; + + use super::*; + + fn make_fields_opt() -> FastOpt { + FastOpt { + delimiter: b'-', + ..FastOpt::default() + } + } + + #[test] + fn test_read_and_cut_str_echo_non_delimited_strings() { + // read_and_cut_str is difficult to test, let's verify at least + // that it reads the input and appears to call cut_str + + let opt = make_fields_opt(); + let mut input = b"foo".as_slice(); + let mut output = Vec::new(); + read_and_cut_text_as_bytes(&mut input, &mut output, &opt).unwrap(); + assert_eq!(output, b"foo\n".as_slice()); + } + + fn make_cut_str_buffers() -> (Vec, Vec>) { + let output = Vec::new(); + let fields = Vec::new(); + (output, fields) + } + + #[test] + fn cut_str_echo_non_delimited_strings() { + let opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + + let line = b"foo"; + + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"foo\n".as_slice()); + } + + #[test] + fn cut_str_skip_non_delimited_strings_when_requested() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + + opt.only_delimited = true; + let line = b"foo"; + + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"".as_slice()); + } + + #[test] + fn cut_str_it_cut_a_field() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("1").unwrap(); + + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"a\n".as_slice()); + } + + #[test] + fn cut_str_it_cut_consecutive_delimiters() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("1,3").unwrap(); + + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"ac\n".as_slice()); + } + + #[test] + fn cut_str_it_supports_zero_terminated_lines() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + opt.eol = EOL::Zero; + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("2").unwrap(); + + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"b\0".as_slice()); + } + + #[test] + fn cut_str_it_join_fields() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("1,3").unwrap(); + opt.join = true; + + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"a-c\n".as_slice()); + } + + #[test] + fn cut_str_it_format_fields() { + let mut opt = make_fields_opt(); + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + + let line = b"a-b-c"; + opt.bounds = ForwardBounds::from_str("{1} < {3} > {2}").unwrap(); + + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"a < c > b\n".as_slice()); + } + + #[test] + fn cut_str_it_trim_fields() { + let mut opt = make_fields_opt(); + let line = b"--a--b--c--"; + + // check Trim::Both + opt.trim = Some(Trim::Both); + opt.bounds = ForwardBounds::from_str("1,3,-1").unwrap(); + + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"abc\n".as_slice()); + + // check Trim::Left + opt.trim = Some(Trim::Left); + opt.bounds = ForwardBounds::from_str("1,3,-3").unwrap(); + + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"abc\n".as_slice()); + + // check Trim::Right + opt.trim = Some(Trim::Right); + opt.bounds = ForwardBounds::from_str("3,5,-1").unwrap(); + + let (mut output, mut fields) = make_cut_str_buffers(); + let last_interesting_field = Side::Continue; + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"abc\n".as_slice()); + } +} From 76a694533af2e27a1d5aae8b906c3f5da811c651 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 13:20:50 -0800 Subject: [PATCH 17/22] Fix only_delimited support in fast lane --- src/cut_str.rs | 18 ++++++++++++++++-- src/fast_lane.rs | 23 +++++++++++++++++++---- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/cut_str.rs b/src/cut_str.rs index d5a3051..3867d4a 100644 --- a/src/cut_str.rs +++ b/src/cut_str.rs @@ -584,24 +584,38 @@ mod tests { #[test] fn cut_str_echo_non_delimited_strings() { let opt = make_fields_opt(); - let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); let eol = &[EOL::Newline as u8]; let line = "foo"; + // non-empty line missing the delimiter + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); assert_eq!(output, b"foo\n".as_slice()); + + // empty line + let line = ""; + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); + cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); + assert_eq!(output, b"\n".as_slice()); } #[test] fn cut_str_skip_non_delimited_strings_when_requested() { let mut opt = make_fields_opt(); - let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); let eol = &[EOL::Newline as u8]; opt.only_delimited = true; + + // non-empty line missing the delimiter let line = "foo"; + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); + cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); + assert_eq!(output, b"".as_slice()); + // empty line + let line = ""; + let (mut output, mut buffer1, mut buffer2) = make_cut_str_buffers(); cut_str(line, &opt, &mut output, &mut buffer1, &mut buffer2, eol).unwrap(); assert_eq!(output, b"".as_slice()); } diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 999eb1b..70cd785 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -34,6 +34,9 @@ fn cut_str_fast_lane( } if buffer.is_empty() { + if !opt.only_delimited { + stdout.write_all(&[opt.eol.into()])?; + } return Ok(()); } @@ -122,7 +125,6 @@ fn output_parts( let idx_end = fields[r.end - 1].end; let output = &line[idx_start..idx_end]; - // let field_to_print = maybe_replace_delimiter(output, opt); let field_to_print = output; stdout.write_all(field_to_print)?; @@ -318,24 +320,37 @@ mod tests { #[test] fn cut_str_echo_non_delimited_strings() { let opt = make_fields_opt(); - let (mut output, mut fields) = make_cut_str_buffers(); let last_interesting_field = Side::Continue; + // non-empty line missing the delimiter let line = b"foo"; - + let (mut output, mut fields) = make_cut_str_buffers(); cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); assert_eq!(output, b"foo\n".as_slice()); + + // empty line + let line = b""; + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"\n".as_slice()); } #[test] fn cut_str_skip_non_delimited_strings_when_requested() { let mut opt = make_fields_opt(); - let (mut output, mut fields) = make_cut_str_buffers(); let last_interesting_field = Side::Continue; opt.only_delimited = true; + + // non-empty line missing the delimiter let line = b"foo"; + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + assert_eq!(output, b"".as_slice()); + // empty line + let line = b""; + let (mut output, mut fields) = make_cut_str_buffers(); cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); assert_eq!(output, b"".as_slice()); } From b1d7b34b353c22fca57652df93260901b7d433db Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 15:28:56 -0800 Subject: [PATCH 18/22] Implement PartialOrd for Side --- src/bounds.rs | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/bounds.rs b/src/bounds.rs index 24c3480..75b1e7f 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -235,6 +235,23 @@ impl fmt::Display for Side { } } +impl PartialOrd for Side { + fn partial_cmp(&self, other: &Self) -> Option { + match (self, other) { + (Side::Some(s), Side::Some(o)) => { + if !(s * o).is_positive() { + // We can't compare two sides with different sign + return None; + } + Some(s.cmp(o)) + } + (Side::Continue, Side::Some(_)) => Some(Ordering::Greater), + (Side::Some(_), Side::Continue) => Some(Ordering::Less), + (Side::Continue, Side::Continue) => Some(Ordering::Equal), + } + } +} + #[derive(Debug, Eq, Clone)] pub struct UserBounds { pub l: Side, @@ -419,40 +436,20 @@ impl UserBounds { }; for i in start..=end { - bounds.push(UserBounds { - l: Side::Some(i), - r: Side::Some(i), - }) + bounds.push(UserBounds::new(Side::Some(i), Side::Some(i))) } bounds } } -impl Ord for UserBounds { - /* - * Compare UserBounds. Note that comparison gives wrong results if - * bounds happen to have a mix of positive/negative indexes (you cannot - * reliably compare -1 with 3 without kwowing how many parts are there). - * Check with UserBounds.is_sortable before comparing. - */ - fn cmp(&self, other: &Self) -> Ordering { - if self == other { - return Ordering::Equal; - } - - match (self.l, self.r, other.l, other.r) { - (_, Side::Some(s_r), Side::Some(o_l), _) if (s_r * o_l).is_positive() && s_r <= o_l => { - Ordering::Less - } - _ => Ordering::Greater, - } - } -} - impl PartialOrd for UserBounds { + /// Compare UserBounds. Note that you cannot reliably compare + /// bounds with a mix of positive/negative indices (you cannot + /// compare `-1` with `3` without kwowing how many parts are there). + /// Check with UserBounds.is_sortable before comparing. fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + self.r.partial_cmp(&other.l) } } From cc4f6465828230a7f8acfe33bfc2542c8018b733 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 15:45:24 -0800 Subject: [PATCH 19/22] Refactor last_bound_idx into last_interesting_field --- src/fast_lane.rs | 160 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 114 insertions(+), 46 deletions(-) diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 70cd785..5335481 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -197,7 +197,10 @@ impl TryFrom<&Opt> for FastOpt { #[derive(Debug)] struct ForwardBounds { pub list: UserBoundsList, - last_bound_idx: usize, + // Optimization that we can use to stop searching for fields + // It's available only when every bound use positive indexes. + // When conditions do not apply, Side::Continue is used. + last_interesting_field: Side, } impl TryFrom<&UserBoundsList> for ForwardBounds { @@ -208,24 +211,20 @@ impl TryFrom<&UserBoundsList> for ForwardBounds { bail!("Cannot create ForwardBounds from an empty UserBoundsList"); } else { let value: UserBoundsList = UserBoundsList(value.iter().cloned().collect()); - let mut maybe_last_bound: Option = None; - value.iter().enumerate().rev().any(|(idx, bof)| { - if matches!(bof, BoundOrFiller::Bound(_)) { - maybe_last_bound = Some(idx); - true - } else { - false + + let mut rightmost_bound: Option = None; + value.iter().for_each(|bof| { + if let BoundOrFiller::Bound(b) = bof { + if rightmost_bound.is_none() || b.r > rightmost_bound.unwrap() { + rightmost_bound = Some(b.r); + } } }); - if let Some(last_bound_idx) = maybe_last_bound { - Ok(ForwardBounds { - list: value, - last_bound_idx, - }) - } else { - bail!("Cannot create ForwardBounds from UserBoundsList without bounds"); - } + Ok(ForwardBounds { + list: value, + last_interesting_field: rightmost_bound.unwrap_or(Side::Continue), + }) } } } @@ -239,12 +238,8 @@ impl Deref for ForwardBounds { } impl ForwardBounds { - fn get_last_bound(&self) -> &UserBounds { - if let Some(BoundOrFiller::Bound(b)) = self.list.get(self.last_bound_idx) { - b - } else { - panic!("Invariant error: last_bound_idx failed to match a bound. The constructor should have verified that") - } + fn get_last_bound(&self) -> Side { + self.last_interesting_field } } @@ -263,8 +258,7 @@ pub fn read_and_cut_text_as_bytes( ) -> Result<()> { let mut fields: Vec> = Vec::with_capacity(16); - // ForwardBounds guarantees that there is at least one field to check - let last_interesting_field = opt.bounds.get_last_bound().r; + let last_interesting_field = opt.bounds.get_last_bound(); match opt.eol { EOL::Newline => stdin.for_byte_line(|line| { @@ -320,38 +314,64 @@ mod tests { #[test] fn cut_str_echo_non_delimited_strings() { let opt = make_fields_opt(); - let last_interesting_field = Side::Continue; // non-empty line missing the delimiter let line = b"foo"; let (mut output, mut fields) = make_cut_str_buffers(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"foo\n".as_slice()); // empty line let line = b""; let (mut output, mut fields) = make_cut_str_buffers(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"\n".as_slice()); } #[test] fn cut_str_skip_non_delimited_strings_when_requested() { let mut opt = make_fields_opt(); - let last_interesting_field = Side::Continue; opt.only_delimited = true; // non-empty line missing the delimiter let line = b"foo"; let (mut output, mut fields) = make_cut_str_buffers(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"".as_slice()); // empty line let line = b""; let (mut output, mut fields) = make_cut_str_buffers(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"".as_slice()); } @@ -359,12 +379,18 @@ mod tests { fn cut_str_it_cut_a_field() { let mut opt = make_fields_opt(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; let line = b"a-b-c"; opt.bounds = ForwardBounds::from_str("1").unwrap(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"a\n".as_slice()); } @@ -372,12 +398,18 @@ mod tests { fn cut_str_it_cut_consecutive_delimiters() { let mut opt = make_fields_opt(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; let line = b"a-b-c"; opt.bounds = ForwardBounds::from_str("1,3").unwrap(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"ac\n".as_slice()); } @@ -385,13 +417,19 @@ mod tests { fn cut_str_it_supports_zero_terminated_lines() { let mut opt = make_fields_opt(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; opt.eol = EOL::Zero; let line = b"a-b-c"; opt.bounds = ForwardBounds::from_str("2").unwrap(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"b\0".as_slice()); } @@ -399,13 +437,19 @@ mod tests { fn cut_str_it_join_fields() { let mut opt = make_fields_opt(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; let line = b"a-b-c"; opt.bounds = ForwardBounds::from_str("1,3").unwrap(); opt.join = true; - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"a-c\n".as_slice()); } @@ -413,12 +457,18 @@ mod tests { fn cut_str_it_format_fields() { let mut opt = make_fields_opt(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; let line = b"a-b-c"; opt.bounds = ForwardBounds::from_str("{1} < {3} > {2}").unwrap(); - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"a < c > b\n".as_slice()); } @@ -432,8 +482,14 @@ mod tests { opt.bounds = ForwardBounds::from_str("1,3,-1").unwrap(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"abc\n".as_slice()); // check Trim::Left @@ -441,8 +497,14 @@ mod tests { opt.bounds = ForwardBounds::from_str("1,3,-3").unwrap(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"abc\n".as_slice()); // check Trim::Right @@ -450,8 +512,14 @@ mod tests { opt.bounds = ForwardBounds::from_str("3,5,-1").unwrap(); let (mut output, mut fields) = make_cut_str_buffers(); - let last_interesting_field = Side::Continue; - cut_str_fast_lane(line, &opt, &mut output, &mut fields, last_interesting_field).unwrap(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); assert_eq!(output, b"abc\n".as_slice()); } } From 945cf5bc67b397e7d2c26dcb533bd221cff51507 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 16:16:49 -0800 Subject: [PATCH 20/22] Ensure that fast lane works with negative indices too --- src/bounds.rs | 2 +- src/fast_lane.rs | 77 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/src/bounds.rs b/src/bounds.rs index 75b1e7f..5128d1f 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -117,7 +117,7 @@ impl UserBoundsList { /// Detect whether the list can be sorted. /// It can be sorted only if every bound /// has the same sign (all positive or all negative). - fn is_sortable(&self) -> bool { + pub fn is_sortable(&self) -> bool { let mut has_positive_idx = false; let mut has_negative_idx = false; self.get_userbounds_only().for_each(|b| { diff --git a/src/fast_lane.rs b/src/fast_lane.rs index 5335481..fb53939 100644 --- a/src/fast_lane.rs +++ b/src/fast_lane.rs @@ -80,7 +80,7 @@ fn cut_str_fast_lane( let num_fields = fields.len(); match num_fields { - 1 if bounds.len() == 1 => { + 1 if bounds.len() == 1 && fields[0].end == buffer.len() => { stdout.write_all(buffer)?; } _ => { @@ -213,13 +213,15 @@ impl TryFrom<&UserBoundsList> for ForwardBounds { let value: UserBoundsList = UserBoundsList(value.iter().cloned().collect()); let mut rightmost_bound: Option = None; - value.iter().for_each(|bof| { - if let BoundOrFiller::Bound(b) = bof { - if rightmost_bound.is_none() || b.r > rightmost_bound.unwrap() { - rightmost_bound = Some(b.r); + if value.is_sortable() { + value.iter().for_each(|bof| { + if let BoundOrFiller::Bound(b) = bof { + if rightmost_bound.is_none() || b.r > rightmost_bound.unwrap() { + rightmost_bound = Some(b.r); + } } - } - }); + }); + } Ok(ForwardBounds { list: value, @@ -394,6 +396,67 @@ mod tests { assert_eq!(output, b"a\n".as_slice()); } + #[test] + fn cut_str_it_cut_with_negative_indices() { + let mut opt = make_fields_opt(); + + let line = b"a-b-c"; + + // just one negative index + opt.bounds = ForwardBounds::from_str("-1").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"c\n".as_slice()); + + // multiple negative indices, in forward order + opt.bounds = ForwardBounds::from_str("-2,-1").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"bc\n".as_slice()); + + // multiple negative indices, in non-forward order + opt.bounds = ForwardBounds::from_str("-1,-2").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"cb\n".as_slice()); + + // mix positive and negative indices + // (this is particularly useful to verify that we don't screw + // up optimizations on last field to check) + opt.bounds = ForwardBounds::from_str("-1,1").unwrap(); + let (mut output, mut fields) = make_cut_str_buffers(); + cut_str_fast_lane( + line, + &opt, + &mut output, + &mut fields, + opt.bounds.get_last_bound(), + ) + .unwrap(); + assert_eq!(output, b"ca\n".as_slice()); + } + #[test] fn cut_str_it_cut_consecutive_delimiters() { let mut opt = make_fields_opt(); From 5962a7743ab94026d10d3ec0975c234823dcc333 Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 16:38:02 -0800 Subject: [PATCH 21/22] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 455addd..284e27d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +- feat: much faster (3x) implementation for common scenarios + ## [1.2.0] - 2024-01-01 - feat: new option --json to format output as JSON array From f30e369e10c4617309929f368af982d9269fdecf Mon Sep 17 00:00:00 2001 From: Riccardo Attilio Galli Date: Sun, 11 Feb 2024 16:40:12 -0800 Subject: [PATCH 22/22] Always inline UserBounds matches --- src/bounds.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bounds.rs b/src/bounds.rs index 5128d1f..6258889 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -327,6 +327,7 @@ impl UserBounds { * * Fields are 1-indexed. */ + #[inline(always)] pub fn matches(&self, idx: i32) -> Result { match (self.l, self.r) { (Side::Some(left), _) if (left * idx).is_negative() => {