Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast lane #128

Merged
merged 22 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased]

- feat: much faster (3x) implementation for common scenarios

## [1.2.0] - 2024-01-01

- feat: new option --json to format output as JSON array
Expand Down
27 changes: 8 additions & 19 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ categories = ["command-line-utilities"]

[dependencies]
anyhow = "1.0.79"
bstr = "1.9.0"
memchr = "2.7.1"
pico-args = { version = "0.5.0", features = ["short-space-opt", "combined-flags", "eq-separator"] }
regex = { version = "1.10", default-features = false, features = ["std", "unicode-bool", "unicode-perl", "unicode-gencat"], optional = true }

Expand Down
8 changes: 6 additions & 2 deletions src/bin/tuc.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
use anyhow::Result;
use std::convert::TryFrom;
use std::io::Write;
use std::str::FromStr;
use tuc::bounds::{BoundOrFiller, BoundsType, UserBoundsList};
use tuc::cut_bytes::read_and_cut_bytes;
use tuc::cut_lines::read_and_cut_lines;
use tuc::cut_str::read_and_cut_str;
use tuc::fast_lane::{read_and_cut_text_as_bytes, FastOpt};
use tuc::options::{Opt, EOL};

#[cfg(feature = "regex")]
Expand Down Expand Up @@ -257,13 +259,15 @@ fn parse_args() -> Result<Opt, pico_args::Error> {
fn main() -> Result<()> {
let opt: Opt = parse_args()?;

let mut stdin = std::io::BufReader::new(std::io::stdin().lock());
let mut stdout = std::io::BufWriter::new(std::io::stdout().lock());
let mut stdin = std::io::BufReader::with_capacity(64 * 1024, std::io::stdin().lock());
let mut stdout = std::io::BufWriter::with_capacity(64 * 1024, std::io::stdout().lock());

if opt.bounds_type == BoundsType::Bytes {
read_and_cut_bytes(&mut stdin, &mut stdout, &opt)?;
} else if opt.bounds_type == BoundsType::Lines {
read_and_cut_lines(&mut stdin, &mut stdout, &opt)?;
} else if let Ok(fast_opt) = FastOpt::try_from(&opt) {
read_and_cut_text_as_bytes(&mut stdin, &mut stdout, &fast_opt)?;
} else {
read_and_cut_str(&mut stdin, &mut stdout, opt)?;
}
Expand Down
176 changes: 108 additions & 68 deletions src/bounds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@ use anyhow::{bail, Result};
use std::cmp::Ordering;
use std::convert::TryInto;
use std::fmt;
use std::ops::Range;
use std::ops::{Deref, Range};
use std::str::FromStr;

#[derive(Debug, Eq, PartialEq)]
#[derive(Debug, PartialEq)]
pub enum BoundsType {
Bytes,
Characters,
Fields,
Lines,
}

#[derive(Debug, Eq, PartialEq)]
#[derive(Clone, Debug, PartialEq)]
pub enum BoundOrFiller {
Bound(UserBounds),
Filler(String),
Expand Down Expand Up @@ -98,6 +98,14 @@ pub fn parse_bounds_list(s: &str) -> Result<Vec<BoundOrFiller>> {
#[derive(Debug)]
pub struct UserBoundsList(pub Vec<BoundOrFiller>);

impl Deref for UserBoundsList {
type Target = Vec<BoundOrFiller>;

fn deref(&self) -> &Self::Target {
&self.0
}
}

impl FromStr for UserBoundsList {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Expand All @@ -106,7 +114,10 @@ impl FromStr for UserBoundsList {
}

impl UserBoundsList {
fn is_sortable(&self) -> bool {
/// Detect whether the list can be sorted.
/// It can be sorted only if every bound
/// has the same sign (all positive or all negative).
pub fn is_sortable(&self) -> bool {
let mut has_positive_idx = false;
let mut has_negative_idx = false;
self.get_userbounds_only().for_each(|b| {
Expand Down Expand Up @@ -168,6 +179,10 @@ impl UserBoundsList {
})
}

/// Check if the bounds in the list match the following conditions:
/// - they are in ascending order
/// - they use solely positive indices
/// - they don't overlap (but they can be adjacent, e.g. 1:2,2,3)
pub fn is_forward_only(&self) -> bool {
self.is_sortable() && self.is_sorted() && !self.has_negative_indices()
}
Expand Down Expand Up @@ -220,6 +235,23 @@ impl fmt::Display for Side {
}
}

impl PartialOrd for Side {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
match (self, other) {
(Side::Some(s), Side::Some(o)) => {
if !(s * o).is_positive() {
// We can't compare two sides with different sign
return None;
}
Some(s.cmp(o))
}
(Side::Continue, Side::Some(_)) => Some(Ordering::Greater),
(Side::Some(_), Side::Continue) => Some(Ordering::Less),
(Side::Continue, Side::Continue) => Some(Ordering::Equal),
}
}
}

#[derive(Debug, Eq, Clone)]
pub struct UserBounds {
pub l: Side,
Expand Down Expand Up @@ -287,12 +319,15 @@ impl UserBounds {
UserBounds { l, r }
}
/**
* Check if an index is between the bounds.
* Check if a field is between the bounds.
*
* It errors out if the index has different sign than the bounds
* (we can't verify if e.g. -1 idx is between 3:5 without knowing the number
* of matching bounds).
*
* Fields are 1-indexed.
*/
#[inline(always)]
pub fn matches(&self, idx: i32) -> Result<bool> {
match (self.l, self.r) {
(Side::Some(left), _) if (left * idx).is_negative() => {
Expand All @@ -317,6 +352,68 @@ impl UserBounds {
}
}

/// Transform UserBounds into std::opt::Range
///
/// UserBounds is 1-indexed and inclusive on both sides, while
/// the resulting range is 0-indexed and exclusive on the right side.
///
/// `parts_length` is necessary to calculate Side::Continue on
/// the right side, or any negative indexes.
///
/// e.g.
///
/// ```rust
/// # use tuc::bounds::UserBounds;
/// # use std::ops::Range;
/// # use tuc::bounds::Side;
///
/// assert_eq!(
/// (UserBounds { l: Side::Some(1), r: Side::Some(2) }).try_into_range(5).unwrap(),
/// Range { start: 0, end: 2} // 2, not 1, because it's exclusive
/// );
///
/// assert_eq!(
/// (UserBounds { l: Side::Some(1), r: Side::Continue }).try_into_range(5).unwrap(),
/// Range { start: 0, end: 5}
/// );
/// ```
pub fn try_into_range(&self, parts_length: usize) -> Result<Range<usize>> {
let start: usize = match self.l {
Side::Continue => 0,
Side::Some(v) => {
if v.unsigned_abs() as usize > parts_length {
bail!("Out of bounds: {}", v);
}
if v < 0 {
parts_length - v.unsigned_abs() as usize
} else {
v as usize - 1
}
}
};

let end: usize = match self.r {
Side::Continue => parts_length,
Side::Some(v) => {
if v.unsigned_abs() as usize > parts_length {
bail!("Out of bounds: {}", v);
}
if v < 0 {
parts_length - v.unsigned_abs() as usize + 1
} else {
v as usize
}
}
};

if end <= start {
// `end` must always be 1 or more greater than start
bail!("Field left value cannot be greater than right value");
}

Ok(Range { start, end })
}

/**
* Transform a ranged bound into a list of one or more
* 1 slot bound
Expand All @@ -340,40 +437,20 @@ impl UserBounds {
};

for i in start..=end {
bounds.push(UserBounds {
l: Side::Some(i),
r: Side::Some(i),
})
bounds.push(UserBounds::new(Side::Some(i), Side::Some(i)))
}

bounds
}
}

impl Ord for UserBounds {
/*
* Compare UserBounds. Note that comparison gives wrong results if
* bounds happen to have a mix of positive/negative indexes (you cannot
* reliably compare -1 with 3 without kwowing how many parts are there).
* Check with UserBounds.is_sortable before comparing.
*/
fn cmp(&self, other: &Self) -> Ordering {
if self == other {
return Ordering::Equal;
}

match (self.l, self.r, other.l, other.r) {
(_, Side::Some(s_r), Side::Some(o_l), _) if (s_r * o_l).is_positive() && s_r <= o_l => {
Ordering::Less
}
_ => Ordering::Greater,
}
}
}

impl PartialOrd for UserBounds {
/// Compare UserBounds. Note that you cannot reliably compare
/// bounds with a mix of positive/negative indices (you cannot
/// compare `-1` with `3` without kwowing how many parts are there).
/// Check with UserBounds.is_sortable before comparing.
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
self.r.partial_cmp(&other.l)
}
}

Expand All @@ -389,43 +466,6 @@ impl Default for UserBounds {
}
}

pub fn bounds_to_std_range(parts_length: usize, bounds: &UserBounds) -> Result<Range<usize>> {
let start: usize = match bounds.l {
Side::Continue => 0,
Side::Some(v) => {
if v.unsigned_abs() as usize > parts_length {
bail!("Out of bounds: {}", v);
}
if v < 0 {
parts_length - v.unsigned_abs() as usize
} else {
v as usize - 1
}
}
};

let end: usize = match bounds.r {
Side::Continue => parts_length,
Side::Some(v) => {
if v.unsigned_abs() as usize > parts_length {
bail!("Out of bounds: {}", v);
}
if v < 0 {
parts_length - v.unsigned_abs() as usize + 1
} else {
v as usize
}
}
};

if end <= start {
// `end` must always be 1 or more greater than start
bail!("Field left value cannot be greater than right value");
}

Ok(Range { start, end })
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
4 changes: 2 additions & 2 deletions src/cut_bytes.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use anyhow::Result;
use std::io::{Read, Write};

use crate::bounds::{bounds_to_std_range, BoundOrFiller};
use crate::bounds::BoundOrFiller;
use crate::options::Opt;
use crate::read_utils::read_bytes_to_end;

Expand All @@ -13,7 +13,7 @@ fn cut_bytes<W: Write>(data: &[u8], opt: &Opt, stdout: &mut W) -> Result<()> {
opt.bounds.0.iter().try_for_each(|bof| -> Result<()> {
let output = match bof {
BoundOrFiller::Bound(b) => {
let r = bounds_to_std_range(data.len(), b)?;
let r = b.try_into_range(data.len())?;
&data[r.start..r.end]
}
BoundOrFiller::Filler(f) => f.as_bytes(),
Expand Down
Loading
Loading