From 5f0e829279f4874f277b0154403d78c9bc714a84 Mon Sep 17 00:00:00 2001 From: Sheldon Young Date: Mon, 8 Mar 2021 18:56:57 -0800 Subject: [PATCH 1/3] Approximately 40% faster, not all numeric chars are digits --- CHANGES.md | 2 ++ src/tokenizer.rs | 75 ++++++++++++++++++++++-------------------------- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 5430091879..0e3d1873d5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,8 @@ ## Unreleased * Add new entries here +* Approximately 40% faster according to `cargo bench`. +* Some "numeric" characters like `¾` and `①` were being treated as digits. * Support `POINT EMPTY` in conversion to `geo_types`. Converts to `MultiPoint([])`. * diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 96aa4521bc..c150ead338 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -29,19 +29,14 @@ where Word(String), } +#[inline] fn is_whitespace(c: char) -> bool { - match c { - '\n' | '\r' | '\t' | ' ' => true, - _ => false, - } + c == ' ' || c == '\n' || c == '\r' || c == '\t' } +#[inline] fn is_numberlike(c: char) -> bool { - match c { - c if c.is_numeric() => true, - '.' | '-' | '+' => true, - _ => false, - } + c == '.' || c == '-' || c == '+' || c.is_ascii_digit() } pub type PeekableTokens<'a, T> = Peekable>; @@ -66,7 +61,7 @@ where impl<'a, T> Iterator for Tokens<'a, T> where - T: WktFloat + str::FromStr + Default, + T: WktFloat + str::FromStr, { type Item = Token; @@ -85,14 +80,19 @@ where ')' => Some(Token::ParenClose), ',' => Some(Token::Comma), c if is_numberlike(c) => { - let number = c.to_string() + &self.read_until_whitespace().unwrap_or_default(); - match number.trim_start_matches('+').parse::() { + let mut number = self.read_until_whitespace().unwrap_or_default(); + if c != '+' { + // Prepend the character because the string likely has capacity. + number.insert(0, c); + } + match number.parse::() { Ok(parsed_num) => Some(Token::Number(parsed_num)), Err(_) => None, } } c => { - let word = c.to_string() + &self.read_until_whitespace().unwrap_or_default(); + let mut word = self.read_until_whitespace().unwrap_or_default(); + word.insert(0, c); Some(Token::Word(word)) } } @@ -101,40 +101,28 @@ where impl<'a, T> Tokens<'a, T> where - T: WktFloat + str::FromStr + Default, + T: str::FromStr, { fn read_until_whitespace(&mut self) -> Option { - let mut result = String::new(); + let mut result = None; while let Some(&next_char) = self.chars.peek() { - let marker = match next_char { - '\0' | '(' | ')' | ',' => true, - _ => false, - }; - - // Consume non-markers - if !marker { - let _ = self.chars.next(); - } - - let whitespace = is_whitespace(next_char); - - // Append non-whitespace, non-marker characters - if !marker && !whitespace { - result.push(next_char); - } - - // Stop reading when reached marker or whitespace - if marker || whitespace { - break; + match next_char { + '\0' | '(' | ')' | ',' => break, // Just stop on a marker + c if is_whitespace(c) => { + let _ = self.chars.next(); + break; + } + _ => { + let _ = self.chars.next(); + result + .get_or_insert_with(|| String::with_capacity(16)) + .push(next_char); + } } } - if result.is_empty() { - None - } else { - Some(result) - } + result } } @@ -186,6 +174,13 @@ fn test_tokenizer_invalid_number() { assert_eq!(tokens, vec![]); } +#[test] +fn test_tokenizer_not_a_number() { + let test_str = "¾"; // A number according to char.is_numeric() + let tokens: Vec> = Tokens::from_str(test_str).collect(); + assert_eq!(tokens, vec![Token::Word("¾".to_owned())]); +} + #[test] fn test_tokenizer_2numbers() { let test_str = ".4 -2"; From 07df26e15174a453b8d3fe01e1720ab4212cbc3f Mon Sep 17 00:00:00 2001 From: Sheldon Young Date: Tue, 9 Mar 2021 10:03:14 -0800 Subject: [PATCH 2/3] Tokenizer faster by localizing string building --- Cargo.toml | 2 ++ src/tokenizer.rs | 23 ++++++++--------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3625affdf0..0f48ef7457 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,3 +27,5 @@ default = ["geo-types"] name = "parse" harness = false +[profile.release] +debug = true diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c150ead338..83cce87201 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -80,21 +80,13 @@ where ')' => Some(Token::ParenClose), ',' => Some(Token::Comma), c if is_numberlike(c) => { - let mut number = self.read_until_whitespace().unwrap_or_default(); - if c != '+' { - // Prepend the character because the string likely has capacity. - number.insert(0, c); - } + let number = self.read_until_whitespace(if c == '+' { None } else { Some(c) }); match number.parse::() { Ok(parsed_num) => Some(Token::Number(parsed_num)), Err(_) => None, } } - c => { - let mut word = self.read_until_whitespace().unwrap_or_default(); - word.insert(0, c); - Some(Token::Word(word)) - } + c => Some(Token::Word(self.read_until_whitespace(Some(c)))), } } } @@ -103,8 +95,11 @@ impl<'a, T> Tokens<'a, T> where T: str::FromStr, { - fn read_until_whitespace(&mut self) -> Option { - let mut result = None; + fn read_until_whitespace(&mut self, first_char: Option) -> String { + let mut result = String::with_capacity(12); // Big enough for most tokens + if let Some(c) = first_char { + result.push(c); + } while let Some(&next_char) = self.chars.peek() { match next_char { @@ -114,10 +109,8 @@ where break; } _ => { + result.push(next_char); let _ = self.chars.next(); - result - .get_or_insert_with(|| String::with_capacity(16)) - .push(next_char); } } } From b4e0e5ca28cfb3e81c89826d44ac722b1025cb4f Mon Sep 17 00:00:00 2001 From: Sheldon Young Date: Tue, 9 Mar 2021 10:04:15 -0800 Subject: [PATCH 3/3] Update Cargo.toml --- Cargo.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0f48ef7457..48ccfc7e62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,3 @@ default = ["geo-types"] [[bench]] name = "parse" harness = false - -[profile.release] -debug = true