Skip to content

Commit

Permalink
Implement string parser (#24)
Browse files Browse the repository at this point in the history
* Implement string parser

* Fix regular expression to properly match strings

* Refactor string parsing into several functions and added tests

* Remove string allocation when handling unicode

* Add simple benchmarks for string parsing

* Add simple benchmarks for string parsing

* Improve unicode parsing

* Move `res.push()` outside `match`

* Format the code

* Use try operator when parsing unicode characters
  • Loading branch information
inikolaev authored Feb 4, 2024
1 parent 64905a9 commit e4b235f
Show file tree
Hide file tree
Showing 5 changed files with 476 additions and 3 deletions.
7 changes: 7 additions & 0 deletions parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,12 @@ categories = ["parsing", "cel"]
lalrpop-util = { version = "0.19.1", features = ["lexer"] }
regex = "1.4.2"

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }

[build-dependencies]
lalrpop = { version = "0.19.1", features = ["lexer"] }

[[bench]]
name = "runtime"
harness = false
19 changes: 19 additions & 0 deletions parser/benches/runtime.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use cel_parser::parse_string;
use criterion::{black_box, criterion_group, criterion_main, Criterion};

pub fn parse_string_benchmark(c: &mut Criterion) {
let expressions = vec![
("text", "\"text\""),
("raw", "r\"text\""),
("single unicode escape sequence", "\"\\U0001f431\""),
("single hex escape sequence", "\"\\x0D\""),
("single oct escape sequence", "\"\\015\""),
];

for (name, expr) in black_box(&expressions) {
c.bench_function(name, |b| b.iter(|| parse_string(expr)));
}
}

criterion_group!(benches, parse_string_benchmark);
criterion_main!(benches);
17 changes: 14 additions & 3 deletions parser/src/cel.lalrpop
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{RelationOp, ArithmeticOp, Expression, UnaryOp, Member, Atom};
use crate::{RelationOp, ArithmeticOp, Expression, UnaryOp, Member, Atom, parse_string};
use std::sync::Arc;

grammar;
Expand Down Expand Up @@ -138,11 +138,22 @@ Atom: Atom = {
// the LALRPOP parser.

// Double quoted string
r#""(\\.|[^"\n])*""# => Atom::String(Arc::from(<>[1..<>.len()-1].to_string())),
// I used ChatGPT to come up with this pattern and the explanation is as follows:
// 1. `"`: Match the opening double quote.
// 2. `([^"\\]*(?:\\.[^"\\]*)*)`: This is the main part of the regex which matches the content inside the double quotes.
// a. `[^"\\]*`: Match any sequence of characters that are neither a double quote nor a backslash.
// b. `(?:\\.[^"\\]*)*`: This part matches an escaped character followed by any sequence of characters that are
// neither a double quote nor a backslash. It uses a non-capturing group (?:...) to repeat the pattern.
// This handles sequences like \", \\, or any other escaped character.
// 3. `"`: Match the closing double quote.
r#""([^"\\]*(?:\\.[^"\\]*)*)""# => Atom::String(parse_string(<>).unwrap().into()),
r#"[rR]"([^"\\]*(?:\\.[^"\\]*)*)""# => Atom::String(parse_string(<>).unwrap().into()),
// r#""""(\\.|[^"{3}])*""""# => Atom::String(<>.to_string().into()),

// Single quoted string
r#"'(\\.|[^'\n])*'"# => Atom::String(Arc::from(<>[1..<>.len()-1].to_string())),
// Uses similar regex as above, but replace double quote with a single one
r#"'([^'\\]*(?:\\.[^'\\]*)*)'"# => Atom::String(parse_string(<>).unwrap().into()),
r#"[rR]'([^'\\]*(?:\\.[^'\\]*)*)'"# => Atom::String(parse_string(<>).unwrap().into()),
// r#"'''(\\.|[^'{3}])*'''"# => Atom::String(<>.to_string().into()),

// Double quoted bytes
Expand Down
3 changes: 3 additions & 0 deletions parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ use lalrpop_util::lalrpop_mod;
pub mod ast;
pub use ast::*;

pub mod parse;
pub use parse::*;

use std::fmt::Display;

lalrpop_mod!(#[allow(clippy::all)] pub parser, "/cel.rs");
Expand Down
Loading

0 comments on commit e4b235f

Please sign in to comment.