feat(tasks): benchmarks for lexer (#2101)

This PR adds benchmarks for the lexer. I'm doing some work on optimizing the lexer and I thought it'd be useful to see the effects of changes in isolation, separate from the parser. These benchmarks may not be ideal to keep long-term, but for now it'd be useful. In order to do so, it's necessary for `oxc_parser` crate to expose the lexer, but have done that without adding it to the docs, and using an alias `__lexer`.
oxc-project · Jan 21, 2024 · 36c718e · 36c718e
1 parent 16b3261
commit 36c718e
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 1 deletion.
diff --git a/crates/oxc_parser/src/lexer/token.rs b/crates/oxc_parser/src/lexer/token.rs
@@ -20,7 +20,10 @@ pub struct Token {
 
     /// True if the identifier / string / template kinds has escaped strings.
     /// The escaped strings are saved in [Lexer::escaped_strings] and [Lexer::escaped_templates] by
-    /// [Token::start]
+    /// [Token::start].
+    ///
+    /// [Lexer::escaped_strings]: [super::Lexer::escaped_strings]
+    /// [Lexer::escaped_templates]: [super::Lexer::escaped_templates]
     pub escaped: bool,
 }
 

diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs
@@ -84,6 +84,12 @@ use crate::{
     state::ParserState,
 };
 
+// Expose lexer for benchmarks
+#[doc(hidden)]
+pub mod __lexer {
+    pub use super::lexer::{Kind, Lexer, Token};
+}
+
 /// Maximum length of source in bytes which can be parsed (~4 GiB).
 // Span's start and end are u32s, so size limit is u32::MAX bytes.
 pub const MAX_LEN: usize = u32::MAX as usize;

diff --git a/tasks/benchmark/Cargo.toml b/tasks/benchmark/Cargo.toml
@@ -43,6 +43,10 @@ harness = false
 name    = "minifier"
 harness = false
 
+[[bench]]
+name    = "lexer"
+harness = false
+
 [dependencies]
 oxc_allocator    = { workspace = true }
 oxc_linter       = { workspace = true }

diff --git a/tasks/benchmark/benches/lexer.rs b/tasks/benchmark/benches/lexer.rs
@@ -0,0 +1,46 @@
+use oxc_allocator::Allocator;
+use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use oxc_parser::__lexer::{Kind, Lexer};
+use oxc_span::SourceType;
+use oxc_tasks_common::{TestFile, TestFiles};
+
+fn bench_lexer(criterion: &mut Criterion) {
+    let mut group = criterion.benchmark_group("lexer");
+
+    // Lexer lacks awareness of JS grammar, so it gets confused by a few things without the parser
+    // driving it, notably escapes in regexps and template strings.
+    // So simplify the input for it, by removing backslashes and converting template strings to
+    // normal string literals.
+    let files = TestFiles::complicated()
+        .files()
+        .iter()
+        .map(|file| TestFile {
+            url: file.url.clone(),
+            file_name: file.file_name.clone(),
+            source_text: file.source_text.replace('\\', " ").replace('`', "'"),
+        })
+        .collect::<Vec<_>>();
+
+    for file in files {
+        let source_type = SourceType::from_path(&file.file_name).unwrap();
+        group.bench_with_input(
+            BenchmarkId::from_parameter(&file.file_name),
+            &file.source_text,
+            |b, source_text| {
+                b.iter_with_large_drop(|| {
+                    // Include the allocator drop time to make time measurement consistent.
+                    // Otherwise the allocator will allocate huge memory chunks (by power of two) from the
+                    // system allocator, which makes time measurement unequal during long runs.
+                    let allocator = Allocator::default();
+                    let mut lexer = Lexer::new(&allocator, source_text, source_type);
+                    while lexer.next_token().kind != Kind::Eof {}
+                    allocator
+                });
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group!(lexer, bench_lexer);
+criterion_main!(lexer);