From cdb3e83c0e782dc39846c253dd0746336067e8bb Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Tue, 4 Feb 2025 17:41:40 -0800 Subject: [PATCH] add test for large select --- parser/src/earley/parser.rs | 6 ++ parser/src/factory.rs | 13 +++- sample_parser/tests/test_lark.rs | 103 ++++++++++++++++++++++++++----- scripts/test-guidance.sh | 1 + 4 files changed, 105 insertions(+), 18 deletions(-) diff --git a/parser/src/earley/parser.rs b/parser/src/earley/parser.rs index 2ac7557b..0191ef55 100644 --- a/parser/src/earley/parser.rs +++ b/parser/src/earley/parser.rs @@ -103,6 +103,12 @@ impl XorShift { x } + pub fn from_range(&mut self, r: Range) -> usize { + assert!(r.start < r.end); + assert!(r.end < std::u32::MAX as usize); + r.start + (self.next() as usize) % (r.end - r.start) + } + pub fn one_in(&mut self, n: u32) -> bool { self.next() % n == 0 } diff --git a/parser/src/factory.rs b/parser/src/factory.rs index ffa567da..0a55574a 100644 --- a/parser/src/factory.rs +++ b/parser/src/factory.rs @@ -86,18 +86,27 @@ impl ParserFactory { } pub fn create_parser(&self, grammar: TopLevelGrammar) -> Result { - self.create_parser_ext(grammar, self.buffer_log_level) + self.create_parser_ext2(grammar, self.buffer_log_level, self.stderr_log_level) } pub fn create_parser_ext( &self, grammar: TopLevelGrammar, buffer_log_level: u32, + ) -> Result { + self.create_parser_ext2(grammar, buffer_log_level, self.stderr_log_level) + } + + pub fn create_parser_ext2( + &self, + grammar: TopLevelGrammar, + buffer_log_level: u32, + stderr_log_level: u32, ) -> Result { let mut parser = TokenParser::from_llguidance_json( self.tok_env.clone(), grammar, - Logger::new(buffer_log_level, self.stderr_log_level), + Logger::new(buffer_log_level, stderr_log_level), self.inference_caps.clone(), self.limits.clone(), self.extra_lexemes(), diff --git a/sample_parser/tests/test_lark.rs b/sample_parser/tests/test_lark.rs index 97d6b622..3a3ab77d 100644 --- a/sample_parser/tests/test_lark.rs +++ b/sample_parser/tests/test_lark.rs @@ -1,10 +1,14 @@ use anyhow::Result; -use llguidance::{api::TopLevelGrammar, TokenParser}; +use llguidance::{api::TopLevelGrammar, earley::XorShift, TokenParser}; use sample_parser::*; -fn make_parser(lark: &str) -> Result { +fn make_parser(lark: &str, quiet: bool) -> Result { let grm = TopLevelGrammar::from_lark(lark.to_string()); - let mut parser = get_parser_factory().create_parser(grm)?; + let mut parser = get_parser_factory().create_parser_ext2( + grm, + if quiet { 0 } else { 2 }, + if quiet { 1 } else { 2 }, + )?; parser.start_without_prompt(); Ok(parser) } @@ -15,14 +19,14 @@ fn consume(parser: &mut TokenParser, tok: u32) { } fn lark_ok(lark: &str) { - match make_parser(lark) { + match make_parser(lark, false) { Err(e) => panic!("unexpected error: {}, grm:\n{}", e, lark), Ok(_) => {} } } fn lark_err_test(lark: &str, err: &str) { - match make_parser(lark) { + match make_parser(lark, false) { Err(e) => { let e = format!("{}", e); if !e.contains(err) { @@ -36,17 +40,21 @@ fn lark_err_test(lark: &str, err: &str) { } } -fn lark_str_test(lark: &str, should_accept: bool, s: &str) { +fn lark_str_test(lark: &str, should_accept: bool, s: &str, quiet: bool) { let trie = get_tok_env().tok_trie(); let tokens = get_tok_env().tokenize(s); - println!( - "\n\ntokens: {}, accpt={}\ngrm:\n{}\n", - trie.tokens_dbg(&tokens), - should_accept, - lark - ); + if !quiet { + println!( + "\n\ntokens: {}, accpt={}\ngrm:\n{}\n", + trie.tokens_dbg(&tokens), + should_accept, + lark + ); + } - let mut p = make_parser(lark).unwrap(); + // let t0 = std::time::Instant::now(); + let mut p = make_parser(lark, quiet).unwrap(); + // println!("make_parser: {:?}", t0.elapsed()); for tok in tokens.iter() { let m = p.compute_mask().unwrap(); @@ -70,15 +78,23 @@ fn lark_str_test(lark: &str, should_accept: bool, s: &str) { } } -fn lark_str_test_many(lark: &str, passing: &[&str], failing: &[&str]) { +fn lark_str_test_many_ext(quiet: bool, lark: &str, passing: &[&str], failing: &[&str]) { for s in passing { - lark_str_test(lark, true, s); + lark_str_test(lark, true, s, quiet); } for s in failing { - lark_str_test(lark, false, s); + lark_str_test(lark, false, s, quiet); } } +fn lark_str_test_many(lark: &str, passing: &[&str], failing: &[&str]) { + lark_str_test_many_ext(false, lark, passing, failing); +} + +fn lark_str_test_many_quiet(lark: &str, passing: &[&str], failing: &[&str]) { + lark_str_test_many_ext(true, lark, passing, failing); +} + #[test] fn test_dot_unicode() { lark_str_test_many( @@ -476,3 +492,58 @@ fn test_lexeme_substring_words_unicode() { &["른 갈색 여우", "여우가 게으", "갈색 여가"], ); } + +fn gen_words(seed: u32, num_words: usize) -> String { + let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,."; + let mut rnd = XorShift::new(seed + 1); + let mut words = vec![]; + let num_words = rnd.from_range((num_words / 2)..num_words); + for _ in 0..num_words { + let mut word = String::new(); + let len = rnd.from_range(1..15); + for _ in 0..len { + let idx = rnd.from_range(0..letters.len()); + word.push(letters.as_bytes()[idx as usize] as char); + } + words.push(word); + } + words.join(" ") +} + +fn quote_str(s: &str) -> String { + serde_json::to_string(s).unwrap() +} + +#[test] +fn test_large_select() { + let num_words = 500; + // it's kind of slow in non-release mode + let num_opt = if cfg!(debug_assertions) { 100 } else { 1500 }; + + let t0 = std::time::Instant::now(); + let mut grm_sz = 0; + + for start in &["start: OPTS\nOPTS: ", "start: opts\nopts: "] { + let mut grm_head = start.to_string(); + let mut grm_tail = "".to_string(); + let options = (0..num_opt) + .map(|i| gen_words(i, num_words)) + .collect::>(); + for (i, opt) in options.iter().enumerate() { + grm_head.push_str(&format!("OPT{} | ", i)); + grm_tail.push_str(&format!("OPT{}: {}\n", i, quote_str(opt))); + } + grm_head.push_str(" \"\"\n"); + let grm = format!("{}{}", grm_head, grm_tail); + grm_sz = grm.len(); + + lark_str_test_many_quiet( + &grm, + //&options.iter().map(|s| s.as_str()).collect::>(), + &[&options[2].as_str(), &options[7].as_str()], + &["something that is unlikely to be in the options"], + ); + } + + println!("large_select: {:?}; grm={}kB", t0.elapsed(), grm_sz / 1024); +} diff --git a/scripts/test-guidance.sh b/scripts/test-guidance.sh index bf485e25..8fbc0a08 100755 --- a/scripts/test-guidance.sh +++ b/scripts/test-guidance.sh @@ -43,6 +43,7 @@ if [ "$TEST_RUST" = 1 ] ; then cargo build --locked cargo test + cargo test --release echo "Running sample_parser" (cd sample_parser && ./run.sh >/dev/null)