From 33233919c221976ed818a0f562ab97448b41aa36 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Tue, 21 Feb 2023 17:07:38 +0100 Subject: [PATCH] Fixed important bug in anahashing and normalizing to alphabet for multibyte characters #17 Also added a 'testinput' mode and made alphabet debugging more verbose --- src/anahash.rs | 22 ++++++++++++---------- src/bin/analiticcl.rs | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/anahash.rs b/src/anahash.rs index b8098c8..a123e65 100644 --- a/src/anahash.rs +++ b/src/anahash.rs @@ -16,7 +16,7 @@ impl Anahashable for str { fn anahash(&self, alphabet: &Alphabet) -> AnaValue { let mut hash: AnaValue = AnaValue::empty(); let mut skip = 0; - for (pos, _) in self.char_indices() { + for (bytepos, _c) in self.char_indices() { if skip > 0 { skip -= 1; continue; @@ -24,13 +24,14 @@ impl Anahashable for str { let mut matched = false; 'abciter: for (seqnr, chars) in alphabet.iter().enumerate() { for element in chars.iter() { - let l = element.chars().count(); - if let Some(slice) = self.get(pos..pos + l) { + let charlen = element.chars().count(); + let bytelen = element.len(); + if let Some(slice) = self.get(bytepos..bytepos + bytelen) { if slice == element { let charvalue = AnaValue::character(seqnr as CharIndexType); hash = hash.insert(&charvalue); matched = true; - skip = l - 1; + skip = charlen - 1; break 'abciter; } } @@ -49,21 +50,22 @@ impl Anahashable for str { fn normalize_to_alphabet(&self, alphabet: &Alphabet) -> NormString { let mut result = Vec::with_capacity(self.chars().count()); let mut skip = 0; - for (pos, _) in self.char_indices() { + for (bytepos, _c) in self.char_indices() { if skip > 0 { skip -= 1; continue; } //does greedy matching in order of appearance in the alphabet file let mut matched = false; - 'abciter: for (i, chars) in alphabet.iter().enumerate() { + 'abciter: for (seqnr, chars) in alphabet.iter().enumerate() { for element in chars.iter() { - let l = element.chars().count(); - if let Some(slice) = self.get(pos..pos + l) { + let charlen = element.chars().count(); + let bytelen = element.len(); + if let Some(slice) = self.get(bytepos..bytepos + bytelen) { if slice == element { - result.push(i as CharIndexType); + result.push(seqnr as CharIndexType); matched = true; - skip = l - 1; + skip = charlen - 1; break 'abciter; } } diff --git a/src/bin/analiticcl.rs b/src/bin/analiticcl.rs index 9166dfa..906dc18 100644 --- a/src/bin/analiticcl.rs +++ b/src/bin/analiticcl.rs @@ -911,6 +911,11 @@ fn main() { .about("Compute and output the anagram index") .args(&common_arguments()) ) + .subcommand( + SubCommand::with_name("testinput") + .about("Test whether the input can be encoded with the given alphabet") + .args(&common_arguments()) + ) .subcommand( SubCommand::with_name("search") .about("Search entire text input and find and output all possible matches") @@ -954,6 +959,8 @@ fn main() { args } else if let Some(args) = rootargs.subcommand_matches("search") { args + } else if let Some(args) = rootargs.subcommand_matches("testinput") { + args } else { eprintln!("No command specified, please see analiticcl --help"); exit(2); @@ -997,6 +1004,24 @@ fn main() { .expect("Debug level should be integer in range 0-4"), ); + if rootargs.subcommand_matches("testinput").is_some() { + eprintln!("Testing whether input can be fully encoded..."); + let stdin = io::stdin(); + let f_buffer = BufReader::new(stdin); + for line in f_buffer.lines() { + if let Ok(input) = line { + let av: AnaValue = input.anahash(&model.alphabet); + let normstring: NormString = input.normalize_to_alphabet(&model.alphabet); + if av.contains(&AnaValue::character(model.alphabet_size() - 1)) { + eprintln!("UNKNOWN: {}\t{}\t{:?}", input, av, normstring); + } else { + println!("OK: {}\t{}\t{:?}", input, av, normstring); + } + } + } + exit(0); + } + eprintln!("Loading lexicons..."); //Gathering everything to load, in the exact order specified