Fixed important bug in anahashing and normalizing to alphabet for mul…

…tibyte characters #17 Also added a 'testinput' mode and made alphabet debugging more verbose
proycon · Feb 21, 2023 · 3323391 · 3323391
1 parent 6d46fd1
commit 3323391
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 10 deletions.
diff --git a/src/anahash.rs b/src/anahash.rs
@@ -16,21 +16,22 @@ impl Anahashable for str {
     fn anahash(&self, alphabet: &Alphabet) -> AnaValue {
         let mut hash: AnaValue = AnaValue::empty();
         let mut skip = 0;
-        for (pos, _) in self.char_indices() {
+        for (bytepos, _c) in self.char_indices() {
             if skip > 0 {
                 skip -= 1;
                 continue;
             }
             let mut matched = false;
             'abciter: for (seqnr, chars) in alphabet.iter().enumerate() {
                 for element in chars.iter() {
-                    let l = element.chars().count();
-                    if let Some(slice) = self.get(pos..pos + l) {
+                    let charlen = element.chars().count();
+                    let bytelen = element.len();
+                    if let Some(slice) = self.get(bytepos..bytepos + bytelen) {
                         if slice == element {
                             let charvalue = AnaValue::character(seqnr as CharIndexType);
                             hash = hash.insert(&charvalue);
                             matched = true;
-                            skip = l - 1;
+                            skip = charlen - 1;
                             break 'abciter;
                         }
                     }
@@ -49,21 +50,22 @@ impl Anahashable for str {
     fn normalize_to_alphabet(&self, alphabet: &Alphabet) -> NormString {
         let mut result = Vec::with_capacity(self.chars().count());
         let mut skip = 0;
-        for (pos, _) in self.char_indices() {
+        for (bytepos, _c) in self.char_indices() {
             if skip > 0 {
                 skip -= 1;
                 continue;
             }
             //does greedy matching in order of appearance in the alphabet file
             let mut matched = false;
-            'abciter: for (i, chars) in alphabet.iter().enumerate() {
+            'abciter: for (seqnr, chars) in alphabet.iter().enumerate() {
                 for element in chars.iter() {
-                    let l = element.chars().count();
-                    if let Some(slice) = self.get(pos..pos + l) {
+                    let charlen = element.chars().count();
+                    let bytelen = element.len();
+                    if let Some(slice) = self.get(bytepos..bytepos + bytelen) {
                         if slice == element {
-                            result.push(i as CharIndexType);
+                            result.push(seqnr as CharIndexType);
                             matched = true;
-                            skip = l - 1;
+                            skip = charlen - 1;
                             break 'abciter;
                         }
                     }

diff --git a/src/bin/analiticcl.rs b/src/bin/analiticcl.rs
@@ -911,6 +911,11 @@ fn main() {
                             .about("Compute and output the anagram index")
                             .args(&common_arguments())
                     )
+                    .subcommand(
+                        SubCommand::with_name("testinput")
+                            .about("Test whether the input can be encoded with the given alphabet")
+                            .args(&common_arguments())
+                    )
                     .subcommand(
                         SubCommand::with_name("search")
                             .about("Search entire text input and find and output all possible matches")
@@ -954,6 +959,8 @@ fn main() {
         args
     } else if let Some(args) = rootargs.subcommand_matches("search") {
         args
+    } else if let Some(args) = rootargs.subcommand_matches("testinput") {
+        args
     } else {
         eprintln!("No command specified, please see analiticcl --help");
         exit(2);
@@ -997,6 +1004,24 @@ fn main() {
             .expect("Debug level should be integer in range 0-4"),
     );
 
+    if rootargs.subcommand_matches("testinput").is_some() {
+        eprintln!("Testing whether input can be fully encoded...");
+        let stdin = io::stdin();
+        let f_buffer = BufReader::new(stdin);
+        for line in f_buffer.lines() {
+            if let Ok(input) = line {
+                let av: AnaValue = input.anahash(&model.alphabet);
+                let normstring: NormString = input.normalize_to_alphabet(&model.alphabet);
+                if av.contains(&AnaValue::character(model.alphabet_size() - 1)) {
+                    eprintln!("UNKNOWN: {}\t{}\t{:?}", input, av, normstring);
+                } else {
+                    println!("OK: {}\t{}\t{:?}", input, av, normstring);
+                }
+            }
+        }
+        exit(0);
+    }
+
     eprintln!("Loading lexicons...");
 
     //Gathering everything to load, in the exact order specified