From 33233919c221976ed818a0f562ab97448b41aa36 Mon Sep 17 00:00:00 2001
From: Maarten van Gompel <proycon@anaproy.nl>
Date: Tue, 21 Feb 2023 17:07:38 +0100
Subject: [PATCH] Fixed important bug in anahashing and normalizing to alphabet
 for multibyte characters #17

Also added a 'testinput' mode and made alphabet debugging more verbose
---
 src/anahash.rs        | 22 ++++++++++++----------
 src/bin/analiticcl.rs | 25 +++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/src/anahash.rs b/src/anahash.rs
index b8098c8..a123e65 100644
--- a/src/anahash.rs
+++ b/src/anahash.rs
@@ -16,7 +16,7 @@ impl Anahashable for str {
     fn anahash(&self, alphabet: &Alphabet) -> AnaValue {
         let mut hash: AnaValue = AnaValue::empty();
         let mut skip = 0;
-        for (pos, _) in self.char_indices() {
+        for (bytepos, _c) in self.char_indices() {
             if skip > 0 {
                 skip -= 1;
                 continue;
@@ -24,13 +24,14 @@ impl Anahashable for str {
             let mut matched = false;
             'abciter: for (seqnr, chars) in alphabet.iter().enumerate() {
                 for element in chars.iter() {
-                    let l = element.chars().count();
-                    if let Some(slice) = self.get(pos..pos + l) {
+                    let charlen = element.chars().count();
+                    let bytelen = element.len();
+                    if let Some(slice) = self.get(bytepos..bytepos + bytelen) {
                         if slice == element {
                             let charvalue = AnaValue::character(seqnr as CharIndexType);
                             hash = hash.insert(&charvalue);
                             matched = true;
-                            skip = l - 1;
+                            skip = charlen - 1;
                             break 'abciter;
                         }
                     }
@@ -49,21 +50,22 @@ impl Anahashable for str {
     fn normalize_to_alphabet(&self, alphabet: &Alphabet) -> NormString {
         let mut result = Vec::with_capacity(self.chars().count());
         let mut skip = 0;
-        for (pos, _) in self.char_indices() {
+        for (bytepos, _c) in self.char_indices() {
             if skip > 0 {
                 skip -= 1;
                 continue;
             }
             //does greedy matching in order of appearance in the alphabet file
             let mut matched = false;
-            'abciter: for (i, chars) in alphabet.iter().enumerate() {
+            'abciter: for (seqnr, chars) in alphabet.iter().enumerate() {
                 for element in chars.iter() {
-                    let l = element.chars().count();
-                    if let Some(slice) = self.get(pos..pos + l) {
+                    let charlen = element.chars().count();
+                    let bytelen = element.len();
+                    if let Some(slice) = self.get(bytepos..bytepos + bytelen) {
                         if slice == element {
-                            result.push(i as CharIndexType);
+                            result.push(seqnr as CharIndexType);
                             matched = true;
-                            skip = l - 1;
+                            skip = charlen - 1;
                             break 'abciter;
                         }
                     }
diff --git a/src/bin/analiticcl.rs b/src/bin/analiticcl.rs
index 9166dfa..906dc18 100644
--- a/src/bin/analiticcl.rs
+++ b/src/bin/analiticcl.rs
@@ -911,6 +911,11 @@ fn main() {
                             .about("Compute and output the anagram index")
                             .args(&common_arguments())
                     )
+                    .subcommand(
+                        SubCommand::with_name("testinput")
+                            .about("Test whether the input can be encoded with the given alphabet")
+                            .args(&common_arguments())
+                    )
                     .subcommand(
                         SubCommand::with_name("search")
                             .about("Search entire text input and find and output all possible matches")
@@ -954,6 +959,8 @@ fn main() {
         args
     } else if let Some(args) = rootargs.subcommand_matches("search") {
         args
+    } else if let Some(args) = rootargs.subcommand_matches("testinput") {
+        args
     } else {
         eprintln!("No command specified, please see analiticcl --help");
         exit(2);
@@ -997,6 +1004,24 @@ fn main() {
             .expect("Debug level should be integer in range 0-4"),
     );
 
+    if rootargs.subcommand_matches("testinput").is_some() {
+        eprintln!("Testing whether input can be fully encoded...");
+        let stdin = io::stdin();
+        let f_buffer = BufReader::new(stdin);
+        for line in f_buffer.lines() {
+            if let Ok(input) = line {
+                let av: AnaValue = input.anahash(&model.alphabet);
+                let normstring: NormString = input.normalize_to_alphabet(&model.alphabet);
+                if av.contains(&AnaValue::character(model.alphabet_size() - 1)) {
+                    eprintln!("UNKNOWN: {}\t{}\t{:?}", input, av, normstring);
+                } else {
+                    println!("OK: {}\t{}\t{:?}", input, av, normstring);
+                }
+            }
+        }
+        exit(0);
+    }
+
     eprintln!("Loading lexicons...");
 
     //Gathering everything to load, in the exact order specified