Skip to content

Commit

Permalink
Fixed important bug in anahashing and normalizing to alphabet for mul…
Browse files Browse the repository at this point in the history
…tibyte characters #17

Also added a 'testinput' mode and made alphabet debugging more verbose
  • Loading branch information
proycon committed Feb 21, 2023
1 parent 6d46fd1 commit 3323391
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 10 deletions.
22 changes: 12 additions & 10 deletions src/anahash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,22 @@ impl Anahashable for str {
fn anahash(&self, alphabet: &Alphabet) -> AnaValue {
let mut hash: AnaValue = AnaValue::empty();
let mut skip = 0;
for (pos, _) in self.char_indices() {
for (bytepos, _c) in self.char_indices() {
if skip > 0 {
skip -= 1;
continue;
}
let mut matched = false;
'abciter: for (seqnr, chars) in alphabet.iter().enumerate() {
for element in chars.iter() {
let l = element.chars().count();
if let Some(slice) = self.get(pos..pos + l) {
let charlen = element.chars().count();
let bytelen = element.len();
if let Some(slice) = self.get(bytepos..bytepos + bytelen) {
if slice == element {
let charvalue = AnaValue::character(seqnr as CharIndexType);
hash = hash.insert(&charvalue);
matched = true;
skip = l - 1;
skip = charlen - 1;
break 'abciter;
}
}
Expand All @@ -49,21 +50,22 @@ impl Anahashable for str {
fn normalize_to_alphabet(&self, alphabet: &Alphabet) -> NormString {
let mut result = Vec::with_capacity(self.chars().count());
let mut skip = 0;
for (pos, _) in self.char_indices() {
for (bytepos, _c) in self.char_indices() {
if skip > 0 {
skip -= 1;
continue;
}
//does greedy matching in order of appearance in the alphabet file
let mut matched = false;
'abciter: for (i, chars) in alphabet.iter().enumerate() {
'abciter: for (seqnr, chars) in alphabet.iter().enumerate() {
for element in chars.iter() {
let l = element.chars().count();
if let Some(slice) = self.get(pos..pos + l) {
let charlen = element.chars().count();
let bytelen = element.len();
if let Some(slice) = self.get(bytepos..bytepos + bytelen) {
if slice == element {
result.push(i as CharIndexType);
result.push(seqnr as CharIndexType);
matched = true;
skip = l - 1;
skip = charlen - 1;
break 'abciter;
}
}
Expand Down
25 changes: 25 additions & 0 deletions src/bin/analiticcl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,11 @@ fn main() {
.about("Compute and output the anagram index")
.args(&common_arguments())
)
.subcommand(
SubCommand::with_name("testinput")
.about("Test whether the input can be encoded with the given alphabet")
.args(&common_arguments())
)
.subcommand(
SubCommand::with_name("search")
.about("Search entire text input and find and output all possible matches")
Expand Down Expand Up @@ -954,6 +959,8 @@ fn main() {
args
} else if let Some(args) = rootargs.subcommand_matches("search") {
args
} else if let Some(args) = rootargs.subcommand_matches("testinput") {
args
} else {
eprintln!("No command specified, please see analiticcl --help");
exit(2);
Expand Down Expand Up @@ -997,6 +1004,24 @@ fn main() {
.expect("Debug level should be integer in range 0-4"),
);

if rootargs.subcommand_matches("testinput").is_some() {
eprintln!("Testing whether input can be fully encoded...");
let stdin = io::stdin();
let f_buffer = BufReader::new(stdin);
for line in f_buffer.lines() {
if let Ok(input) = line {
let av: AnaValue = input.anahash(&model.alphabet);
let normstring: NormString = input.normalize_to_alphabet(&model.alphabet);
if av.contains(&AnaValue::character(model.alphabet_size() - 1)) {
eprintln!("UNKNOWN: {}\t{}\t{:?}", input, av, normstring);
} else {
println!("OK: {}\t{}\t{:?}", input, av, normstring);
}
}
}
exit(0);
}

eprintln!("Loading lexicons...");

//Gathering everything to load, in the exact order specified
Expand Down

0 comments on commit 3323391

Please sign in to comment.