-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathlib.rs
2814 lines (2610 loc) · 112 KB
/
lib.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
extern crate ibig;
extern crate num_traits;
extern crate rayon;
extern crate rustfst;
extern crate sesdiff;
extern crate simple_error;
use rayon::prelude::*;
use rustfst::prelude::*;
use sesdiff::shortest_edit_script;
use std::cmp::min;
use std::cmp::Ordering;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::error::Error;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::str::FromStr;
use std::sync::Arc;
use std::time::SystemTime;
pub mod anahash;
pub mod cache;
pub mod confusables;
pub mod distance;
pub mod index;
pub mod iterators;
pub mod search;
pub mod test;
pub mod types;
pub mod vocab;
pub use crate::anahash::*;
pub use crate::cache::*;
pub use crate::confusables::*;
pub use crate::distance::*;
pub use crate::index::*;
pub use crate::iterators::*;
pub use crate::search::*;
pub use crate::types::*;
pub use crate::vocab::*;
/// An absolute maximum on the anagram distance, even for long inputs
const MAX_ANAGRAM_DISTANCE: u8 = 12;
/// An absolute maximum on the edit distance, even for long inputs
const MAX_EDIT_DISTANCE: u8 = 12;
/// The VariantModel is the most high-level model of analiticcl, it holds
/// all data required for variant matching.
pub struct VariantModel {
/// Maps Vocabulary IDs to their textual strings and other related properties
pub decoder: VocabDecoder,
/// Map strings to vocabulary IDs
pub encoder: VocabEncoder,
/// Defines the alphabet used for the variant model
pub alphabet: Alphabet,
///The main index, mapping anagrams to instances
pub index: AnaIndex,
///A secondary sorted index
///indices of the outer vector correspond to the length of an anagram (in chars) - 1
///Inner vector is always sorted
pub sortedindex: BTreeMap<u16, Vec<AnaValue>>,
/// Ngrams for simple context-sensitive language modelling
/// when finding the most probable sequence of variants
pub ngrams: HashMap<NGram, u32>,
///Total frequency, index corresponds to n-1 size, so this holds the total count for unigrams, bigrams, etc.
pub freq_sum: Vec<usize>,
/// Do we have frequency information for variant matching?
pub have_freq: bool,
/// Do we have an LM?
pub have_lm: bool,
/// Context rules
pub context_rules: Vec<ContextRule>,
/// Tags used by the context rules
pub tags: Vec<String>,
///Weights used in distance scoring
pub weights: Weights,
/// Stores the names of the loaded lexicons, they will be referenced by index from individual
/// items for provenance reasons
pub lexicons: Vec<String>,
/// Holds weighted confusable recipes that can be used in scoring and ranking
pub confusables: Vec<Confusable>,
///Process confusables before pruning by max_matches
pub confusables_before_pruning: bool,
pub debug: u8,
}
impl VariantModel {
/// Instantiate a new variant model
pub fn new(alphabet_file: &str, weights: Weights, debug: u8) -> VariantModel {
let mut model = VariantModel {
alphabet: Vec::new(),
encoder: HashMap::new(),
decoder: Vec::new(),
index: HashMap::new(),
sortedindex: BTreeMap::new(),
ngrams: HashMap::new(),
freq_sum: vec![0],
have_freq: false,
have_lm: false,
weights,
lexicons: Vec::new(),
confusables: Vec::new(),
confusables_before_pruning: false,
context_rules: Vec::new(),
tags: Vec::new(),
debug,
};
model
.read_alphabet(alphabet_file)
.expect("Error loading alphabet file");
init_vocab(&mut model.decoder, &mut model.encoder);
model
}
/// Instantiate a new variant model, explicitly passing an alphabet rather than loading one
/// from file.
pub fn new_with_alphabet(alphabet: Alphabet, weights: Weights, debug: u8) -> VariantModel {
let mut model = VariantModel {
alphabet: alphabet,
decoder: Vec::new(),
encoder: HashMap::new(),
index: HashMap::new(),
sortedindex: BTreeMap::new(),
ngrams: HashMap::new(),
freq_sum: vec![0],
have_freq: false,
have_lm: false,
weights,
lexicons: Vec::new(),
confusables: Vec::new(),
confusables_before_pruning: false,
context_rules: Vec::new(),
tags: Vec::new(),
debug,
};
init_vocab(&mut model.decoder, &mut model.encoder);
model
}
/// Configure the model to match against known confusables prior to pruning on maximum weight.
/// This may lead to better results but may have a significant performance impact.
pub fn set_confusables_before_pruning(&mut self) {
self.confusables_before_pruning = true;
}
/// Returns the size of the alphabet, this is typically +1 longer than the actual alphabet file
/// as it includes the UNKNOWN symbol.
pub fn alphabet_size(&self) -> CharIndexType {
self.alphabet.len() as CharIndexType + 1 //+1 for UNK
}
/// Get an item from the index or insert it if it doesn't exist yet
pub fn get_or_create_index<'a, 'b>(
&'a mut self,
anahash: &'b AnaValue,
) -> &'a mut AnaIndexNode {
if self.contains_key(anahash) {
self.index
.get_mut(anahash)
.expect("get_mut on node after check")
} else {
self.index.insert(
anahash.clone(),
AnaIndexNode {
instances: Vec::new(),
charcount: anahash.char_count(self.alphabet_size()),
},
);
self.index
.get_mut(&anahash)
.expect("get_mut on node after insert")
}
}
/// Build the anagram index (and secondary index) so the model
/// is ready for variant matching
pub fn build(&mut self) {
eprintln!("Computing anagram values for all items in the lexicon...");
// Hash all strings in the lexicon
// and add them to the index
let mut tmp_hashes: Vec<(AnaValue, VocabId)> = Vec::with_capacity(self.decoder.len());
for (id, value) in self.decoder.iter().enumerate() {
if value.vocabtype.check(VocabType::INDEXED) {
//get the anahash
let anahash = value.text.anahash(&self.alphabet);
if self.debug >= 2 {
eprintln!(
" -- Anavalue={} VocabId={} Text={}",
&anahash, id, value.text
);
}
tmp_hashes.push((anahash, id as VocabId));
}
}
eprintln!(" - Found {} instances", tmp_hashes.len());
eprintln!("Adding all instances to the index...");
self.index.clear();
for (anahash, id) in tmp_hashes {
//add it to the index
let node = self.get_or_create_index(&anahash);
node.instances.push(id);
}
eprintln!(" - Found {} anagrams", self.index.len());
eprintln!("Creating sorted secondary index...");
self.sortedindex.clear();
for (anahash, node) in self.index.iter() {
if !self.sortedindex.contains_key(&node.charcount) {
self.sortedindex.insert(node.charcount, Vec::new());
}
let keys = self
.sortedindex
.get_mut(&node.charcount)
.expect("getting sorted index (1)");
keys.push(anahash.clone()); //TODO: see if we can make this a reference later
}
eprintln!("Sorting secondary index...");
let mut sizes: Vec<u16> = self.sortedindex.keys().map(|x| *x).collect();
sizes.sort();
for size in sizes {
let keys = self
.sortedindex
.get_mut(&size)
.expect("getting sorted index (2)");
keys.sort();
eprintln!(" - Found {} anagrams of length {}", keys.len(), size);
}
eprintln!("Constructing Language Model...");
//extra unigrams extracted from n-grams that need to be added to the vocabulary decoder
let mut unseen_parts: Option<VocabEncoder> = Some(VocabEncoder::new());
for id in 0..self.decoder.len() {
if self
.decoder
.get(id)
.expect("item")
.vocabtype
.check(VocabType::LM)
{
//get the ngram and find any unseen parts
if let Ok(ngram) = self.into_ngram(id as VocabId, &mut unseen_parts) {
let freq = self.decoder.get(id).unwrap().frequency;
if ngram.len() > 1 {
//reserve the space for the total counts
for _ in self.freq_sum.len()..ngram.len() {
self.freq_sum.push(0);
}
//add to the totals for this order of ngrams
self.freq_sum[ngram.len() - 1] += freq as usize;
} else {
self.freq_sum[0] += freq as usize;
}
self.add_ngram(ngram, freq);
}
}
}
if let Some(unseen_parts) = unseen_parts {
//add collected unseen n-gram parts to the decoder
for (part, id) in unseen_parts {
self.add_ngram(NGram::UniGram(id), 1);
self.encoder.insert(part.clone(), id);
self.decoder.push(VocabValue::new(part, VocabType::LM));
}
}
if self.ngrams.is_empty() {
eprintln!(" - No language model provided");
self.have_lm = false;
} else {
eprintln!(
" - Found {} n-grams for language modelling",
self.ngrams.len()
);
self.have_lm = true;
}
}
/// Tests if the anagram value exists in the index
pub fn contains_key(&self, key: &AnaValue) -> bool {
self.index.contains_key(key)
}
///Get all anagram instances for a specific entry
pub fn get_anagram_instances(&self, text: &str) -> Vec<&VocabValue> {
let anavalue = text.anahash(&self.alphabet);
let mut instances: Vec<&VocabValue> = Vec::new();
if let Some(node) = self.index.get(&anavalue) {
for vocab_id in node.instances.iter() {
instances.push(
self.decoder
.get(*vocab_id as usize)
.expect("vocab from decoder"),
);
}
}
instances
}
///Get an exact item in the lexicon (if it exists)
pub fn get(&self, text: &str) -> Option<&VocabValue> {
for instance in self.get_anagram_instances(text) {
if instance.text == text {
return Some(instance);
}
}
None
}
///Tests if the lexicon has a specific entry, by text
pub fn has(&self, text: &str) -> bool {
for instance in self.get_anagram_instances(text) {
if instance.text == text {
return true;
}
}
false
}
///Resolves a vocabulary ID
pub fn get_vocab(&self, vocab_id: VocabId) -> Option<&VocabValue> {
self.decoder.get(vocab_id as usize)
}
/// Decomposes and decodes and anagram value into the characters that make it up.
/// Mostly intended for debugging purposes.
pub fn decompose_anavalue(&self, av: &AnaValue) -> Vec<&str> {
let mut result = Vec::new();
for c in av.iter(self.alphabet_size()) {
result.push(
self.alphabet
.get(c.0.charindex as usize)
.expect("alphabet item must exist")
.get(0)
.unwrap()
.as_str(),
);
}
result
}
///Read the alphabet from a TSV file
///The file contains one alphabet entry per line, but may
///consist of multiple tab-separated alphabet entries on that line, which
///will be treated as the identical.
///The alphabet is not limited to single characters but may consist
///of longer string, a greedy matching approach will be used so order
///matters (but only for this)
pub fn read_alphabet(&mut self, filename: &str) -> Result<(), std::io::Error> {
if self.debug >= 1 {
eprintln!("Reading alphabet from {}...", filename);
}
let f = File::open(filename)?;
let f_buffer = BufReader::new(f);
for line in f_buffer.lines() {
if let Ok(line) = line {
if !line.is_empty() {
let fields = line
.split("\t")
.filter_map(|x| match x {
"\\s" => Some(" ".to_owned()),
"\\t" => Some("\t".to_owned()),
"\\n" => Some("\n".to_owned()),
_ => {
if x.trim().is_empty() {
None
} else {
Some(x.trim().to_owned())
}
}
})
.collect();
self.alphabet.push(fields);
}
}
}
if self.debug >= 2 {
eprintln!(" -- Read alphabet of size {}", self.alphabet.len());
for (i, items) in self.alphabet.iter().enumerate() {
let av = AnaValue::character(i as CharIndexType);
eprintln!(" -- #{} -> {} - {:?}", i, av, items);
}
} else if self.debug >= 1 {
eprintln!(" -- Read alphabet of size {}", self.alphabet.len());
}
Ok(())
}
///Read a confusiblelist from a TSV file
///Contains edit scripts in the first columned (formatted in sesdiff style)
///and optionally a weight in the second column.
///favourable confusables have a weight > 1.0, unfavourable ones are < 1.0 (penalties)
///Weight values should be relatively close to 1.0 as they are applied to the entire score
pub fn read_confusablelist(&mut self, filename: &str) -> Result<(), std::io::Error> {
if self.debug >= 1 {
eprintln!("Reading confusables from {}...", filename);
}
let f = File::open(filename)?;
let f_buffer = BufReader::new(f);
for line in f_buffer.lines() {
if let Ok(line) = line {
if !line.is_empty() {
let fields: Vec<&str> = line.split("\t").collect();
let weight = if fields.len() >= 2 {
fields
.get(1)
.unwrap()
.parse::<f64>()
.expect("score should be a float")
} else {
1.0
};
self.add_to_confusables(fields.get(0).unwrap(), weight)?;
}
}
}
if self.debug >= 1 {
eprintln!(" -- Read {} confusables", self.confusables.len());
}
Ok(())
}
/// Add a confusable
pub fn add_to_confusables(
&mut self,
editscript: &str,
weight: f64,
) -> Result<(), std::io::Error> {
let confusable = Confusable::new(editscript, weight)?;
self.confusables.push(confusable);
Ok(())
}
/// Add a (weighted) variant to the model, referring to a reference that already exists in
/// the model.
/// Variants will be added
/// to the lexicon automatically when necessary. Set VocabType::TRANSPARENT
/// if you want variants to only be used as an intermediate towards items that
/// have already been added previously through a more authoritative lexicon.
pub fn add_variant(
&mut self,
ref_id: VocabId,
variant: &str,
score: f64,
freq: Option<u32>,
params: &VocabParams,
) -> bool {
let variantid = self.add_to_vocabulary(variant, freq, ¶ms);
self.add_variant_by_id(ref_id, variantid, score)
}
/// Add a (weighted) variant to the model, referring to a reference that already exists in
/// the model.
/// Variants will be added
/// to the lexicon automatically when necessary. Set VocabType::TRANSPARENT
/// if you want variants to only be used as an intermediate towards items that
/// have already been added previously through a more authoritative lexicon.
pub fn add_variant_by_id(&mut self, ref_id: VocabId, variantid: VocabId, score: f64) -> bool {
if variantid != ref_id {
//link reference to variant
if let Some(vocabvalue) = self.decoder.get_mut(ref_id as usize) {
let variantref = VariantReference::ReferenceFor((variantid, score));
if vocabvalue.variants.is_none() {
vocabvalue.variants = Some(vec![variantref]);
} else if let Some(variantrefs) = vocabvalue.variants.as_mut() {
//only add if it doesn't already exists (only first mention counts, regardless of score)
if !variantrefs.iter().any(|x| match x {
VariantReference::ReferenceFor((y, _)) => variantid == *y,
_ => false,
}) {
variantrefs.push(variantref);
}
}
}
//link variant to reference
if let Some(vocabvalue) = self.decoder.get_mut(variantid as usize) {
let variantref = VariantReference::VariantOf((ref_id, score));
if vocabvalue.variants.is_none() {
vocabvalue.variants = Some(vec![variantref]);
} else if let Some(variantrefs) = vocabvalue.variants.as_mut() {
//only add if it doesn't already exists (only first mention counts, regardless of score)
if !variantrefs.iter().any(|x| match x {
VariantReference::VariantOf((y, _)) => variantid == *y,
_ => false,
}) {
variantrefs.push(variantref);
}
}
}
true
} else {
false
}
}
///Read vocabulary (a lexicon or corpus-derived lexicon) from a TSV file
///May contain frequency information
///The parameters define what value can be read from what column
pub fn read_vocabulary(
&mut self,
filename: &str,
params: &VocabParams,
) -> Result<(), std::io::Error> {
if self.debug >= 1 {
eprintln!(
"Reading vocabulary #{} from {} ({:?})...",
self.lexicons.len() + 1,
filename,
params.vocab_type
);
}
let beginlen = self.decoder.len();
let f = File::open(filename)?;
let f_buffer = BufReader::new(f);
let mut params = params.clone();
params.index = self.lexicons.len() as u8;
for line in f_buffer.lines() {
if let Ok(line) = line {
if !line.is_empty() {
let fields: Vec<&str> = line.split("\t").collect();
let text = fields
.get(params.text_column as usize)
.expect("Expected text column not found");
let frequency = if let Some(freq_column) = params.freq_column {
if params.vocab_type.check(VocabType::INDEXED) {
self.have_freq = true;
}
fields
.get(freq_column as usize)
.unwrap_or(&"1")
.parse::<u32>()
.expect("frequency should be a valid integer")
} else {
1
};
self.add_to_vocabulary(text, Some(frequency), ¶ms);
}
}
}
if self.debug >= 1 {
eprintln!(
" - Read vocabulary of size {}",
self.decoder.len() - beginlen
);
}
self.lexicons.push(filename.to_string());
Ok(())
}
pub fn read_contextrules(&mut self, filename: &str) -> Result<(), std::io::Error> {
if self.debug >= 1 {
eprintln!("Reading context rules {}...", filename);
}
let f = File::open(filename)?;
let f_buffer = BufReader::new(f);
let mut linenr = 0;
for line in f_buffer.lines() {
if let Ok(line) = line {
linenr += 1;
if !line.is_empty() && !line.starts_with('#') {
let fields: Vec<&str> = line.split("\t").collect();
if fields.len() < 2 {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!(
"Expected at least two columns in context rules file {}, line {}",
filename, linenr
),
));
}
let pattern: &str = fields.get(0).unwrap();
if pattern.is_empty() {
continue;
}
let score = fields.get(1).unwrap().parse::<f32>();
if let Err(_) = score {
return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("context rule score should be a floating point value above or below 1.0, got {} ({}, line {})", fields.get(1).unwrap(), filename,linenr)));
}
let score = score.unwrap();
let tag: Vec<&str> = match fields.get(2) {
Some(s) => s
.split(";")
.filter_map(|w| {
let w = w.trim();
if w.is_empty() {
None
} else {
Some(w)
}
})
.collect(),
None => Vec::new(),
};
let mut tagoffset: Vec<&str> = match fields.get(3) {
Some(s) => s
.split(";")
.filter_map(|w| {
let w = w.trim();
if w.is_empty() {
None
} else {
Some(w)
}
})
.collect(),
None => Vec::new(),
};
if tag.len() == 1 && tagoffset.len() == 0 {
tagoffset.push("0:");
} else if tag.len() != tagoffset.len() {
return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("Multiple tags are specified for a context rule, expected the same number of tag offsets! (semicolon separated) ({}, line {})", filename, linenr)));
}
if let Err(error) = self.add_contextrule(pattern, score, tag, tagoffset) {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!(
"Error adding context rule: {} ({}, line {})",
error, filename, linenr
),
));
}
}
}
}
if self.debug >= 1 {
eprintln!(" -- Read {} context rules", self.context_rules.len());
}
Ok(())
}
pub fn add_contextrule(
&mut self,
pattern: &str,
score: f32,
tag: Vec<&str>,
tagoffset: Vec<&str>,
) -> Result<(), std::io::Error> {
let expressions: Vec<&str> = pattern.split(";").map(|s| s.trim()).collect();
let mut pattern: Vec<PatternMatch> = Vec::new();
for expr in expressions {
match PatternMatch::parse(expr, &self.lexicons, &self.encoder) {
Ok(pm) => pattern.push(pm),
Err(err) => {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!("Error parsing context rule: {}", err),
))
}
}
}
let mut errmsg: Option<&str> = None;
let tag: Vec<u16> = tag
.iter()
.map(|tag| {
if tag.is_empty() {
errmsg = Some("tag is empty");
}
let mut pos = None;
for (i, t) in self.tags.iter().enumerate() {
if t == tag {
pos = Some(i as u16);
break;
}
}
if pos.is_none() {
self.tags.push(tag.to_string());
(self.tags.len() - 1) as u16
} else {
pos.unwrap()
}
})
.collect();
if let Some(errmsg) = errmsg {
return Err(std::io::Error::new(std::io::ErrorKind::Other, errmsg));
}
let mut error: Option<&str> = None;
let mut tagoffset: Vec<(u8, u8)> = tagoffset
.iter()
.map(|s| {
let fields: Vec<&str> = s.split(":").collect();
let tagbegin: u8 = if let Some(tagbegin) = fields.get(0) {
if tagbegin.is_empty() {
0
} else {
match tagbegin.parse::<u8>() {
Ok(x) => x,
Err(_) => {
error = Some("tag offset should be an integer");
0
}
}
}
} else {
0
};
let taglength: u8 = if let Some(taglength) = fields.get(1) {
if taglength.is_empty() {
pattern.len() as u8 - tagbegin
} else {
match taglength.parse::<u8>() {
Ok(x) => x,
Err(_) => {
error = Some("tag length should be an integer");
0
}
}
}
} else {
pattern.len() as u8 - tagbegin
};
(tagbegin, taglength)
})
.collect();
if let Some(error) = error {
return Err(std::io::Error::new(std::io::ErrorKind::Other, error));
}
while tagoffset.len() < tag.len() {
tagoffset.push((0, pattern.len() as u8));
}
if !pattern.is_empty() {
self.context_rules.push(ContextRule {
pattern,
score,
tag,
tagoffset,
});
}
Ok(())
}
///Read a weighted variant list from a TSV file. Contains a canonical/reference form in the
///first column, and variants with score (two columns) in the following columns. May also
///contain frequency information (auto detected), in which case the first column has the
///canonical/reference form, the second column the frequency, and all further columns hold
///variants, their score and their frequency (three columns).
///Consumes much more memory than equally weighted variants.
pub fn read_variants(
&mut self,
filename: &str,
params: Option<&VocabParams>,
transparent: bool,
) -> Result<(), std::io::Error> {
let params = if let Some(params) = params {
let mut p = params.clone();
p.index = self.lexicons.len() as u8;
p
} else {
VocabParams {
index: self.lexicons.len() as u8,
..Default::default()
}
};
let transparent_params = if transparent {
let mut p = params.clone();
p.vocab_type |= VocabType::TRANSPARENT;
p
} else {
params.clone()
};
if self.debug >= 1 {
eprintln!("Reading variants from {}...", filename);
}
let mut count = 0;
let mut has_freq = None;
let f = File::open(filename)?;
let f_buffer = BufReader::new(f);
for (linenr, line) in f_buffer.lines().enumerate() {
let linenr = linenr + 1;
if let Ok(line) = line {
if !line.is_empty() {
let fields: Vec<&str> = line.split("\t").collect();
let reference = fields.get(0).expect(
format!(
"reference item (line {}, column 1, of {})",
linenr, filename
)
.as_str(),
);
let freq = if has_freq.is_none() {
//autodetect whether we have frequency information or not
if (fields.len() - 2) % 3 == 0 {
let freq = fields.get(1).expect("second field");
match freq.parse::<u32>() {
Ok(freq) => {
has_freq = Some(true);
Some(freq)
}
_ => None,
}
} else {
//number of columns not consistent with holding frequency information
has_freq = Some(false);
None
}
} else if has_freq == Some(true) {
let freq = fields.get(1).expect("score of reference item");
Some(
freq.parse::<u32>().expect(
format!(
"Frequency must be an integer (line {}, column 2, of {})",
linenr, filename
)
.as_str(),
),
)
} else {
None
};
let ref_id = self.add_to_vocabulary(reference, freq, ¶ms);
let mut iter = fields.iter();
if has_freq == Some(true) {
iter.next();
iter.next();
while let (Some(variant), Some(score), Some(freq)) =
(iter.next(), iter.next(), iter.next())
{
let score = score.parse::<f64>().expect(format!("Variant scores must be a floating point value (line {} of {}, got {} instead), also parsing frequency", linenr, filename, score).as_str());
let freq = freq.parse::<u32>().expect(format!("Variant frequency must be an integer (line {} of {}), got {} instead", linenr, filename, freq).as_str());
if self.add_variant(
ref_id,
variant,
score,
Some(freq),
if transparent {
&transparent_params
} else {
¶ms
},
) {
count += 1;
}
}
} else {
iter.next();
while let (Some(variant), Some(score)) = (iter.next(), iter.next()) {
let score = score.parse::<f64>().expect(format!("Variant scores must be a floating point value (line {} of {}, got {}), no frequency information", linenr, filename, score).as_str());
if self.add_variant(
ref_id,
variant,
score,
None,
if transparent {
&transparent_params
} else {
¶ms
},
) {
count += 1;
}
}
}
}
}
}
if self.debug >= 1 {
eprintln!(" - Read weighted variants list, added {} references", count);
}
self.lexicons.push(filename.to_string());
Ok(())
}
/// Adds an entry in the vocabulary
pub fn add_to_vocabulary(
&mut self,
text: &str,
frequency: Option<u32>,
params: &VocabParams,
) -> VocabId {
let frequency = frequency.unwrap_or(1);
if self.debug >= 2 {
eprintln!(" -- Adding to vocabulary: {} ({})", text, frequency);
}
if let Some(vocab_id) = self.encoder.get(text) {
let item = self.decoder.get_mut(*vocab_id as usize).expect(&format!(
"Retrieving existing vocabulary entry {}",
vocab_id
));
match params.freq_handling {
FrequencyHandling::Sum => {
item.frequency += frequency;
}
FrequencyHandling::Max => {
if frequency > item.frequency {
item.frequency = frequency;
};
}
FrequencyHandling::Min => {
if frequency < item.frequency {
item.frequency = frequency;
};
}
FrequencyHandling::Replace => {
item.frequency = frequency;
}
}
if vocab_id == &BOS || vocab_id == &EOS || vocab_id == &UNK {
item.vocabtype = VocabType::LM; //by definition
} else if item.vocabtype.check(VocabType::TRANSPARENT)
&& !params.vocab_type.check(VocabType::TRANSPARENT)
{
//we can lose the transparency flag if a later lexicon doesn't provide it
item.vocabtype ^= VocabType::TRANSPARENT;
}
item.lexindex |= 1 << params.index;
if self.debug >= 3 {
eprintln!(
" (updated) freq={}, lexindex+={}",
item.frequency, params.index
);
}
*vocab_id
} else {
//item is new
self.encoder
.insert(text.to_string(), self.decoder.len() as u64);
self.decoder.push(VocabValue {
text: text.to_string(),
norm: text.normalize_to_alphabet(&self.alphabet),
frequency: frequency,
tokencount: text.chars().filter(|c| *c == ' ').count() as u8 + 1,
lexindex: 1 << params.index,
variants: None,
vocabtype: params.vocab_type,
});
if self.debug >= 3 {
eprintln!(" (new) lexindex={}", params.index);
}
self.decoder.len() as VocabId - 1
}
}
/// Find variants in the vocabulary for a given string (in its totality), returns a vector of vocabulary ID and score pairs
/// Returns a vector of three-tuples (VocabId, distance_score, freq_score)
/// The resulting vocabulary Ids can be resolved through `get_vocab()`
pub fn find_variants(&self, input: &str, params: &SearchParameters) -> Vec<VariantResult> {
if self.index.is_empty() {
eprintln!("ERROR: Model has not been built yet! Call build() before find_variants()");
return vec![];
}
//Compute the anahash
let normstring = input.normalize_to_alphabet(&self.alphabet);
let anahash = input.anahash(&self.alphabet);
let max_anagram_distance: u8 = match params.max_anagram_distance {
DistanceThreshold::Ratio(x) => min(
(normstring.len() as f32 * x).floor() as u8,
MAX_ANAGRAM_DISTANCE, //absolute maximum as a safeguard
),
DistanceThreshold::RatioWithLimit(x, limit) => {
min((normstring.len() as f32 * x).floor() as u8, limit)
}
DistanceThreshold::Absolute(x) => min(
x,
(normstring.len() as f64 / 2.0).floor() as u8, //we still override the absolute threshold when dealing with very small inputs
),
};
//Compute neighbouring anahashes and find the nearest anahashes in the model
let anahashes =
self.find_nearest_anahashes(&anahash, max_anagram_distance, params.stop_criterion);
let max_edit_distance: u8 = match params.max_edit_distance {