Skip to content

Commit

Permalink
Added verbose output support (suitable for import with STAM).
Browse files Browse the repository at this point in the history
  • Loading branch information
proycon committed Feb 6, 2024
1 parent 08e0fc4 commit 1681d38
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 27 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ This is a simple lexicon matching tool that, given a lexicon of words or phrases
It can be used compute a frequency list for a lexicon, on a target corpus.

The implementation uses suffix arrays. The text must be plain-text UTF-8 and is limited to 2^32 bytes (about 4GB).
The offsets outputted will be UTF-8 byte positions.


## Installation
Expand Down Expand Up @@ -33,3 +34,16 @@ $ lexmatch --lexicon lexicon.lst --text corpus.txt
```

The lexicon must be plain-text UTF-8 containing one entry per line, an entry need not be a single word and is not constrained in length.

Instead of a lexicon you can also provide the patterns to query on the command line using ``--query``.
For verbose output, add ``--verbose``. This produces TSV (tab seperated values) output that you can easily import in for example the [STAM tools](https://github.com/annotation/stam-tools):

```
$ lexmatch --verbose --query test --text /tmp/test.txt
Reading text...
Building suffix array (this may take a while)...
Searching...
Text BeginUtf8Offset EndUtf8Offset
test 53 57
test 11 15
```
83 changes: 56 additions & 27 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
extern crate clap;
extern crate suffix;

use clap::{App, Arg};
use std::fs::File;
use std::io::{Read,BufReader,BufRead};
use std::io::{BufRead, BufReader, Read};
use std::process::exit;
use clap::{Arg, App};
use suffix::SuffixTable;


///Read a lexicon, one entry per line
fn read_lexicon(filename: &str) -> Result<Vec<String>, std::io::Error> {
let mut lexicon: Vec<String> = Vec::new();
Expand Down Expand Up @@ -54,14 +53,19 @@ fn main() {
.arg(Arg::with_name("text")
.long("text")
.short("t")
.help("The text to operate on (plain text UTF-8, max 4GB)")
.help("The filename of the text to operate on (plain text UTF-8, max 4GB)")
.takes_value(true)
.required(true))
.arg(Arg::with_name("all")
.long("all")
.short("a")
.help("Return all matches (also as substrings), rather than only exact matches")
.required(false))
.arg(Arg::with_name("verbose")
.long("verbose")
.short("v")
.help("Return output verbosely as TSV with each match on a separate row. Will output a header on the first line.")
.required(false))
.arg(Arg::with_name("no-matches")
.long("no-matches")
.short("M")
Expand All @@ -75,7 +79,11 @@ fn main() {
.default_value("1"))
.get_matches();

let freq_threshold = args.value_of("freq").expect("frequency threshold").parse::<usize>().expect("Frequency threshold must be an integer value >= 0");
let freq_threshold = args
.value_of("freq")
.expect("frequency threshold")
.parse::<usize>()
.expect("Frequency threshold must be an integer value >= 0");

if !args.is_present("lexicon") && !args.is_present("query") {
eprintln!("ERROR: specify either --lexicon or --query");
Expand Down Expand Up @@ -103,16 +111,28 @@ fn main() {
let suffixtable = build_suffixarray(&text);

eprintln!("Searching...");
if args.is_present("verbose") {
println!("Text\tBeginUtf8Offset\tEndUtf8Offset");
}
for entry in lexicon.iter() {
let matches = suffixtable.positions(entry);
let length = entry.as_bytes().len() as u32;

if args.is_present("all") {
if matches.len() >= freq_threshold {
print!("{}\t{}",entry, matches.len());
if !args.is_present("no-matches") {
if args.is_present("verbose") {
for begin in matches.iter() {
print!("\t{}",begin);
let end = *begin + length;
println!("{}\t{}\t{}", entry, *begin, end);
}
} else {
print!("{}\t{}", entry, matches.len());
if !args.is_present("no-matches") {
for begin in matches.iter() {
print!("\t{}", begin);
}
}
println!();
}
}
} else {
Expand All @@ -121,32 +141,41 @@ fn main() {
//boundaries are simple ascii-like spaces, punctuation etc.
//
let bytetext: &[u8] = text.as_bytes();
let entrylength = entry.as_bytes().len();
let matches_exact: Vec<u32> = matches.into_iter().filter_map(|begin| {
let begin = *begin as usize;
if begin > 0 {
let c: char = bytetext[begin-1] as char;
if c.is_alphanumeric() {
return None;
let matches_exact: Vec<u32> = matches
.into_iter()
.filter_map(|begin| {
let begin = *begin as usize;
if begin > 0 {
let c: char = bytetext[begin - 1] as char;
if c.is_alphanumeric() {
return None;
}
}
}
if begin + entrylength < bytetext.len() {
let c: char = bytetext[begin +entrylength] as char;
if c.is_alphanumeric() {
return None;
if (begin + length as usize) < bytetext.len() {
let c: char = bytetext[begin + length as usize] as char;
if c.is_alphanumeric() {
return None;
}
}
}
Some(begin as u32)
}).collect();
Some(begin as u32)
})
.collect();

if matches_exact.len() >= freq_threshold {
print!("{}\t{}",entry, matches_exact.len());
if !args.is_present("no-matches") {
if args.is_present("verbose") {
for begin in matches_exact.iter() {
print!("\t{}",begin);
let end = begin + length;
println!("{}\t{}\t{}", entry, *begin, end);
}
} else {
print!("{}\t{}", entry, matches_exact.len());
if !args.is_present("no-matches") {
for begin in matches_exact.iter() {
print!("\t{}", begin);
}
}
println!();
}
println!();
}
}
}
Expand Down

0 comments on commit 1681d38

Please sign in to comment.