diff --git a/README.md b/README.md index 27bb249..c07689d 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ This is a simple lexicon matching tool that, given a lexicon of words or phrases It can be used compute a frequency list for a lexicon, on a target corpus. The implementation uses suffix arrays. The text must be plain-text UTF-8 and is limited to 2^32 bytes (about 4GB). +The offsets outputted will be UTF-8 byte positions. ## Installation @@ -33,3 +34,16 @@ $ lexmatch --lexicon lexicon.lst --text corpus.txt ``` The lexicon must be plain-text UTF-8 containing one entry per line, an entry need not be a single word and is not constrained in length. + +Instead of a lexicon you can also provide the patterns to query on the command line using ``--query``. +For verbose output, add ``--verbose``. This produces TSV (tab seperated values) output that you can easily import in for example the [STAM tools](https://github.com/annotation/stam-tools): + +``` +$ lexmatch --verbose --query test --text /tmp/test.txt +Reading text... +Building suffix array (this may take a while)... +Searching... +Text BeginUtf8Offset EndUtf8Offset +test 53 57 +test 11 15 +``` diff --git a/src/main.rs b/src/main.rs index f6a1cf8..44cd078 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,13 +1,12 @@ extern crate clap; extern crate suffix; +use clap::{App, Arg}; use std::fs::File; -use std::io::{Read,BufReader,BufRead}; +use std::io::{BufRead, BufReader, Read}; use std::process::exit; -use clap::{Arg, App}; use suffix::SuffixTable; - ///Read a lexicon, one entry per line fn read_lexicon(filename: &str) -> Result<Vec<String>, std::io::Error> { let mut lexicon: Vec<String> = Vec::new(); @@ -54,7 +53,7 @@ fn main() { .arg(Arg::with_name("text") .long("text") .short("t") - .help("The text to operate on (plain text UTF-8, max 4GB)") + .help("The filename of the text to operate on (plain text UTF-8, max 4GB)") .takes_value(true) .required(true)) .arg(Arg::with_name("all") @@ -62,6 +61,11 @@ fn main() { .short("a") .help("Return all matches (also as substrings), rather than only exact matches") .required(false)) + .arg(Arg::with_name("verbose") + .long("verbose") + .short("v") + .help("Return output verbosely as TSV with each match on a separate row. Will output a header on the first line.") + .required(false)) .arg(Arg::with_name("no-matches") .long("no-matches") .short("M") @@ -75,7 +79,11 @@ fn main() { .default_value("1")) .get_matches(); - let freq_threshold = args.value_of("freq").expect("frequency threshold").parse::<usize>().expect("Frequency threshold must be an integer value >= 0"); + let freq_threshold = args + .value_of("freq") + .expect("frequency threshold") + .parse::<usize>() + .expect("Frequency threshold must be an integer value >= 0"); if !args.is_present("lexicon") && !args.is_present("query") { eprintln!("ERROR: specify either --lexicon or --query"); @@ -103,16 +111,28 @@ fn main() { let suffixtable = build_suffixarray(&text); eprintln!("Searching..."); + if args.is_present("verbose") { + println!("Text\tBeginUtf8Offset\tEndUtf8Offset"); + } for entry in lexicon.iter() { let matches = suffixtable.positions(entry); + let length = entry.as_bytes().len() as u32; if args.is_present("all") { if matches.len() >= freq_threshold { - print!("{}\t{}",entry, matches.len()); - if !args.is_present("no-matches") { + if args.is_present("verbose") { for begin in matches.iter() { - print!("\t{}",begin); + let end = *begin + length; + println!("{}\t{}\t{}", entry, *begin, end); + } + } else { + print!("{}\t{}", entry, matches.len()); + if !args.is_present("no-matches") { + for begin in matches.iter() { + print!("\t{}", begin); + } } + println!(); } } } else { @@ -121,32 +141,41 @@ fn main() { //boundaries are simple ascii-like spaces, punctuation etc. // let bytetext: &[u8] = text.as_bytes(); - let entrylength = entry.as_bytes().len(); - let matches_exact: Vec<u32> = matches.into_iter().filter_map(|begin| { - let begin = *begin as usize; - if begin > 0 { - let c: char = bytetext[begin-1] as char; - if c.is_alphanumeric() { - return None; + let matches_exact: Vec<u32> = matches + .into_iter() + .filter_map(|begin| { + let begin = *begin as usize; + if begin > 0 { + let c: char = bytetext[begin - 1] as char; + if c.is_alphanumeric() { + return None; + } } - } - if begin + entrylength < bytetext.len() { - let c: char = bytetext[begin +entrylength] as char; - if c.is_alphanumeric() { - return None; + if (begin + length as usize) < bytetext.len() { + let c: char = bytetext[begin + length as usize] as char; + if c.is_alphanumeric() { + return None; + } } - } - Some(begin as u32) - }).collect(); + Some(begin as u32) + }) + .collect(); if matches_exact.len() >= freq_threshold { - print!("{}\t{}",entry, matches_exact.len()); - if !args.is_present("no-matches") { + if args.is_present("verbose") { for begin in matches_exact.iter() { - print!("\t{}",begin); + let end = begin + length; + println!("{}\t{}\t{}", entry, *begin, end); + } + } else { + print!("{}\t{}", entry, matches_exact.len()); + if !args.is_present("no-matches") { + for begin in matches_exact.iter() { + print!("\t{}", begin); + } } + println!(); } - println!(); } } }