Skip to content

Commit

Permalink
implemented --min-token-length
Browse files Browse the repository at this point in the history
  • Loading branch information
proycon committed Jul 3, 2024
1 parent 2d2201e commit 64730ef
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ fn main() {
.long("coverage-matrix")
.help("For each line in the input, compute the coverage in the lexicons")
.required(false))
.arg(Arg::with_name("min-token-length")
.long("min-token-length")
.help("Minimum token length to consider, shorter tokens will be ignored and not matched (applies --tokens, --coverage and --coverage-matrix)")
.takes_value(true)
.required(false))
.arg(Arg::with_name("cjk")
.short('C')
.long("cjk")
Expand Down Expand Up @@ -243,6 +248,11 @@ fn main() {
.collect();

let do_coverage = args.is_present("coverage");
let min_token_length = args
.value_of("min-token-length")
.unwrap()
.parse::<usize>()
.expect("Value must be integer"); //only for coverage computation

if args.is_present("verbose") || args.is_present("tokens") || args.is_present("cjk") {
print!("Text");
Expand Down Expand Up @@ -285,7 +295,10 @@ fn main() {
if c.is_alphanumeric() {
token.push(c);
} else if !token.is_empty() {
if token.chars().any(|c| c.is_alphabetic()) {
if token.chars().any(|c| c.is_alphabetic())
&& (min_token_length <= 1
|| token.chars().count() >= min_token_length)
{
totalcount += 1;
for (j, lexicon) in lexicons.iter().enumerate() {
if lexicon.contains(&token) {
Expand Down Expand Up @@ -328,7 +341,9 @@ fn main() {
if c.is_alphanumeric() {
token.push(c);
} else if !token.is_empty() {
if token.chars().any(|c| c.is_alphabetic()) {
if token.chars().any(|c| c.is_alphabetic())
&& (min_token_length <= 1 || token.chars().count() >= min_token_length)
{
let mut has_match = false;
for item in &mut matched_lexicon {
//reset matches
Expand Down

0 comments on commit 64730ef

Please sign in to comment.