From 90cc69f3c72dda101c378066e5ca9ccffca76cb9 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Mon, 1 Jul 2024 21:06:30 +0200 Subject: [PATCH] documentation and fixes for coverage --- README.md | 8 ++++++++ src/main.rs | 17 +++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a0cf556..d716158 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,14 @@ bad 3488 3491 Unlike before, you will find the matches are now returned in reading order. +If you add `--coverage` then you will get an extra last line with some coverage +statistics. This is useful to see how much of the text is covered by your +lexicon. + +``` +#coverage (tokens) = 7/627 = 0.011164274322169059 +``` + When using ``--tokens`` we rely on whitespace and punctuation to delimit tokens. This does not work for languages such as Chinese, Japanese and Korean that are not delimited in such a way. For such languages, similar linear search diff --git a/src/main.rs b/src/main.rs index b1e5e02..2e4763c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -127,6 +127,11 @@ fn main() { exit(1); } + if (!args.is_present("tokens") && !args.is_present("cjk")) && args.is_present("coverage") { + eprintln!("ERROR: --coverage can only be used with --tokens or --cjk"); + exit(1); + } + let mut lexicons: Vec = if args.is_present("lexicon") { args.get_many("lexicon") .unwrap() @@ -237,7 +242,11 @@ fn main() { "#coverage (tokens) = {}/{} = {}", matchcount, totalcount, - matchcount as f64 / totalcount as f64 + if totalcount == 0 { + 0.0 + } else { + matchcount as f64 / totalcount as f64 + } ); } } else if args.is_present("cjk") { @@ -281,7 +290,11 @@ fn main() { "#coverage (chars) = {}/{} = {}", matchcount, totalcount, - matchcount as f64 / totalcount as f64 + if totalcount == 0 { + 0.0 + } else { + matchcount as f64 / totalcount as f64 + } ); } }