From be7072fe8b844bcb6b4b394dc6c6b57178aa678a Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Thu, 19 Oct 2023 14:40:45 +0000
Subject: [PATCH] updating tests to reflect unk changes

---
 docs/Usage.md                           | 2 +-
 src/fstalign.cpp                        | 9 ---------
 test/data/align_1.aligned.punc_case.nlp | 2 +-
 test/data/align_1.ref.aligned.nlp       | 2 +-
 test/data/noise_1.hyp2.aligned          | 4 ++--
 5 files changed, 5 insertions(+), 14 deletions(-)
diff --git a/docs/Usage.md b/docs/Usage.md
index 0e09ce9..3f6a540 100644
--- a/docs/Usage.md
+++ b/docs/Usage.md
@@ -107,7 +107,7 @@ must also be disabled with `--disable-approx-alignment`.
 ### Synonyms
 Synonyms allow for reference words to be equivalent to similar forms (determined by the user) for error counting. They are accepted for any input formats and passed into the tool via the `--syn <path_to_synonym_file>` flag. For details see [Synonyms Format](https://github.com/revdotcom/fstalign/blob/develop/docs/Synonyms-Format.md). A standard set of synonyms we use at Rev.ai is available in the repository under `sample_data/synonyms.rules.txt`.
 
-In addition to allowing for custom synonyms to be passed in via CLI, fstalign also automatically generates synonyms based on the reference and hypothesis text. Currently, it does this for two cases: cutoff words (hello-) and compound hyphenated words (long-term). In both cases, a synonym is dynamically generated with the hyphen removed. Both of these synonym types can be disabled through the CLI by passing in `--disable-cutoffs` and `--disable-hyphen-ignore`, respectively.
+In addition to allowing for custom synonyms to be passed in via CLI, fstalign also automatically generates synonyms based on the reference and hypothesis text. Currently, it does this for three cases: cutoff words (e.g. hello-), compound hyphenated words (e.g. long-term), and tags or codes that follow the regular expression: `<.*>` (e.g. <laugh>). In the first two cases, a synonym is dynamically generated with the hyphen removed. Both of these synonym types can be disabled through the CLI by passing in `--disable-cutoffs` and `--disable-hyphen-ignore`, respectively. For the last case of tags, we will automatically allow for `<unk>` to be a valid synonym -- currently, this feature cannot be turned off.
 
 ### Normalizations
 Normalizations are a similar concept to synonyms. They allow a token or group of tokens to be represented by alternatives when calculating the WER alignment. Unlike synonyms, they are only accepted for NLP file inputs where the tokens are tagged with a unique ID. The normalizations are specified in a JSON format, with the unique ID as keys. Example to illustrate the schema:
diff --git a/src/fstalign.cpp b/src/fstalign.cpp
index 206e31a..e7cc2fb 100644
--- a/src/fstalign.cpp
+++ b/src/fstalign.cpp
@@ -594,15 +594,6 @@ void write_stitches_to_nlp(vector<Stitching>& stitches, ofstream &output_nlp_fil
         logger->warn("an unnormalized token was found: {}", ref_tk);
       }
     } else if (IsNoisecodeToken(original_nlp_token)) {
-      // if we have a noisecode  <.*> in the nlp token, we inject it here
-      if (stitch.comment.length() == 0) {
-        if (ref_tk == DEL || ref_tk == "") {
-          stitch.comment = "sub(<eps>)";
-        } else {
-          stitch.comment = "sub(" + ref_tk + ")";
-        }
-      }
-
       ref_tk = original_nlp_token;
     } else if (stitch.comment.find("ins") == 0) {
       assert(add_inserts);
diff --git a/test/data/align_1.aligned.punc_case.nlp b/test/data/align_1.aligned.punc_case.nlp
index a614c37..893829e 100644
--- a/test/data/align_1.aligned.punc_case.nlp
+++ b/test/data/align_1.aligned.punc_case.nlp
@@ -4,7 +4,7 @@ b|1|3.0000|4.0000|||LC|[]|[]||||
 c|1|5.0000|6.0000|||LC|[]|[]||||
 d|1|7.0000|8.0000|,||LC|[]|[]||||
 ,|1|7.0000|8.0000|||||[]||||
-<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||sub(<unk>)|
+<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]||||
 .|1|11.0000|12.0000|||||[]|||sub(e)|
 e|1|11.0000|12.0000|||LC|[]|[]|||sub(.)|
 f|1|13.0000|14.0000|||LC|[]|[]||||
diff --git a/test/data/align_1.ref.aligned.nlp b/test/data/align_1.ref.aligned.nlp
index a910d24..40cff43 100644
--- a/test/data/align_1.ref.aligned.nlp
+++ b/test/data/align_1.ref.aligned.nlp
@@ -3,7 +3,7 @@ a|1|1.0000|2.0000|||CA|[]|[]||||
 b|1|3.0000|4.0000|||LC|[]|[]||||
 c|1|5.0000|6.0000|||LC|[]|[]||||
 d|1|7.0000|8.0000|,||LC|[]|[]||||
-<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||sub(<unk>)|
+<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]||||
 e|1|11.0000|12.0000|||LC|[]|[]||||
 f|1|13.0000|14.0000|||LC|[]|[]||||
 g|1|15.0000|16.0000|||LC|[]|[]||||
diff --git a/test/data/noise_1.hyp2.aligned b/test/data/noise_1.hyp2.aligned
index 1fe99ac..f2449ef 100644
--- a/test/data/noise_1.hyp2.aligned
+++ b/test/data/noise_1.hyp2.aligned
@@ -3,11 +3,11 @@ a|1|1.0000|2.0000|||CA|[]|[]||||
 b|1|3.0000|4.0000|||LC|[]|[]||||
 c|1|5.0000|6.0000|||LC|[]|[]||||
 d|1|7.0000|8.0000|,||LC|[]|[]||||
-<inaudible>|1|9.0000|10.0000|,||LC|[]|[]|||sub(<unk>)|
+<inaudible>|1|9.0000|10.0000|,||LC|[]|[]||||
 e|1|11.0000|12.0000|||LC|[]|[]||||
 F|1|13.0000|14.0000|||LC|[]|[]||||
 G|1|15.0000|16.0000|||LC|[]|[]||||
 h|1|17.0000|18.0000|||LC|[]|[]||||
-<foreign>|1|19.0000|20.0000|,||LC|[]|[]|||sub(<unk>)|
+<foreign>|1|19.0000|20.0000|,||LC|[]|[]||||
 i|1|21.0000|22.0000|||LC|[]|[]||||
 j|1|23.0000|24.0000|||LC|[]|[]||||