From 7045a4f9dadac9cc4897eabf59d77dce272674dd Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Mon, 9 Oct 2023 15:13:34 +0000
Subject: [PATCH 01/14] case passed all over

---
 src/FstLoader.h |  2 ++
 src/Nlp.cpp     |  2 +-
 src/Nlp.h       |  2 +-
 src/main.cpp    | 12 ++++++++----
 4 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/src/FstLoader.h b/src/FstLoader.h
index ccd6a17..86673f3 100644
--- a/src/FstLoader.h
+++ b/src/FstLoader.h
@@ -28,11 +28,13 @@ class FstLoader {
                                                         const std::string& wer_sidecar_filename,
                                                         const std::string& json_norm_filename,
                                                         bool use_punctuation,
+                                                        bool use_case,
                                                         bool symbols_file_included);
 
   static std::unique_ptr<FstLoader> MakeHypothesisLoader(const std::string& hyp_filename,
                                                          const std::string& hyp_json_norm_filename,
                                                          bool use_punctuation,
+                                                         bool use_case,
                                                          bool symbols_file_included);
 
 
diff --git a/src/Nlp.cpp b/src/Nlp.cpp
index ff59a14..fbee735 100644
--- a/src/Nlp.cpp
+++ b/src/Nlp.cpp
@@ -20,7 +20,7 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
     : NlpFstLoader(records, normalization, wer_sidecar, true) {}
 
 NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, 
-    Json::Value wer_sidecar, bool processLabels, bool use_punctuation)
+    Json::Value wer_sidecar, bool processLabels, bool use_punctuation, bool use_case)
     : FstLoader() {
   mJsonNorm = normalization;
   mWerSidecar = wer_sidecar;
diff --git a/src/Nlp.h b/src/Nlp.h
index 8ce4092..4eae2b1 100644
--- a/src/Nlp.h
+++ b/src/Nlp.h
@@ -43,7 +43,7 @@ class NlpReader {
 
 class NlpFstLoader : public FstLoader {
  public:
-  NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, Json::Value wer_sidecar, bool processLabels, bool use_punctuation = false);
+  NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, Json::Value wer_sidecar, bool processLabels, bool use_punctuation = false, bool use_case = false);
   NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, Json::Value wer_sidecar);
   virtual ~NlpFstLoader();
   virtual void addToSymbolTable(fst::SymbolTable &symbol) const;
diff --git a/src/main.cpp b/src/main.cpp
index c2d63fd..44dea16 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -33,6 +33,7 @@ int main(int argc, char **argv) {
   int levenstein_maximum_error_streak = 100;
   bool record_case_stats = false;
   bool use_punctuation = false;
+  bool use_case = false;
   bool disable_approximate_alignment = false;
   bool add_inserts_nlp = false;
 
@@ -123,6 +124,7 @@ int main(int argc, char **argv) {
                     "Record precision/recall for how well the hypothesis"
                     "casing matches the reference.");
   get_wer->add_flag("--use-punctuation", use_punctuation, "Treat punctuation from nlp rows as separate tokens");
+  get_wer->add_flag("--use-case", use_case, "Keeps token casing and considers tokens with different case as different tokens");
   get_wer->add_flag("--add-inserts-nlp", add_inserts_nlp, "Add inserts to NLP output");
 
   // CLI11_PARSE(app, argc, argv);
@@ -154,8 +156,8 @@ int main(int argc, char **argv) {
 
 
   // loading "reference" inputs
-  std::unique_ptr<FstLoader> hyp = FstLoader::MakeHypothesisLoader(hyp_filename, hyp_json_norm_filename, use_punctuation, !symbols_filename.empty());
-  std::unique_ptr<FstLoader> ref = FstLoader::MakeReferenceLoader(ref_filename, wer_sidecar_filename, json_norm_filename, use_punctuation, !symbols_filename.empty());
+  std::unique_ptr<FstLoader> hyp = FstLoader::MakeHypothesisLoader(hyp_filename, hyp_json_norm_filename, use_punctuation, use_case, !symbols_filename.empty());
+  std::unique_ptr<FstLoader> ref = FstLoader::MakeReferenceLoader(ref_filename, wer_sidecar_filename, json_norm_filename, use_punctuation, use_case, !symbols_filename.empty());
 
   AlignerOptions alignerOptions;
   alignerOptions.speaker_switch_context_size = speaker_switch_context_size;
@@ -219,6 +221,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeReferenceLoader(const std::string& ref
                                                           const std::string& wer_sidecar_filename,
                                                           const std::string& json_norm_filename,
                                                           bool use_punctuation,
+                                                          bool use_case,
                                                           bool symbols_file_included) {
   auto console = logger::GetLogger("console");
   Json::Value obj;
@@ -265,7 +268,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeReferenceLoader(const std::string& ref
     NlpReader nlpReader = NlpReader();
     console->info("reading reference nlp from {}", ref_filename);
     auto vec = nlpReader.read_from_disk(ref_filename);
-    return std::make_unique<NlpFstLoader>(vec, obj, wer_sidecar_obj, true, use_punctuation);
+    return std::make_unique<NlpFstLoader>(vec, obj, wer_sidecar_obj, true, use_punctuation, use_case);
   } else if (EndsWithCaseInsensitive(ref_filename, string(".ctm"))) {
     console->info("reading reference ctm from {}", ref_filename);
     CtmReader ctmReader = CtmReader();
@@ -288,6 +291,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeReferenceLoader(const std::string& ref
 std::unique_ptr<FstLoader> FstLoader::MakeHypothesisLoader(const std::string& hyp_filename,
                                                            const std::string& hyp_json_norm_filename,
                                                            bool use_punctuation,
+                                                           bool use_case,
                                                            bool symbols_file_included) {
   auto console = logger::GetLogger("console");
 
@@ -329,7 +333,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeHypothesisLoader(const std::string& hy
     auto vec = nlpReader.read_from_disk(hyp_filename);
     // for now, nlp files passed as hypothesis won't have their labels handled as such
     // this also mean that json normalization will be ignored
-    return std::make_unique<NlpFstLoader>(vec, hyp_json_obj, hyp_empty_json, false, use_punctuation);
+    return std::make_unique<NlpFstLoader>(vec, hyp_json_obj, hyp_empty_json, false, use_punctuation, use_case);
   } else if (EndsWithCaseInsensitive(hyp_filename, string(".ctm"))) {
     console->info("reading hypothesis ctm from {}", hyp_filename);
     CtmReader ctmReader = CtmReader();

From 966a4a0341859acc3e32bf9e986000e9488bf63f Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Mon, 9 Oct 2023 16:01:46 +0000
Subject: [PATCH 02/14] updating version

---
 src/Nlp.cpp   | 23 ++++++++++++++++-------
 src/Nlp.h     |  2 ++
 src/version.h |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/Nlp.cpp b/src/Nlp.cpp
index fbee735..6a216f1 100644
--- a/src/Nlp.cpp
+++ b/src/Nlp.cpp
@@ -19,11 +19,14 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
     Json::Value wer_sidecar)
     : NlpFstLoader(records, normalization, wer_sidecar, true) {}
 
-NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, 
+NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization,
     Json::Value wer_sidecar, bool processLabels, bool use_punctuation, bool use_case)
     : FstLoader() {
   mJsonNorm = normalization;
   mWerSidecar = wer_sidecar;
+  mUsePunctuation = use_punctuation;
+  mUseCase = use_case;
+
   std::string last_label;
   bool firstTk = true;
 
@@ -81,8 +84,10 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
         mJsonNorm[curr_label_id]["candidates"][last_idx]["verbalization"].append(curr_tk);
       }
     } else {
-      std::string lower_cased = UnicodeLowercase(curr_tk);
-      mToken.push_back(lower_cased);
+      if (!mUseCase) {
+          curr_tk = UnicodeLowercase(curr_tk);
+      }
+      mToken.push_back(curr_tk);
       mSpeakers.push_back(speaker);
       if (use_punctuation && punctuation != "") {
         mToken.push_back(punctuation);
@@ -118,8 +123,10 @@ void NlpFstLoader::addToSymbolTable(fst::SymbolTable &symbol) const {
           auto candidate = candidates[i]["verbalization"];
           for (auto tk_itr : candidate) {
             std::string token = tk_itr.asString();
-            std::string lower_cased = UnicodeLowercase(token);
-            AddSymbolIfNeeded(symbol, lower_cased);
+            if (!mUseCase) {
+              token = UnicodeLowercase(token);
+            }
+            AddSymbolIfNeeded(symbol, token);
           }
         }
       }
@@ -250,11 +257,13 @@ so we add 2 states
         auto candidate = candidates[i]["verbalization"];
         for (auto tk_itr : candidate) {
           std::string ltoken = std::string(tk_itr.asString());
-          std::string lower_cased = UnicodeLowercase(ltoken);
+          if (!mUseCase) {
+            ltoken = UnicodeLowercase(ltoken);
+          }
           transducer.AddState();
           nextState++;
 
-          int token_sym = symbol.Find(lower_cased);
+          int token_sym = symbol.Find(ltoken);
           if (token_sym == -1) {
             token_sym = symbol.Find(options.symUnk);
           }
diff --git a/src/Nlp.h b/src/Nlp.h
index 4eae2b1..f53683a 100644
--- a/src/Nlp.h
+++ b/src/Nlp.h
@@ -56,6 +56,8 @@ class NlpFstLoader : public FstLoader {
   Json::Value mJsonNorm;
   Json::Value mWerSidecar;
   virtual const std::string &getToken(int index) const { return mToken.at(index); }
+ private:
+  bool mUsePunctuation, mUseCase;
 };
 
 #endif /* NLP_H_ */
diff --git a/src/version.h b/src/version.h
index eeb0162..56363c6 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1,5 +1,5 @@
 #pragma once
 
 #define FSTALIGNER_VERSION_MAJOR 1
-#define FSTALIGNER_VERSION_MINOR 10
+#define FSTALIGNER_VERSION_MINOR 11
 #define FSTALIGNER_VERSION_PATCH 0

From 85ae0dd078c5c1cf89ac9d4a1b06bd1b929e4053 Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Mon, 9 Oct 2023 16:53:48 +0000
Subject: [PATCH 03/14] updating aligned to include case and adding testing

---
 src/fstalign.cpp                      | 24 ++++++++++-----
 src/fstalign.h                        |  2 +-
 src/main.cpp                          |  2 +-
 test/data/short.aligned.case.nlp      | 33 ++++++++++++++++++++
 test/data/short.aligned.punc_case.nlp | 43 +++++++++++++++++++++++++++
 test/fstalign_Test.cc                 | 20 +++++++++++++
 6 files changed, 114 insertions(+), 10 deletions(-)
 create mode 100644 test/data/short.aligned.case.nlp
 create mode 100644 test/data/short.aligned.punc_case.nlp

diff --git a/src/fstalign.cpp b/src/fstalign.cpp
index af5b5b0..206e31a 100644
--- a/src/fstalign.cpp
+++ b/src/fstalign.cpp
@@ -218,7 +218,8 @@ wer_alignment Fstalign(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine
 
 vector<Stitching> make_stitches(wer_alignment &alignment, vector<RawCtmRecord> hyp_ctm_rows = {},
                                             vector<RawNlpRecord> hyp_nlp_rows = {},
-                                            vector<string> one_best_tokens = {}) {
+                                            vector<string> one_best_tokens = {},
+                                            bool use_case = false) {
   auto logger = logger::GetOrCreateLogger("fstalign");
 
   // Go through top alignment and create stitches
@@ -287,7 +288,11 @@ vector<Stitching> make_stitches(wer_alignment &alignment, vector<RawCtmRecord> h
 
       part.hyp_orig = ctmPart.word;
       // sanity check
-      std::string ctmCopy = UnicodeLowercase(ctmPart.word);
+      std::string ctmCopy = std::string(ctmPart.word);
+      if (!use_case) {
+        ctmCopy = UnicodeLowercase(ctmPart.word);
+      }
+
       if (hyp_tk != ctmCopy) {
         logger->warn(
             "hum, looks like the ctm and the alignment got out of sync? [{}] vs "
@@ -326,7 +331,10 @@ vector<Stitching> make_stitches(wer_alignment &alignment, vector<RawCtmRecord> h
       part.hyp_orig = token;
 
       // sanity check
-      std::string token_copy = UnicodeLowercase(token);
+      std::string token_copy = std::string(token);
+      if (!use_case) {
+        token_copy = UnicodeLowercase(token);
+      }
       if (hyp_tk != token_copy) {
         logger->warn(
             "hum, looks like the text and the alignment got out of sync? [{}] vs "
@@ -633,7 +641,7 @@ void write_stitches_to_nlp(vector<Stitching>& stitches, ofstream &output_nlp_fil
 }
 
 void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine, const string& output_sbs, const string& output_nlp,
-               AlignerOptions alignerOptions, bool add_inserts_nlp) {
+               AlignerOptions alignerOptions, bool add_inserts_nlp, bool use_case) {
   //  int speaker_switch_context_size, int numBests, int pr_threshold, string symbols_filename,
   //  string composition_approach, bool record_case_stats) {
   auto logger = logger::GetOrCreateLogger("fstalign");
@@ -648,9 +656,9 @@ void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine
   NlpFstLoader *nlp_hyp_loader = dynamic_cast<NlpFstLoader *>(&hypLoader);
   OneBestFstLoader *best_loader = dynamic_cast<OneBestFstLoader *>(&hypLoader);
   if (ctm_hyp_loader) {
-    stitches = make_stitches(topAlignment, ctm_hyp_loader->mCtmRows, {});
+    stitches = make_stitches(topAlignment, ctm_hyp_loader->mCtmRows, {}, {}, use_case);
   } else if (nlp_hyp_loader) {
-    stitches = make_stitches(topAlignment, {}, nlp_hyp_loader->mNlpRows);
+    stitches = make_stitches(topAlignment, {}, nlp_hyp_loader->mNlpRows, {}, use_case);
   } else if (best_loader) {
     vector<string> tokens;
     tokens.reserve(best_loader->TokensSize());
@@ -658,9 +666,9 @@ void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine
       string token = best_loader->getToken(i);
       tokens.push_back(token);
     }
-    stitches = make_stitches(topAlignment, {}, {}, tokens);
+    stitches = make_stitches(topAlignment, {}, {}, tokens, use_case);
   } else {
-    stitches = make_stitches(topAlignment);
+    stitches = make_stitches(topAlignment, {}, {}, {}, use_case);
   }
 
   NlpFstLoader *nlp_ref_loader = dynamic_cast<NlpFstLoader *>(&refLoader);
diff --git a/src/fstalign.h b/src/fstalign.h
index 929c220..0320785 100644
--- a/src/fstalign.h
+++ b/src/fstalign.h
@@ -51,7 +51,7 @@ struct AlignerOptions {
 //                  int numBests, string symbols_filename, string composition_approach);
 
 void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine, const string& output_sbs, const string& output_nlp,
-               AlignerOptions alignerOptions, bool add_inserts_nlp = false);
+               AlignerOptions alignerOptions, bool add_inserts_nlp = false, bool use_case = false);
 void HandleAlign(NlpFstLoader &refLoader, CtmFstLoader &hypLoader, SynonymEngine &engine, ofstream &output_nlp_file,
                  AlignerOptions alignerOptions);
 
diff --git a/src/main.cpp b/src/main.cpp
index 44dea16..87c3e8f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -180,7 +180,7 @@ int main(int argc, char **argv) {
   }
 
   if (command == "wer") {
-    HandleWer(*ref, *hyp, engine, output_sbs, output_nlp, alignerOptions, add_inserts_nlp);
+    HandleWer(*ref, *hyp, engine, output_sbs, output_nlp, alignerOptions, add_inserts_nlp, use_case);
   } else if (command == "align") {
     if (output_nlp.empty()) {
       console->error("the output nlp file must be specified");
diff --git a/test/data/short.aligned.case.nlp b/test/data/short.aligned.case.nlp
new file mode 100644
index 0000000..d2fd08c
--- /dev/null
+++ b/test/data/short.aligned.case.nlp
@@ -0,0 +1,33 @@
+token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
+<crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
+Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
+yeah|1|||,||LC|[]|[]|||del|
+right|1|0.0000|0.0000|.||LC|[]|[]||||
+Yeah|1|||,||UC|[]|[]|||del|
+all|1|||||LC|[]|[]|||del|
+right|1|0.0000|0.0000|,||LC|[]|[]|||sub(I'll)|
+probably|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
+just|1|0.0000|0.0000|||LC|[]|[]||||
+that|1|0.0000|0.0000|.||LC|[]|[]||||
+Are|3|0.0000|0.0000|||UC|[]|[]||||
+there|3|0.0000|0.0000|||LC|[]|[]||||
+any|3|0.0000|0.0000|||LC|[]|[]||||
+visuals|3|0.0000|0.0000|||LC|[]|[]||||
+that|3|0.0000|0.0000|||LC|[]|[]||||
+come|3|0.0000|0.0000|||LC|[]|[]||||
+to|3|0.0000|0.0000|||LC|[]|[]||||
+mind|3|0.0000|0.0000|||LC|[]|[]||||
+or|3|0.0000|0.0000|||LC|[]|[]||||
+Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
+sure|1|0.0000|0.0000|.||LC|[]|[]||||
+When|1|0.0000|0.0000|||UC|[]|[]||||
+I|1|0.0000|0.0000|||CA|[]|[]||||
+hear|1|0.0000|0.0000|||LC|[]|[]||||
+Foobar|1|0.0000|0.0000|,||UC|[]|[]||||
+I|1|0.0000|0.0000|||CA|[]|[]||||
+think|1|0.0000|0.0000|||LC|[]|[]||||
+about|1|0.0000|0.0000|||LC|[]|[]||||
+just|1|0.0000|0.0000|||LC|[]|[]||||
+that|1|0.0000|0.0000|:||LC|[]|[]||||
+foo|1|0.0000|0.0000|||LC|[]|[]|||sub(Foobar)|
+a|1|0.0000|0.0000|||LC|[]|[]||||
diff --git a/test/data/short.aligned.punc_case.nlp b/test/data/short.aligned.punc_case.nlp
new file mode 100644
index 0000000..e8b58a9
--- /dev/null
+++ b/test/data/short.aligned.punc_case.nlp
@@ -0,0 +1,43 @@
+token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
+<crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
+Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
+,|1|0.0000|0.0000|||UC|[]|[]||||
+yeah|1|||,||LC|[]|[]|||del|
+,|1|||||LC|[]|[]|||del|
+right|1|0.0000|0.0000|.||LC|[]|[]||||
+.|1|||||LC|[]|[]|||del|
+Yeah|1|||,||UC|[]|[]|||del|
+,|1|||||UC|[]|[]|||del|
+all|1|||||LC|[]|[]|||del|
+right|1|||,||LC|[]|[]|||del|
+,|1|0.0000|0.0000|||LC|[]|[]|||sub(I'll)|
+probably|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
+just|1|0.0000|0.0000|||LC|[]|[]||||
+that|1|0.0000|0.0000|.||LC|[]|[]||||
+.|1|0.0000|0.0000|||LC|[]|[]|||sub(?)|
+Are|3|0.0000|0.0000|||UC|[]|[]||||
+there|3|0.0000|0.0000|||LC|[]|[]||||
+any|3|0.0000|0.0000|||LC|[]|[]||||
+visuals|3|0.0000|0.0000|||LC|[]|[]||||
+that|3|0.0000|0.0000|||LC|[]|[]||||
+come|3|0.0000|0.0000|||LC|[]|[]||||
+to|3|0.0000|0.0000|||LC|[]|[]||||
+mind|3|0.0000|0.0000|||LC|[]|[]||||
+or|3|0.0000|0.0000|||LC|[]|[]||||
+Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
+,|1|0.0000|0.0000|||UC|[]|[]||||
+sure|1|0.0000|0.0000|.||LC|[]|[]||||
+.|1|0.0000|0.0000|||LC|[]|[]||||
+When|1|0.0000|0.0000|||UC|[]|[]||||
+I|1|0.0000|0.0000|||CA|[]|[]||||
+hear|1|0.0000|0.0000|||LC|[]|[]||||
+Foobar|1|0.0000|0.0000|,||UC|[]|[]||||
+,|1|0.0000|0.0000|||UC|[]|[]||||
+I|1|0.0000|0.0000|||CA|[]|[]||||
+think|1|0.0000|0.0000|||LC|[]|[]||||
+about|1|0.0000|0.0000|||LC|[]|[]||||
+just|1|0.0000|0.0000|||LC|[]|[]||||
+that|1|0.0000|0.0000|:||LC|[]|[]||||
+:|1|0.0000|0.0000|||LC|[]|[]||||
+foo|1|0.0000|0.0000|||LC|[]|[]|||sub(,)|
+a|1|0.0000|0.0000|||LC|[]|[]||||
diff --git a/test/fstalign_Test.cc b/test/fstalign_Test.cc
index c0597fe..315d0c0 100644
--- a/test/fstalign_Test.cc
+++ b/test/fstalign_Test.cc
@@ -669,6 +669,26 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") {
     REQUIRE_THAT(result, Contains("WER: INS:2 DEL:7 SUB:4"));
   }
 
+  SECTION("wer with case(nlp output)") {
+    const auto result =
+        exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-case");
+    const auto testFile = std::string{TEST_DATA} + "short.aligned.case.nlp";
+
+    REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str()));
+    REQUIRE_THAT(result, Contains("WER: 6/32 = 0.1875"));
+    REQUIRE_THAT(result, Contains("WER: INS:0 DEL:3 SUB:3"));
+  }
+
+  SECTION("wer with case and punctuation(nlp output)") {
+    const auto result =
+        exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case");
+    const auto testFile = std::string{TEST_DATA} + "short.aligned.punc_case.nlp";
+
+    REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str()));
+    REQUIRE_THAT(result, Contains("WER: 13/42 = 0.3095"));
+    REQUIRE_THAT(result, Contains("WER: INS:2 DEL:7 SUB:4"));
+  }
+
   // alignment tests
 
   SECTION("align_1") {

From 687397d7ac17d706442c34f1c66f45f01c071ee9 Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Mon, 9 Oct 2023 18:30:31 +0000
Subject: [PATCH 04/14] removing unnecessary private flag

---
 src/Nlp.cpp | 1 -
 src/Nlp.h   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Nlp.cpp b/src/Nlp.cpp
index 6a216f1..f0fa7ae 100644
--- a/src/Nlp.cpp
+++ b/src/Nlp.cpp
@@ -24,7 +24,6 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
     : FstLoader() {
   mJsonNorm = normalization;
   mWerSidecar = wer_sidecar;
-  mUsePunctuation = use_punctuation;
   mUseCase = use_case;
 
   std::string last_label;
diff --git a/src/Nlp.h b/src/Nlp.h
index f53683a..a9757f9 100644
--- a/src/Nlp.h
+++ b/src/Nlp.h
@@ -57,7 +57,7 @@ class NlpFstLoader : public FstLoader {
   Json::Value mWerSidecar;
   virtual const std::string &getToken(int index) const { return mToken.at(index); }
  private:
-  bool mUsePunctuation, mUseCase;
+  bool mUseCase;
 };
 
 #endif /* NLP_H_ */

From 0288b5dba2a4332f1aab6f7fa00e3d1e6b13a64f Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Mon, 9 Oct 2023 20:19:28 +0000
Subject: [PATCH 05/14] CTM and Text use-case

---
 src/Ctm.cpp              | 18 ++++++++++++------
 src/Ctm.h                |  4 +++-
 src/OneBestFstLoader.cpp | 19 +++++++++++++++----
 src/OneBestFstLoader.h   |  4 +++-
 src/main.cpp             |  8 ++++----
 5 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/src/Ctm.cpp b/src/Ctm.cpp
index 6353b19..e674e43 100644
--- a/src/Ctm.cpp
+++ b/src/Ctm.cpp
@@ -17,12 +17,16 @@ using namespace fst;
 /***************************************
     CTM FST Loader Class Start
  ***************************************/
-CtmFstLoader::CtmFstLoader(vector<RawCtmRecord> &records) : FstLoader() {
+CtmFstLoader::CtmFstLoader(vector<RawCtmRecord> &records, bool use_case) : FstLoader() {
   {
     mCtmRows = records;
+    mUseCase = use_case;
     for (auto &row : mCtmRows) {
-      std::string lower_cased = UnicodeLowercase(row.word);
-      mToken.push_back(lower_cased);
+      std::string token = std::string(row.word);
+      if (!mUseCase) {
+        token = UnicodeLowercase(row.word);
+      }
+      mToken.push_back(token);
     }
   }
 }
@@ -51,13 +55,15 @@ StdVectorFst CtmFstLoader::convertToFst(const SymbolTable &symbol, std::vector<i
   int map_sz = map.size();
   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
     std::string token = *i;
-    std::string lower_cased = UnicodeLowercase(token);
+    if (!mUseCase) {
+      token = UnicodeLowercase(token);
+    }
     transducer.AddState();
 
     if (map_sz > wc && map[wc] > 0) {
-      transducer.AddArc(prevState, StdArc(symbol.Find(lower_cased), symbol.Find(lower_cased), 1.0f, nextState));
+      transducer.AddArc(prevState, StdArc(symbol.Find(token), symbol.Find(token), 1.0f, nextState));
     } else {
-      transducer.AddArc(prevState, StdArc(symbol.Find(lower_cased), symbol.Find(lower_cased), 0.0f, nextState));
+      transducer.AddArc(prevState, StdArc(symbol.Find(token), symbol.Find(token), 0.0f, nextState));
     }
 
     prevState = nextState;
diff --git a/src/Ctm.h b/src/Ctm.h
index efd008b..5d65bbd 100644
--- a/src/Ctm.h
+++ b/src/Ctm.h
@@ -27,13 +27,15 @@ struct RawCtmRecord {
 
 class CtmFstLoader : public FstLoader {
  public:
-  CtmFstLoader(std::vector<RawCtmRecord> &records);
+  CtmFstLoader(std::vector<RawCtmRecord> &records, bool use_case = false);
   ~CtmFstLoader();
   vector<RawCtmRecord> mCtmRows;
   virtual void addToSymbolTable(fst::SymbolTable &symbol) const;
   virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector<int> map) const;
   virtual std::vector<int> convertToIntVector(fst::SymbolTable &symbol) const;
   virtual const std::string &getToken(int index) const { return mToken.at(index); }
+ private:
+  bool mUseCase;
 };
 
 class CtmReader {
diff --git a/src/OneBestFstLoader.cpp b/src/OneBestFstLoader.cpp
index 6b70a00..b705786 100644
--- a/src/OneBestFstLoader.cpp
+++ b/src/OneBestFstLoader.cpp
@@ -12,7 +12,9 @@
 #include "utilities.h"
 
 // empty constructor
-OneBestFstLoader::OneBestFstLoader() : FstLoader() {}
+OneBestFstLoader::OneBestFstLoader(bool use_case) : FstLoader() {
+  mUseCase = use_case;
+}
 
 void OneBestFstLoader::BuildFromString(const std::string content) {
   std::istringstream mystream(content);
@@ -33,7 +35,10 @@ void OneBestFstLoader::LoadTextFile(const std::string filename) {
 
 void OneBestFstLoader::addToSymbolTable(fst::SymbolTable &symbol) const {
   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
-    std::string token = UnicodeLowercase(*i);
+    std::string token = *i;
+    if (!mUseCase) {
+      token = UnicodeLowercase(token);
+    }
     // fst::kNoSymbol
     if (symbol.Find(token) == -1) {
       symbol.AddSymbol(token);
@@ -57,7 +62,10 @@ fst::StdVectorFst OneBestFstLoader::convertToFst(const fst::SymbolTable &symbol,
   int map_sz = map.size();
   int wc = 0;
   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
-    std::string token = UnicodeLowercase(*i);
+    std::string token = *i;
+    if (!mUseCase) {
+      token = UnicodeLowercase(token);
+    }
     transducer.AddState();
 
     int tk_idx = symbol.Find(token);
@@ -92,7 +100,10 @@ std::vector<int> OneBestFstLoader::convertToIntVector(fst::SymbolTable &symbol)
 
   FstAlignOption options;
   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
-    std::string token = UnicodeLowercase(*i);
+    std::string token = *i;
+    if (!mUseCase) {
+      token = UnicodeLowercase(token);
+    }
     int token_sym = symbol.Find(token);
     if (token_sym == -1) {
       token_sym = symbol.Find(options.symUnk);
diff --git a/src/OneBestFstLoader.h b/src/OneBestFstLoader.h
index b43a9e5..9be9fed 100644
--- a/src/OneBestFstLoader.h
+++ b/src/OneBestFstLoader.h
@@ -11,7 +11,7 @@
 
 class OneBestFstLoader : public FstLoader {
  public:
-  OneBestFstLoader();
+  OneBestFstLoader(bool use_case = false);
   virtual ~OneBestFstLoader();
   void LoadTextFile(const std::string filename);
   void BuildFromString(const std::string content);
@@ -21,6 +21,8 @@ class OneBestFstLoader : public FstLoader {
   virtual const std::string &getToken(int index) const { return mToken.at(index); }
   virtual std::vector<int> convertToIntVector(fst::SymbolTable &symbol) const;
   int TokensSize() { return mToken.size(); }
+ private:
+  bool mUseCase;
 };
 
 #endif /* ONEBESTFSTLOADER_H_ */
diff --git a/src/main.cpp b/src/main.cpp
index 87c3e8f..d87ddbc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -273,7 +273,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeReferenceLoader(const std::string& ref
     console->info("reading reference ctm from {}", ref_filename);
     CtmReader ctmReader = CtmReader();
     auto vect = ctmReader.read_from_disk(ref_filename);
-    return std::make_unique<CtmFstLoader>(vect);
+    return std::make_unique<CtmFstLoader>(vect, use_case);
   } else if (EndsWithCaseInsensitive(ref_filename, string(".fst"))) {
     if (!symbols_file_included) {
       console->error("a symbols file must be specified if reading an FST.");
@@ -282,7 +282,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeReferenceLoader(const std::string& ref
     return std::make_unique<FstFileLoader>(ref_filename);
   } else {
     console->info("reading reference plain text from {}", ref_filename);
-    auto oneBestFst = std::make_unique<OneBestFstLoader>();
+    auto oneBestFst = std::make_unique<OneBestFstLoader>(use_case);
     oneBestFst->LoadTextFile(ref_filename);
     return oneBestFst;
   }
@@ -338,7 +338,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeHypothesisLoader(const std::string& hy
     console->info("reading hypothesis ctm from {}", hyp_filename);
     CtmReader ctmReader = CtmReader();
     auto vect = ctmReader.read_from_disk(hyp_filename);
-    return std::make_unique<CtmFstLoader>(vect);
+    return std::make_unique<CtmFstLoader>(vect, use_case);
   } else if (EndsWithCaseInsensitive(hyp_filename, string(".fst"))) {
     if (!symbols_file_included) {
       console->error("a symbols file must be specified if reading an FST.");
@@ -347,7 +347,7 @@ std::unique_ptr<FstLoader> FstLoader::MakeHypothesisLoader(const std::string& hy
     return std::make_unique<FstFileLoader>(hyp_filename);
   } else {
     console->info("reading hypothesis plain text from {}", hyp_filename);
-    auto hypOneBest = std::make_unique<OneBestFstLoader>();
+    auto hypOneBest = std::make_unique<OneBestFstLoader>(use_case);
     hypOneBest->LoadTextFile(hyp_filename);
     return hypOneBest;
   }

From 97a4902d27d51b144f56c0a339789b4bb157792d Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Mon, 9 Oct 2023 21:03:12 +0000
Subject: [PATCH 06/14] adding tests

---
 test/data/align_1.aligned.punc_case.nlp | 15 ++++++++++++++
 test/data/align_1.hyp.punc_case.ctm     | 13 ++++++++++++
 test/data/twenty.aligned.punc_case.nlp  |  8 ++++++++
 test/data/twenty.hyp.punc_case.txt      |  2 ++
 test/fstalign_Test.cc                   | 27 ++++++++++++++++++++++++-
 5 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 test/data/align_1.aligned.punc_case.nlp
 create mode 100644 test/data/align_1.hyp.punc_case.ctm
 create mode 100644 test/data/twenty.aligned.punc_case.nlp
 create mode 100644 test/data/twenty.hyp.punc_case.txt

diff --git a/test/data/align_1.aligned.punc_case.nlp b/test/data/align_1.aligned.punc_case.nlp
new file mode 100644
index 0000000..2ff0ddf
--- /dev/null
+++ b/test/data/align_1.aligned.punc_case.nlp
@@ -0,0 +1,15 @@
+token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
+a|1|1.0000|2.0000|||CA|[]|[]|||sub(A)|
+b|1|3.0000|4.0000|||LC|[]|[]||||
+c|1|5.0000|6.0000|||LC|[]|[]||||
+d|1|7.0000|8.0000|,||LC|[]|[]||||
+,|1|7.0000|8.0000|||LC|[]|[]||||
+<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||sub(<unk>)|
+e|1|11.0000|12.0000|||LC|[]|[]||||
+f|1|13.0000|14.0000|||LC|[]|[]||||
+g|1|15.0000|16.0000|||LC|[]|[]||||
+h|1|17.0000|18.0000|||LC|[]|[]||||
+<foreign>|1|||,||LC|[]|[]|||del|
+,|1|||||LC|[]|[]|||del|
+i|1|21.0000|22.0000|||LC|[]|[]|||sub(I)|
+j|1|23.0000|24.0000|||LC|[]|[]|||sub(J)|
diff --git a/test/data/align_1.hyp.punc_case.ctm b/test/data/align_1.hyp.punc_case.ctm
new file mode 100644
index 0000000..4da53d8
--- /dev/null
+++ b/test/data/align_1.hyp.punc_case.ctm
@@ -0,0 +1,13 @@
+recording.wav 1 1 1 A
+recording.wav 1 3 1 b
+recording.wav 1 5 1 c
+recording.wav 1 7 1 d
+recording.wav 1 7 1 ,
+recording.wav 1 9 1 <unk>
+recording.wav 1 11 1 e
+recording.wav 1 11 1 .
+recording.wav 1 13 1 f
+recording.wav 1 15 1 g
+recording.wav 1 17 1 h
+recording.wav 1 21 1 I
+recording.wav 1 23 1 J
diff --git a/test/data/twenty.aligned.punc_case.nlp b/test/data/twenty.aligned.punc_case.nlp
new file mode 100644
index 0000000..fa9542e
--- /dev/null
+++ b/test/data/twenty.aligned.punc_case.nlp
@@ -0,0 +1,8 @@
+token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
+20|2|0.0000|0.0000|||CA|['1:CARDINAL']|['1']|84.6600|85.0600|sub(in)|
+in|2|0.0000|0.0000|||LC|[]|[]|89.1600|89.2800|sub(Twenty)|
+2020|2|0.0000|0.0000|||CA|['0:YEAR']|['0', '2']|89.7400|89.9900|sub(tHiRtY)|
+is|2|0.0000|0.0000|||LC|[]|[]|89.1600|89.2800||
+one|2|0.0000|0.0000|||CA|['3:CARDINAL']|['3']|89.7400|89.9900|,push_last|
+twenty|2|0.0000|0.0000|||LC|['3:CARDINAL']|['3']|89.7400|89.9900|sub(two),push_last|
+three|2|0.0000|0.0000|||LC|['3:CARDINAL']|['3']|89.7400|89.9900|,push_last|
diff --git a/test/data/twenty.hyp.punc_case.txt b/test/data/twenty.hyp.punc_case.txt
new file mode 100644
index 0000000..9bbad89
--- /dev/null
+++ b/test/data/twenty.hyp.punc_case.txt
@@ -0,0 +1,2 @@
+in Twenty tHiRtY , is one TWENTY two three
+
diff --git a/test/fstalign_Test.cc b/test/fstalign_Test.cc
index 315d0c0..4f0d509 100644
--- a/test/fstalign_Test.cc
+++ b/test/fstalign_Test.cc
@@ -679,7 +679,7 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") {
     REQUIRE_THAT(result, Contains("WER: INS:0 DEL:3 SUB:3"));
   }
 
-  SECTION("wer with case and punctuation(nlp output)") {
+  SECTION("NLP Hypothesis: wer with case and punctuation(nlp output)") {
     const auto result =
         exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case");
     const auto testFile = std::string{TEST_DATA} + "short.aligned.punc_case.nlp";
@@ -689,6 +689,31 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") {
     REQUIRE_THAT(result, Contains("WER: INS:2 DEL:7 SUB:4"));
   }
 
+  SECTION("CTM Hypothesis: wer with case and punctuation(nlp output)") {
+    const auto result =
+        exec(command("wer", approach, "align_1.ref.nlp", "align_1.hyp.punc_case.ctm", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case");
+    const auto testFile = std::string{TEST_DATA} + "align_1.aligned.punc_case.nlp";
+
+    REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str()));
+    REQUIRE_THAT(result, Contains("WER: 6/14 = 0.4286"));
+    REQUIRE_THAT(result, Contains("WER: INS:1 DEL:2 SUB:3"));
+  }
+
+  SECTION("TXT Hypothesis: wer with case and punctuation(nlp output)") {
+    const auto result =
+        exec(command("wer", approach, "twenty.ref.testing.nlp", "twenty.hyp.punc_case.txt", sbs_output, nlp_output, TEST_SYNONYMS,
+                                      "twenty.ref.testing.norm.json")+" --use-punctuation --use-case");
+    const auto testFile = std::string{TEST_DATA} + "twenty.aligned.punc_case.nlp";
+
+    REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str()));
+    REQUIRE_THAT(result, Contains("WER: 6/7 = 0.8571"));
+    REQUIRE_THAT(result, Contains("WER: INS:2 DEL:0 SUB:4"));
+    REQUIRE_THAT(result, Contains("Wer Entity ID 1 WER: 1/1 = 1.0000"));
+    REQUIRE_THAT(result, Contains("Wer Entity ID 0 WER: 1/1 = 1.0000"));
+    REQUIRE_THAT(result, Contains("Wer Entity ID 2 WER: 1/1 = 1.0000"));
+    REQUIRE_THAT(result, Contains("Wer Entity ID 3 WER: 2/3 = 0.6667"));
+  }
+
   // alignment tests
 
   SECTION("align_1") {

From 6ca1dba465c9d5cc61d268b49b3921d5b62465ab Mon Sep 17 00:00:00 2001
From: Miguel Angel Del Rio Fernandez <miguel.delrio@rev.com>
Date: Mon, 9 Oct 2023 17:39:47 -0400
Subject: [PATCH 07/14] Update Advanced-Usage.md to include `use-punctuation`
 and `use-case`

---
 docs/Advanced-Usage.md | 51 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/docs/Advanced-Usage.md b/docs/Advanced-Usage.md
index 9443927..e036265 100644
--- a/docs/Advanced-Usage.md
+++ b/docs/Advanced-Usage.md
@@ -12,6 +12,23 @@ Much of the advanced features for fstalign come from providing [NLP file inputs]
 
 - Speaker-switch WER: similarly, fstalign will report the error rate of words around a speaker switch
   - The window size for the context of a speaker switch can be adjusted with the `--speaker-switch-context <int>` flag. By default this is set to 5.
+ 
+## Table of Contents
+* [Inputs](#inputs)
+  * [CTM](#ctm)
+  * [NLP](#nlp)
+  * [FST](#fst)
+  * [Synonyms](#synonyms)
+  * [Normalizations](#normalizations)
+  * [WER Sidecar](#wer-sidecar)
+* [Text Transforms](#text-transforms)
+  * [use-punctuation](#use-punctuation)
+  * [use-case](#use-case)
+* [Outputs](#outputs)
+  * [Text Log](#text-log)
+  * [Side-by-side](#sbs)
+  * [JSON Log](#json-log)
+  * [Aligned NLP](#nlp-1)
 
 ## Inputs
 ### CTM
@@ -83,6 +100,40 @@ CLI flag: `--wer-sidecar`
 Only usable for NLP format reference files. This passes a [WER sidecar](https://github.com/revdotcom/fstalign/blob/develop/docs//NLP-Format.md#wer-tag-sidecar) file to
 add extra information to some outputs. Optional.
 
+## Text Transforms
+In this section, we outline transforms that can be applied to input files. These will modify the handling of the files by `fstalign`.
+### `use-punctuation`
+Adding the `--use-punctuation` flag will treat punctuation from NLP files as individual tokens for `fstalign`. All other file formats that desire this format are expected to handle punctuation on their own and separating them into their own tokens.
+
+The following files are equivalent with this flag set:
+
+**example.nlp**
+```
+token|speaker|ts|endTs|punctuation|case|tags|wer_tags
+Good|0||||UC|[]|[]
+morning|0|||.|LC|['5:TIME']|['5']
+Welcome|0|||!|LC|[]|[]
+```
+
+**example.txt**
+```
+good morning . welcome !
+```
+
+_Note that WER when this flag is set, measures errors in the words output by the ASR as well as punctuation._
+
+### `use-case`
+Adding the `--use-case` flag will take a word's letter case into consideration. In other words, the same word with different letters capitalized will now be considered a different word. For example consider the following:
+
+**Ref:** `Hi this is an example`
+
+**Hyp:** `hi THIS iS An ExAmPlE`
+
+Without this flag, `fstalign` considers these two strings to be equivalent and result in 0 errors. With `--use-case` set, none of these words would be equivalent because they have different letter cases.
+
+_Note that WER when this flag is set, measures errors in the words output by the ASR, taking into account letter casing._
+
+
 ## Outputs
 
 ### Text Log

From 8584e558d9d3beecb07727f72f0c59e83166c6d8 Mon Sep 17 00:00:00 2001
From: Miguel Angel Del Rio Fernandez <miguel.delrio@rev.com>
Date: Tue, 10 Oct 2023 10:05:59 -0400
Subject: [PATCH 08/14] Moving to "Documentation" style MD instead of "Advanced
 Usage"

---
 docs/{Advanced-Usage.md => Usage.md} | 89 +++++++++++++++++++++++-----
 1 file changed, 74 insertions(+), 15 deletions(-)
 rename docs/{Advanced-Usage.md => Usage.md} (77%)

diff --git a/docs/Advanced-Usage.md b/docs/Usage.md
similarity index 77%
rename from docs/Advanced-Usage.md
rename to docs/Usage.md
index e036265..0e09ce9 100644
--- a/docs/Advanced-Usage.md
+++ b/docs/Usage.md
@@ -1,19 +1,9 @@
-## Advanced Usage
-Much of the advanced features for fstalign come from providing [NLP file inputs](#NLP) to the references. Some of these features include:
-- Entity category WER and normalization: based on labels in the NLP file, entities are grouped into classes in the WER output
-  - For example: if the NLP has `2020|0||||CA|['0:YEAR']|` you will see
-```s
-[+++] [22:36:50] [approach1] class YEAR         WER: 0/8 = 0.0000
-```
-
-  - Another useful feature here is normalization, which allows tokens with entity labels to have multiple normalizations accepted as correct by fstalign. This functionality is enabled when the tool is invoked with `--ref-json <path_to_norm_sidecar>` (passed in addition to the `--ref`). This enables something like `2020` to be treated equivalent to `twenty twenty`. More details on the specification for this file are specified in the [Inputs](#Inputs) section below. Note that only reference-side normalization is currently supported.
-
-- Speaker-wise WER: since the NLP file contains a speaker column, fstalign logs and output will provide a breakdown of WER by speaker ID if non-null
-
-- Speaker-switch WER: similarly, fstalign will report the error rate of words around a speaker switch
-  - The window size for the context of a speaker switch can be adjusted with the `--speaker-switch-context <int>` flag. By default this is set to 5.
- 
+# Documentation
 ## Table of Contents
+* [Quickstart](#quickstart)
+* [Subcommands](#subcommands)
+  * [`wer`](#wer)
+  * [`align`](#align)
 * [Inputs](#inputs)
   * [CTM](#ctm)
   * [NLP](#nlp)
@@ -29,6 +19,60 @@ Much of the advanced features for fstalign come from providing [NLP file inputs]
   * [Side-by-side](#sbs)
   * [JSON Log](#json-log)
   * [Aligned NLP](#nlp-1)
+* [Advanced Usage](#advanced-usage)
+
+In this document, we outline the functions of `fstalign` and the features that make this tool unique. Please feel free to start an issue if any of this documentation is lacking / needs further clarification.
+
+## Quickstart
+```
+Rev FST Align
+Usage: ./fstalign [OPTIONS] [SUBCOMMAND]
+
+Options:
+  -h,--help                   Print this help message and exit
+  --help-all                  Expand all help
+  --version                   Show fstalign version.
+
+Subcommands:
+  wer                         Get the WER between a reference and an hypothesis.
+  align                       Produce an alignment between an NLP file and a CTM-like input.
+```
+## Subcommands
+### `wer`
+
+The wer subcommand is the most frequent usage of this tool. Required are two arguments traditional to WER calculation: a reference (`--ref <file_path>`) and a hypothesis (`--hyp <file_path>`) transcript. Currently the tool is configured to simply look at the file extension to determine the file format of the input transcripts and parse accordingly.
+
+| File Extension | Reference Support | Hypothesis Supprt |
+| ----------- | ----------- | ----------- |
+| `.ctm`      | :white_check_mark: | :white_check_mark: |
+| `.nlp`      | :white_check_mark: | :white_check_mark: |
+| `.fst`      | :white_check_mark: | :white_check_mark: |
+| All other file extensions, assumed to be plain text | :white_check_mark: | :white_check_mark: |
+
+Basic Example:
+```
+ref.txt
+this is the best sentence
+
+hyp.txt
+this is a test sentence
+
+./bin/fstalign wer --ref ref.txt --hyp hyp.txt
+```
+
+When run, fstalign will dump a log to STDOUT with summary WER information at the bottom. For the above example:
+```
+[+++] [20:37:10] [fstalign] done walking the graph
+[+++] [20:37:10] [wer] best WER: 2/5 = 0.4000 (Total words in reference: 5)
+[+++] [20:37:10] [wer] best WER: INS:0 DEL:0 SUB:2
+[+++] [20:37:10] [wer] best WER: Precision:0.600000 Recall:0.600000
+```
+
+Note that in addition to general WER, the insertion/deletion/substitution breakdown is also printed. fstalign also has other useful outputs, including a JSON log for downstream machine parsing, and a side-by-side view of the alignment and errors generated. For more details, see the [Outputs](https://github.com/revdotcom/fstalign/blob/develop/docs/Advanced-Usage.md#outputs) section in the [Advanced Usage](https://github.com/revdotcom/fstalign/blob/develop/docs/Advanced-Usage.md) doc.
+
+### `align`
+Usage of the `align` subcommand is almost identical to the `wer` subcommand. The exception is that `align` can only be run if the provided reference is a NLP and the provided hypothesis is a CTM. This is because the core function of the subcommand is to align an NLP without timestamps to a CTM that has timestamps, producing an output of tokens from the reference with timings from the hypothesis.
+
 
 ## Inputs
 ### CTM
@@ -221,3 +265,18 @@ The “bigrams” and “unigrams” fields are only populated with unigrams and
 CLI flag: `--output-nlp`
 
 Writes out the reference [NLP](https://github.com/revdotcom/fstalign/blob/develop/docs/NLP-Format.md), but with timings provided by a hypothesis CTM. Mostly relevant for the `align` subcommand.
+
+## Advanced Usage
+Much of the advanced features for fstalign come from providing [NLP file inputs](#NLP) to the references. Some of these features include:
+- Entity category WER and normalization: based on labels in the NLP file, entities are grouped into classes in the WER output
+  - For example: if the NLP has `2020|0||||CA|['0:YEAR']|` you will see
+```s
+[+++] [22:36:50] [approach1] class YEAR         WER: 0/8 = 0.0000
+```
+
+  - Another useful feature here is normalization, which allows tokens with entity labels to have multiple normalizations accepted as correct by fstalign. This functionality is enabled when the tool is invoked with `--ref-json <path_to_norm_sidecar>` (passed in addition to the `--ref`). This enables something like `2020` to be treated equivalent to `twenty twenty`. More details on the specification for this file are specified in the [Inputs](#Inputs) section below. Note that only reference-side normalization is currently supported.
+
+- Speaker-wise WER: since the NLP file contains a speaker column, fstalign logs and output will provide a breakdown of WER by speaker ID if non-null
+
+- Speaker-switch WER: similarly, fstalign will report the error rate of words around a speaker switch
+  - The window size for the context of a speaker switch can be adjusted with the `--speaker-switch-context <int>` flag. By default this is set to 5.

From 6be81ce1cb75e0f1456a7eea753ef7d6a6ed74d8 Mon Sep 17 00:00:00 2001
From: Miguel Angel Del Rio Fernandez <miguel.delrio@rev.com>
Date: Tue, 10 Oct 2023 10:07:14 -0400
Subject: [PATCH 09/14] README points to documentation and removes explanations

---
 README.md | 54 ++----------------------------------------------------
 1 file changed, 2 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 408e31e..518e6b8 100644
--- a/README.md
+++ b/README.md
@@ -75,55 +75,5 @@ For development you can also build the docker image locally using:
 docker build . -t fstalign-dev
 ```
 
-## Quickstart
-```
-Rev FST Align
-Usage: ./fstalign [OPTIONS] [SUBCOMMAND]
-
-Options:
-  -h,--help                   Print this help message and exit
-  --help-all                  Expand all help
-  --version                   Show fstalign version.
-
-Subcommands:
-  wer                         Get the WER between a reference and an hypothesis.
-  align                       Produce an alignment between an NLP file and a CTM-like input.
-```
-
-### WER Subcommand
-
-The wer subcommand is the most frequent usage of this tool. Required are two arguments traditional to WER calculation: a reference (`--ref <file_path>`) and a hypothesis (`--hyp <file_path>`) transcript. Currently the tool is configured to simply look at the file extension to determine the file format of the input transcripts and parse accordingly.
-
-| File Extension | Reference Support | Hypothesis Supprt |
-| ----------- | ----------- | ----------- |
-| `.ctm`      | :white_check_mark: | :white_check_mark: |
-| `.nlp`      | :white_check_mark: | :white_check_mark: |
-| `.fst`      | :white_check_mark: | :white_check_mark: |
-| All other file extensions, assumed to be plain text | :white_check_mark: | :white_check_mark: |
-
-Basic Example:
-```
-ref.txt
-this is the best sentence
-
-hyp.txt
-this is a test sentence
-
-./bin/fstalign wer --ref ref.txt --hyp hyp.txt
-```
-
-When run, fstalign will dump a log to STDOUT with summary WER information at the bottom. For the above example:
-```
-[+++] [20:37:10] [fstalign] done walking the graph
-[+++] [20:37:10] [wer] best WER: 2/5 = 0.4000 (Total words in reference: 5)
-[+++] [20:37:10] [wer] best WER: INS:0 DEL:0 SUB:2
-[+++] [20:37:10] [wer] best WER: Precision:0.600000 Recall:0.600000
-```
-
-Note that in addition to general WER, the insertion/deletion/substitution breakdown is also printed. fstalign also has other useful outputs, including a JSON log for downstream machine parsing, and a side-by-side view of the alignment and errors generated. For more details, see the [Outputs](https://github.com/revdotcom/fstalign/blob/develop/docs/Advanced-Usage.md#outputs) section in the [Advanced Usage](https://github.com/revdotcom/fstalign/blob/develop/docs/Advanced-Usage.md) doc.
-
-### Align Subcommand
-Usage of the `align` subcommand is almost identical to the `wer` subcommand. The exception is that `align` can only be run if the provided reference is a NLP and the provided hypothesis is a CTM. This is because the core function of the subcommand is to align an NLP without timestamps to a CTM that has timestamps, producing an output of tokens from the reference with timings from the hypothesis.
-
-## Advanced Usage
-See [the advanced usage doc](https://github.com/revdotcom/fstalign/blob/develop/docs/Advanced-Usage.md) for more details.
+## Documentation
+For more information on how to use `fstalign` see our [documentation](https://github.com/revdotcom/fstalign/blob/develop/docs/Usage.md) for more details.

From 373d004304df102a6904679a06c91afb58fd7263 Mon Sep 17 00:00:00 2001
From: Miguel Angel Del Rio Fernandez <miguel.delrio@rev.com>
Date: Tue, 10 Oct 2023 10:07:44 -0400
Subject: [PATCH 10/14] Updating ToC

---
 README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 518e6b8..80a48e3 100644
--- a/README.md
+++ b/README.md
@@ -9,10 +9,7 @@
   * [Dependencies](#Dependencies)
   * [Build](#Build)
   * [Docker](#Docker)
-- [Quickstart](#Quickstart)
-  * [WER Subcommand](#WER-Subcommand)
-  * [Align Subcommand](#Align-Subcommand)
-- [Advanced Usage](#Advanced-Usage)
+- [Documentation](#Documentation)
 
 ## Overview
 `fstalign` is a tool for creating alignment between two sequences of tokens (here out referred to as “reference” and “hypothesis”). It has two key functions: computing word error rate (WER) and aligning [NLP-formatted](https://github.com/revdotcom/fstalign/blob/develop/docs/NLP-Format.md) references with CTM hypotheses.

From 9aba0da48b55d704f3216407f2dca3354563fc21 Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Tue, 10 Oct 2023 21:20:17 +0000
Subject: [PATCH 11/14] moving code pushing punctuation out of else

---
 src/Nlp.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Nlp.cpp b/src/Nlp.cpp
index f0fa7ae..65578b2 100644
--- a/src/Nlp.cpp
+++ b/src/Nlp.cpp
@@ -38,6 +38,7 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
     auto curr_label_id = row.best_label_id;
     auto punctuation = row.punctuation;
     auto curr_row_tags = row.wer_tags;
+
     // Update wer tags in records to real string labels
     vector<string> real_wer_tags;
     for (auto &tag: curr_row_tags) {
@@ -88,16 +89,15 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
       }
       mToken.push_back(curr_tk);
       mSpeakers.push_back(speaker);
-      if (use_punctuation && punctuation != "") {
-        mToken.push_back(punctuation);
-        mSpeakers.push_back(speaker);
-        RawNlpRecord nlp_row = row;
-        nlp_row.token = nlp_row.punctuation;
-        nlp_row.punctuation = "";
-        mNlpRows.push_back(nlp_row);
-      }
     }
-
+    if (use_punctuation && punctuation != "") {
+      mToken.push_back(punctuation);
+      mSpeakers.push_back(speaker);
+      RawNlpRecord punc_row = row;
+      punc_row.token = punc_row.punctuation;
+      punc_row.punctuation = "";
+      mNlpRows.push_back(punc_row);
+    }
     firstTk = false;
     last_label = curr_label;
   }

From 1a9d4d4140e90c507cf9433740b1d0decc546b93 Mon Sep 17 00:00:00 2001
From: qmac <quinten.mcnamara@gmail.com>
Date: Wed, 11 Oct 2023 14:44:51 -0500
Subject: [PATCH 12/14] patch to entity treatment of punctuation

---
 src/Nlp.cpp | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/Nlp.cpp b/src/Nlp.cpp
index 65578b2..c0dce53 100644
--- a/src/Nlp.cpp
+++ b/src/Nlp.cpp
@@ -15,12 +15,11 @@
 /***********************************
    NLP FstLoader class start
  ************************************/
-NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization,
-    Json::Value wer_sidecar)
+NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, Json::Value wer_sidecar)
     : NlpFstLoader(records, normalization, wer_sidecar, true) {}
 
-NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization,
-    Json::Value wer_sidecar, bool processLabels, bool use_punctuation, bool use_case)
+NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, Json::Value wer_sidecar,
+                           bool processLabels, bool use_punctuation, bool use_case)
     : FstLoader() {
   mJsonNorm = normalization;
   mWerSidecar = wer_sidecar;
@@ -29,7 +28,6 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
   std::string last_label;
   bool firstTk = true;
 
-
   // fuse multiple rows that have the same id/label into one entry only
   for (auto &row : records) {
     mNlpRows.push_back(row);
@@ -41,10 +39,10 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
 
     // Update wer tags in records to real string labels
     vector<string> real_wer_tags;
-    for (auto &tag: curr_row_tags) {
+    for (auto &tag : curr_row_tags) {
       auto real_tag = tag;
       if (mWerSidecar != Json::nullValue) {
-        real_tag = "###"+ real_tag + "_" + mWerSidecar[real_tag]["entity_type"].asString() + "###";
+        real_tag = "###" + real_tag + "_" + mWerSidecar[real_tag]["entity_type"].asString() + "###";
       }
       real_wer_tags.push_back(real_tag);
     }
@@ -85,7 +83,7 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
       }
     } else {
       if (!mUseCase) {
-          curr_tk = UnicodeLowercase(curr_tk);
+        curr_tk = UnicodeLowercase(curr_tk);
       }
       mToken.push_back(curr_tk);
       mSpeakers.push_back(speaker);
@@ -93,7 +91,7 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
     if (use_punctuation && punctuation != "") {
       mToken.push_back(punctuation);
       mSpeakers.push_back(speaker);
-      RawNlpRecord punc_row = row;
+      RawNlpRecord punc_row;
       punc_row.token = punc_row.punctuation;
       punc_row.punctuation = "";
       mNlpRows.push_back(punc_row);
@@ -341,19 +339,20 @@ std::vector<RawNlpRecord> NlpReader::read_from_disk(const std::string &filename)
   std::vector<RawNlpRecord> vect;
   io::CSVReader<13, io::trim_chars<' ', '\t'>, io::no_quote_escape<'|'>> input_nlp(filename);
   // token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|ali_comment|oldTs|oldEndTs
-  input_nlp.read_header(io::ignore_missing_column | io::ignore_extra_column,
-      "token", "speaker", "ts", "endTs", "punctuation", "prepunctuation",
-      "case", "tags", "wer_tags", "ali_comment", "oldTs", "oldEndTs", "confidence");
-
-  std::string token, speaker, ts, endTs, punctuation, prepunctuation, casing, tags, wer_tags, ali_comment, oldTs, oldEndTs, confidence;
-  while (input_nlp.read_row(token, speaker, ts, endTs, punctuation, prepunctuation, casing, tags, wer_tags, ali_comment, oldTs,
-                            oldEndTs, confidence)) {
+  input_nlp.read_header(io::ignore_missing_column | io::ignore_extra_column, "token", "speaker", "ts", "endTs",
+                        "punctuation", "prepunctuation", "case", "tags", "wer_tags", "ali_comment", "oldTs", "oldEndTs",
+                        "confidence");
+
+  std::string token, speaker, ts, endTs, punctuation, prepunctuation, casing, tags, wer_tags, ali_comment, oldTs,
+      oldEndTs, confidence;
+  while (input_nlp.read_row(token, speaker, ts, endTs, punctuation, prepunctuation, casing, tags, wer_tags, ali_comment,
+                            oldTs, oldEndTs, confidence)) {
     RawNlpRecord record;
     record.speakerId = speaker;
     record.casing = casing;
     record.punctuation = punctuation;
     if (input_nlp.has_column("prepunctuation")) {
-        record.prepunctuation = prepunctuation;
+      record.prepunctuation = prepunctuation;
     }
     record.ts = ts;
     record.endTs = endTs;
@@ -365,7 +364,7 @@ std::vector<RawNlpRecord> NlpReader::read_from_disk(const std::string &filename)
       record.wer_tags = GetWerTags(wer_tags);
     }
     if (input_nlp.has_column("confidence")) {
-        record.confidence = confidence;
+      record.confidence = confidence;
     }
     vect.push_back(record);
   }

From c883ac33a609260c575e38351a3ab826d0c2096a Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Wed, 11 Oct 2023 21:04:33 +0000
Subject: [PATCH 13/14] updating tests

---
 src/Nlp.cpp                             |  1 +
 test/data/align_1.aligned.punc_case.nlp |  7 ++++---
 test/data/short.aligned.punc.nlp        | 20 ++++++++++----------
 test/data/short.aligned.punc_case.nlp   | 20 ++++++++++----------
 test/fstalign_Test.cc                   |  4 ++--
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/Nlp.cpp b/src/Nlp.cpp
index c0dce53..06cdfe8 100644
--- a/src/Nlp.cpp
+++ b/src/Nlp.cpp
@@ -93,6 +93,7 @@ NlpFstLoader::NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value norma
       mSpeakers.push_back(speaker);
       RawNlpRecord punc_row;
       punc_row.token = punc_row.punctuation;
+      punc_row.speakerId = speaker;
       punc_row.punctuation = "";
       mNlpRows.push_back(punc_row);
     }
diff --git a/test/data/align_1.aligned.punc_case.nlp b/test/data/align_1.aligned.punc_case.nlp
index 2ff0ddf..a614c37 100644
--- a/test/data/align_1.aligned.punc_case.nlp
+++ b/test/data/align_1.aligned.punc_case.nlp
@@ -3,13 +3,14 @@ a|1|1.0000|2.0000|||CA|[]|[]|||sub(A)|
 b|1|3.0000|4.0000|||LC|[]|[]||||
 c|1|5.0000|6.0000|||LC|[]|[]||||
 d|1|7.0000|8.0000|,||LC|[]|[]||||
-,|1|7.0000|8.0000|||LC|[]|[]||||
+,|1|7.0000|8.0000|||||[]||||
 <laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||sub(<unk>)|
-e|1|11.0000|12.0000|||LC|[]|[]||||
+.|1|11.0000|12.0000|||||[]|||sub(e)|
+e|1|11.0000|12.0000|||LC|[]|[]|||sub(.)|
 f|1|13.0000|14.0000|||LC|[]|[]||||
 g|1|15.0000|16.0000|||LC|[]|[]||||
 h|1|17.0000|18.0000|||LC|[]|[]||||
 <foreign>|1|||,||LC|[]|[]|||del|
-,|1|||||LC|[]|[]|||del|
+,|1|||||||[]|||del|
 i|1|21.0000|22.0000|||LC|[]|[]|||sub(I)|
 j|1|23.0000|24.0000|||LC|[]|[]|||sub(J)|
diff --git a/test/data/short.aligned.punc.nlp b/test/data/short.aligned.punc.nlp
index 8cfc53e..421d1ac 100644
--- a/test/data/short.aligned.punc.nlp
+++ b/test/data/short.aligned.punc.nlp
@@ -1,20 +1,20 @@
 token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 <crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
 Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
-,|1|0.0000|0.0000|||UC|[]|[]||||
+,|1|0.0000|0.0000|||||[]||||
 yeah|1|||,||LC|[]|[]|||del|
-,|1|||||LC|[]|[]|||del|
+,|1|||||||[]|||del|
 right|1|0.0000|0.0000|.||LC|[]|[]||||
-.|1|||||LC|[]|[]|||del|
+.|1|||||||[]|||del|
 Yeah|1|||,||UC|[]|[]|||del|
-,|1|||||UC|[]|[]|||del|
+,|1|||||||[]|||del|
 all|1|||||LC|[]|[]|||del|
 right|1|||,||LC|[]|[]|||del|
-,|1|0.0000|0.0000|||LC|[]|[]|||sub(i'll)|
+,|1|0.0000|0.0000|||||[]|||sub(i'll)|
 probably|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
 just|1|0.0000|0.0000|||LC|[]|[]||||
 that|1|0.0000|0.0000|.||LC|[]|[]||||
-.|1|0.0000|0.0000|||LC|[]|[]|||sub(?)|
+.|1|0.0000|0.0000|||||[]|||sub(?)|
 Are|3|0.0000|0.0000|||UC|[]|[]||||
 there|3|0.0000|0.0000|||LC|[]|[]||||
 any|3|0.0000|0.0000|||LC|[]|[]||||
@@ -25,19 +25,19 @@ to|3|0.0000|0.0000|||LC|[]|[]||||
 mind|3|0.0000|0.0000|||LC|[]|[]||||
 or|3|0.0000|0.0000|||LC|[]|[]||||
 Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
-,|1|0.0000|0.0000|||UC|[]|[]||||
+,|1|0.0000|0.0000|||||[]||||
 sure|1|0.0000|0.0000|.||LC|[]|[]||||
-.|1|0.0000|0.0000|||LC|[]|[]||||
+.|1|0.0000|0.0000|||||[]||||
 When|1|0.0000|0.0000|||UC|[]|[]||||
 I|1|0.0000|0.0000|||CA|[]|[]||||
 hear|1|0.0000|0.0000|||LC|[]|[]||||
 Foobar|1|0.0000|0.0000|,||UC|[]|[]||||
-,|1|0.0000|0.0000|||UC|[]|[]||||
+,|1|0.0000|0.0000|||||[]||||
 I|1|0.0000|0.0000|||CA|[]|[]||||
 think|1|0.0000|0.0000|||LC|[]|[]||||
 about|1|0.0000|0.0000|||LC|[]|[]||||
 just|1|0.0000|0.0000|||LC|[]|[]||||
 that|1|0.0000|0.0000|:||LC|[]|[]||||
-:|1|0.0000|0.0000|||LC|[]|[]||||
+:|1|0.0000|0.0000|||||[]||||
 foo|1|0.0000|0.0000|||LC|[]|[]|||sub(,)|
 a|1|0.0000|0.0000|||LC|[]|[]||||
diff --git a/test/data/short.aligned.punc_case.nlp b/test/data/short.aligned.punc_case.nlp
index e8b58a9..affb08c 100644
--- a/test/data/short.aligned.punc_case.nlp
+++ b/test/data/short.aligned.punc_case.nlp
@@ -1,20 +1,20 @@
 token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 <crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
 Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
-,|1|0.0000|0.0000|||UC|[]|[]||||
+,|1|0.0000|0.0000|||||[]||||
 yeah|1|||,||LC|[]|[]|||del|
-,|1|||||LC|[]|[]|||del|
+,|1|||||||[]|||del|
 right|1|0.0000|0.0000|.||LC|[]|[]||||
-.|1|||||LC|[]|[]|||del|
+.|1|||||||[]|||del|
 Yeah|1|||,||UC|[]|[]|||del|
-,|1|||||UC|[]|[]|||del|
+,|1|||||||[]|||del|
 all|1|||||LC|[]|[]|||del|
 right|1|||,||LC|[]|[]|||del|
-,|1|0.0000|0.0000|||LC|[]|[]|||sub(I'll)|
+,|1|0.0000|0.0000|||||[]|||sub(I'll)|
 probably|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
 just|1|0.0000|0.0000|||LC|[]|[]||||
 that|1|0.0000|0.0000|.||LC|[]|[]||||
-.|1|0.0000|0.0000|||LC|[]|[]|||sub(?)|
+.|1|0.0000|0.0000|||||[]|||sub(?)|
 Are|3|0.0000|0.0000|||UC|[]|[]||||
 there|3|0.0000|0.0000|||LC|[]|[]||||
 any|3|0.0000|0.0000|||LC|[]|[]||||
@@ -25,19 +25,19 @@ to|3|0.0000|0.0000|||LC|[]|[]||||
 mind|3|0.0000|0.0000|||LC|[]|[]||||
 or|3|0.0000|0.0000|||LC|[]|[]||||
 Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
-,|1|0.0000|0.0000|||UC|[]|[]||||
+,|1|0.0000|0.0000|||||[]||||
 sure|1|0.0000|0.0000|.||LC|[]|[]||||
-.|1|0.0000|0.0000|||LC|[]|[]||||
+.|1|0.0000|0.0000|||||[]||||
 When|1|0.0000|0.0000|||UC|[]|[]||||
 I|1|0.0000|0.0000|||CA|[]|[]||||
 hear|1|0.0000|0.0000|||LC|[]|[]||||
 Foobar|1|0.0000|0.0000|,||UC|[]|[]||||
-,|1|0.0000|0.0000|||UC|[]|[]||||
+,|1|0.0000|0.0000|||||[]||||
 I|1|0.0000|0.0000|||CA|[]|[]||||
 think|1|0.0000|0.0000|||LC|[]|[]||||
 about|1|0.0000|0.0000|||LC|[]|[]||||
 just|1|0.0000|0.0000|||LC|[]|[]||||
 that|1|0.0000|0.0000|:||LC|[]|[]||||
-:|1|0.0000|0.0000|||LC|[]|[]||||
+:|1|0.0000|0.0000|||||[]||||
 foo|1|0.0000|0.0000|||LC|[]|[]|||sub(,)|
 a|1|0.0000|0.0000|||LC|[]|[]||||
diff --git a/test/fstalign_Test.cc b/test/fstalign_Test.cc
index 4f0d509..4b23e63 100644
--- a/test/fstalign_Test.cc
+++ b/test/fstalign_Test.cc
@@ -695,8 +695,8 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") {
     const auto testFile = std::string{TEST_DATA} + "align_1.aligned.punc_case.nlp";
 
     REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str()));
-    REQUIRE_THAT(result, Contains("WER: 6/14 = 0.4286"));
-    REQUIRE_THAT(result, Contains("WER: INS:1 DEL:2 SUB:3"));
+    REQUIRE_THAT(result, Contains("WER: 7/15 = 0.4667"));
+    REQUIRE_THAT(result, Contains("WER: INS:0 DEL:2 SUB:5"));
   }
 
   SECTION("TXT Hypothesis: wer with case and punctuation(nlp output)") {

From be7072fe8b844bcb6b4b394dc6c6b57178aa678a Mon Sep 17 00:00:00 2001
From: Miguel Del Rio <pique0822@gmail.com>
Date: Thu, 19 Oct 2023 14:40:45 +0000
Subject: [PATCH 14/14] updating tests to reflect unk changes

---
 docs/Usage.md                           | 2 +-
 src/fstalign.cpp                        | 9 ---------
 test/data/align_1.aligned.punc_case.nlp | 2 +-
 test/data/align_1.ref.aligned.nlp       | 2 +-
 test/data/noise_1.hyp2.aligned          | 4 ++--
 5 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/docs/Usage.md b/docs/Usage.md
index 0e09ce9..3f6a540 100644
--- a/docs/Usage.md
+++ b/docs/Usage.md
@@ -107,7 +107,7 @@ must also be disabled with `--disable-approx-alignment`.
 ### Synonyms
 Synonyms allow for reference words to be equivalent to similar forms (determined by the user) for error counting. They are accepted for any input formats and passed into the tool via the `--syn <path_to_synonym_file>` flag. For details see [Synonyms Format](https://github.com/revdotcom/fstalign/blob/develop/docs/Synonyms-Format.md). A standard set of synonyms we use at Rev.ai is available in the repository under `sample_data/synonyms.rules.txt`.
 
-In addition to allowing for custom synonyms to be passed in via CLI, fstalign also automatically generates synonyms based on the reference and hypothesis text. Currently, it does this for two cases: cutoff words (hello-) and compound hyphenated words (long-term). In both cases, a synonym is dynamically generated with the hyphen removed. Both of these synonym types can be disabled through the CLI by passing in `--disable-cutoffs` and `--disable-hyphen-ignore`, respectively.
+In addition to allowing for custom synonyms to be passed in via CLI, fstalign also automatically generates synonyms based on the reference and hypothesis text. Currently, it does this for three cases: cutoff words (e.g. hello-), compound hyphenated words (e.g. long-term), and tags or codes that follow the regular expression: `<.*>` (e.g. <laugh>). In the first two cases, a synonym is dynamically generated with the hyphen removed. Both of these synonym types can be disabled through the CLI by passing in `--disable-cutoffs` and `--disable-hyphen-ignore`, respectively. For the last case of tags, we will automatically allow for `<unk>` to be a valid synonym -- currently, this feature cannot be turned off.
 
 ### Normalizations
 Normalizations are a similar concept to synonyms. They allow a token or group of tokens to be represented by alternatives when calculating the WER alignment. Unlike synonyms, they are only accepted for NLP file inputs where the tokens are tagged with a unique ID. The normalizations are specified in a JSON format, with the unique ID as keys. Example to illustrate the schema:
diff --git a/src/fstalign.cpp b/src/fstalign.cpp
index 206e31a..e7cc2fb 100644
--- a/src/fstalign.cpp
+++ b/src/fstalign.cpp
@@ -594,15 +594,6 @@ void write_stitches_to_nlp(vector<Stitching>& stitches, ofstream &output_nlp_fil
         logger->warn("an unnormalized token was found: {}", ref_tk);
       }
     } else if (IsNoisecodeToken(original_nlp_token)) {
-      // if we have a noisecode  <.*> in the nlp token, we inject it here
-      if (stitch.comment.length() == 0) {
-        if (ref_tk == DEL || ref_tk == "") {
-          stitch.comment = "sub(<eps>)";
-        } else {
-          stitch.comment = "sub(" + ref_tk + ")";
-        }
-      }
-
       ref_tk = original_nlp_token;
     } else if (stitch.comment.find("ins") == 0) {
       assert(add_inserts);
diff --git a/test/data/align_1.aligned.punc_case.nlp b/test/data/align_1.aligned.punc_case.nlp
index a614c37..893829e 100644
--- a/test/data/align_1.aligned.punc_case.nlp
+++ b/test/data/align_1.aligned.punc_case.nlp
@@ -4,7 +4,7 @@ b|1|3.0000|4.0000|||LC|[]|[]||||
 c|1|5.0000|6.0000|||LC|[]|[]||||
 d|1|7.0000|8.0000|,||LC|[]|[]||||
 ,|1|7.0000|8.0000|||||[]||||
-<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||sub(<unk>)|
+<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]||||
 .|1|11.0000|12.0000|||||[]|||sub(e)|
 e|1|11.0000|12.0000|||LC|[]|[]|||sub(.)|
 f|1|13.0000|14.0000|||LC|[]|[]||||
diff --git a/test/data/align_1.ref.aligned.nlp b/test/data/align_1.ref.aligned.nlp
index a910d24..40cff43 100644
--- a/test/data/align_1.ref.aligned.nlp
+++ b/test/data/align_1.ref.aligned.nlp
@@ -3,7 +3,7 @@ a|1|1.0000|2.0000|||CA|[]|[]||||
 b|1|3.0000|4.0000|||LC|[]|[]||||
 c|1|5.0000|6.0000|||LC|[]|[]||||
 d|1|7.0000|8.0000|,||LC|[]|[]||||
-<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||sub(<unk>)|
+<laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]||||
 e|1|11.0000|12.0000|||LC|[]|[]||||
 f|1|13.0000|14.0000|||LC|[]|[]||||
 g|1|15.0000|16.0000|||LC|[]|[]||||
diff --git a/test/data/noise_1.hyp2.aligned b/test/data/noise_1.hyp2.aligned
index 1fe99ac..f2449ef 100644
--- a/test/data/noise_1.hyp2.aligned
+++ b/test/data/noise_1.hyp2.aligned
@@ -3,11 +3,11 @@ a|1|1.0000|2.0000|||CA|[]|[]||||
 b|1|3.0000|4.0000|||LC|[]|[]||||
 c|1|5.0000|6.0000|||LC|[]|[]||||
 d|1|7.0000|8.0000|,||LC|[]|[]||||
-<inaudible>|1|9.0000|10.0000|,||LC|[]|[]|||sub(<unk>)|
+<inaudible>|1|9.0000|10.0000|,||LC|[]|[]||||
 e|1|11.0000|12.0000|||LC|[]|[]||||
 F|1|13.0000|14.0000|||LC|[]|[]||||
 G|1|15.0000|16.0000|||LC|[]|[]||||
 h|1|17.0000|18.0000|||LC|[]|[]||||
-<foreign>|1|19.0000|20.0000|,||LC|[]|[]|||sub(<unk>)|
+<foreign>|1|19.0000|20.0000|,||LC|[]|[]||||
 i|1|21.0000|22.0000|||LC|[]|[]||||
 j|1|23.0000|24.0000|||LC|[]|[]||||