k2-fsa · AmirHussein96 · Oct 5, 2023 · Oct 11, 2023 · Sep 23, 2024 · Sep 23, 2024
diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md
@@ -1,5 +1,103 @@
 ## Results
 
+zipformer_hat
+### zipformer hybrid autoregresive transducer (HAT)
+see <https://github.com/k2-fsa/icefall/pull/1291> for more details
+[zipformer_hat](./zipformer_hat)
+
+Results with RNNLM shallow fusion and internal language model subtraction on the same data (Librispeech)
+
+Model | Train | Decode | LM scale | ILM scale | test-clean | test-other
+-- | -- | -- | -- | -- | -- | --
+Zipformer-HAT | train-960 | greedy_search | - | - | 2.22 | 5.01
+  |   | modified_beam_search | 0 | 0 | 2.18 | 4.96
+  |   | + RNNLM shallow fusion | 0.29 | 0 | 1.96 | 4.55
+  |   | - ILME  | 0.29 | 0.1 | 1.95 | 4.55
+  |   | - ILME  | 0.29 | 0.3 | 1.97 | 4.5
+
+The training command is:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+./zipformer_hat/train.py \
+  --world-size 4 \
+  --num-epochs 40 \
+  --lr-batches 5000 \
+  --lr-epochs 4 \
+  --exp-dir ./zipformer_hat/exp \
+  --use-fp16 1 \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --max-duration 800 \
+  --causal 0 \
+  --num-encoder-layers 2,2,2,2,2,2 \
+  --feedforward-dim 512,768,1024,1024,1024,768 \
+  --encoder-dim 192,256,256,256,256,256 \
+  --encoder-unmasked-dim 192,192,192,192,192,192 \
+  --use-transducer 1
+  ```
+
+The decoding command is:
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+for method in modified_beam_search greedy_search; do
+  ./zipformer_hat/decode.py \
+  --epoch 40 --avg 16 --use-averaged-model True \
+  --beam-size 4 \
+  --exp-dir ./zipformer_hat/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --max-contexts 4 \
+  --max-states 8 \
+  --max-duration 600 \
+  --decoding-method $method 
+done
+```
+
+The decoding with shallow LM fusion and ILM subtraction:
+```bash
+for method in modified_beam_search_auxlm_shallow_fusion; do
+  ./zipformer_hat/decode.py \
+   --epoch 40 --avg 16 --use-averaged-model True \
+  --beam-size 4 \
+  --exp-dir ./zipformer_hat/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --max-contexts 4 \
+  --max-states 8 \
+  --max-duration 800 \
+  --decoding-method $method \
+  --subtract-ilm True\
+  --ilm-scale 0.1 \
+  --use-shallow-fusion 1 \
+  --lm-type rnn \
+  --lm-exp-dir rnn_lm/exp \
+  --lm-epoch 25 \
+  --lm-scale 0.45 \
+  --lm-avg 5 \
+  --lm-vocab-size 500 \
+  --rnn-lm-embedding-dim 512 \
+  --rnn-lm-hidden-dim 512 \
+  --rnn-lm-num-layers 2
+done
+```
+for m in ctc-decoding attention-decoder-rescoring-no-ngram; do
+  ./zipformer/ctc_decode.py \
+    --epoch 50 \
+    --avg 29 \
+    --exp-dir zipformer/exp-large \
+    --use-ctc 1 \
+    --use-transducer 0 \
+    --use-attention-decoder 1 \
+    --attention-decoder-loss-scale 0.9 \
+    --num-encoder-layers 2,2,4,5,4,2 \
+    --feedforward-dim 512,768,1536,2048,1536,768 \
+    --encoder-dim 192,256,512,768,512,256 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    --max-duration 100 \
+    --causal 0 \
+    --num-paths 100 \
+    --decoding-method $m
+done
+```
+
 ### zipformer (zipformer + CTC/AED)
 
 See <https://github.com/k2-fsa/icefall/pull/1389> for more details.
@@ -155,29 +253,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
   --master-port 12345
 ```
 
-The decoding command is:
-```bash
-export CUDA_VISIBLE_DEVICES="0"
-for m in ctc-decoding attention-decoder-rescoring-no-ngram; do
-  ./zipformer/ctc_decode.py \
-    --epoch 50 \
-    --avg 29 \
-    --exp-dir zipformer/exp-large \
-    --use-ctc 1 \
-    --use-transducer 0 \
-    --use-attention-decoder 1 \
-    --attention-decoder-loss-scale 0.9 \
-    --num-encoder-layers 2,2,4,5,4,2 \
-    --feedforward-dim 512,768,1536,2048,1536,768 \
-    --encoder-dim 192,256,512,768,512,256 \
-    --encoder-unmasked-dim 192,192,256,320,256,192 \
-    --max-duration 100 \
-    --causal 0 \
-    --num-paths 100 \
-    --decoding-method $m
-done
-```
-
 
 ### zipformer (zipformer + pruned stateless transducer + CTC)
 

diff --git a/egs/librispeech/ASR/zipformer_hat/asr_datamodule.py b/egs/librispeech/ASR/zipformer_hat/asr_datamodule.py
@@ -0,0 +1 @@
+../tdnn_lstm_ctc/asr_datamodule.py