Merge branch 'master' into dev/bilingual

k2-fsa · Nov 23, 2023 · 4c25b50 · 4c25b50
2 parents de71456 + 238b45b
commit 4c25b50
Show file tree

Hide file tree

Showing 54 changed files with 3,723 additions and 54 deletions.
diff --git a/egs/aishell/ASR/prepare.sh b/egs/aishell/ASR/prepare.sh
@@ -261,10 +261,9 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
   fi
 
   if [ ! -f $lang_char_dir/HLG.fst ]; then
-    lang_phone_dir=data/lang_phone
     ./local/prepare_lang_fst.py  \
-      --lang-dir $lang_phone_dir \
-      --ngram-G ./data/lm/G_3_gram.fst.txt
+      --lang-dir $lang_char_dir \
+      --ngram-G ./data/lm/G_3_gram_char.fst.txt
   fi
 fi
 

diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/decode.py b/egs/aishell/ASR/pruned_transducer_stateless7/decode.py
@@ -641,7 +641,7 @@ def main():
                 contexts_text.append(line.strip())
             contexts = graph_compiler.texts_to_ids(contexts_text)
             context_graph = ContextGraph(params.context_score)
-            context_graph.build(contexts)
+            context_graph.build([(c, 0.0) for c in contexts])
         else:
             context_graph = None
     else:

diff --git a/...SR/pruned_transducer_stateless7/train2.py → ...ucer_stateless7/do_not_use_it_directly.py b/...SR/pruned_transducer_stateless7/train2.py → ...ucer_stateless7/do_not_use_it_directly.py
@@ -1234,6 +1234,7 @@ def scan_pessimistic_batches_for_oom(
 
 
 def main():
+    raise RuntimeError("Please don't use this file directly!")
     parser = get_parser()
     AsrDataModule.add_arguments(parser)
     args = parser.parse_args()

diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/export-onnx.py b/egs/aishell/ASR/pruned_transducer_stateless7/export-onnx.py
@@ -56,7 +56,7 @@
 from decoder2 import Decoder
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from scaling_converter import convert_scaled_to_non_scaled
-from train2 import add_model_arguments, get_params, get_transducer_model
+from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 from zipformer import Zipformer
 
 from icefall.checkpoint import (

diff --git a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
@@ -686,7 +686,7 @@ def main():
                 contexts_text.append(line.strip())
             contexts = graph_compiler.texts_to_ids(contexts_text)
             context_graph = ContextGraph(params.context_score)
-            context_graph.build(contexts)
+            context_graph.build([(c, 0.0) for c in contexts])
         else:
             context_graph = None
     else:

diff --git a/...transducer_stateless7_streaming/train2.py → ...less7_streaming/do_not_use_it_directly.py b/...transducer_stateless7_streaming/train2.py → ...less7_streaming/do_not_use_it_directly.py
@@ -1233,6 +1233,7 @@ def scan_pessimistic_batches_for_oom(
 
 
 def main():
+    raise RuntimeError("Please don't use this file directly!")
     parser = get_parser()
     AishellAsrDataModule.add_arguments(parser)
     args = parser.parse_args()

diff --git a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/README.md b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/README.md
@@ -4,6 +4,6 @@ See https://k2-fsa.github.io/icefall/recipes/Streaming-ASR/librispeech/zipformer
 
 [./emformer.py](./emformer.py) and [./train.py](./train.py)
 are basically the same as
-[./emformer2.py](./emformer2.py) and [./train2.py](./train2.py).
-The only purpose of [./emformer2.py](./emformer2.py) and [./train2.py](./train2.py)
+[./emformer2.py](./emformer2.py) and [./do_not_use_it_directly.py](./do_not_use_it_directly.py).
+The only purpose of [./emformer2.py](./emformer2.py) and [./do_not_use_it_directly.py](./do_not_use_it_directly.py)
 is for exporting to [sherpa-ncnn](https://github.com/k2-fsa/sherpa-ncnn).
diff --git a/...transducer_stateless7_streaming/train2.py → ...less7_streaming/do_not_use_it_directly.py b/...transducer_stateless7_streaming/train2.py → ...less7_streaming/do_not_use_it_directly.py
@@ -1237,6 +1237,7 @@ def scan_pessimistic_batches_for_oom(
 
 
 def main():
+    raise RuntimeError("Please don't use this file directly!")
     parser = get_parser()
     CommonVoiceAsrDataModule.add_arguments(parser)
     args = parser.parse_args()

diff --git a/...transducer_stateless7_streaming/train2.py → ...less7_streaming/do_not_use_it_directly.py b/...transducer_stateless7_streaming/train2.py → ...less7_streaming/do_not_use_it_directly.py
@@ -1274,6 +1274,7 @@ def scan_pessimistic_batches_for_oom(
 
 
 def main():
+    raise RuntimeError("Please don't use this file directly!")
     parser = get_parser()
     CSJAsrDataModule.add_arguments(parser)
     Tokenizer.add_arguments(parser)

diff --git a/egs/csj/ASR/pruned_transducer_stateless7_streaming/export-for-ncnn.py b/egs/csj/ASR/pruned_transducer_stateless7_streaming/export-for-ncnn.py
@@ -72,7 +72,7 @@
 import torch
 from scaling_converter import convert_scaled_to_non_scaled
 from tokenizer import Tokenizer
-from train2 import add_model_arguments, get_params, get_transducer_model
+from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 
 from icefall.checkpoint import (
     average_checkpoints,

diff --git a/egs/libriheavy/ASR/README.md b/egs/libriheavy/ASR/README.md
@@ -0,0 +1,6 @@
+# Libriheavy: a 50,000 hours ASR corpus with punctuation casing and context
+
+Libriheavy is a labeled version of [Librilight](https://arxiv.org/pdf/1912.07875.pdf). Please refer to our repository [k2-fsa/libriheavy](https://github.com/k2-fsa/libriheavy) for more details. We also have a paper: *Libriheavy: a 50,000 hours ASR corpus with punctuation casing and context*, [Preprint available on arxiv](https://arxiv.org/abs/2309.08105).
+
+
+See [RESULTS](./RESULTS.md) for the results for icefall recipes.
diff --git a/egs/libriheavy/ASR/RESULTS.md b/egs/libriheavy/ASR/RESULTS.md
@@ -1,6 +1,116 @@
-## Results
+# Results
 
-### Zipformer PromptASR (zipformer + PromptASR + BERT text encoder)
+## zipformer (zipformer + pruned stateless transducer)
+
+See <https://github.com/k2-fsa/icefall/pull/1261> for more details.
+
+[zipformer](./zipformer)
+
+### Non-streaming
+
+#### Training on normalized text, i.e. Upper case without punctuation
+
+##### normal-scaled model, number of model parameters: 65805511, i.e., 65.81 M
+
+You can find a pretrained model, training logs at:
+<https://www.modelscope.cn/models/pkufool/icefall-asr-zipformer-libriheavy-20230926/summary>
+
+Note: The repository above contains three models trained on different subset of libriheavy exp(large set), exp_medium_subset(medium set),
+exp_small_subset(small set).
+
+Results of models:
+
+| training set  |  decoding method    | librispeech clean | librispeech other | libriheavy clean | libriheavy other | comment            |
+|---------------|---------------------|-------------------|-------------------|------------------|------------------|--------------------|
+| small         |  greedy search      | 4.19              | 9.99              | 4.75             | 10.25            |--epoch 90 --avg 20 |
+| small         | modified beam search| 4.05              | 9.89              | 4.68             | 10.01            |--epoch 90 --avg 20 |
+| medium        |  greedy search      | 2.39              | 4.85              | 2.90             | 6.6              |--epoch 60 --avg 20 |
+| medium        | modified beam search| 2.35              | 4.82              | 2.90             | 6.57             |--epoch 60 --avg 20 |
+| large         |  greedy search      | 1.67              | 3.32              | 2.24             | 5.61             |--epoch 16 --avg 3  |
+| large         | modified beam search| 1.62              | 3.36              | 2.20             | 5.57             |--epoch 16 --avg 3  |
+
+The training command is:
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python ./zipformer/train.py \
+    --world-size 4 \
+    --master-port 12365 \
+    --exp-dir zipformer/exp \
+    --num-epochs 60 \   # 16 for large; 90 for small
+    --lr-hours 15000 \  # 20000 for large; 5000 for small
+    --use-fp16 1 \
+    --start-epoch 1 \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --max-duration 1000 \
+    --subset medium
+```
+
+The decoding command is:
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+for m in greedy_search modified_beam_search; do
+  ./zipformer/decode.py \
+      --epoch 16 \
+      --avg 3 \
+      --exp-dir zipformer/exp \
+      --max-duration 1000 \
+      --causal 0 \
+      --decoding-method $m
+done
+```
+
+#### Training on full formatted text, i.e. with casing and punctuation
+
+##### normal-scaled model, number of model parameters: 66074067 , i.e., 66M
+
+You can find a pretrained model, training logs at:
+<https://www.modelscope.cn/models/pkufool/icefall-asr-zipformer-libriheavy-punc-20230830/summary>
+
+Note: The repository above contains three models trained on different subset of libriheavy exp(large set), exp_medium_subset(medium set),
+exp_small_subset(small set).
+
+Results of models:
+
+| training set  |  decoding method    | libriheavy clean (WER) | libriheavy other (WER) | libriheavy clean (CER) | libriheavy other (CER) | comment            |
+|---------------|---------------------|-------------------|-------------------|------------------|------------------|--------------------|
+| small         | modified beam search| 13.04             | 19.54             | 4.51             | 7.90             |--epoch 88 --avg 41 |
+| medium        | modified beam search| 9.84              | 13.39             | 3.02             | 5.10             |--epoch 50 --avg 15 |
+| large         | modified beam search| 7.76              | 11.32             | 2.41             | 4.22             |--epoch 16 --avg 2  |
+
+The training command is:
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python ./zipformer/train.py \
+    --world-size 4 \
+    --master-port 12365 \
+    --exp-dir zipformer/exp \
+    --num-epochs 60 \   # 16 for large; 90 for small
+    --lr-hours 15000 \  # 20000 for large; 10000 for small
+    --use-fp16 1 \
+    --train-with-punctuation 1 \
+    --start-epoch 1 \
+    --bpe-model data/lang_punc_bpe_756/bpe.model \
+    --max-duration 1000 \
+    --subset medium
+```
+
+The decoding command is:
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+for m in greedy_search modified_beam_search; do
+  ./zipformer/decode.py \
+      --epoch 16 \
+      --avg 3 \
+      --exp-dir zipformer/exp \
+      --max-duration 1000 \
+      --causal 0 \
+      --decoding-method $m
+done
+```
+
+## Zipformer PromptASR (zipformer + PromptASR + BERT text encoder)
 
 #### [zipformer_prompt_asr](./zipformer_prompt_asr)