From 785f3f0bcf3d65d3f00232e9c084c78d4bff5bc1 Mon Sep 17 00:00:00 2001 From: Zengwei Yao Date: Tue, 9 Jul 2024 20:04:47 +0800 Subject: [PATCH] Update RESULTS.md, adding results and model links of zipformer-small/medium CTC/AED models (#1683) --- egs/librispeech/ASR/README.md | 2 +- egs/librispeech/ASR/RESULTS.md | 113 ++++++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index 93fef7a079..8b87ee19b4 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -50,7 +50,7 @@ We place an additional Conv1d layer right after the input embedding layer. | `conformer-ctc2` | Reworked Conformer | Use auxiliary attention head | | `conformer-ctc3` | Reworked Conformer | Streaming version + delay penalty | | `zipformer-ctc` | Zipformer | Use auxiliary attention head | -| `zipformer` | Upgraded Zipformer | Use auxiliary transducer head | The latest recipe | +| `zipformer` | Upgraded Zipformer | Use auxiliary transducer head / attention-decoder head | The latest recipe | # MMI diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 6f00bc14d9..66b147764b 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -8,6 +8,117 @@ See for more details. #### Non-streaming +##### small-scale model, number of model parameters: 46282107, i.e., 46.3 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| ctc-decoding | 3.04 | 7.04 | --epoch 50 --avg 30 | +| attention-decoder-rescoring-no-ngram | 2.45 | 6.08 | --epoch 50 --avg 30 | + +The training command is: +```bash +export CUDA_VISIBLE_DEVICES="0,1" +# For non-streaming model training: +./zipformer/train.py \ + --world-size 2 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp-small \ + --full-libri 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --ctc-loss-scale 0.1 \ + --attention-decoder-loss-scale 0.9 \ + --num-encoder-layers 2,2,2,2,2,2 \ + --feedforward-dim 512,768,768,768,768,768 \ + --encoder-dim 192,256,256,256,256,256 \ + --encoder-unmasked-dim 192,192,192,192,192,192 \ + --base-lr 0.04 \ + --max-duration 1700 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +for m in ctc-decoding attention-decoder-rescoring-no-ngram; do + ./zipformer/ctc_decode.py \ + --epoch 50 \ + --avg 30 \ + --exp-dir zipformer/exp-small \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --attention-decoder-loss-scale 0.9 \ + --num-encoder-layers 2,2,2,2,2,2 \ + --feedforward-dim 512,768,768,768,768,768 \ + --encoder-dim 192,256,256,256,256,256 \ + --encoder-unmasked-dim 192,192,192,192,192,192 \ + --max-duration 100 \ + --causal 0 \ + --num-paths 100 \ + --decoding-method $m +done +``` + +##### medium-scale model, number of model parameters: 89987295, i.e., 90.0 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| ctc-decoding | 2.46 | 5.57 | --epoch 50 --avg 22 | +| attention-decoder-rescoring-no-ngram | 2.23 | 4.98 | --epoch 50 --avg 22 | + +The training command is: +```bash +export CUDA_VISIBLE_DEVICES="0,1,2,3" +# For non-streaming model training: +./zipformer/train.py \ + --world-size 4 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp \ + --full-libri 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --ctc-loss-scale 0.1 \ + --attention-decoder-loss-scale 0.9 \ + --max-duration 1200 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +for m in ctc-decoding attention-decoder-rescoring-no-ngram; do + ./zipformer/ctc_decode.py \ + --epoch 50 \ + --avg 22 \ + --exp-dir zipformer/exp \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --attention-decoder-loss-scale 0.9 \ + --max-duration 100 \ + --causal 0 \ + --num-paths 100 \ + --decoding-method $m +done +``` + ##### large-scale model, number of model parameters: 174319650, i.e., 174.3 M You can find a pretrained model, training logs, decoding logs, and decoding results at: @@ -15,8 +126,6 @@ You can find a pretrained model, training logs, decoding logs, and decoding resu You can use to deploy it. -Results of the CTC head: - | decoding method | test-clean | test-other | comment | |--------------------------------------|------------|------------|---------------------| | ctc-decoding | 2.29 | 5.14 | --epoch 50 --avg 29 |