diff --git a/egs/librispeech/ASR/zipformer/attention_decoder.py b/egs/librispeech/ASR/zipformer/attention_decoder.py index 81682e87b5..bff536f90b 100644 --- a/egs/librispeech/ASR/zipformer/attention_decoder.py +++ b/egs/librispeech/ASR/zipformer/attention_decoder.py @@ -236,7 +236,7 @@ def forward( causal_mask = subsequent_mask(x.shape[0], device=x.device) # (seq_len, seq_len) attn_mask = torch.logical_or( padding_mask.unsqueeze(1), # (batch, 1, seq_len) - torch.logical_not(causal_mask).unsqueeze(0) # (1, seq_len, seq_len) + torch.logical_not(causal_mask).unsqueeze(0), # (1, seq_len, seq_len) ) # (batch, seq_len, seq_len) if memory is not None: @@ -367,7 +367,9 @@ def __init__( self.num_heads = num_heads self.head_dim = attention_dim // num_heads assert self.head_dim * num_heads == attention_dim, ( - self.head_dim, num_heads, attention_dim + self.head_dim, + num_heads, + attention_dim, ) self.dropout = dropout self.name = None # will be overwritten in training code; for diagnostics. @@ -437,15 +439,19 @@ def forward( if key_padding_mask is not None: assert key_padding_mask.shape == (batch, src_len), key_padding_mask.shape attn_weights = attn_weights.masked_fill( - key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"), + key_padding_mask.unsqueeze(1).unsqueeze(2), + float("-inf"), ) if attn_mask is not None: - assert ( - attn_mask.shape == (batch, 1, src_len) - or attn_mask.shape == (batch, tgt_len, src_len) + assert attn_mask.shape == (batch, 1, src_len) or attn_mask.shape == ( + batch, + tgt_len, + src_len, ), attn_mask.shape - attn_weights = attn_weights.masked_fill(attn_mask.unsqueeze(1), float("-inf")) + attn_weights = attn_weights.masked_fill( + attn_mask.unsqueeze(1), float("-inf") + ) attn_weights = attn_weights.view(batch * num_heads, tgt_len, src_len) attn_weights = nn.functional.softmax(attn_weights, dim=-1) @@ -456,7 +462,11 @@ def forward( # (batch * head, tgt_len, head_dim) attn_output = torch.bmm(attn_weights, v) - assert attn_output.shape == (batch * num_heads, tgt_len, head_dim), attn_output.shape + assert attn_output.shape == ( + batch * num_heads, + tgt_len, + head_dim, + ), attn_output.shape attn_output = attn_output.transpose(0, 1).contiguous() attn_output = attn_output.view(tgt_len, batch, num_heads * head_dim) diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py index 88c58f5818..a35eb52877 100755 --- a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py +++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py @@ -487,6 +487,7 @@ def build_inputs_outputs(tensors, i): add_meta_data(filename=encoder_filename, meta_data=meta_data) + def export_decoder_model_onnx( decoder_model: OnnxDecoder, decoder_filename: str, @@ -754,30 +755,31 @@ def main(): ) logging.info(f"Exported joiner to {joiner_filename}") - if(params.fp16) : + if params.fp16: from onnxconverter_common import float16 + logging.info("Generate fp16 models") encoder = onnx.load(encoder_filename) encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True) encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx" - onnx.save(encoder_fp16,encoder_filename_fp16) + onnx.save(encoder_fp16, encoder_filename_fp16) decoder = onnx.load(decoder_filename) decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True) decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx" - onnx.save(decoder_fp16,decoder_filename_fp16) + onnx.save(decoder_fp16, decoder_filename_fp16) joiner = onnx.load(joiner_filename) joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True) joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx" - onnx.save(joiner_fp16,joiner_filename_fp16) + onnx.save(joiner_fp16, joiner_filename_fp16) # Generate int8 quantization models # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection logging.info("Generate int8 quantization models") - + encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx" quantize_dynamic( model_input=encoder_filename, diff --git a/egs/librispeech/ASR/zipformer/export-onnx.py b/egs/librispeech/ASR/zipformer/export-onnx.py index ca3cbf0d59..a56a7a3e67 100755 --- a/egs/librispeech/ASR/zipformer/export-onnx.py +++ b/egs/librispeech/ASR/zipformer/export-onnx.py @@ -592,23 +592,23 @@ def main(): ) logging.info(f"Exported joiner to {joiner_filename}") - if(params.fp16) : + if params.fp16: logging.info("Generate fp16 models") encoder = onnx.load(encoder_filename) encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True) encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx" - onnx.save(encoder_fp16,encoder_filename_fp16) + onnx.save(encoder_fp16, encoder_filename_fp16) decoder = onnx.load(decoder_filename) decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True) decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx" - onnx.save(decoder_fp16,decoder_filename_fp16) + onnx.save(decoder_fp16, decoder_filename_fp16) joiner = onnx.load(joiner_filename) joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True) joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx" - onnx.save(joiner_fp16,joiner_filename_fp16) + onnx.save(joiner_fp16, joiner_filename_fp16) # Generate int8 quantization models # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection diff --git a/egs/libritts/ASR/local/compute_fbank_libritts.py b/egs/libritts/ASR/local/compute_fbank_libritts.py index 5e78af18b1..b6e2a4c436 100755 --- a/egs/libritts/ASR/local/compute_fbank_libritts.py +++ b/egs/libritts/ASR/local/compute_fbank_libritts.py @@ -124,7 +124,7 @@ def compute_fbank_libritts( supervisions=m["supervisions"], ) if sampling_rate != 24000: - logging.info(f"Resampling audio to {sampling_rate}") + logging.info(f"Resampling audio to {sampling_rate}Hz") cut_set = cut_set.resample(sampling_rate) if "train" in partition: if perturb_speed: diff --git a/egs/libritts/ASR/local/convert_transcript_words_to_tokens.py b/egs/libritts/ASR/local/convert_transcript_words_to_tokens.py new file mode 120000 index 0000000000..2ce13fd69a --- /dev/null +++ b/egs/libritts/ASR/local/convert_transcript_words_to_tokens.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/convert_transcript_words_to_tokens.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/download_lm.py b/egs/libritts/ASR/local/download_lm.py new file mode 120000 index 0000000000..c9668bd2dc --- /dev/null +++ b/egs/libritts/ASR/local/download_lm.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/download_lm.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/norm_text.py b/egs/libritts/ASR/local/norm_text.py new file mode 120000 index 0000000000..dea3c051f8 --- /dev/null +++ b/egs/libritts/ASR/local/norm_text.py @@ -0,0 +1 @@ +../../../libriheavy/ASR/local/norm_text.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/prepare_lang.py b/egs/libritts/ASR/local/prepare_lang.py new file mode 120000 index 0000000000..747f2ab398 --- /dev/null +++ b/egs/libritts/ASR/local/prepare_lang.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/prepare_lang_bpe.py b/egs/libritts/ASR/local/prepare_lang_bpe.py new file mode 120000 index 0000000000..36b40e7fc2 --- /dev/null +++ b/egs/libritts/ASR/local/prepare_lang_bpe.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang_bpe.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/prepare_lang_fst.py b/egs/libritts/ASR/local/prepare_lang_fst.py new file mode 120000 index 0000000000..c5787c5340 --- /dev/null +++ b/egs/libritts/ASR/local/prepare_lang_fst.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lang_fst.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/prepare_lm_training_data.py b/egs/libritts/ASR/local/prepare_lm_training_data.py new file mode 120000 index 0000000000..abc00d421f --- /dev/null +++ b/egs/libritts/ASR/local/prepare_lm_training_data.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/prepare_lm_training_data.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/train_bpe_model.py b/egs/libritts/ASR/local/train_bpe_model.py new file mode 120000 index 0000000000..6fad36421e --- /dev/null +++ b/egs/libritts/ASR/local/train_bpe_model.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/train_bpe_model.py \ No newline at end of file diff --git a/egs/libritts/ASR/local/validate_bpe_lexicon.py b/egs/libritts/ASR/local/validate_bpe_lexicon.py new file mode 120000 index 0000000000..721bb48e7c --- /dev/null +++ b/egs/libritts/ASR/local/validate_bpe_lexicon.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/validate_bpe_lexicon.py \ No newline at end of file diff --git a/egs/libritts/ASR/prepare.sh b/egs/libritts/ASR/prepare.sh index 23c84e8386..4b551385f5 100755 --- a/egs/libritts/ASR/prepare.sh +++ b/egs/libritts/ASR/prepare.sh @@ -7,9 +7,15 @@ set -eou pipefail stage=0 stop_stage=100 -sampling_rate=24000 +sampling_rate=16000 nj=32 perturb_speed=true +vocab_sizes=( + # 5000 + # 2000 + # 1000 + 500 +) dl_dir=$PWD/download @@ -27,6 +33,15 @@ log() { log "dl_dir: $dl_dir" +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + log "Stage -1: Download LM" # we directly use the librispeech lm here + mkdir -p $dl_dir/lm + if [ ! -e $dl_dir/lm/.done ]; then + ./local/download_lm.py --out-dir=$dl_dir/lm + touch $dl_dir/lm/.done + fi +fi + if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" @@ -107,3 +122,73 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then touch data/fbank/.msuan.done fi fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Train BPE model for normalized text" + + if [ ! -f data/texts ]; then + gunzip -c data/manifests/libritts_supervisions_train-clean-100.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/norm_text.py > data/texts + + gunzip -c data/manifests/libritts_supervisions_train-clean-360.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/norm_text.py >> data/texts + + gunzip -c data/manifests/libritts_supervisions_train-other-500.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/norm_text.py >> data/texts + fi + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + + cp data/texts $lang_dir/text + + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/text + fi + done +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Prepare phone based lang" + lang_dir=data/lang_phone + mkdir -p $lang_dir + + if [ ! -f $dl_dir/lm/librispeech-lexicon.txt ]; then + log "No lexicon file in $dl_dir/lm, please run :" + log "prepare.sh --stage -1 --stop-stage -1" + exit -1 + fi + + if [ ! -f $lang_dir/lexicon.txt ]; then + (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | + cat - $dl_dir/lm/librispeech-lexicon.txt | + sort | uniq > $lang_dir/lexicon.txt + fi + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang.py --lang-dir $lang_dir + fi + + if [ ! -f $lang_dir/L.fst ]; then + log "Converting L.pt to L.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L.pt \ + $lang_dir/L.fst + fi + + if [ ! -f $lang_dir/L_disambig.fst ]; then + log "Converting L_disambig.pt to L_disambig.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L_disambig.pt \ + $lang_dir/L_disambig.fst + fi +fi diff --git a/egs/libritts/ASR/prepare_lm.sh b/egs/libritts/ASR/prepare_lm.sh new file mode 100755 index 0000000000..1c690983b8 --- /dev/null +++ b/egs/libritts/ASR/prepare_lm.sh @@ -0,0 +1,264 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +# This script generate Ngram LM / NNLM and related files that needed by decoding. + +# We assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# +# - $dl_dir/lm +# This directory contains the following files downloaded from +# http://www.openslr.org/resources/11 +# +# - 3-gram.pruned.1e-7.arpa.gz +# - 3-gram.pruned.1e-7.arpa +# - 4-gram.arpa.gz +# - 4-gram.arpa +# - librispeech-vocab.txt +# - librispeech-lexicon.txt +# - librispeech-lm-norm.txt.gz +# + +. prepare.sh --stage -1 --stop-stage 6 || exit 1 + +log "Running prepare_lm.sh" + +stage=0 +stop_stage=100 + +. shared/parse_options.sh || exit 1 + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Prepare BPE based lexicon." + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang_phone/words.txt $lang_dir + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + + log "Validating $lang_dir/lexicon.txt" + ./local/validate_bpe_lexicon.py \ + --lexicon $lang_dir/lexicon.txt \ + --bpe-model $lang_dir/bpe.model + fi + + if [ ! -f $lang_dir/L.fst ]; then + log "Converting L.pt to L.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L.pt \ + $lang_dir/L.fst + fi + + if [ ! -f $lang_dir/L_disambig.fst ]; then + log "Converting L_disambig.pt to L_disambig.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L_disambig.pt \ + $lang_dir/L_disambig.fst + fi + done +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare word level G" + # We assume you have installed kaldilm, if not, please install + # it using: pip install kaldilm + + mkdir -p data/lm + if [ ! -f data/lm/G_3_gram.fst.txt ]; then + # It is used in building HLG + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + $dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt + fi + + if [ ! -f data/lm/G_4_gram.fst.txt ]; then + # It is used for LM rescoring + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + $dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt + fi + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + + if [ ! -f $lang_dir/HL.fst ]; then + ./local/prepare_lang_fst.py \ + --lang-dir $lang_dir \ + --ngram-G ./data/lm/G_3_gram.fst.txt + fi + done +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Compile HLG" + ./local/compile_hlg.py --lang-dir data/lang_phone + + # Note If ./local/compile_hlg.py throws OOM, + # please switch to the following command + # + # ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_hlg.py --lang-dir $lang_dir + + # Note If ./local/compile_hlg.py throws OOM, + # please switch to the following command + # + # ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir + done +fi + +# Compile LG for RNN-T fast_beam_search decoding +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compile LG" + ./local/compile_lg.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_lg.py --lang-dir $lang_dir + done +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Prepare token level ngram G" + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + + if [ ! -f $lang_dir/transcript_tokens.txt ]; then + ./local/convert_transcript_words_to_tokens.py \ + --lexicon $lang_dir/lexicon.txt \ + --transcript $lang_dir/transcript_words.txt \ + --oov "" \ + > $lang_dir/transcript_tokens.txt + fi + + for ngram in 2 3 4 5; do + if [ ! -f $lang_dir/${ngram}gram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order ${ngram} \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/${ngram}gram.arpa + fi + + if [ ! -f $lang_dir/${ngram}gram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=${ngram} \ + $lang_dir/${ngram}gram.arpa > $lang_dir/${ngram}gram.fst.txt + fi + done + done +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Generate NNLM training data" + + for vocab_size in ${vocab_sizes[@]}; do + log "Processing vocab_size == ${vocab_size}" + lang_dir=data/lang_bpe_${vocab_size} + out_dir=data/lm_training_bpe_${vocab_size} + mkdir -p $out_dir + + ./local/prepare_lm_training_data.py \ + --bpe-model $lang_dir/bpe.model \ + --lm-data $dl_dir/lm/librispeech-lm-norm.txt \ + --lm-archive $out_dir/lm_data.pt + done +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Generate NNLM validation data" + + for vocab_size in ${vocab_sizes[@]}; do + log "Processing vocab_size == ${vocab_size}" + out_dir=data/lm_training_bpe_${vocab_size} + mkdir -p $out_dir + + if [ ! -f $out_dir/valid.txt ]; then + gunzip -c data/manifests/libritts_supervisions_dev-clean.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/norm_text.py > $out_dir/valid.txt + + gunzip -c data/manifests/libritts_supervisions_dev-other.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/norm_text.py >> $out_dir/valid.txt + fi + + lang_dir=data/lang_bpe_${vocab_size} + ./local/prepare_lm_training_data.py \ + --bpe-model $lang_dir/bpe.model \ + --lm-data $out_dir/valid.txt \ + --lm-archive $out_dir/lm_data-valid.pt + done +fi + +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Generate NNLM test data" + + for vocab_size in ${vocab_sizes[@]}; do + log "Processing vocab_size == ${vocab_size}" + out_dir=data/lm_training_bpe_${vocab_size} + mkdir -p $out_dir + + if [ ! -f $out_dir/test.txt ]; then + gunzip -c data/manifests/libritts_supervisions_test-clean.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/norm_text.py > $out_dir/test.txt + + gunzip -c data/manifests/libritts_supervisions_test-other.jsonl.gz \ + | jq ".text" | sed 's/"//g' \ + | ./local/norm_text.py >> $out_dir/test.txt + fi + + lang_dir=data/lang_bpe_${vocab_size} + ./local/prepare_lm_training_data.py \ + --bpe-model $lang_dir/bpe.model \ + --lm-data $out_dir/test.txt \ + --lm-archive $out_dir/lm_data-test.pt + done +fi + +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Sort NNLM training data" + # Sort LM training data by sentence length in descending order + # for ease of training. + # + # Sentence length equals to the number of BPE tokens + # in a sentence. + + for vocab_size in ${vocab_sizes[@]}; do + out_dir=data/lm_training_bpe_${vocab_size} + mkdir -p $out_dir + ./local/sort_lm_training_data.py \ + --in-lm-data $out_dir/lm_data.pt \ + --out-lm-data $out_dir/sorted_lm_data.pt \ + --out-statistics $out_dir/statistics.txt + + ./local/sort_lm_training_data.py \ + --in-lm-data $out_dir/lm_data-valid.pt \ + --out-lm-data $out_dir/sorted_lm_data-valid.pt \ + --out-statistics $out_dir/statistics-valid.txt + + ./local/sort_lm_training_data.py \ + --in-lm-data $out_dir/lm_data-test.pt \ + --out-lm-data $out_dir/sorted_lm_data-test.pt \ + --out-statistics $out_dir/statistics-test.txt + done +fi diff --git a/egs/libritts/ASR/zipformer/decode.py b/egs/libritts/ASR/zipformer/decode.py index 8b033ce90f..15267b0cb2 100755 --- a/egs/libritts/ASR/zipformer/decode.py +++ b/egs/libritts/ASR/zipformer/decode.py @@ -1041,13 +1041,13 @@ def main(): # we need cut ids to display recognition results. args.return_cuts = True - librispeech = LibriTTSAsrDataModule(args) + libritts = LibriTTSAsrDataModule(args) - test_clean_cuts = librispeech.test_clean_cuts() - test_other_cuts = librispeech.test_other_cuts() + test_clean_cuts = libritts.test_clean_cuts() + test_other_cuts = libritts.test_other_cuts() - test_clean_dl = librispeech.test_dataloaders(test_clean_cuts) - test_other_dl = librispeech.test_dataloaders(test_other_cuts) + test_clean_dl = libritts.test_dataloaders(test_clean_cuts) + test_other_dl = libritts.test_dataloaders(test_other_cuts) test_sets = ["test-clean", "test-other"] test_dl = [test_clean_dl, test_other_dl] diff --git a/egs/libritts/ASR/zipformer/streaming_decode.py b/egs/libritts/ASR/zipformer/streaming_decode.py index 4e2f1ecb9f..3ecc5c94f1 100755 --- a/egs/libritts/ASR/zipformer/streaming_decode.py +++ b/egs/libritts/ASR/zipformer/streaming_decode.py @@ -864,10 +864,10 @@ def main(): num_param = sum([p.numel() for p in model.parameters()]) logging.info(f"Number of model parameters: {num_param}") - librispeech = LibriTTSAsrDataModule(args) + libritts = LibriTTSAsrDataModule(args) - test_clean_cuts = librispeech.test_clean_cuts() - test_other_cuts = librispeech.test_other_cuts() + test_clean_cuts = libritts.test_clean_cuts() + test_other_cuts = libritts.test_other_cuts() test_sets = ["test-clean", "test-other"] test_cuts = [test_clean_cuts, test_other_cuts] diff --git a/egs/libritts/ASR/zipformer/train.py b/egs/libritts/ASR/zipformer/train.py index 5eb90efbcb..98bbafc4a3 100755 --- a/egs/libritts/ASR/zipformer/train.py +++ b/egs/libritts/ASR/zipformer/train.py @@ -603,6 +603,15 @@ def _to_int_tuple(s: str): return tuple(map(int, s.split(","))) +def remove_punc_to_upper(text: str) -> str: + text = text.replace("‘", "'") + text = text.replace("’", "'") + tokens = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'") + s_list = [x.upper() if x in tokens else " " for x in text] + s = " ".join("".join(s_list).split()).strip() + return s + + def get_encoder_embed(params: AttributeDict) -> nn.Module: # encoder_embed converts the input of shape (N, T, num_features) # to the shape (N, (T - 7) // 2, encoder_dims). @@ -1284,21 +1293,26 @@ def run(rank, world_size, args): if params.inf_check: register_inf_check_hooks(model) - librispeech = LibriTTSAsrDataModule(args) + libritts = LibriTTSAsrDataModule(args) if params.full_libri: - train_cuts = librispeech.train_all_shuf_cuts() + train_cuts = libritts.train_all_shuf_cuts() # previously we used the following code to load all training cuts, # strictly speaking, shuffled training cuts should be used instead, # but we leave the code here to demonstrate that there is an option # like this to combine multiple cutsets - # train_cuts = librispeech.train_clean_100_cuts() - # train_cuts += librispeech.train_clean_360_cuts() - # train_cuts += librispeech.train_other_500_cuts() + # train_cuts = libritts.train_clean_100_cuts() + # train_cuts += libritts.train_clean_360_cuts() + # train_cuts += libritts.train_other_500_cuts() else: - train_cuts = librispeech.train_clean_100_cuts() + train_cuts = libritts.train_clean_100_cuts() + + def normalize_text(c: Cut): + text = remove_punc_to_upper(c.supervisions[0].text) + c.supervisions[0].text = text + return c def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds @@ -1338,6 +1352,7 @@ def remove_short_and_long_utt(c: Cut): return True train_cuts = train_cuts.filter(remove_short_and_long_utt) + train_cuts = train_cuts.map(normalize_text) if params.start_batch > 0 and checkpoints and "sampler" in checkpoints: # We only load the sampler's state dict when it loads a checkpoint @@ -1346,13 +1361,13 @@ def remove_short_and_long_utt(c: Cut): else: sampler_state_dict = None - train_dl = librispeech.train_dataloaders( + train_dl = libritts.train_dataloaders( train_cuts, sampler_state_dict=sampler_state_dict ) - valid_cuts = librispeech.dev_clean_cuts() - valid_cuts += librispeech.dev_other_cuts() - valid_dl = librispeech.valid_dataloaders(valid_cuts) + valid_cuts = libritts.dev_clean_cuts() + valid_cuts += libritts.dev_other_cuts() + valid_dl = libritts.valid_dataloaders(valid_cuts) if not params.print_diagnostics: scan_pessimistic_batches_for_oom( diff --git a/egs/libritts/CODEC/prepare.sh b/egs/libritts/CODEC/prepare.sh index 3dcb734745..6a471c3adc 100755 --- a/egs/libritts/CODEC/prepare.sh +++ b/egs/libritts/CODEC/prepare.sh @@ -37,15 +37,6 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then if [ ! -d $dl_dir/LibriTTS ]; then lhotse download libritts $dl_dir fi - - # If you have pre-downloaded it to /path/to/musan, - # you can create a symlink - # - # ln -sfv /path/to/musan $dl_dir/musan - # - if [ ! -d $dl_dir/musan ]; then - lhotse download musan $dl_dir - fi fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then