lifeiteng · RuntimeRacer · Apr 27, 2023 · Apr 27, 2023 · Apr 28, 2023 · Apr 28, 2023
diff --git a/egs/commonvoice/README.md b/egs/commonvoice/README.md
@@ -0,0 +1,149 @@
+# Mozilla CommonVoice
+
+## Install deps
+```
+pip install librosa==0.8.1 matplotlib h5py 
+```
+
+## Prepare Dataset
+```
+cd egs/commonvoice
+
+# Those stages are very time-consuming
+bash prepare.sh --stage -1 --stop-stage 3
+(actually this params don't seem to even work; didn't bother fixing it though; just manually set it at the top of the shell script.)
+
+##  train
+Cut statistics:
+╒═══════════════════════════╤════════════╕
+│ Cuts count:               │ 5122876    │
+├─────────────────────────────────────────
+│ Total duration (hh:mm:ss) │ 7598:10:45 │
+├─────────────────────────────────────────
+│ mean                      │ 5.3        │
+├─────────────────────────────────────────
+│ std                       │ 1.7        │
+├─────────────────────────────────────────
+│ min                       │ 0.1        │
+├─────────────────────────────────────────
+│ 25%                       │ 4.1        │
+├─────────────────────────────────────────
+│ 50%                       │ 5.2        │
+├─────────────────────────────────────────
+│ 75%                       │ 6.4        │
+├─────────────────────────────────────────
+│ 99%                       │ 9.7        │
+├─────────────────────────────────────────
+│ 99.5%                     │ 10.1       │
+├─────────────────────────────────────────
+│ 99.9%                     │ 10.5       │
+├─────────────────────────────────────────
+│ max                       │ 49.9       │
+├─────────────────────────────────────────
+│ Recordings available:     │ 5122876    │
+├─────────────────────────────────────────
+│ Features available:       │ 5122876    │
+├─────────────────────────────────────────
+│ Supervisions available:   │ 5122876    │
+╘═══════════════════════════╧════════════╛
+SUPERVISION custom fields:
+Speech duration statistics:
+╒══════════════════════════════╤════════════╤══════════════════════╕
+│ Total speech duration        │ 7598:10:45 │ 100.00% of recording │
+├───────────────────────────────────────────────────────────────────
+│ Total speaking time duration │ 7598:10:45 │ 100.00% of recording │
+├───────────────────────────────────────────────────────────────────
+│ Total silence duration       │ 00:00:01   │ 0.00% of recording   │
+╘══════════════════════════════╧════════════╧══════════════════════╛
+
+
+##  dev
+Cut statistics:
+╒═══════════════════════════╤══════════╕
+│ Cuts count:               │ 9600     │
+├───────────────────────────┼──────────┤
+│ Total duration (hh:mm:ss) │ 13:35:01 │
+├───────────────────────────┼──────────┤
+│ mean                      │ 5.1      │
+├───────────────────────────┼──────────┤
+│ std                       │ 1.8      │
+├───────────────────────────┼──────────┤
+│ min                       │ 1.3      │
+├───────────────────────────┼──────────┤
+│ 25%                       │ 3.7      │
+├───────────────────────────┼──────────┤
+│ 50%                       │ 4.9      │
+├───────────────────────────┼──────────┤
+│ 75%                       │ 6.3      │
+├───────────────────────────┼──────────┤
+│ 99%                       │ 9.9      │
+├───────────────────────────┼──────────┤
+│ 99.5%                     │ 10.1     │
+├───────────────────────────┼──────────┤
+│ 99.9%                     │ 10.5     │
+├───────────────────────────┼──────────┤
+│ max                       │ 11.0     │
+├───────────────────────────┼──────────┤
+│ Recordings available:     │ 9600     │
+├───────────────────────────┼──────────┤
+│ Features available:       │ 9600     │
+├───────────────────────────┼──────────┤
+│ Supervisions available:   │ 9600     │
+╘═══════════════════════════╧══════════╛
+SUPERVISION custom fields:
+Speech duration statistics:
+╒══════════════════════════════╤══════════╤══════════════════════╕
+│ Total speech duration        │ 13:35:01 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total speaking time duration │ 13:35:01 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total silence duration       │ 00:00:01 │ 0.00% of recording   │
+╘══════════════════════════════╧══════════╧══════════════════════╛
+
+
+##  test
+Cut statistics:
+╒═══════════════════════════╤══════════╕
+│ Cuts count:               │ 9600     │
+├───────────────────────────┼──────────┤
+│ Total duration (hh:mm:ss) │ 14:20:55 │
+├───────────────────────────┼──────────┤
+│ mean                      │ 5.4      │
+├───────────────────────────┼──────────┤
+│ std                       │ 1.9      │
+├───────────────────────────┼──────────┤
+│ min                       │ 1.0      │
+├───────────────────────────┼──────────┤
+│ 25%                       │ 3.9      │
+├───────────────────────────┼──────────┤
+│ 50%                       │ 5.1      │
+├───────────────────────────┼──────────┤
+│ 75%                       │ 6.7      │
+├───────────────────────────┼──────────┤
+│ 99%                       │ 10.2     │
+├───────────────────────────┼──────────┤
+│ 99.5%                     │ 10.4     │
+├───────────────────────────┼──────────┤
+│ 99.9%                     │ 10.9     │
+├───────────────────────────┼──────────┤
+│ max                       │ 21.7     │
+├───────────────────────────┼──────────┤
+│ Recordings available:     │ 9600     │
+├───────────────────────────┼──────────┤
+│ Features available:       │ 9600     │
+├───────────────────────────┼──────────┤
+│ Supervisions available:   │ 9600     │
+╘═══════════════════════════╧══════════╛
+SUPERVISION custom fields:
+Speech duration statistics:
+╒══════════════════════════════╤══════════╤══════════════════════╕
+│ Total speech duration        │ 14:20:55 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total speaking time duration │ 14:20:55 │ 100.00% of recording │
+├──────────────────────────────┼──────────┼──────────────────────┤
+│ Total silence duration       │ 00:00:01 │ 0.00% of recording   │
+╘══════════════════════════════╧══════════╧══════════════════════╛
+```
+
+## Training & Inference
+refer to [Training](../../README.md##Training&Inference)
diff --git a/egs/commonvoice/bin b/egs/commonvoice/bin
@@ -0,0 +1 @@
+../../valle/bin
diff --git a/egs/commonvoice/prepare.sh b/egs/commonvoice/prepare.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+nj=16
+stage=-1
+stop_stage=4
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/commonvoice
+#      You can download commonvoice from https://commonvoice.mozilla.org/
+#
+
+dl_dir=$PWD/download
+release=cv-corpus-13.0-2023-03-09
+languages="en de fr cy tt kab ca zh-TW it fa eu es ru tr nl eo zh-CN rw pt zh-HK cs pl uk ja"
+
+dataset_parts="train dev test"
+
+audio_extractor="Encodec"  # or Fbank
+audio_feats_dir=data/tokenized
+
+. shared/parse_options.sh || exit 1
+
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "dl_dir: $dl_dir"
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/commonvoice,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/commonvoice $dl_dir/commonvoice
+  #
+  for lang in $languages
+  do
+    log "Checking if language ${lang} has been downloaded..."
+    if [ ! -d $dl_dir/$release/$lang/clips ]; then
+      lhotse download commonvoice --languages $lang --release $release $dl_dir
+    fi
+  done
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare commonvoice manifest"
+  # We assume that you have downloaded the commonvoice corpus
+  # to $dl_dir/commonvoice
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.commonvoice.done ]; then
+    for lang in $languages
+    do
+      if [ ! -e data/manifests/.commonvoice-${lang}.done ]; then
+        lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/manifests
+        touch data/manifests/.commonvoice-${lang}.done
+      fi
+      # Rename to match Tokenizer Pattern
+      for part in $dataset_parts
+      do
+        mv data/manifests/cv-${lang}_recordings_${part}.jsonl.gz data/manifests/commonvoice_recordings_${lang}_${part}.jsonl.gz
+        mv data/manifests/cv-${lang}_supervisions_${part}.jsonl.gz data/manifests/commonvoice_supervisions_${lang}_${part}.jsonl.gz
+      done
+    done
+    touch data/manifests/.commonvoice.done
+  fi
+fi
+
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Tokenize/Fbank commonvoice"
+
+  # Build all Parts for all the commonvoice datasets
+  dynamicPartList=""
+  for lang in $languages
+  do
+    for part in $dataset_parts
+    do
+      dynamicPartList+="${lang}_${part} "
+    done
+  done
+  # echo "${dynamicPartList}" # debug
+
+  mkdir -p ${audio_feats_dir}
+  if [ ! -e ${audio_feats_dir}/.commonvoice.tokenize.done ]; then
+    python3 bin/tokenizer.py --dataset-parts "${dynamicPartList}" \
+        --audio-extractor ${audio_extractor} \
+        --batch-duration 400 \
+        --prefix "commonvoice" \
+        --src-dir "data/manifests" \
+        --output-dir "${audio_feats_dir}"
+  fi
+  touch ${audio_feats_dir}/.commonvoice.tokenize.done
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare commonvoice train/dev/test"
+  if [ ! -e ${audio_feats_dir}/.commonvoice.train.done ]; then
+
+    # Due to sheer size of the dev and train sets for the individual languages
+    # we need to only take small subsets from each language so we're able to calculate validation loss
+    for lang in $languages
+    do
+      lhotse subset --first 400 \
+        ${audio_feats_dir}/commonvoice_cuts_${lang}_dev.jsonl.gz \
+        ${audio_feats_dir}/commonvoice_cuts_${lang}_dev_subset.jsonl.gz
+
+      lhotse subset --first 400 \
+        ${audio_feats_dir}/commonvoice_cuts_${lang}_test.jsonl.gz \
+        ${audio_feats_dir}/commonvoice_cuts_${lang}_test_subset.jsonl.gz
+    done
+
+    # Build all Parts for all the commonvoice datasets
+    cutsTrainList=""
+    cutsDevList=""
+    cutsTestList=""
+    for lang in $languages
+    do
+      cutsTrainList+="${audio_feats_dir}/commonvoice_cuts_${lang}_train.jsonl.gz "
+      cutsDevList+="${audio_feats_dir}/commonvoice_cuts_${lang}_dev_subset.jsonl.gz "
+      cutsTestList+="${audio_feats_dir}/commonvoice_cuts_${lang}_test_subset.jsonl.gz "
+    done
+    # echo "${cutsTrainList}" # debug
+    # echo "${cutsDevList}" # debug
+    # echo "${cutsTestList}" # debug
+
+    # Combine all datasets
+    lhotse combine ${cutsTrainList} ${audio_feats_dir}/cuts_train.jsonl.gz
+    lhotse combine ${cutsDevList} ${audio_feats_dir}/cuts_dev.jsonl.gz
+    lhotse combine ${cutsTestList} ${audio_feats_dir}/cuts_test.jsonl.gz
+
+    touch ${audio_feats_dir}/.commonvoice.train.done
+  fi
+fi
+
+python3 ./bin/display_manifest_statistics.py --manifest-dir ${audio_feats_dir}
diff --git a/egs/commonvoice/prompts/ch_24k.txt b/egs/commonvoice/prompts/ch_24k.txt
@@ -0,0 +1 @@
+甚至 出现 交易 几乎 停滞 的 情况
diff --git a/egs/commonvoice/prompts/ch_24k.wav b/egs/commonvoice/prompts/ch_24k.wav
diff --git a/egs/commonvoice/prompts/ch_24k_loudness_normalized20.wav b/egs/commonvoice/prompts/ch_24k_loudness_normalized20.wav
diff --git a/egs/commonvoice/shared b/egs/commonvoice/shared
@@ -0,0 +1 @@
+../libritts/shared/
diff --git a/valle/bin/tokenizer.py b/valle/bin/tokenizer.py
@@ -180,7 +180,7 @@ def main():
                         f"{args.output_dir}/{args.prefix}_fbank_{partition}"
                     )
 
-                if args.prefix.lower() in ["ljspeech", "aishell", "baker"]:
+                if args.prefix.lower() in ["ljspeech", "aishell", "baker", "commonvoice"]:
                     cut_set = cut_set.resample(24000)
                     # https://github.com/lifeiteng/vall-e/issues/90
                     # if args.prefix == "aishell":
@@ -233,8 +233,7 @@ def main():
                                 text_tokenizer, text=c.supervisions[0].text
                             )
                             c.supervisions[0].custom = {}
-                        else:
-                            assert args.prefix == "libritts"
+                        else:  # libritts, commonvoice
                             phonemes = tokenize_text(
                                 text_tokenizer, text=c.supervisions[0].text
                             )