diff --git a/egs/commonvoice/README.md b/egs/commonvoice/README.md new file mode 100644 index 0000000..698ccf9 --- /dev/null +++ b/egs/commonvoice/README.md @@ -0,0 +1,149 @@ +# Mozilla CommonVoice + +## Install deps +``` +pip install librosa==0.8.1 matplotlib h5py +``` + +## Prepare Dataset +``` +cd egs/commonvoice + +# Those stages are very time-consuming +bash prepare.sh --stage -1 --stop-stage 3 +(actually this params don't seem to even work; didn't bother fixing it though; just manually set it at the top of the shell script.) + +## train +Cut statistics: +╒═══════════════════════════╤════════════╕ +│ Cuts count: │ 5122876 │ +├───────────────────────────────────────── +│ Total duration (hh:mm:ss) │ 7598:10:45 │ +├───────────────────────────────────────── +│ mean │ 5.3 │ +├───────────────────────────────────────── +│ std │ 1.7 │ +├───────────────────────────────────────── +│ min │ 0.1 │ +├───────────────────────────────────────── +│ 25% │ 4.1 │ +├───────────────────────────────────────── +│ 50% │ 5.2 │ +├───────────────────────────────────────── +│ 75% │ 6.4 │ +├───────────────────────────────────────── +│ 99% │ 9.7 │ +├───────────────────────────────────────── +│ 99.5% │ 10.1 │ +├───────────────────────────────────────── +│ 99.9% │ 10.5 │ +├───────────────────────────────────────── +│ max │ 49.9 │ +├───────────────────────────────────────── +│ Recordings available: │ 5122876 │ +├───────────────────────────────────────── +│ Features available: │ 5122876 │ +├───────────────────────────────────────── +│ Supervisions available: │ 5122876 │ +╘═══════════════════════════╧════════════╛ +SUPERVISION custom fields: +Speech duration statistics: +╒══════════════════════════════╤════════════╤══════════════════════╕ +│ Total speech duration │ 7598:10:45 │ 100.00% of recording │ +├─────────────────────────────────────────────────────────────────── +│ Total speaking time duration │ 7598:10:45 │ 100.00% of recording │ +├─────────────────────────────────────────────────────────────────── +│ Total silence duration │ 00:00:01 │ 0.00% of recording │ +╘══════════════════════════════╧════════════╧══════════════════════╛ + + +## dev +Cut statistics: +╒═══════════════════════════╤══════════╕ +│ Cuts count: │ 9600 │ +├───────────────────────────┼──────────┤ +│ Total duration (hh:mm:ss) │ 13:35:01 │ +├───────────────────────────┼──────────┤ +│ mean │ 5.1 │ +├───────────────────────────┼──────────┤ +│ std │ 1.8 │ +├───────────────────────────┼──────────┤ +│ min │ 1.3 │ +├───────────────────────────┼──────────┤ +│ 25% │ 3.7 │ +├───────────────────────────┼──────────┤ +│ 50% │ 4.9 │ +├───────────────────────────┼──────────┤ +│ 75% │ 6.3 │ +├───────────────────────────┼──────────┤ +│ 99% │ 9.9 │ +├───────────────────────────┼──────────┤ +│ 99.5% │ 10.1 │ +├───────────────────────────┼──────────┤ +│ 99.9% │ 10.5 │ +├───────────────────────────┼──────────┤ +│ max │ 11.0 │ +├───────────────────────────┼──────────┤ +│ Recordings available: │ 9600 │ +├───────────────────────────┼──────────┤ +│ Features available: │ 9600 │ +├───────────────────────────┼──────────┤ +│ Supervisions available: │ 9600 │ +╘═══════════════════════════╧══════════╛ +SUPERVISION custom fields: +Speech duration statistics: +╒══════════════════════════════╤══════════╤══════════════════════╕ +│ Total speech duration │ 13:35:01 │ 100.00% of recording │ +├──────────────────────────────┼──────────┼──────────────────────┤ +│ Total speaking time duration │ 13:35:01 │ 100.00% of recording │ +├──────────────────────────────┼──────────┼──────────────────────┤ +│ Total silence duration │ 00:00:01 │ 0.00% of recording │ +╘══════════════════════════════╧══════════╧══════════════════════╛ + + +## test +Cut statistics: +╒═══════════════════════════╤══════════╕ +│ Cuts count: │ 9600 │ +├───────────────────────────┼──────────┤ +│ Total duration (hh:mm:ss) │ 14:20:55 │ +├───────────────────────────┼──────────┤ +│ mean │ 5.4 │ +├───────────────────────────┼──────────┤ +│ std │ 1.9 │ +├───────────────────────────┼──────────┤ +│ min │ 1.0 │ +├───────────────────────────┼──────────┤ +│ 25% │ 3.9 │ +├───────────────────────────┼──────────┤ +│ 50% │ 5.1 │ +├───────────────────────────┼──────────┤ +│ 75% │ 6.7 │ +├───────────────────────────┼──────────┤ +│ 99% │ 10.2 │ +├───────────────────────────┼──────────┤ +│ 99.5% │ 10.4 │ +├───────────────────────────┼──────────┤ +│ 99.9% │ 10.9 │ +├───────────────────────────┼──────────┤ +│ max │ 21.7 │ +├───────────────────────────┼──────────┤ +│ Recordings available: │ 9600 │ +├───────────────────────────┼──────────┤ +│ Features available: │ 9600 │ +├───────────────────────────┼──────────┤ +│ Supervisions available: │ 9600 │ +╘═══════════════════════════╧══════════╛ +SUPERVISION custom fields: +Speech duration statistics: +╒══════════════════════════════╤══════════╤══════════════════════╕ +│ Total speech duration │ 14:20:55 │ 100.00% of recording │ +├──────────────────────────────┼──────────┼──────────────────────┤ +│ Total speaking time duration │ 14:20:55 │ 100.00% of recording │ +├──────────────────────────────┼──────────┼──────────────────────┤ +│ Total silence duration │ 00:00:01 │ 0.00% of recording │ +╘══════════════════════════════╧══════════╧══════════════════════╛ +``` + +## Training & Inference +refer to [Training](../../README.md##Training&Inference) diff --git a/egs/commonvoice/bin b/egs/commonvoice/bin new file mode 120000 index 0000000..501d03e --- /dev/null +++ b/egs/commonvoice/bin @@ -0,0 +1 @@ +../../valle/bin \ No newline at end of file diff --git a/egs/commonvoice/prepare.sh b/egs/commonvoice/prepare.sh new file mode 100755 index 0000000..9a4914d --- /dev/null +++ b/egs/commonvoice/prepare.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash + +set -eou pipefail + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +nj=16 +stage=-1 +stop_stage=4 + +# We assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# +# - $dl_dir/commonvoice +# You can download commonvoice from https://commonvoice.mozilla.org/ +# + +dl_dir=$PWD/download +release=cv-corpus-13.0-2023-03-09 +languages="en de fr cy tt kab ca zh-TW it fa eu es ru tr nl eo zh-CN rw pt zh-HK cs pl uk ja" + +dataset_parts="train dev test" + +audio_extractor="Encodec" # or Fbank +audio_feats_dir=data/tokenized + +. shared/parse_options.sh || exit 1 + + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "dl_dir: $dl_dir" + log "Stage 0: Download data" + + # If you have pre-downloaded it to /path/to/commonvoice, + # you can create a symlink + # + # ln -sfv /path/to/commonvoice $dl_dir/commonvoice + # + for lang in $languages + do + log "Checking if language ${lang} has been downloaded..." + if [ ! -d $dl_dir/$release/$lang/clips ]; then + lhotse download commonvoice --languages $lang --release $release $dl_dir + fi + done +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare commonvoice manifest" + # We assume that you have downloaded the commonvoice corpus + # to $dl_dir/commonvoice + mkdir -p data/manifests + if [ ! -e data/manifests/.commonvoice.done ]; then + for lang in $languages + do + if [ ! -e data/manifests/.commonvoice-${lang}.done ]; then + lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/manifests + touch data/manifests/.commonvoice-${lang}.done + fi + # Rename to match Tokenizer Pattern + for part in $dataset_parts + do + mv data/manifests/cv-${lang}_recordings_${part}.jsonl.gz data/manifests/commonvoice_recordings_${lang}_${part}.jsonl.gz + mv data/manifests/cv-${lang}_supervisions_${part}.jsonl.gz data/manifests/commonvoice_supervisions_${lang}_${part}.jsonl.gz + done + done + touch data/manifests/.commonvoice.done + fi +fi + + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Tokenize/Fbank commonvoice" + + # Build all Parts for all the commonvoice datasets + dynamicPartList="" + for lang in $languages + do + for part in $dataset_parts + do + dynamicPartList+="${lang}_${part} " + done + done + # echo "${dynamicPartList}" # debug + + mkdir -p ${audio_feats_dir} + if [ ! -e ${audio_feats_dir}/.commonvoice.tokenize.done ]; then + python3 bin/tokenizer.py --dataset-parts "${dynamicPartList}" \ + --audio-extractor ${audio_extractor} \ + --batch-duration 400 \ + --prefix "commonvoice" \ + --src-dir "data/manifests" \ + --output-dir "${audio_feats_dir}" + fi + touch ${audio_feats_dir}/.commonvoice.tokenize.done +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Prepare commonvoice train/dev/test" + if [ ! -e ${audio_feats_dir}/.commonvoice.train.done ]; then + + # Due to sheer size of the dev and train sets for the individual languages + # we need to only take small subsets from each language so we're able to calculate validation loss + for lang in $languages + do + lhotse subset --first 400 \ + ${audio_feats_dir}/commonvoice_cuts_${lang}_dev.jsonl.gz \ + ${audio_feats_dir}/commonvoice_cuts_${lang}_dev_subset.jsonl.gz + + lhotse subset --first 400 \ + ${audio_feats_dir}/commonvoice_cuts_${lang}_test.jsonl.gz \ + ${audio_feats_dir}/commonvoice_cuts_${lang}_test_subset.jsonl.gz + done + + # Build all Parts for all the commonvoice datasets + cutsTrainList="" + cutsDevList="" + cutsTestList="" + for lang in $languages + do + cutsTrainList+="${audio_feats_dir}/commonvoice_cuts_${lang}_train.jsonl.gz " + cutsDevList+="${audio_feats_dir}/commonvoice_cuts_${lang}_dev_subset.jsonl.gz " + cutsTestList+="${audio_feats_dir}/commonvoice_cuts_${lang}_test_subset.jsonl.gz " + done + # echo "${cutsTrainList}" # debug + # echo "${cutsDevList}" # debug + # echo "${cutsTestList}" # debug + + # Combine all datasets + lhotse combine ${cutsTrainList} ${audio_feats_dir}/cuts_train.jsonl.gz + lhotse combine ${cutsDevList} ${audio_feats_dir}/cuts_dev.jsonl.gz + lhotse combine ${cutsTestList} ${audio_feats_dir}/cuts_test.jsonl.gz + + touch ${audio_feats_dir}/.commonvoice.train.done + fi +fi + +python3 ./bin/display_manifest_statistics.py --manifest-dir ${audio_feats_dir} diff --git a/egs/commonvoice/prompts/ch_24k.txt b/egs/commonvoice/prompts/ch_24k.txt new file mode 100644 index 0000000..4d2fbbb --- /dev/null +++ b/egs/commonvoice/prompts/ch_24k.txt @@ -0,0 +1 @@ +甚至 出现 交易 几乎 停滞 的 情况 diff --git a/egs/commonvoice/prompts/ch_24k.wav b/egs/commonvoice/prompts/ch_24k.wav new file mode 100644 index 0000000..ab68b98 Binary files /dev/null and b/egs/commonvoice/prompts/ch_24k.wav differ diff --git a/egs/commonvoice/prompts/ch_24k_loudness_normalized20.wav b/egs/commonvoice/prompts/ch_24k_loudness_normalized20.wav new file mode 100644 index 0000000..28a54ec Binary files /dev/null and b/egs/commonvoice/prompts/ch_24k_loudness_normalized20.wav differ diff --git a/egs/commonvoice/shared b/egs/commonvoice/shared new file mode 120000 index 0000000..71e59eb --- /dev/null +++ b/egs/commonvoice/shared @@ -0,0 +1 @@ +../libritts/shared/ \ No newline at end of file diff --git a/valle/bin/tokenizer.py b/valle/bin/tokenizer.py index 0c88be6..f3fa8b2 100644 --- a/valle/bin/tokenizer.py +++ b/valle/bin/tokenizer.py @@ -180,7 +180,7 @@ def main(): f"{args.output_dir}/{args.prefix}_fbank_{partition}" ) - if args.prefix.lower() in ["ljspeech", "aishell", "baker"]: + if args.prefix.lower() in ["ljspeech", "aishell", "baker", "commonvoice"]: cut_set = cut_set.resample(24000) # https://github.com/lifeiteng/vall-e/issues/90 # if args.prefix == "aishell": @@ -233,8 +233,7 @@ def main(): text_tokenizer, text=c.supervisions[0].text ) c.supervisions[0].custom = {} - else: - assert args.prefix == "libritts" + else: # libritts, commonvoice phonemes = tokenize_text( text_tokenizer, text=c.supervisions[0].text )