From ef8f317440e614c72d8421d35f84628aa5002ed8 Mon Sep 17 00:00:00 2001 From: HighPriest Date: Fri, 16 Feb 2024 03:37:15 +0100 Subject: [PATCH 1/2] WIP: Replace bash run script with python for VALLE So now it can be executed on windows --- egs/tts/VALLE/README.md | 16 ++-- egs/tts/VALLE/run.py | 105 ++++++++++++++++++++++++++ egs/tts/VALLE/run.sh | 158 ---------------------------------------- 3 files changed, 113 insertions(+), 166 deletions(-) create mode 100644 egs/tts/VALLE/run.py delete mode 100644 egs/tts/VALLE/run.sh diff --git a/egs/tts/VALLE/README.md b/egs/tts/VALLE/README.md index e9fe0da9..7b1ebe5b 100644 --- a/egs/tts/VALLE/README.md +++ b/egs/tts/VALLE/README.md @@ -54,10 +54,10 @@ Specify the `processed_dir` and the `log_dir` and for saving the processed data Run the `run.sh` as the preproces stage (set `--stage 1`): ```bash -sh egs/tts/VALLE/run.sh --stage 1 +python egs/tts/VALLE/run.py --stage 1 ``` -> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.py` by specifying such as `--gpu "1"`. ## 3. Training @@ -74,7 +74,7 @@ We provide the default hyparameters in the `exp_config.json`. They can work on s ### Run -Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. +Run the `run.py` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. Specifically, VALL-E need to train a autoregressive (AR) model and then a non-autoregressive (NAR) model. So, you can set `--model_train_stage 1` to train AR model, and set `--model_train_stage 2` to train NAR model, where `--ar_model_ckpt_dir` should be set as the ckeckpoint path to the trained AR model. @@ -82,23 +82,23 @@ Specifically, VALL-E need to train a autoregressive (AR) model and then a non-au Train a AR moel, just run: ```bash -sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName] +python egs/tts/VALLE/run.py --stage 2 --model_train_stage 1 --name [YourExptName] ``` Train a NAR model, just run: ```bash -sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName] +python egs/tts/VALLE/run.py --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName] ``` -> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.py` by specifying such as `--gpu "0,1,2,3"`. ## 4. Inference ### Configuration -For inference, you need to specify the following configurations when running `run.sh`: +For inference, you need to specify the following configurations when running `run.py`: @@ -117,7 +117,7 @@ For inference, you need to specify the following configurations when running `ru For example, if you want to generate a single clip of speech, just run: ```bash -sh egs/tts/VALLE/run.sh --stage 3 --gpu "0" \ +python egs/tts/VALLE/run.py --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ --infer_mode "single" \ diff --git a/egs/tts/VALLE/run.py b/egs/tts/VALLE/run.py new file mode 100644 index 00000000..3bab758a --- /dev/null +++ b/egs/tts/VALLE/run.py @@ -0,0 +1,105 @@ +import os +import sys +import subprocess +from argparse import ArgumentParser + + +# Set up directories +work_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) +print(f"Work Directory: {work_dir}") + +os.environ['WORK_DIR'] = work_dir +os.environ['PYTHONPATH'] = work_dir +os.environ['PYTHONIOENCODING'] = 'UTF-8' + +# Build Monotonic Align Module +os.chdir(os.path.join(work_dir, 'modules', 'monotonic_align')) +os.makedirs('monotonic_align', exist_ok=True) +subprocess.run(['python', 'setup.py', 'build_ext', '--inplace'], check=True) +os.chdir(work_dir) + +# Parse parameters +parser = ArgumentParser() +parser.add_argument('-c', '--config', help='Experimental Configuration File') +parser.add_argument('-n', '--name', help='Experimental Name') +parser.add_argument('-s', '--stage', type=int, help='Running Stage') +parser.add_argument('--gpu', type=str, help='Visible GPU machines') +parser.add_argument('--model_train_stage', type=str, help='Model Training Stage') +parser.add_argument('--ar_model_ckpt_dir', type=str, help='The stage1 ckpt dir') +parser.add_argument('--infer_expt_dir', type=str, help='The experiment dir') +parser.add_argument('--infer_output_dir', type=str, help='The output dir to save inferred audios') +parser.add_argument('--infer_mode', type=str, help='The inference mode') +parser.add_argument('--infer_test_list_file', type=str, help='The inference test list file') +parser.add_argument('--infer_text', type=str, help='The text to be synthesized from') +parser.add_argument('--infer_text_prompt', type=str, help='The inference text prompt') +parser.add_argument('--infer_audio_prompt', type=str, help='The inference audio prompt') +args = parser.parse_args() + +# Check required parameters +if args.stage is None: + print("Error: Please specify the running stage") + sys.exit(1) + +if args.config is None: + args.config = os.path.join(work_dir, 'exp_config.json') +print(f"Experimental Configuration File: {args.config}") + +if args.gpu is None: + args.gpu = '0' + +# Features Extraction +if args.stage == 1: + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu + cmd = ['python', os.path.join(work_dir, "bins", "tts", "preprocess.py"), '--config', args.config, '--num_workers', '4'] + subprocess.run(cmd, check=True, cwd=work_dir) + +# Training +if args.stage == 2: + if args.name is None: + print("Error: Please specify the experiments name") + sys.exit(1) + + if args.model_train_stage == '2' and args.ar_model_ckpt_dir is None: + print("Error: Please specify the checkpoint path to the trained model in stage1.") + sys.exit(1) + + if args.model_train_stage == '1': + args.ar_model_ckpt_dir = None + + print(f"Experimental Name: {args.name}") + + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu + cmd = ['python', os.path.join(work_dir, "bins", "tts", "train.py"), '--config', args.config, '--exp_name', args.name, '--log_level', 'debug', '--train_stage', args.model_train_stage, '--checkpoint_path', args.ar_model_ckpt_dir] + subprocess.run(cmd, check=True) + +# Inference +if args.stage == 3: + if args.infer_expt_dir is None: + print("Error: Please specify the experimental directory. The value is like [Your path to save logs and checkpoints]/[YourExptName]") + sys.exit(1) + + if args.infer_output_dir is None: + args.infer_output_dir = os.path.join(args.infer_expt_dir, 'result') + + if args.infer_mode is None: + print("Error: Please specify the inference mode, e.g., \"batch\", \"single\"") + sys.exit(1) + + if args.infer_mode == 'batch' and args.infer_test_list_file is None: + print("Error: Please specify the test list file used in inference when the inference mode is batch") + sys.exit(1) + + if args.infer_mode == 'single' and args.infer_text is None: + print("Error: Please specify the text to be synthesized when the inference mode is single") + sys.exit(1) + + if args.infer_mode == 'single': + print(f'Text: {args.infer_text}') + args.infer_test_list_file = None + elif args.infer_mode == 'batch': + args.infer_text = "" + args.infer_text_prompt = "" + args.infer_audio_prompt = "" + + cmd = ['python', os.path.join(work_dir, "bins", "tts", "inference.py"), '--config', args.config, '--log_level', 'debug', '--acoustics_dir', args.infer_expt_dir, '--output_dir', args.infer_output_dir, '--mode', args.infer_mode, '--text', args.infer_text, '--text_prompt', args.infer_text_prompt, '--audio_prompt', args.infer_audio_prompt, '--test_list_file', args.infer_test_list_file] + subprocess.run(cmd, check=True) diff --git a/egs/tts/VALLE/run.sh b/egs/tts/VALLE/run.sh deleted file mode 100644 index 104f9f32..00000000 --- a/egs/tts/VALLE/run.sh +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2023 Amphion. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -######## Build Experiment Environment ########### -exp_dir=$(cd `dirname $0`; pwd) -work_dir=$(dirname $(dirname $(dirname $exp_dir))) - -export WORK_DIR=$work_dir -export PYTHONPATH=$work_dir -export PYTHONIOENCODING=UTF-8 - -cd $work_dir/modules/monotonic_align -mkdir -p monotonic_align -python setup.py build_ext --inplace -cd $work_dir - -######## Parse the Given Parameters from the Commond ########### -options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,ar_model_ckpt_dir:,infer_output_dir:,infer_mode:,infer_test_list_file:,infer_text:,infer_text_prompt:,infer_audio_prompt:,model_train_stage:,name:,stage: -- "$@") -eval set -- "$options" - -while true; do - case $1 in - # Experimental Configuration File - -c | --config) shift; exp_config=$1 ; shift ;; - # Experimental Name - -n | --name) shift; exp_name=$1 ; shift ;; - # Running Stage - -s | --stage) shift; running_stage=$1 ; shift ;; - # Visible GPU machines. The default value is "0". - --gpu) shift; gpu=$1 ; shift ;; - - # [Only for Training] Model training stage. - --model_train_stage) shift; model_train_stage=$1 ; shift ;; - # [Only for Training] The stage1 ckpt dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" - --ar_model_ckpt_dir) shift; ar_model_ckpt_dir=$1 ; shift ;; - - # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" - --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; - # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" - --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; - - # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. - --infer_mode) shift; infer_mode=$1 ; shift ;; - # [Only for Inference] The inference test list file. It is only used when the inference model is "batch". - --infer_test_list_file) shift; infer_test_list_file=$1 ; shift ;; - # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". - --infer_text) shift; infer_text=$1 ; shift ;; - # [Only for Inference] The inference text prompt. It is only used when the inference model is "single". - --infer_text_prompt) shift; infer_text_prompt=$1 ; shift ;; - # [Only for Inference] The inference audio prompt. It is only used when the inference model is "single". - --infer_audio_prompt) shift; infer_audio_prompt=$1 ; shift ;; - - --) shift ; break ;; - *) echo "Invalid option: $1" exit 1 ;; - esac -done - - -### Value check ### -if [ -z "$running_stage" ]; then - echo "[Error] Please specify the running stage" - exit 1 -fi - -if [ -z "$exp_config" ]; then - exp_config="${exp_dir}"/exp_config.json -fi -echo "Exprimental Configuration File: $exp_config" - -if [ -z "$gpu" ]; then - gpu="0" -fi - -######## Features Extraction ########### -if [ $running_stage -eq 1 ]; then - CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ - --config=$exp_config \ - --num_workers=4 -fi - -######## Training ########### -if [ $running_stage -eq 2 ]; then - if [ -z "$exp_name" ]; then - echo "[Error] Please specify the experiments name" - exit 1 - fi - - if [ "$model_train_stage" = "2" ] && [ -z "$ar_model_ckpt_dir" ]; then - echo "[Error] Please specify the ckeckpoint path to the trained model in stage1." - exit 1 - fi - - if [ "$model_train_stage" = "1" ]; then - ar_model_ckpt_dir=None - fi - - echo "Exprimental Name: $exp_name" - - CUDA_VISIBLE_DEVICES=$gpu accelerate launch --main_process_port 29510 \ - "${work_dir}"/bins/tts/train.py \ - --config $exp_config \ - --exp_name $exp_name \ - --log_level debug \ - --train_stage $model_train_stage \ - --checkpoint_path $ar_model_ckpt_dir -fi - - -######## Inference ########### -if [ $running_stage -eq 3 ]; then - if [ -z "$infer_expt_dir" ]; then - echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" - exit 1 - fi - - if [ -z "$infer_output_dir" ]; then - infer_output_dir="$expt_dir/result" - fi - - if [ -z "$infer_mode" ]; then - echo "[Error] Please specify the inference mode, e.g., "batch", "single"" - exit 1 - fi - - if [ "$infer_mode" = "batch" ] && [ -z "$infer_test_list_file" ]; then - echo "[Error] Please specify the test list file used in inference when the inference mode is batch" - exit 1 - fi - - if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then - echo "[Error] Please specify the text to be synthesized when the inference mode is single" - exit 1 - fi - - if [ "$infer_mode" = "single" ]; then - echo 'Text: ' ${infer_text} - infer_test_list_file=None - elif [ "$infer_mode" = "batch" ]; then - infer_text="" - infer_text_prompt="" - infer_audio_prompt="" - fi - - - CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ - --config $exp_config \ - --log_level debug \ - --acoustics_dir $infer_expt_dir \ - --output_dir $infer_output_dir \ - --mode $infer_mode \ - --text "$infer_text" \ - --text_prompt "$infer_text_prompt" \ - --audio_prompt $infer_audio_prompt\ - --test_list_file $infer_test_list_file \ - -fi From f7348b8aae16467b6ec4a8b29fbb36eb4c45a892 Mon Sep 17 00:00:00 2001 From: HighPriest Date: Fri, 16 Feb 2024 03:42:48 +0100 Subject: [PATCH 2/2] WIP: Use os agnostic path parser for filesystem operations NOTE: This is not final! Only things relevant to running combination of VALLE & libritts! --- bins/calc_metrics.py | 8 ++-- bins/svc/inference.py | 4 +- .../similarity/resemblyzer_similarity.py | 2 +- models/base/base_inference.py | 4 +- models/vocoders/vocoder_inference.py | 6 +-- preprocessors/cdmusiceval.py | 6 +-- preprocessors/coco.py | 4 +- preprocessors/cocoeval.py | 6 +-- preprocessors/csd.py | 6 +-- preprocessors/customsvcdataset.py | 10 ++--- preprocessors/libritts.py | 44 +++++++++---------- preprocessors/pjs.py | 5 ++- utils/audio_slicer.py | 2 +- 13 files changed, 55 insertions(+), 52 deletions(-) diff --git a/bins/calc_metrics.py b/bins/calc_metrics.py index f093bc15..78749554 100644 --- a/bins/calc_metrics.py +++ b/bins/calc_metrics.py @@ -86,14 +86,16 @@ def calc_metric( continue audios_ref = [] + import os + audios_deg = [] - files = glob(ref_dir + "/*.wav") + files = glob(os.path.join(ref_dir, "*.wav")) for file in files: audios_ref.append(file) - uid = file.split("/")[-1].split(".wav")[0] - file_gt = deg_dir + "/{}.wav".format(uid) + uid = os.path.splitext(os.path.basename(file))[0] + file_gt = os.path.join(deg_dir, f"{uid}.wav") audios_deg.append(file_gt) if metric in ["v_uv_f1"]: diff --git a/bins/svc/inference.py b/bins/svc/inference.py index da0031d1..dac4ec9f 100644 --- a/bins/svc/inference.py +++ b/bins/svc/inference.py @@ -227,14 +227,14 @@ def main(): audio_list = [] for suffix in ["wav", "flac", "mp3"]: audio_list += glob.glob( - os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True + os.path.join(source_audio_dir, "**", "*.{}".format(suffix)), recursive=True ) print("There are {} source audios: ".format(len(audio_list))) # Infer for every file as dataset output_root_path = args.output_dir for audio_path in tqdm(audio_list): - audio_name = audio_path.split("/")[-1].split(".")[0] + audio_name = os.path.splitext(os.path.basename(audio_path))[0] args.output_dir = os.path.join(output_root_path, audio_name) print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name)) diff --git a/evaluation/metrics/similarity/resemblyzer_similarity.py b/evaluation/metrics/similarity/resemblyzer_similarity.py index 15d89d5b..d02a4669 100644 --- a/evaluation/metrics/similarity/resemblyzer_similarity.py +++ b/evaluation/metrics/similarity/resemblyzer_similarity.py @@ -60,7 +60,7 @@ def extract_resemblyzer_similarity(target_path, reference_path, dump_dir): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - filename = target_path.split("/")[-1] + filename = os.path.basename(target_path) csv_file_name = f"similarity_results_{filename}.csv" dump_dir = dump_dir + "/" + csv_file_name diff --git a/models/base/base_inference.py b/models/base/base_inference.py index 2713f19a..88348216 100644 --- a/models/base/base_inference.py +++ b/models/base/base_inference.py @@ -85,8 +85,8 @@ def get_vocoder_info(self): os.path.dirname(self.checkpoint_dir_vocoder), "args.json" ) self.cfg.vocoder = load_config(self.vocoder_cfg, lowercase=True) - self.vocoder_tag = self.checkpoint_dir_vocoder.split("/")[-2].split(":")[-1] - self.vocoder_steps = self.checkpoint_dir_vocoder.split("/")[-1].split(".")[0] + self.vocoder_tag = os.path.split(self.checkpoint_dir_vocoder)[-2].split(":")[-1] + self.vocoder_steps = os.path.splitext(os.path.basename(self.checkpoint_dir_vocoder))[0] def build_test_utt_data(self): raise NotImplementedError diff --git a/models/vocoders/vocoder_inference.py b/models/vocoders/vocoder_inference.py index 95e354c5..6aebd026 100644 --- a/models/vocoders/vocoder_inference.py +++ b/models/vocoders/vocoder_inference.py @@ -179,7 +179,7 @@ def _build_tmp_dataset_from_feature(self): utts = [] mels = glob(os.path.join(self.args.feature_folder, "mels", "*.npy")) for i, mel in enumerate(mels): - uid = mel.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(mel))[0] utt = {"Dataset": "tmp", "Uid": uid, "index": i} utts.append(utt) @@ -199,7 +199,7 @@ def _build_tmp_dataset_from_feature(self): features = glob(os.path.join(self.args.feature_folder, "*")) for feature in features: - feature_name = feature.split("/")[-1] + feature_name = os.path.basename(feature) if os.path.isfile(feature): continue shutil.copytree( @@ -214,7 +214,7 @@ def _build_tmp_dataset_from_audio(self): utts = [] audios = glob(os.path.join(self.args.audio_folder, "*")) for i, audio in enumerate(audios): - uid = audio.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(audio))[0] utt = {"Dataset": "tmp", "Uid": uid, "index": i, "Path": audio} utts.append(utt) diff --git a/preprocessors/cdmusiceval.py b/preprocessors/cdmusiceval.py index 1c4af97c..b8da228f 100644 --- a/preprocessors/cdmusiceval.py +++ b/preprocessors/cdmusiceval.py @@ -60,12 +60,12 @@ def statistics(utterance_dir): singer_infos = glob(utterance_dir + "/*") for singer_info in singer_infos: - singer = singer_info.split("/")[-1] + singer = os.path.basename(singer_info) song_infos = glob(singer_info + "/*") for song_info in song_infos: - song = song_info.split("/")[-1] + song = os.path.basename(song_info) singers.append(singer) songs.append(song) @@ -73,7 +73,7 @@ def statistics(utterance_dir): utts = glob(song_info + "/*.wav") for utt in utts: - uid = utt.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(utt))[0] singers2songs[singer][song].append(uid) unique_singers = list(set(singers)) diff --git a/preprocessors/coco.py b/preprocessors/coco.py index 9ac462f3..01bff952 100644 --- a/preprocessors/coco.py +++ b/preprocessors/coco.py @@ -24,10 +24,10 @@ def coco_statistics(data_dir): song_infos = glob(data_dir + "/*") for song in song_infos: - song_name = song.split("/")[-1] + song_name = os.path.basename(song) utts = glob(song + "/*.wav") for utt in utts: - uid = utt.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(utt))[0] song2utts[song_name].append(uid) print("Coco: {} songs".format(len(song_infos))) diff --git a/preprocessors/cocoeval.py b/preprocessors/cocoeval.py index 3cb3604c..1e918f4a 100644 --- a/preprocessors/cocoeval.py +++ b/preprocessors/cocoeval.py @@ -25,7 +25,7 @@ def _split_utts(): vocal_files = glob(os.path.join(raw_dir, "*/vocal.wav")) for vocal_f in tqdm(vocal_files): - song_name = vocal_f.split("/")[-2] + song_name = os.path.split(vocal_f)[-2] output_dir = os.path.join(output_root, song_name) os.makedirs(output_dir, exist_ok=True) @@ -39,10 +39,10 @@ def cocoeval_statistics(data_dir): song_infos = glob(data_dir + "/*") for song in song_infos: - song_name = song.split("/")[-1] + song_name = os.path.basename(song) utts = glob(song + "/*.wav") for utt in utts: - uid = utt.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(utt))[0] song2utts[song_name].append(uid) print("Cocoeval: {} songs".format(len(song_infos))) diff --git a/preprocessors/csd.py b/preprocessors/csd.py index 645a8b3d..8d854206 100644 --- a/preprocessors/csd.py +++ b/preprocessors/csd.py @@ -38,7 +38,7 @@ def split_to_utterances(language_dir, output_dir): pitches = set() for wav_file in tqdm(glob("{}/*.wav".format(wav_dir))): # Load waveform - song_name = wav_file.split("/")[-1].split(".")[0] + song_name = os.path.splitext(os.path.basename(wav_file))[0] waveform, fs = torchaudio.load(wav_file) # Load utterances @@ -100,7 +100,7 @@ def csd_statistics(data_dir): folder_infos = glob(data_dir + "/*") for folder_info in folder_infos: - folder_info_split = folder_info.split("/")[-1] + folder_info_split = os.path.basename(folder_info) language = folder_info_split[:2] song = folder_info_split[2:] @@ -111,7 +111,7 @@ def csd_statistics(data_dir): utts = glob(folder_info + "/*") for utt in utts: - uid = utt.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(utt))[0] languages2songs[language][song].append(uid) unique_languages = list(set(languages)) diff --git a/preprocessors/customsvcdataset.py b/preprocessors/customsvcdataset.py index 84f204de..da52e43b 100644 --- a/preprocessors/customsvcdataset.py +++ b/preprocessors/customsvcdataset.py @@ -19,15 +19,15 @@ def statistics(utterance_dir): utts_all = [] singers2songs = defaultdict(lambda: defaultdict(list)) - singer_infos = glob(utterance_dir + "/*") + singer_infos = os.scandir(utterance_dir.path) for singer_info in singer_infos: - singer = singer_info.split("/")[-1] + singer = os.path.basename(singer_info) - song_infos = glob(singer_info + "/*") + song_infos = os.scandir(singer_info.path) for song_info in song_infos: - song = song_info.split("/")[-1] + song = os.path.basename(song_info) singers.append(singer) songs.append(song) @@ -36,7 +36,7 @@ def statistics(utterance_dir): utts_all.extend(utts) for utt in utts: - uid = utt.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(utt))[0] singers2songs[singer][song].append(uid) unique_singers = list(set(singers)) diff --git a/preprocessors/libritts.py b/preprocessors/libritts.py index adc86f0c..6c724268 100644 --- a/preprocessors/libritts.py +++ b/preprocessors/libritts.py @@ -19,34 +19,33 @@ def libritts_statistics(data_dir): lambda: defaultdict(lambda: defaultdict(list)) ) - distribution_infos = glob(data_dir + "/*") + distribution_infos = os.scandir(data_dir) for distribution_info in distribution_infos: - distribution = distribution_info.split("/")[-1] - print(distribution) + if distribution_info.is_dir(): + distribution = distribution_info.name + print(distribution) - speaker_infos = glob(distribution_info + "/*") + speaker_infos = os.scandir(distribution_info.path) - if len(speaker_infos) == 0: - continue + for speaker_info in speaker_infos: + if speaker_info.is_dir(): + speaker = speaker_info.name - for speaker_info in speaker_infos: - speaker = speaker_info.split("/")[-1] + speakers.append(speaker) - speakers.append(speaker) + pharase_infos = os.scandir(speaker_info.path) - pharase_infos = glob(speaker_info + "/*") + for pharase_info in pharase_infos: + if pharase_info.is_dir(): + pharase = pharase_info.name - for pharase_info in pharase_infos: - pharase = pharase_info.split("/")[-1] + utts = os.scandir(pharase_info.path) - utts = glob(pharase_info + "/*.wav") - - for utt in utts: - uid = utt.split("/")[-1].split(".")[0] - distribution2speakers2pharases2utts[distribution][speaker][ - pharase - ].append(uid) + for utt in utts: + if utt.is_file() and utt.name.endswith(".wav"): + uid = os.path.splitext(utt.name)[0] + distribution2speakers2pharases2utts[distribution][speaker][pharase].append(uid) unique_speakers = list(set(speakers)) unique_speakers.sort() @@ -105,9 +104,10 @@ def main(output_path, dataset_path): distribution, speaker, chosen_pharase, chosen_uid ), } - res["Path"] = "{}/{}/{}/{}.wav".format( - distribution, speaker, chosen_pharase, chosen_uid - ) + res["Path"] = os.path.join("{}".format(distribution), + "{}".format(speaker), + "{}".format(chosen_pharase), + "{}".format(chosen_uid)) + ".wav" res["Path"] = os.path.join(libritts_path, res["Path"]) assert os.path.exists(res["Path"]) diff --git a/preprocessors/pjs.py b/preprocessors/pjs.py index 78d69bc5..2dbbdf63 100644 --- a/preprocessors/pjs.py +++ b/preprocessors/pjs.py @@ -11,6 +11,7 @@ from utils.util import has_existed from utils.io import save_audio +from tqdm import tqdm def get_splitted_utterances( @@ -27,7 +28,7 @@ def get_splitted_utterances( if len(raw_song_files) * n_utterance_splits == len(trimed_song_files): print("Splitted done...") for wav_file in tqdm(trimed_song_files): - uid = wav_file.split("/")[-1].split(".")[0] + uid = os.path.splitext(os.path.basename(wav_file))[0] utt = {"Dataset": "pjs", "Singer": "male1", "Uid": uid, "Path": wav_file} waveform, sample_rate = torchaudio.load(wav_file) @@ -38,7 +39,7 @@ def get_splitted_utterances( else: for wav_file in tqdm(raw_song_files): - song_id = wav_file.split("/")[-1].split(".")[0] + song_id = os.path.splitext(os.path.basename(wav_file))[0] waveform, sample_rate = torchaudio.load(wav_file) trimed_waveform = torchaudio.functional.vad(waveform, sample_rate) diff --git a/utils/audio_slicer.py b/utils/audio_slicer.py index 28474596..96f64fa2 100644 --- a/utils/audio_slicer.py +++ b/utils/audio_slicer.py @@ -230,7 +230,7 @@ def split_utterances_from_audio( min_interval (millisecond): The smaller min_interval is, the more sliced audio clips this script is likely to generate. """ - print("File:", wav_file.split("/")[-1]) + print("File:", os.path.basename(wav_file)) waveform, fs = torchaudio.load(wav_file) slicer = Slicer(sr=fs, min_interval=min_interval, threshold=db_threshold)