From ef8f317440e614c72d8421d35f84628aa5002ed8 Mon Sep 17 00:00:00 2001
From: HighPriest <git_user@amun.pl>
Date: Fri, 16 Feb 2024 03:37:15 +0100
Subject: [PATCH 1/2] WIP: Replace bash run script with python for VALLE

So now it can be executed on windows
---
 egs/tts/VALLE/README.md |  16 ++--
 egs/tts/VALLE/run.py    | 105 ++++++++++++++++++++++++++
 egs/tts/VALLE/run.sh    | 158 ----------------------------------------
 3 files changed, 113 insertions(+), 166 deletions(-)
 create mode 100644 egs/tts/VALLE/run.py
 delete mode 100644 egs/tts/VALLE/run.sh

diff --git a/egs/tts/VALLE/README.md b/egs/tts/VALLE/README.md
index e9fe0da9..7b1ebe5b 100644
--- a/egs/tts/VALLE/README.md
+++ b/egs/tts/VALLE/README.md
@@ -54,10 +54,10 @@ Specify the `processed_dir` and the `log_dir` and for saving the processed data
 Run the `run.sh` as the preproces stage (set  `--stage 1`):
 
 ```bash
-sh egs/tts/VALLE/run.sh --stage 1
+python egs/tts/VALLE/run.py --stage 1
 ```
 
-> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.py` by specifying such as `--gpu "1"`.
 
 
 ## 3. Training
@@ -74,7 +74,7 @@ We provide the default hyparameters in the `exp_config.json`. They can work on s
 
 ### Run
 
-Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
+Run the `run.py` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
 
 Specifically, VALL-E need to train a autoregressive (AR) model and then a non-autoregressive (NAR) model. So, you can set `--model_train_stage 1` to train AR model, and set `--model_train_stage 2` to train NAR model, where `--ar_model_ckpt_dir` should be set as the ckeckpoint path to the trained AR model.
 
@@ -82,23 +82,23 @@ Specifically, VALL-E need to train a autoregressive (AR) model and then a non-au
 Train a AR moel, just run:
 
 ```bash
-sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName]
+python egs/tts/VALLE/run.py --stage 2 --model_train_stage 1 --name [YourExptName]
 ```
 
 Train a NAR model, just run:
 ```bash
-sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName]
+python egs/tts/VALLE/run.py --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName]
 ```
 <!-- > **NOTE:** To train a NAR model, `--checkpoint_path` should be set as the ckeckpoint path to the trained AR model. -->
 
-> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.py` by specifying such as `--gpu "0,1,2,3"`.
 
 
 ## 4. Inference
 
 ### Configuration
 
-For inference, you need to specify the following configurations when running `run.sh`:
+For inference, you need to specify the following configurations when running `run.py`:
 
 
 
@@ -117,7 +117,7 @@ For inference, you need to specify the following configurations when running `ru
 For example, if you want to generate a single clip of speech, just run:
 
 ```bash
-sh egs/tts/VALLE/run.sh --stage 3 --gpu "0" \
+python egs/tts/VALLE/run.py --stage 3 --gpu "0" \
     --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
     --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
     --infer_mode "single" \
diff --git a/egs/tts/VALLE/run.py b/egs/tts/VALLE/run.py
new file mode 100644
index 00000000..3bab758a
--- /dev/null
+++ b/egs/tts/VALLE/run.py
@@ -0,0 +1,105 @@
+import os
+import sys
+import subprocess
+from argparse import ArgumentParser
+
+
+# Set up directories
+work_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
+print(f"Work Directory: {work_dir}")
+
+os.environ['WORK_DIR'] = work_dir
+os.environ['PYTHONPATH'] = work_dir
+os.environ['PYTHONIOENCODING'] = 'UTF-8'
+
+# Build Monotonic Align Module
+os.chdir(os.path.join(work_dir, 'modules', 'monotonic_align'))
+os.makedirs('monotonic_align', exist_ok=True)
+subprocess.run(['python', 'setup.py', 'build_ext', '--inplace'], check=True)
+os.chdir(work_dir)
+
+# Parse parameters
+parser = ArgumentParser()
+parser.add_argument('-c', '--config', help='Experimental Configuration File')
+parser.add_argument('-n', '--name', help='Experimental Name')
+parser.add_argument('-s', '--stage', type=int, help='Running Stage')
+parser.add_argument('--gpu', type=str, help='Visible GPU machines')
+parser.add_argument('--model_train_stage', type=str, help='Model Training Stage')
+parser.add_argument('--ar_model_ckpt_dir', type=str, help='The stage1 ckpt dir')
+parser.add_argument('--infer_expt_dir', type=str, help='The experiment dir')
+parser.add_argument('--infer_output_dir', type=str, help='The output dir to save inferred audios')
+parser.add_argument('--infer_mode', type=str, help='The inference mode')
+parser.add_argument('--infer_test_list_file', type=str, help='The inference test list file')
+parser.add_argument('--infer_text', type=str, help='The text to be synthesized from')
+parser.add_argument('--infer_text_prompt', type=str, help='The inference text prompt')
+parser.add_argument('--infer_audio_prompt', type=str, help='The inference audio prompt')
+args = parser.parse_args()
+
+# Check required parameters
+if args.stage is None:
+    print("Error: Please specify the running stage")
+    sys.exit(1)
+
+if args.config is None:
+    args.config = os.path.join(work_dir, 'exp_config.json')
+print(f"Experimental Configuration File: {args.config}")
+
+if args.gpu is None:
+    args.gpu = '0'
+
+# Features Extraction
+if args.stage == 1:
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    cmd = ['python', os.path.join(work_dir, "bins", "tts", "preprocess.py"), '--config', args.config, '--num_workers', '4']
+    subprocess.run(cmd, check=True, cwd=work_dir)
+
+# Training
+if args.stage ==   2:
+    if args.name is None:
+        print("Error: Please specify the experiments name")
+        sys.exit(1)
+
+    if args.model_train_stage == '2' and args.ar_model_ckpt_dir is None:
+        print("Error: Please specify the checkpoint path to the trained model in stage1.")
+        sys.exit(1)
+
+    if args.model_train_stage == '1':
+        args.ar_model_ckpt_dir = None
+
+    print(f"Experimental Name: {args.name}")
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    cmd = ['python', os.path.join(work_dir, "bins", "tts", "train.py"), '--config', args.config, '--exp_name', args.name, '--log_level', 'debug', '--train_stage', args.model_train_stage, '--checkpoint_path', args.ar_model_ckpt_dir]
+    subprocess.run(cmd, check=True)
+
+# Inference
+if args.stage ==   3:
+    if args.infer_expt_dir is None:
+        print("Error: Please specify the experimental directory. The value is like [Your path to save logs and checkpoints]/[YourExptName]")
+        sys.exit(1)
+
+    if args.infer_output_dir is None:
+        args.infer_output_dir = os.path.join(args.infer_expt_dir, 'result')
+
+    if args.infer_mode is None:
+        print("Error: Please specify the inference mode, e.g., \"batch\", \"single\"")
+        sys.exit(1)
+
+    if args.infer_mode == 'batch' and args.infer_test_list_file is None:
+        print("Error: Please specify the test list file used in inference when the inference mode is batch")
+        sys.exit(1)
+
+    if args.infer_mode == 'single' and args.infer_text is None:
+        print("Error: Please specify the text to be synthesized when the inference mode is single")
+        sys.exit(1)
+
+    if args.infer_mode == 'single':
+        print(f'Text: {args.infer_text}')
+        args.infer_test_list_file = None
+    elif args.infer_mode == 'batch':
+        args.infer_text = ""
+        args.infer_text_prompt = ""
+        args.infer_audio_prompt = ""
+
+    cmd = ['python', os.path.join(work_dir, "bins", "tts", "inference.py"), '--config', args.config, '--log_level', 'debug', '--acoustics_dir', args.infer_expt_dir, '--output_dir', args.infer_output_dir, '--mode', args.infer_mode, '--text', args.infer_text, '--text_prompt', args.infer_text_prompt, '--audio_prompt', args.infer_audio_prompt, '--test_list_file', args.infer_test_list_file]
+    subprocess.run(cmd, check=True)
diff --git a/egs/tts/VALLE/run.sh b/egs/tts/VALLE/run.sh
deleted file mode 100644
index 104f9f32..00000000
--- a/egs/tts/VALLE/run.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2023 Amphion.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-######## Build Experiment Environment ###########
-exp_dir=$(cd `dirname $0`; pwd)
-work_dir=$(dirname $(dirname $(dirname $exp_dir)))
-
-export WORK_DIR=$work_dir
-export PYTHONPATH=$work_dir
-export PYTHONIOENCODING=UTF-8
-
-cd $work_dir/modules/monotonic_align
-mkdir -p monotonic_align
-python setup.py build_ext --inplace
-cd $work_dir
-
-######## Parse the Given Parameters from the Commond ###########
-options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,ar_model_ckpt_dir:,infer_output_dir:,infer_mode:,infer_test_list_file:,infer_text:,infer_text_prompt:,infer_audio_prompt:,model_train_stage:,name:,stage: -- "$@")
-eval set -- "$options"
-
-while true; do
-  case $1 in
-    # Experimental Configuration File
-    -c | --config) shift; exp_config=$1 ; shift ;;
-    # Experimental Name
-    -n | --name) shift; exp_name=$1 ; shift ;;
-    # Running Stage
-    -s | --stage) shift; running_stage=$1 ; shift ;;
-    # Visible GPU machines. The default value is "0".
-    --gpu) shift; gpu=$1 ; shift ;;
-
-    # [Only for Training] Model training stage. 
-    --model_train_stage) shift; model_train_stage=$1 ; shift ;;
-    # [Only for Training] The stage1 ckpt dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
-    --ar_model_ckpt_dir) shift; ar_model_ckpt_dir=$1 ; shift ;;
-
-    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
-    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
-    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
-    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
-    
-    # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
-    --infer_mode) shift; infer_mode=$1 ; shift ;;
-    # [Only for Inference] The inference test list file. It is only used when the inference model is "batch".
-    --infer_test_list_file) shift; infer_test_list_file=$1 ; shift ;;
-    # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". 
-    --infer_text) shift; infer_text=$1 ; shift ;;
-    # [Only for Inference] The inference text prompt. It is only used when the inference model is "single".
-    --infer_text_prompt) shift; infer_text_prompt=$1 ; shift ;;
-    # [Only for Inference] The inference audio prompt. It is only used when the inference model is "single".
-    --infer_audio_prompt) shift; infer_audio_prompt=$1 ; shift ;;
-
-    --) shift ; break ;;
-    *) echo "Invalid option: $1" exit 1 ;;
-  esac
-done
-
-
-### Value check ###
-if [ -z "$running_stage" ]; then
-    echo "[Error] Please specify the running stage"
-    exit 1
-fi
-
-if [ -z "$exp_config" ]; then
-    exp_config="${exp_dir}"/exp_config.json
-fi
-echo "Exprimental Configuration File: $exp_config"
-
-if [ -z "$gpu" ]; then
-    gpu="0"
-fi
-
-######## Features Extraction ###########
-if [ $running_stage -eq 1 ]; then
-    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \
-        --config=$exp_config \
-        --num_workers=4
-fi
-
-######## Training ###########
-if [ $running_stage -eq 2 ]; then
-    if [ -z "$exp_name" ]; then
-        echo "[Error] Please specify the experiments name"
-        exit 1
-    fi
-
-    if [ "$model_train_stage" = "2" ] && [ -z "$ar_model_ckpt_dir" ]; then
-        echo "[Error] Please specify the ckeckpoint path to the trained model in stage1."
-        exit 1
-    fi
-
-    if [  "$model_train_stage" = "1" ]; then
-        ar_model_ckpt_dir=None
-    fi
-
-    echo "Exprimental Name: $exp_name"
-
-    CUDA_VISIBLE_DEVICES=$gpu accelerate launch --main_process_port 29510 \
-    "${work_dir}"/bins/tts/train.py \
-        --config $exp_config \
-        --exp_name $exp_name \
-        --log_level debug \
-        --train_stage $model_train_stage \
-        --checkpoint_path $ar_model_ckpt_dir
-fi
-
-
-######## Inference ###########
-if [ $running_stage -eq 3 ]; then
-    if [ -z "$infer_expt_dir" ]; then
-        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
-        exit 1
-    fi
-
-    if [ -z "$infer_output_dir" ]; then
-        infer_output_dir="$expt_dir/result"
-    fi
-
-    if [ -z "$infer_mode" ]; then
-        echo "[Error] Please specify the inference mode, e.g., "batch", "single""
-        exit 1
-    fi
-
-    if [ "$infer_mode" = "batch" ] && [ -z "$infer_test_list_file" ]; then
-        echo "[Error] Please specify the test list file used in inference when the inference mode is batch"
-        exit 1
-    fi
-
-    if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then
-        echo "[Error] Please specify the text to be synthesized when the inference mode is single"
-        exit 1
-    fi
-
-    if [ "$infer_mode" = "single" ]; then
-        echo 'Text: ' ${infer_text}
-        infer_test_list_file=None
-    elif [ "$infer_mode" = "batch" ]; then
-        infer_text=""
-        infer_text_prompt=""
-        infer_audio_prompt=""
-    fi
-
-
-    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
-        --config $exp_config \
-        --log_level debug \
-        --acoustics_dir $infer_expt_dir \
-        --output_dir $infer_output_dir  \
-        --mode $infer_mode \
-        --text "$infer_text" \
-        --text_prompt "$infer_text_prompt" \
-        --audio_prompt $infer_audio_prompt\
-        --test_list_file $infer_test_list_file \
-
-fi

From f7348b8aae16467b6ec4a8b29fbb36eb4c45a892 Mon Sep 17 00:00:00 2001
From: HighPriest <git_user@amun.pl>
Date: Fri, 16 Feb 2024 03:42:48 +0100
Subject: [PATCH 2/2] WIP: Use os agnostic path parser for filesystem
 operations

NOTE: This is not final! Only things relevant to running combination of VALLE & libritts!
---
 bins/calc_metrics.py                          |  8 ++--
 bins/svc/inference.py                         |  4 +-
 .../similarity/resemblyzer_similarity.py      |  2 +-
 models/base/base_inference.py                 |  4 +-
 models/vocoders/vocoder_inference.py          |  6 +--
 preprocessors/cdmusiceval.py                  |  6 +--
 preprocessors/coco.py                         |  4 +-
 preprocessors/cocoeval.py                     |  6 +--
 preprocessors/csd.py                          |  6 +--
 preprocessors/customsvcdataset.py             | 10 ++---
 preprocessors/libritts.py                     | 44 +++++++++----------
 preprocessors/pjs.py                          |  5 ++-
 utils/audio_slicer.py                         |  2 +-
 13 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/bins/calc_metrics.py b/bins/calc_metrics.py
index f093bc15..78749554 100644
--- a/bins/calc_metrics.py
+++ b/bins/calc_metrics.py
@@ -86,14 +86,16 @@ def calc_metric(
             continue
 
         audios_ref = []
+        import os
+
         audios_deg = []
 
-        files = glob(ref_dir + "/*.wav")
+        files = glob(os.path.join(ref_dir, "*.wav"))
 
         for file in files:
             audios_ref.append(file)
-            uid = file.split("/")[-1].split(".wav")[0]
-            file_gt = deg_dir + "/{}.wav".format(uid)
+            uid = os.path.splitext(os.path.basename(file))[0]
+            file_gt = os.path.join(deg_dir, f"{uid}.wav")
             audios_deg.append(file_gt)
 
         if metric in ["v_uv_f1"]:
diff --git a/bins/svc/inference.py b/bins/svc/inference.py
index da0031d1..dac4ec9f 100644
--- a/bins/svc/inference.py
+++ b/bins/svc/inference.py
@@ -227,14 +227,14 @@ def main():
         audio_list = []
         for suffix in ["wav", "flac", "mp3"]:
             audio_list += glob.glob(
-                os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+                os.path.join(source_audio_dir, "**", "*.{}".format(suffix)), recursive=True
             )
         print("There are {} source audios: ".format(len(audio_list)))
 
         # Infer for every file as dataset
         output_root_path = args.output_dir
         for audio_path in tqdm(audio_list):
-            audio_name = audio_path.split("/")[-1].split(".")[0]
+            audio_name = os.path.splitext(os.path.basename(audio_path))[0]
             args.output_dir = os.path.join(output_root_path, audio_name)
             print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
 
diff --git a/evaluation/metrics/similarity/resemblyzer_similarity.py b/evaluation/metrics/similarity/resemblyzer_similarity.py
index 15d89d5b..d02a4669 100644
--- a/evaluation/metrics/similarity/resemblyzer_similarity.py
+++ b/evaluation/metrics/similarity/resemblyzer_similarity.py
@@ -60,7 +60,7 @@ def extract_resemblyzer_similarity(target_path, reference_path, dump_dir):
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    filename = target_path.split("/")[-1]
+    filename = os.path.basename(target_path)
     csv_file_name = f"similarity_results_{filename}.csv"
     dump_dir = dump_dir + "/" + csv_file_name
 
diff --git a/models/base/base_inference.py b/models/base/base_inference.py
index 2713f19a..88348216 100644
--- a/models/base/base_inference.py
+++ b/models/base/base_inference.py
@@ -85,8 +85,8 @@ def get_vocoder_info(self):
             os.path.dirname(self.checkpoint_dir_vocoder), "args.json"
         )
         self.cfg.vocoder = load_config(self.vocoder_cfg, lowercase=True)
-        self.vocoder_tag = self.checkpoint_dir_vocoder.split("/")[-2].split(":")[-1]
-        self.vocoder_steps = self.checkpoint_dir_vocoder.split("/")[-1].split(".")[0]
+        self.vocoder_tag = os.path.split(self.checkpoint_dir_vocoder)[-2].split(":")[-1]
+        self.vocoder_steps = os.path.splitext(os.path.basename(self.checkpoint_dir_vocoder))[0]
 
     def build_test_utt_data(self):
         raise NotImplementedError
diff --git a/models/vocoders/vocoder_inference.py b/models/vocoders/vocoder_inference.py
index 95e354c5..6aebd026 100644
--- a/models/vocoders/vocoder_inference.py
+++ b/models/vocoders/vocoder_inference.py
@@ -179,7 +179,7 @@ def _build_tmp_dataset_from_feature(self):
         utts = []
         mels = glob(os.path.join(self.args.feature_folder, "mels", "*.npy"))
         for i, mel in enumerate(mels):
-            uid = mel.split("/")[-1].split(".")[0]
+            uid = os.path.splitext(os.path.basename(mel))[0]
             utt = {"Dataset": "tmp", "Uid": uid, "index": i}
             utts.append(utt)
 
@@ -199,7 +199,7 @@ def _build_tmp_dataset_from_feature(self):
 
         features = glob(os.path.join(self.args.feature_folder, "*"))
         for feature in features:
-            feature_name = feature.split("/")[-1]
+            feature_name = os.path.basename(feature)
             if os.path.isfile(feature):
                 continue
             shutil.copytree(
@@ -214,7 +214,7 @@ def _build_tmp_dataset_from_audio(self):
         utts = []
         audios = glob(os.path.join(self.args.audio_folder, "*"))
         for i, audio in enumerate(audios):
-            uid = audio.split("/")[-1].split(".")[0]
+            uid = os.path.splitext(os.path.basename(audio))[0]
             utt = {"Dataset": "tmp", "Uid": uid, "index": i, "Path": audio}
             utts.append(utt)
 
diff --git a/preprocessors/cdmusiceval.py b/preprocessors/cdmusiceval.py
index 1c4af97c..b8da228f 100644
--- a/preprocessors/cdmusiceval.py
+++ b/preprocessors/cdmusiceval.py
@@ -60,12 +60,12 @@ def statistics(utterance_dir):
     singer_infos = glob(utterance_dir + "/*")
 
     for singer_info in singer_infos:
-        singer = singer_info.split("/")[-1]
+        singer = os.path.basename(singer_info)
 
         song_infos = glob(singer_info + "/*")
 
         for song_info in song_infos:
-            song = song_info.split("/")[-1]
+            song = os.path.basename(song_info)
 
             singers.append(singer)
             songs.append(song)
@@ -73,7 +73,7 @@ def statistics(utterance_dir):
             utts = glob(song_info + "/*.wav")
 
             for utt in utts:
-                uid = utt.split("/")[-1].split(".")[0]
+                uid = os.path.splitext(os.path.basename(utt))[0]
                 singers2songs[singer][song].append(uid)
 
     unique_singers = list(set(singers))
diff --git a/preprocessors/coco.py b/preprocessors/coco.py
index 9ac462f3..01bff952 100644
--- a/preprocessors/coco.py
+++ b/preprocessors/coco.py
@@ -24,10 +24,10 @@ def coco_statistics(data_dir):
     song_infos = glob(data_dir + "/*")
 
     for song in song_infos:
-        song_name = song.split("/")[-1]
+        song_name = os.path.basename(song)
         utts = glob(song + "/*.wav")
         for utt in utts:
-            uid = utt.split("/")[-1].split(".")[0]
+            uid = os.path.splitext(os.path.basename(utt))[0]
             song2utts[song_name].append(uid)
 
     print("Coco: {} songs".format(len(song_infos)))
diff --git a/preprocessors/cocoeval.py b/preprocessors/cocoeval.py
index 3cb3604c..1e918f4a 100644
--- a/preprocessors/cocoeval.py
+++ b/preprocessors/cocoeval.py
@@ -25,7 +25,7 @@ def _split_utts():
 
     vocal_files = glob(os.path.join(raw_dir, "*/vocal.wav"))
     for vocal_f in tqdm(vocal_files):
-        song_name = vocal_f.split("/")[-2]
+        song_name = os.path.split(vocal_f)[-2]
 
         output_dir = os.path.join(output_root, song_name)
         os.makedirs(output_dir, exist_ok=True)
@@ -39,10 +39,10 @@ def cocoeval_statistics(data_dir):
     song_infos = glob(data_dir + "/*")
 
     for song in song_infos:
-        song_name = song.split("/")[-1]
+        song_name = os.path.basename(song)
         utts = glob(song + "/*.wav")
         for utt in utts:
-            uid = utt.split("/")[-1].split(".")[0]
+            uid = os.path.splitext(os.path.basename(utt))[0]
             song2utts[song_name].append(uid)
 
     print("Cocoeval: {} songs".format(len(song_infos)))
diff --git a/preprocessors/csd.py b/preprocessors/csd.py
index 645a8b3d..8d854206 100644
--- a/preprocessors/csd.py
+++ b/preprocessors/csd.py
@@ -38,7 +38,7 @@ def split_to_utterances(language_dir, output_dir):
     pitches = set()
     for wav_file in tqdm(glob("{}/*.wav".format(wav_dir))):
         # Load waveform
-        song_name = wav_file.split("/")[-1].split(".")[0]
+        song_name = os.path.splitext(os.path.basename(wav_file))[0]
         waveform, fs = torchaudio.load(wav_file)
 
         # Load utterances
@@ -100,7 +100,7 @@ def csd_statistics(data_dir):
     folder_infos = glob(data_dir + "/*")
 
     for folder_info in folder_infos:
-        folder_info_split = folder_info.split("/")[-1]
+        folder_info_split = os.path.basename(folder_info)
 
         language = folder_info_split[:2]
         song = folder_info_split[2:]
@@ -111,7 +111,7 @@ def csd_statistics(data_dir):
         utts = glob(folder_info + "/*")
 
         for utt in utts:
-            uid = utt.split("/")[-1].split(".")[0]
+            uid = os.path.splitext(os.path.basename(utt))[0]
             languages2songs[language][song].append(uid)
 
     unique_languages = list(set(languages))
diff --git a/preprocessors/customsvcdataset.py b/preprocessors/customsvcdataset.py
index 84f204de..da52e43b 100644
--- a/preprocessors/customsvcdataset.py
+++ b/preprocessors/customsvcdataset.py
@@ -19,15 +19,15 @@ def statistics(utterance_dir):
     utts_all = []
     singers2songs = defaultdict(lambda: defaultdict(list))
 
-    singer_infos = glob(utterance_dir + "/*")
+    singer_infos = os.scandir(utterance_dir.path)
 
     for singer_info in singer_infos:
-        singer = singer_info.split("/")[-1]
+        singer = os.path.basename(singer_info)
 
-        song_infos = glob(singer_info + "/*")
+        song_infos = os.scandir(singer_info.path)
 
         for song_info in song_infos:
-            song = song_info.split("/")[-1]
+            song = os.path.basename(song_info)
 
             singers.append(singer)
             songs.append(song)
@@ -36,7 +36,7 @@ def statistics(utterance_dir):
             utts_all.extend(utts)
 
             for utt in utts:
-                uid = utt.split("/")[-1].split(".")[0]
+                uid = os.path.splitext(os.path.basename(utt))[0]
                 singers2songs[singer][song].append(uid)
 
     unique_singers = list(set(singers))
diff --git a/preprocessors/libritts.py b/preprocessors/libritts.py
index adc86f0c..6c724268 100644
--- a/preprocessors/libritts.py
+++ b/preprocessors/libritts.py
@@ -19,34 +19,33 @@ def libritts_statistics(data_dir):
         lambda: defaultdict(lambda: defaultdict(list))
     )
 
-    distribution_infos = glob(data_dir + "/*")
+    distribution_infos = os.scandir(data_dir)
 
     for distribution_info in distribution_infos:
-        distribution = distribution_info.split("/")[-1]
-        print(distribution)
+        if distribution_info.is_dir():
+            distribution = distribution_info.name
+            print(distribution)
 
-        speaker_infos = glob(distribution_info + "/*")
+            speaker_infos = os.scandir(distribution_info.path)
 
-        if len(speaker_infos) == 0:
-            continue
+            for speaker_info in speaker_infos:
+                if speaker_info.is_dir():
+                    speaker = speaker_info.name
 
-        for speaker_info in speaker_infos:
-            speaker = speaker_info.split("/")[-1]
+                    speakers.append(speaker)
 
-            speakers.append(speaker)
+                    pharase_infos = os.scandir(speaker_info.path)
 
-            pharase_infos = glob(speaker_info + "/*")
+                    for pharase_info in pharase_infos:
+                        if pharase_info.is_dir():
+                            pharase = pharase_info.name
 
-            for pharase_info in pharase_infos:
-                pharase = pharase_info.split("/")[-1]
+                            utts = os.scandir(pharase_info.path)
 
-                utts = glob(pharase_info + "/*.wav")
-
-                for utt in utts:
-                    uid = utt.split("/")[-1].split(".")[0]
-                    distribution2speakers2pharases2utts[distribution][speaker][
-                        pharase
-                    ].append(uid)
+                            for utt in utts:
+                                if utt.is_file() and utt.name.endswith(".wav"):
+                                    uid = os.path.splitext(utt.name)[0]
+                                    distribution2speakers2pharases2utts[distribution][speaker][pharase].append(uid)
 
     unique_speakers = list(set(speakers))
     unique_speakers.sort()
@@ -105,9 +104,10 @@ def main(output_path, dataset_path):
                             distribution, speaker, chosen_pharase, chosen_uid
                         ),
                     }
-                    res["Path"] = "{}/{}/{}/{}.wav".format(
-                        distribution, speaker, chosen_pharase, chosen_uid
-                    )
+                    res["Path"] = os.path.join("{}".format(distribution),
+                                               "{}".format(speaker),
+                                               "{}".format(chosen_pharase),
+                                               "{}".format(chosen_uid)) + ".wav"
                     res["Path"] = os.path.join(libritts_path, res["Path"])
                     assert os.path.exists(res["Path"])
 
diff --git a/preprocessors/pjs.py b/preprocessors/pjs.py
index 78d69bc5..2dbbdf63 100644
--- a/preprocessors/pjs.py
+++ b/preprocessors/pjs.py
@@ -11,6 +11,7 @@
 
 from utils.util import has_existed
 from utils.io import save_audio
+from tqdm import tqdm
 
 
 def get_splitted_utterances(
@@ -27,7 +28,7 @@ def get_splitted_utterances(
     if len(raw_song_files) * n_utterance_splits == len(trimed_song_files):
         print("Splitted done...")
         for wav_file in tqdm(trimed_song_files):
-            uid = wav_file.split("/")[-1].split(".")[0]
+            uid = os.path.splitext(os.path.basename(wav_file))[0]
             utt = {"Dataset": "pjs", "Singer": "male1", "Uid": uid, "Path": wav_file}
 
             waveform, sample_rate = torchaudio.load(wav_file)
@@ -38,7 +39,7 @@ def get_splitted_utterances(
 
     else:
         for wav_file in tqdm(raw_song_files):
-            song_id = wav_file.split("/")[-1].split(".")[0]
+            song_id = os.path.splitext(os.path.basename(wav_file))[0]
 
             waveform, sample_rate = torchaudio.load(wav_file)
             trimed_waveform = torchaudio.functional.vad(waveform, sample_rate)
diff --git a/utils/audio_slicer.py b/utils/audio_slicer.py
index 28474596..96f64fa2 100644
--- a/utils/audio_slicer.py
+++ b/utils/audio_slicer.py
@@ -230,7 +230,7 @@ def split_utterances_from_audio(
     min_interval (millisecond):
         The smaller min_interval is, the more sliced audio clips this script is likely to generate.
     """
-    print("File:", wav_file.split("/")[-1])
+    print("File:", os.path.basename(wav_file))
     waveform, fs = torchaudio.load(wav_file)
 
     slicer = Slicer(sr=fs, min_interval=min_interval, threshold=db_threshold)