From 86890fe0fa066a515fe6a591c3e5195766dacc79 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Sun, 21 Jul 2024 21:48:29 -0500
Subject: [PATCH 01/13] Update import statement to use 'binary_dilation'
 directly from 'scipy.ndimage' instead of deprecated 'morphology' module.

---
 encoder/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/encoder/audio.py b/encoder/audio.py
index 799aa8354..764b83091 100644
--- a/encoder/audio.py
+++ b/encoder/audio.py
@@ -1,4 +1,4 @@
-from scipy.ndimage.morphology import binary_dilation
+from scipy.ndimage import binary_dilation
 from encoder.params_data import *
 from pathlib import Path
 from typing import Optional, Union

From 3013482856da79adabb632064be42b0f3b375a9b Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Sun, 21 Jul 2024 21:57:08 -0500
Subject: [PATCH 02/13] Specify 'ImportError' in the try-except block for
 importing 'webrtcvad' to improve code clarity.

---
 encoder/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/encoder/audio.py b/encoder/audio.py
index 764b83091..95369c949 100644
--- a/encoder/audio.py
+++ b/encoder/audio.py
@@ -9,7 +9,7 @@
 
 try:
     import webrtcvad
-except:
+except ImportError:
     warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
     webrtcvad=None
 

From 33c25d11ef62e8f8824fa75f9d22a640becc4471 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Sun, 21 Jul 2024 21:58:16 -0500
Subject: [PATCH 03/13] Simplify type checking for file path input using a
 tuple in 'isinstance'.

---
 encoder/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/encoder/audio.py b/encoder/audio.py
index 95369c949..f1b688661 100644
--- a/encoder/audio.py
+++ b/encoder/audio.py
@@ -32,7 +32,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
     this argument will be ignored.
     """
     # Load the wav from disk if needed
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+    if isinstance(fpath_or_wav, (str, Path)):
         wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
     else:
         wav = fpath_or_wav

From 19497f360b59e884d52fed98d177227dc34ca119 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Sun, 21 Jul 2024 21:59:36 -0500
Subject: [PATCH 04/13] Update 'librosa.resample' function call with named
 arguments for clarity and compatibility.

---
 encoder/audio.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/encoder/audio.py b/encoder/audio.py
index f1b688661..fb2aafef0 100644
--- a/encoder/audio.py
+++ b/encoder/audio.py
@@ -39,8 +39,8 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
     
     # Resample the wav if needed
     if source_sr is not None and source_sr != sampling_rate:
-        wav = librosa.resample(wav, source_sr, sampling_rate)
-        
+        wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
+            
     # Apply the preprocessing: normalize volume and shorten long silences 
     if normalize:
         wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)

From 8c1ba11f8d205fbf3d41a235ec8e9d2e4fd1928c Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Sun, 21 Jul 2024 22:01:57 -0500
Subject: [PATCH 05/13] Add 'sr' parameter to 'wav_to_mel_spectrogram' function
 to allow specifying sample rate.

---
 encoder/audio.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/encoder/audio.py b/encoder/audio.py
index fb2aafef0..f35551328 100644
--- a/encoder/audio.py
+++ b/encoder/audio.py
@@ -57,9 +57,9 @@ def wav_to_mel_spectrogram(wav):
     """
     frames = librosa.feature.melspectrogram(
         wav,
-        sampling_rate,
-        n_fft=int(sampling_rate * mel_window_length / 1000),
-        hop_length=int(sampling_rate * mel_window_step / 1000),
+        sr,
+        n_fft=int(sr * mel_window_length / 1000),
+        hop_length=int(sr * mel_window_step / 1000),
         n_mels=mel_n_channels
     )
     return frames.astype(np.float32).T

From df96670415ed59c0023a68e59c5209858015c57f Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Sun, 21 Jul 2024 22:02:55 -0500
Subject: [PATCH 06/13] Replace deprecated 'np.bool' with built-in 'bool' for
 type conversion.

---
 encoder/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/encoder/audio.py b/encoder/audio.py
index f35551328..dc6ebcd00 100644
--- a/encoder/audio.py
+++ b/encoder/audio.py
@@ -99,7 +99,7 @@ def moving_average(array, width):
         return ret[width - 1:] / width
     
     audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
+    audio_mask = np.round(audio_mask).astype(bool)
     
     # Dilate the voiced regions
     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))

From 96434c3811371f7fe9d4cf4798ad14608d75ad21 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Mon, 22 Jul 2024 21:22:07 -0500
Subject: [PATCH 07/13] major refactoring

---
 encoder/audio.py |  30 +++++++++++++++---------------
 requirements.txt | Bin 562 -> 580 bytes
 toolbox/ui.py    |   2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/encoder/audio.py b/encoder/audio.py
index dc6ebcd00..86308f8f1 100644
--- a/encoder/audio.py
+++ b/encoder/audio.py
@@ -36,28 +36,28 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
         wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
     else:
         wav = fpath_or_wav
-    
+
     # Resample the wav if needed
     if source_sr is not None and source_sr != sampling_rate:
         wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
-            
-    # Apply the preprocessing: normalize volume and shorten long silences 
+
+    # Apply the preprocessing: normalize volume and shorten long silences
     if normalize:
         wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
     if webrtcvad and trim_silence:
         wav = trim_long_silences(wav)
-    
+
     return wav
 
 
-def wav_to_mel_spectrogram(wav):
+def wav_to_mel_spectrogram(wav, sr=sampling_rate):
     """
     Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
-    Note: this not a log-mel spectrogram.
+    Note: this is not a log-mel spectrogram.
     """
     frames = librosa.feature.melspectrogram(
-        wav,
-        sr,
+        y=wav,
+        sr=sr,
         n_fft=int(sr * mel_window_length / 1000),
         hop_length=int(sr * mel_window_step / 1000),
         n_mels=mel_n_channels
@@ -75,13 +75,13 @@ def trim_long_silences(wav):
     """
     # Compute the voice detection window size
     samples_per_window = (vad_window_length * sampling_rate) // 1000
-    
+
     # Trim the end of the audio to have a multiple of the window size
     wav = wav[:len(wav) - (len(wav) % samples_per_window)]
-    
+
     # Convert the float waveform to 16-bit mono PCM
     pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
-    
+
     # Perform voice activation detection
     voice_flags = []
     vad = webrtcvad.Vad(mode=3)
@@ -90,21 +90,21 @@ def trim_long_silences(wav):
         voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                                          sample_rate=sampling_rate))
     voice_flags = np.array(voice_flags)
-    
+
     # Smooth the voice detection with a moving average
     def moving_average(array, width):
         array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
         ret = np.cumsum(array_padded, dtype=float)
         ret[width:] = ret[width:] - ret[:-width]
         return ret[width - 1:] / width
-    
+
     audio_mask = moving_average(voice_flags, vad_moving_average_width)
     audio_mask = np.round(audio_mask).astype(bool)
-    
+
     # Dilate the voiced regions
     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
     audio_mask = np.repeat(audio_mask, samples_per_window)
-    
+
     return wav[audio_mask == True]
 
 
diff --git a/requirements.txt b/requirements.txt
index f0c24bfe22b275f8b81c8ad61df318e843dd71e6..1e4600c0af2c6e3dd40e7dc108e7c8107e0b71f6 100644
GIT binary patch
delta 225
zcmdnQa)f1q2&XxN9)mFu8%~s!W;bLo05Xjx+A1<yPK=jjH3RZYCN_#P10^O-R|Qgr
z6Zfhrn}XCc@G@{Q6f-0<WHJ;mR5I8Cp&?k@1V|c8W@c1pG@q=^sK{)@pf@?1Q4(U<
z<Vr?WMzhKD8I|oVz$TP36fxugVJ1To&>bLCfa;+bWJ4KHrkEjxA)g@^EDLrZP=(24
Xb|!6^bg)c0LmpTT<SvkI1F*>ekt!s(

delta 186
zcmX@YvWaDa2&XB79)mFu8%&gyX0-q^3@6$vGMY|Il$ACBOYt&rF$6GVGUNbZK0`T!
zEf88voUO`eHu113D@cve#JA$C<{-tBxf#_NjVBv3D(XTs>oF7nH5D_IFc>m`3@TwL
zWJqDi1sh@l)M3V81lCu|sLE(Gc{ZamklfEG&TR-X7AUL7U_SXiqa-JY4|1I4<iCvC
E0G&`Gd;kCd

diff --git a/toolbox/ui.py b/toolbox/ui.py
index e33998ba9..3cfbc6e02 100644
--- a/toolbox/ui.py
+++ b/toolbox/ui.py
@@ -34,7 +34,7 @@
     [0, 0, 0],
     [183, 183, 183],
     [76, 255, 0],
-], dtype=np.float) / 255
+], dtype=np.float64) / 255
 
 default_text = \
     "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \

From d6f39de104744a80d19d75e50b4ae5ac0c4e5349 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Mon, 22 Jul 2024 21:34:24 -0500
Subject: [PATCH 08/13] models updated

---
 utils/default_models.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/utils/default_models.py b/utils/default_models.py
index a0fb9276e..6e3bf57f3 100644
--- a/utils/default_models.py
+++ b/utils/default_models.py
@@ -1,39 +1,28 @@
-import urllib.request
+import gdown
 from pathlib import Path
 from threading import Thread
-from urllib.error import HTTPError
-
 from tqdm import tqdm
 
-
 default_models = {
     "encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379),
-    "synthesizer": ("https://drive.google.com/u/0/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download&confirm=t", 370554559),
+    "synthesizer": ("https://drive.google.com/uc?export=download&id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download", 370554559),
     "vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290),
 }
 
-
 class DownloadProgressBar(tqdm):
     def update_to(self, b=1, bsize=1, tsize=None):
         if tsize is not None:
             self.total = tsize
         self.update(b * bsize - self.n)
 
-
 def download(url: str, target: Path, bar_pos=0):
-    # Ensure the directory exists
     target.parent.mkdir(exist_ok=True, parents=True)
-
     desc = f"Downloading {target.name}"
-    with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t:
-        try:
-            urllib.request.urlretrieve(url, filename=target, reporthook=t.update_to)
-        except HTTPError:
-            return
 
+    with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t:
+        gdown.download(url, str(target), quiet=False)
 
 def ensure_default_models(models_dir: Path):
-    # Define download tasks
     jobs = []
     for model_name, (url, size) in default_models.items():
         target_path = models_dir / "default" / f"{model_name}.pt"
@@ -47,7 +36,6 @@ def ensure_default_models(models_dir: Path):
         thread.start()
         jobs.append((thread, target_path, size))
 
-    # Run and join threads
     for thread, target_path, size in jobs:
         thread.join()
 

From 2bedf6ade76ff3b8fcb13b49c8103b1b8f1e86d1 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Mon, 22 Jul 2024 21:42:15 -0500
Subject: [PATCH 09/13] rollback url

---
 utils/default_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/default_models.py b/utils/default_models.py
index 6e3bf57f3..9d14247c1 100644
--- a/utils/default_models.py
+++ b/utils/default_models.py
@@ -5,7 +5,7 @@
 
 default_models = {
     "encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379),
-    "synthesizer": ("https://drive.google.com/uc?export=download&id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download", 370554559),
+    "synthesizer": ("https://drive.google.com/u/0/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download&confirm=t", 370554559),
     "vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290),
 }
 

From b06f7de3c739d5d54a98ed2a51d6a2e3bc70bca9 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Mon, 22 Jul 2024 21:49:55 -0500
Subject: [PATCH 10/13] python version upgrade

---
 README.md | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 8025f3201..888f1c22f 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 # Real-Time Voice Cloning
+
 This repository is an implementation of [Transfer Learning from Speaker Verification to
 Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
 
@@ -8,18 +9,19 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre
 
 [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
 
+### Papers implemented
 
-
-### Papers implemented  
-| URL | Designation | Title | Implementation source |
-| --- | ----------- | ----- | --------------------- |
-|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
-|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
-|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
-|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
+| URL                                                    | Designation            | Title                                                                                    | Implementation source                                   |
+| ------------------------------------------------------ | ---------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| [**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS**             | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo                                               |
+| [1802.08435](https://arxiv.org/pdf/1802.08435.pdf)     | WaveRNN (vocoder)      | Efficient Neural Audio Synthesis                                                         | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
+| [1703.10135](https://arxiv.org/pdf/1703.10135.pdf)     | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis                                            | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
+| [1710.10467](https://arxiv.org/pdf/1710.10467.pdf)     | GE2E (encoder)         | Generalized End-To-End Loss for Speaker Verification                                     | This repo                                               |
 
 ## Heads up
+
 Like everything else in Deep Learning, this repo has quickly gotten old. Many SaaS apps (often paying) will give you a better audio quality than this repository will. If you wish for an open-source solution with a high voice quality:
+
 - Check out [paperswithcode](https://paperswithcode.com/task/speech-synthesis/) for other repositories and recent research in the field of speech synthesis.
 - Check out [CoquiTTS](https://github.com/coqui-ai/tts) for a repository with a better voice cloning quality and more functionalities.
 - Check out [MetaVoice-1B](https://github.com/metavoiceio/metavoice-src) for a large voice model with high voice quality
@@ -27,16 +29,19 @@ Like everything else in Deep Learning, this repo has quickly gotten old. Many Sa
 ## Setup
 
 ### 1. Install Requirements
+
 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
-2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
+2. Python 3.11.7 is recommended. Python 3.11 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
 5. Install the remaining requirements with `pip install -r requirements.txt`
 
 ### 2. (Optional) Download Pretrained Models
+
 Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
 
 ### 3. (Optional) Test Configuration
+
 Before you download any dataset, you can begin by testing your configuration with:
 
 `python demo_cli.py`
@@ -44,13 +49,15 @@ Before you download any dataset, you can begin by testing your configuration wit
 If all tests pass, you're good to go.
 
 ### 4. (Optional) Download Datasets
+
 For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
 
 ### 5. Launch the Toolbox
+
 You can then try the toolbox:
 
 `python demo_toolbox.py -d <datasets_root>`  
 or  
-`python demo_toolbox.py`  
+`python demo_toolbox.py`
 
 depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).

From 356bc0f8375e33a5f8321354c32b76bfbb7baacc Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Mon, 22 Jul 2024 21:59:14 -0500
Subject: [PATCH 11/13] rollback formatting

---
 README.md | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 888f1c22f..ca8099b86 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre
 
 [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
 
+
 ### Papers implemented
 
 | URL                                                    | Designation            | Title                                                                                    | Implementation source                                   |
@@ -19,17 +20,14 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre
 | [1710.10467](https://arxiv.org/pdf/1710.10467.pdf)     | GE2E (encoder)         | Generalized End-To-End Loss for Speaker Verification                                     | This repo                                               |
 
 ## Heads up
-
 Like everything else in Deep Learning, this repo has quickly gotten old. Many SaaS apps (often paying) will give you a better audio quality than this repository will. If you wish for an open-source solution with a high voice quality:
 
 - Check out [paperswithcode](https://paperswithcode.com/task/speech-synthesis/) for other repositories and recent research in the field of speech synthesis.
 - Check out [CoquiTTS](https://github.com/coqui-ai/tts) for a repository with a better voice cloning quality and more functionalities.
 - Check out [MetaVoice-1B](https://github.com/metavoiceio/metavoice-src) for a large voice model with high voice quality
-
 ## Setup
 
 ### 1. Install Requirements
-
 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
 2. Python 3.11.7 is recommended. Python 3.11 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
@@ -37,11 +35,9 @@ Like everything else in Deep Learning, this repo has quickly gotten old. Many Sa
 5. Install the remaining requirements with `pip install -r requirements.txt`
 
 ### 2. (Optional) Download Pretrained Models
-
 Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
 
 ### 3. (Optional) Test Configuration
-
 Before you download any dataset, you can begin by testing your configuration with:
 
 `python demo_cli.py`
@@ -49,15 +45,13 @@ Before you download any dataset, you can begin by testing your configuration wit
 If all tests pass, you're good to go.
 
 ### 4. (Optional) Download Datasets
-
 For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
 
 ### 5. Launch the Toolbox
-
 You can then try the toolbox:
 
 `python demo_toolbox.py -d <datasets_root>`  
 or  
-`python demo_toolbox.py`
+`python demo_toolbox.py` 
 
 depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).

From 85c315d0df1085f87c8df92e31e150bb085e3267 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Mon, 22 Jul 2024 22:05:31 -0500
Subject: [PATCH 12/13] rollback vscode auto formatting

---
 README.md | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index ca8099b86..0073b1b9c 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
 # Real-Time Voice Cloning
-
 This repository is an implementation of [Transfer Learning from Speaker Verification to
 Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
 
@@ -10,21 +9,21 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre
 [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
 
 
-### Papers implemented
 
-| URL                                                    | Designation            | Title                                                                                    | Implementation source                                   |
-| ------------------------------------------------------ | ---------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------- |
-| [**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS**             | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo                                               |
-| [1802.08435](https://arxiv.org/pdf/1802.08435.pdf)     | WaveRNN (vocoder)      | Efficient Neural Audio Synthesis                                                         | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
-| [1703.10135](https://arxiv.org/pdf/1703.10135.pdf)     | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis                                            | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
-| [1710.10467](https://arxiv.org/pdf/1710.10467.pdf)     | GE2E (encoder)         | Generalized End-To-End Loss for Speaker Verification                                     | This repo                                               |
+### Papers implemented  
+| URL | Designation | Title | Implementation source |
+| --- | ----------- | ----- | --------------------- |
+|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
+|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
+|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
+|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
 
 ## Heads up
 Like everything else in Deep Learning, this repo has quickly gotten old. Many SaaS apps (often paying) will give you a better audio quality than this repository will. If you wish for an open-source solution with a high voice quality:
-
 - Check out [paperswithcode](https://paperswithcode.com/task/speech-synthesis/) for other repositories and recent research in the field of speech synthesis.
 - Check out [CoquiTTS](https://github.com/coqui-ai/tts) for a repository with a better voice cloning quality and more functionalities.
 - Check out [MetaVoice-1B](https://github.com/metavoiceio/metavoice-src) for a large voice model with high voice quality
+
 ## Setup
 
 ### 1. Install Requirements
@@ -52,6 +51,6 @@ You can then try the toolbox:
 
 `python demo_toolbox.py -d <datasets_root>`  
 or  
-`python demo_toolbox.py` 
+`python demo_toolbox.py`  
 
 depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).

From c5972f5b4996683de3792bd0942d29cfb88cede5 Mon Sep 17 00:00:00 2001
From: Kaled Dahleh <kdahleh23@gmail.com>
Date: Mon, 22 Jul 2024 22:08:33 -0500
Subject: [PATCH 13/13] comments

---
 utils/default_models.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/default_models.py b/utils/default_models.py
index 9d14247c1..7e3d0708c 100644
--- a/utils/default_models.py
+++ b/utils/default_models.py
@@ -23,6 +23,7 @@ def download(url: str, target: Path, bar_pos=0):
         gdown.download(url, str(target), quiet=False)
 
 def ensure_default_models(models_dir: Path):
+    # Define download tasks
     jobs = []
     for model_name, (url, size) in default_models.items():
         target_path = models_dir / "default" / f"{model_name}.pt"
@@ -35,7 +36,8 @@ def ensure_default_models(models_dir: Path):
         thread = Thread(target=download, args=(url, target_path, len(jobs)))
         thread.start()
         jobs.append((thread, target_path, size))
-
+    
+    # Run and join threads
     for thread, target_path, size in jobs:
         thread.join()