From 86890fe0fa066a515fe6a591c3e5195766dacc79 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Sun, 21 Jul 2024 21:48:29 -0500 Subject: [PATCH 01/13] Update import statement to use 'binary_dilation' directly from 'scipy.ndimage' instead of deprecated 'morphology' module. --- encoder/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encoder/audio.py b/encoder/audio.py index 799aa8354..764b83091 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -1,4 +1,4 @@ -from scipy.ndimage.morphology import binary_dilation +from scipy.ndimage import binary_dilation from encoder.params_data import * from pathlib import Path from typing import Optional, Union From 3013482856da79adabb632064be42b0f3b375a9b Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Sun, 21 Jul 2024 21:57:08 -0500 Subject: [PATCH 02/13] Specify 'ImportError' in the try-except block for importing 'webrtcvad' to improve code clarity. --- encoder/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encoder/audio.py b/encoder/audio.py index 764b83091..95369c949 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -9,7 +9,7 @@ try: import webrtcvad -except: +except ImportError: warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.") webrtcvad=None From 33c25d11ef62e8f8824fa75f9d22a640becc4471 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Sun, 21 Jul 2024 21:58:16 -0500 Subject: [PATCH 03/13] Simplify type checking for file path input using a tuple in 'isinstance'. --- encoder/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encoder/audio.py b/encoder/audio.py index 95369c949..f1b688661 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -32,7 +32,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], this argument will be ignored. """ # Load the wav from disk if needed - if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): + if isinstance(fpath_or_wav, (str, Path)): wav, source_sr = librosa.load(str(fpath_or_wav), sr=None) else: wav = fpath_or_wav From 19497f360b59e884d52fed98d177227dc34ca119 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Sun, 21 Jul 2024 21:59:36 -0500 Subject: [PATCH 04/13] Update 'librosa.resample' function call with named arguments for clarity and compatibility. --- encoder/audio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/encoder/audio.py b/encoder/audio.py index f1b688661..fb2aafef0 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -39,8 +39,8 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], # Resample the wav if needed if source_sr is not None and source_sr != sampling_rate: - wav = librosa.resample(wav, source_sr, sampling_rate) - + wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate) + # Apply the preprocessing: normalize volume and shorten long silences if normalize: wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) From 8c1ba11f8d205fbf3d41a235ec8e9d2e4fd1928c Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Sun, 21 Jul 2024 22:01:57 -0500 Subject: [PATCH 05/13] Add 'sr' parameter to 'wav_to_mel_spectrogram' function to allow specifying sample rate. --- encoder/audio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/encoder/audio.py b/encoder/audio.py index fb2aafef0..f35551328 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -57,9 +57,9 @@ def wav_to_mel_spectrogram(wav): """ frames = librosa.feature.melspectrogram( wav, - sampling_rate, - n_fft=int(sampling_rate * mel_window_length / 1000), - hop_length=int(sampling_rate * mel_window_step / 1000), + sr, + n_fft=int(sr * mel_window_length / 1000), + hop_length=int(sr * mel_window_step / 1000), n_mels=mel_n_channels ) return frames.astype(np.float32).T From df96670415ed59c0023a68e59c5209858015c57f Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Sun, 21 Jul 2024 22:02:55 -0500 Subject: [PATCH 06/13] Replace deprecated 'np.bool' with built-in 'bool' for type conversion. --- encoder/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encoder/audio.py b/encoder/audio.py index f35551328..dc6ebcd00 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -99,7 +99,7 @@ def moving_average(array, width): return ret[width - 1:] / width audio_mask = moving_average(voice_flags, vad_moving_average_width) - audio_mask = np.round(audio_mask).astype(np.bool) + audio_mask = np.round(audio_mask).astype(bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) From 96434c3811371f7fe9d4cf4798ad14608d75ad21 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Mon, 22 Jul 2024 21:22:07 -0500 Subject: [PATCH 07/13] major refactoring --- encoder/audio.py | 30 +++++++++++++++--------------- requirements.txt | Bin 562 -> 580 bytes toolbox/ui.py | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/encoder/audio.py b/encoder/audio.py index dc6ebcd00..86308f8f1 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -36,28 +36,28 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], wav, source_sr = librosa.load(str(fpath_or_wav), sr=None) else: wav = fpath_or_wav - + # Resample the wav if needed if source_sr is not None and source_sr != sampling_rate: wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate) - - # Apply the preprocessing: normalize volume and shorten long silences + + # Apply the preprocessing: normalize volume and shorten long silences if normalize: wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) if webrtcvad and trim_silence: wav = trim_long_silences(wav) - + return wav -def wav_to_mel_spectrogram(wav): +def wav_to_mel_spectrogram(wav, sr=sampling_rate): """ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. - Note: this not a log-mel spectrogram. + Note: this is not a log-mel spectrogram. """ frames = librosa.feature.melspectrogram( - wav, - sr, + y=wav, + sr=sr, n_fft=int(sr * mel_window_length / 1000), hop_length=int(sr * mel_window_step / 1000), n_mels=mel_n_channels @@ -75,13 +75,13 @@ def trim_long_silences(wav): """ # Compute the voice detection window size samples_per_window = (vad_window_length * sampling_rate) // 1000 - + # Trim the end of the audio to have a multiple of the window size wav = wav[:len(wav) - (len(wav) % samples_per_window)] - + # Convert the float waveform to 16-bit mono PCM pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) - + # Perform voice activation detection voice_flags = [] vad = webrtcvad.Vad(mode=3) @@ -90,21 +90,21 @@ def trim_long_silences(wav): voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=sampling_rate)) voice_flags = np.array(voice_flags) - + # Smooth the voice detection with a moving average def moving_average(array, width): array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) ret = np.cumsum(array_padded, dtype=float) ret[width:] = ret[width:] - ret[:-width] return ret[width - 1:] / width - + audio_mask = moving_average(voice_flags, vad_moving_average_width) audio_mask = np.round(audio_mask).astype(bool) - + # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) audio_mask = np.repeat(audio_mask, samples_per_window) - + return wav[audio_mask == True] diff --git a/requirements.txt b/requirements.txt index f0c24bfe22b275f8b81c8ad61df318e843dd71e6..1e4600c0af2c6e3dd40e7dc108e7c8107e0b71f6 100644 GIT binary patch delta 225 zcmdnQa)f1q2&XxN9)mFu8%~s!W;bLo05Xjx+A1bLCfa;+bWJ4KHrkEjxA)g@^EDLrZP=(24 Xb|!6^bg)c0LmpTTekt!s( delta 186 zcmX@YvWaDa2&XB79)mFu8%&gyX0-q^3@6$vGMY|Il$ACBOYt&rF$6GVGUNbZK0`T! zEf88voUO`eHu113D@cve#JA$C<{-tBxf#_NjVBv3D(XTs>oF7nH5D_IFc>m`3@TwL zWJqDi1sh@l)M3V81lCu|sLE(Gc{ZamklfEG&TR-X7AUL7U_SXiqa-JY4|1I4 Date: Mon, 22 Jul 2024 21:34:24 -0500 Subject: [PATCH 08/13] models updated --- utils/default_models.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/utils/default_models.py b/utils/default_models.py index a0fb9276e..6e3bf57f3 100644 --- a/utils/default_models.py +++ b/utils/default_models.py @@ -1,39 +1,28 @@ -import urllib.request +import gdown from pathlib import Path from threading import Thread -from urllib.error import HTTPError - from tqdm import tqdm - default_models = { "encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379), - "synthesizer": ("https://drive.google.com/u/0/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download&confirm=t", 370554559), + "synthesizer": ("https://drive.google.com/uc?export=download&id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download", 370554559), "vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290), } - class DownloadProgressBar(tqdm): def update_to(self, b=1, bsize=1, tsize=None): if tsize is not None: self.total = tsize self.update(b * bsize - self.n) - def download(url: str, target: Path, bar_pos=0): - # Ensure the directory exists target.parent.mkdir(exist_ok=True, parents=True) - desc = f"Downloading {target.name}" - with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t: - try: - urllib.request.urlretrieve(url, filename=target, reporthook=t.update_to) - except HTTPError: - return + with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t: + gdown.download(url, str(target), quiet=False) def ensure_default_models(models_dir: Path): - # Define download tasks jobs = [] for model_name, (url, size) in default_models.items(): target_path = models_dir / "default" / f"{model_name}.pt" @@ -47,7 +36,6 @@ def ensure_default_models(models_dir: Path): thread.start() jobs.append((thread, target_path, size)) - # Run and join threads for thread, target_path, size in jobs: thread.join() From 2bedf6ade76ff3b8fcb13b49c8103b1b8f1e86d1 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Mon, 22 Jul 2024 21:42:15 -0500 Subject: [PATCH 09/13] rollback url --- utils/default_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/default_models.py b/utils/default_models.py index 6e3bf57f3..9d14247c1 100644 --- a/utils/default_models.py +++ b/utils/default_models.py @@ -5,7 +5,7 @@ default_models = { "encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379), - "synthesizer": ("https://drive.google.com/uc?export=download&id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download", 370554559), + "synthesizer": ("https://drive.google.com/u/0/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download&confirm=t", 370554559), "vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290), } From b06f7de3c739d5d54a98ed2a51d6a2e3bc70bca9 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Mon, 22 Jul 2024 21:49:55 -0500 Subject: [PATCH 10/13] python version upgrade --- README.md | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8025f3201..888f1c22f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # Real-Time Voice Cloning + This repository is an implementation of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801). @@ -8,18 +9,19 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) +### Papers implemented - -### Papers implemented -| URL | Designation | Title | Implementation source | -| --- | ----------- | ----- | --------------------- | -|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | -|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | -|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) -|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | +| URL | Designation | Title | Implementation source | +| ------------------------------------------------------ | ---------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------- | +| [**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | +| [1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | +| [1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | +| [1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder) | Generalized End-To-End Loss for Speaker Verification | This repo | ## Heads up + Like everything else in Deep Learning, this repo has quickly gotten old. Many SaaS apps (often paying) will give you a better audio quality than this repository will. If you wish for an open-source solution with a high voice quality: + - Check out [paperswithcode](https://paperswithcode.com/task/speech-synthesis/) for other repositories and recent research in the field of speech synthesis. - Check out [CoquiTTS](https://github.com/coqui-ai/tts) for a repository with a better voice cloning quality and more functionalities. - Check out [MetaVoice-1B](https://github.com/metavoiceio/metavoice-src) for a large voice model with high voice quality @@ -27,16 +29,19 @@ Like everything else in Deep Learning, this repo has quickly gotten old. Many Sa ## Setup ### 1. Install Requirements + 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory. -2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. +2. Python 3.11.7 is recommended. Python 3.11 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files. 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command. 5. Install the remaining requirements with `pip install -r requirements.txt` ### 2. (Optional) Download Pretrained Models + Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). ### 3. (Optional) Test Configuration + Before you download any dataset, you can begin by testing your configuration with: `python demo_cli.py` @@ -44,13 +49,15 @@ Before you download any dataset, you can begin by testing your configuration wit If all tests pass, you're good to go. ### 4. (Optional) Download Datasets + For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. ### 5. Launch the Toolbox + You can then try the toolbox: `python demo_toolbox.py -d ` or -`python demo_toolbox.py` +`python demo_toolbox.py` depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). From 356bc0f8375e33a5f8321354c32b76bfbb7baacc Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Mon, 22 Jul 2024 21:59:14 -0500 Subject: [PATCH 11/13] rollback formatting --- README.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 888f1c22f..ca8099b86 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) + ### Papers implemented | URL | Designation | Title | Implementation source | @@ -19,17 +20,14 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre | [1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder) | Generalized End-To-End Loss for Speaker Verification | This repo | ## Heads up - Like everything else in Deep Learning, this repo has quickly gotten old. Many SaaS apps (often paying) will give you a better audio quality than this repository will. If you wish for an open-source solution with a high voice quality: - Check out [paperswithcode](https://paperswithcode.com/task/speech-synthesis/) for other repositories and recent research in the field of speech synthesis. - Check out [CoquiTTS](https://github.com/coqui-ai/tts) for a repository with a better voice cloning quality and more functionalities. - Check out [MetaVoice-1B](https://github.com/metavoiceio/metavoice-src) for a large voice model with high voice quality - ## Setup ### 1. Install Requirements - 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory. 2. Python 3.11.7 is recommended. Python 3.11 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files. @@ -37,11 +35,9 @@ Like everything else in Deep Learning, this repo has quickly gotten old. Many Sa 5. Install the remaining requirements with `pip install -r requirements.txt` ### 2. (Optional) Download Pretrained Models - Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). ### 3. (Optional) Test Configuration - Before you download any dataset, you can begin by testing your configuration with: `python demo_cli.py` @@ -49,15 +45,13 @@ Before you download any dataset, you can begin by testing your configuration wit If all tests pass, you're good to go. ### 4. (Optional) Download Datasets - For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. ### 5. Launch the Toolbox - You can then try the toolbox: `python demo_toolbox.py -d ` or -`python demo_toolbox.py` +`python demo_toolbox.py` depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). From 85c315d0df1085f87c8df92e31e150bb085e3267 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Mon, 22 Jul 2024 22:05:31 -0500 Subject: [PATCH 12/13] rollback vscode auto formatting --- README.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index ca8099b86..0073b1b9c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ # Real-Time Voice Cloning - This repository is an implementation of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801). @@ -10,21 +9,21 @@ SV2TTS is a deep learning framework in three stages. In the first stage, one cre [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) -### Papers implemented -| URL | Designation | Title | Implementation source | -| ------------------------------------------------------ | ---------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------- | -| [**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | -| [1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | -| [1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | -| [1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder) | Generalized End-To-End Loss for Speaker Verification | This repo | +### Papers implemented +| URL | Designation | Title | Implementation source | +| --- | ----------- | ----- | --------------------- | +|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | +|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | +|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) +|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | ## Heads up Like everything else in Deep Learning, this repo has quickly gotten old. Many SaaS apps (often paying) will give you a better audio quality than this repository will. If you wish for an open-source solution with a high voice quality: - - Check out [paperswithcode](https://paperswithcode.com/task/speech-synthesis/) for other repositories and recent research in the field of speech synthesis. - Check out [CoquiTTS](https://github.com/coqui-ai/tts) for a repository with a better voice cloning quality and more functionalities. - Check out [MetaVoice-1B](https://github.com/metavoiceio/metavoice-src) for a large voice model with high voice quality + ## Setup ### 1. Install Requirements @@ -52,6 +51,6 @@ You can then try the toolbox: `python demo_toolbox.py -d ` or -`python demo_toolbox.py` +`python demo_toolbox.py` depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). From c5972f5b4996683de3792bd0942d29cfb88cede5 Mon Sep 17 00:00:00 2001 From: Kaled Dahleh Date: Mon, 22 Jul 2024 22:08:33 -0500 Subject: [PATCH 13/13] comments --- utils/default_models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/default_models.py b/utils/default_models.py index 9d14247c1..7e3d0708c 100644 --- a/utils/default_models.py +++ b/utils/default_models.py @@ -23,6 +23,7 @@ def download(url: str, target: Path, bar_pos=0): gdown.download(url, str(target), quiet=False) def ensure_default_models(models_dir: Path): + # Define download tasks jobs = [] for model_name, (url, size) in default_models.items(): target_path = models_dir / "default" / f"{model_name}.pt" @@ -35,7 +36,8 @@ def ensure_default_models(models_dir: Path): thread = Thread(target=download, args=(url, target_path, len(jobs))) thread.start() jobs.append((thread, target_path, size)) - + + # Run and join threads for thread, target_path, size in jobs: thread.join()