diff --git a/README.md b/README.md index 8025f3201..0073b1b9c 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Like everything else in Deep Learning, this repo has quickly gotten old. Many Sa ### 1. Install Requirements 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory. -2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. +2. Python 3.11.7 is recommended. Python 3.11 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files. 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command. 5. Install the remaining requirements with `pip install -r requirements.txt` diff --git a/encoder/audio.py b/encoder/audio.py index 799aa8354..86308f8f1 100644 --- a/encoder/audio.py +++ b/encoder/audio.py @@ -1,4 +1,4 @@ -from scipy.ndimage.morphology import binary_dilation +from scipy.ndimage import binary_dilation from encoder.params_data import * from pathlib import Path from typing import Optional, Union @@ -9,7 +9,7 @@ try: import webrtcvad -except: +except ImportError: warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.") webrtcvad=None @@ -32,34 +32,34 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], this argument will be ignored. """ # Load the wav from disk if needed - if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): + if isinstance(fpath_or_wav, (str, Path)): wav, source_sr = librosa.load(str(fpath_or_wav), sr=None) else: wav = fpath_or_wav - + # Resample the wav if needed if source_sr is not None and source_sr != sampling_rate: - wav = librosa.resample(wav, source_sr, sampling_rate) - - # Apply the preprocessing: normalize volume and shorten long silences + wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate) + + # Apply the preprocessing: normalize volume and shorten long silences if normalize: wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) if webrtcvad and trim_silence: wav = trim_long_silences(wav) - + return wav -def wav_to_mel_spectrogram(wav): +def wav_to_mel_spectrogram(wav, sr=sampling_rate): """ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. - Note: this not a log-mel spectrogram. + Note: this is not a log-mel spectrogram. """ frames = librosa.feature.melspectrogram( - wav, - sampling_rate, - n_fft=int(sampling_rate * mel_window_length / 1000), - hop_length=int(sampling_rate * mel_window_step / 1000), + y=wav, + sr=sr, + n_fft=int(sr * mel_window_length / 1000), + hop_length=int(sr * mel_window_step / 1000), n_mels=mel_n_channels ) return frames.astype(np.float32).T @@ -75,13 +75,13 @@ def trim_long_silences(wav): """ # Compute the voice detection window size samples_per_window = (vad_window_length * sampling_rate) // 1000 - + # Trim the end of the audio to have a multiple of the window size wav = wav[:len(wav) - (len(wav) % samples_per_window)] - + # Convert the float waveform to 16-bit mono PCM pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) - + # Perform voice activation detection voice_flags = [] vad = webrtcvad.Vad(mode=3) @@ -90,21 +90,21 @@ def trim_long_silences(wav): voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=sampling_rate)) voice_flags = np.array(voice_flags) - + # Smooth the voice detection with a moving average def moving_average(array, width): array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) ret = np.cumsum(array_padded, dtype=float) ret[width:] = ret[width:] - ret[:-width] return ret[width - 1:] / width - + audio_mask = moving_average(voice_flags, vad_moving_average_width) - audio_mask = np.round(audio_mask).astype(np.bool) - + audio_mask = np.round(audio_mask).astype(bool) + # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) audio_mask = np.repeat(audio_mask, samples_per_window) - + return wav[audio_mask == True] diff --git a/requirements.txt b/requirements.txt index f0c24bfe2..1e4600c0a 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/toolbox/ui.py b/toolbox/ui.py index e33998ba9..3cfbc6e02 100644 --- a/toolbox/ui.py +++ b/toolbox/ui.py @@ -34,7 +34,7 @@ [0, 0, 0], [183, 183, 183], [76, 255, 0], -], dtype=np.float) / 255 +], dtype=np.float64) / 255 default_text = \ "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \ diff --git a/utils/default_models.py b/utils/default_models.py index a0fb9276e..7e3d0708c 100644 --- a/utils/default_models.py +++ b/utils/default_models.py @@ -1,36 +1,26 @@ -import urllib.request +import gdown from pathlib import Path from threading import Thread -from urllib.error import HTTPError - from tqdm import tqdm - default_models = { "encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379), "synthesizer": ("https://drive.google.com/u/0/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download&confirm=t", 370554559), "vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290), } - class DownloadProgressBar(tqdm): def update_to(self, b=1, bsize=1, tsize=None): if tsize is not None: self.total = tsize self.update(b * bsize - self.n) - def download(url: str, target: Path, bar_pos=0): - # Ensure the directory exists target.parent.mkdir(exist_ok=True, parents=True) - desc = f"Downloading {target.name}" - with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t: - try: - urllib.request.urlretrieve(url, filename=target, reporthook=t.update_to) - except HTTPError: - return + with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t: + gdown.download(url, str(target), quiet=False) def ensure_default_models(models_dir: Path): # Define download tasks @@ -46,7 +36,7 @@ def ensure_default_models(models_dir: Path): thread = Thread(target=download, args=(url, target_path, len(jobs))) thread.start() jobs.append((thread, target_path, size)) - + # Run and join threads for thread, target_path, size in jobs: thread.join()