CorentinJ · KaledDahleh · Jul 22, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ Like everything else in Deep Learning, this repo has quickly gotten old. Many Sa
 
 ### 1. Install Requirements
 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
-2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
+2. Python 3.11.7 is recommended. Python 3.11 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
 5. Install the remaining requirements with `pip install -r requirements.txt`

diff --git a/encoder/audio.py b/encoder/audio.py
@@ -1,4 +1,4 @@
-from scipy.ndimage.morphology import binary_dilation
+from scipy.ndimage import binary_dilation
 from encoder.params_data import *
 from pathlib import Path
 from typing import Optional, Union
@@ -9,7 +9,7 @@
 
 try:
     import webrtcvad
-except:
+except ImportError:
     warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
     webrtcvad=None
 
@@ -32,34 +32,34 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
     this argument will be ignored.
     """
     # Load the wav from disk if needed
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+    if isinstance(fpath_or_wav, (str, Path)):
         wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
     else:
         wav = fpath_or_wav
-    
+
     # Resample the wav if needed
     if source_sr is not None and source_sr != sampling_rate:
-        wav = librosa.resample(wav, source_sr, sampling_rate)
-        
-    # Apply the preprocessing: normalize volume and shorten long silences 
+        wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
+
+    # Apply the preprocessing: normalize volume and shorten long silences
     if normalize:
         wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
     if webrtcvad and trim_silence:
         wav = trim_long_silences(wav)
-    
+
     return wav
 
 
-def wav_to_mel_spectrogram(wav):
+def wav_to_mel_spectrogram(wav, sr=sampling_rate):
     """
     Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
-    Note: this not a log-mel spectrogram.
+    Note: this is not a log-mel spectrogram.
     """
     frames = librosa.feature.melspectrogram(
-        wav,
-        sampling_rate,
-        n_fft=int(sampling_rate * mel_window_length / 1000),
-        hop_length=int(sampling_rate * mel_window_step / 1000),
+        y=wav,
+        sr=sr,
+        n_fft=int(sr * mel_window_length / 1000),
+        hop_length=int(sr * mel_window_step / 1000),
         n_mels=mel_n_channels
     )
     return frames.astype(np.float32).T
@@ -75,13 +75,13 @@ def trim_long_silences(wav):
     """
     # Compute the voice detection window size
     samples_per_window = (vad_window_length * sampling_rate) // 1000
-    
+
     # Trim the end of the audio to have a multiple of the window size
     wav = wav[:len(wav) - (len(wav) % samples_per_window)]
-    
+
     # Convert the float waveform to 16-bit mono PCM
     pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
-    
+
     # Perform voice activation detection
     voice_flags = []
     vad = webrtcvad.Vad(mode=3)
@@ -90,21 +90,21 @@ def trim_long_silences(wav):
         voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                                          sample_rate=sampling_rate))
     voice_flags = np.array(voice_flags)
-    
+
     # Smooth the voice detection with a moving average
     def moving_average(array, width):
         array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
         ret = np.cumsum(array_padded, dtype=float)
         ret[width:] = ret[width:] - ret[:-width]
         return ret[width - 1:] / width
-    
+
     audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
-    
+    audio_mask = np.round(audio_mask).astype(bool)
+
     # Dilate the voiced regions
     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
     audio_mask = np.repeat(audio_mask, samples_per_window)
-    
+
     return wav[audio_mask == True]
 
 

diff --git a/requirements.txt b/requirements.txt
diff --git a/toolbox/ui.py b/toolbox/ui.py
@@ -34,7 +34,7 @@
     [0, 0, 0],
     [183, 183, 183],
     [76, 255, 0],
-], dtype=np.float) / 255
+], dtype=np.float64) / 255
 
 default_text = \
     "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \

diff --git a/utils/default_models.py b/utils/default_models.py
@@ -1,36 +1,26 @@
-import urllib.request
+import gdown
 from pathlib import Path
 from threading import Thread
-from urllib.error import HTTPError
-
 from tqdm import tqdm
 
-
 default_models = {
     "encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379),
     "synthesizer": ("https://drive.google.com/u/0/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download&confirm=t", 370554559),
     "vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290),
 }
 
-
 class DownloadProgressBar(tqdm):
     def update_to(self, b=1, bsize=1, tsize=None):
         if tsize is not None:
             self.total = tsize
         self.update(b * bsize - self.n)
 
-
 def download(url: str, target: Path, bar_pos=0):
-    # Ensure the directory exists
     target.parent.mkdir(exist_ok=True, parents=True)
-
     desc = f"Downloading {target.name}"
-    with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t:
-        try:
-            urllib.request.urlretrieve(url, filename=target, reporthook=t.update_to)
-        except HTTPError:
-            return
 
+    with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t:
+        gdown.download(url, str(target), quiet=False)
 
 def ensure_default_models(models_dir: Path):
     # Define download tasks
@@ -46,7 +36,7 @@ def ensure_default_models(models_dir: Path):
         thread = Thread(target=download, args=(url, target_path, len(jobs)))
         thread.start()
         jobs.append((thread, target_path, size))
-
+    
     # Run and join threads
     for thread, target_path, size in jobs:
         thread.join()