Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stability upgrades #1307

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Like everything else in Deep Learning, this repo has quickly gotten old. Many Sa

### 1. Install Requirements
1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
2. Python 3.11.7 is recommended. Python 3.11 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
5. Install the remaining requirements with `pip install -r requirements.txt`
Expand Down
44 changes: 22 additions & 22 deletions encoder/audio.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from scipy.ndimage.morphology import binary_dilation
from scipy.ndimage import binary_dilation
from encoder.params_data import *
from pathlib import Path
from typing import Optional, Union
Expand All @@ -9,7 +9,7 @@

try:
import webrtcvad
except:
except ImportError:
warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
webrtcvad=None

Expand All @@ -32,34 +32,34 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
this argument will be ignored.
"""
# Load the wav from disk if needed
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
if isinstance(fpath_or_wav, (str, Path)):
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
else:
wav = fpath_or_wav

# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences
wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)

# Apply the preprocessing: normalize volume and shorten long silences
if normalize:
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
if webrtcvad and trim_silence:
wav = trim_long_silences(wav)

return wav


def wav_to_mel_spectrogram(wav):
def wav_to_mel_spectrogram(wav, sr=sampling_rate):
"""
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
Note: this not a log-mel spectrogram.
Note: this is not a log-mel spectrogram.
"""
frames = librosa.feature.melspectrogram(
wav,
sampling_rate,
n_fft=int(sampling_rate * mel_window_length / 1000),
hop_length=int(sampling_rate * mel_window_step / 1000),
y=wav,
sr=sr,
n_fft=int(sr * mel_window_length / 1000),
hop_length=int(sr * mel_window_step / 1000),
n_mels=mel_n_channels
)
return frames.astype(np.float32).T
Expand All @@ -75,13 +75,13 @@ def trim_long_silences(wav):
"""
# Compute the voice detection window size
samples_per_window = (vad_window_length * sampling_rate) // 1000

# Trim the end of the audio to have a multiple of the window size
wav = wav[:len(wav) - (len(wav) % samples_per_window)]

# Convert the float waveform to 16-bit mono PCM
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))

# Perform voice activation detection
voice_flags = []
vad = webrtcvad.Vad(mode=3)
Expand All @@ -90,21 +90,21 @@ def trim_long_silences(wav):
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
voice_flags = np.array(voice_flags)

# Smooth the voice detection with a moving average
def moving_average(array, width):
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
ret = np.cumsum(array_padded, dtype=float)
ret[width:] = ret[width:] - ret[:-width]
return ret[width - 1:] / width

audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
audio_mask = np.round(audio_mask).astype(bool)

# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
audio_mask = np.repeat(audio_mask, samples_per_window)

return wav[audio_mask == True]


Expand Down
Binary file modified requirements.txt
Binary file not shown.
2 changes: 1 addition & 1 deletion toolbox/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
[0, 0, 0],
[183, 183, 183],
[76, 255, 0],
], dtype=np.float) / 255
], dtype=np.float64) / 255

default_text = \
"Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \
Expand Down
18 changes: 4 additions & 14 deletions utils/default_models.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,26 @@
import urllib.request
import gdown
from pathlib import Path
from threading import Thread
from urllib.error import HTTPError

from tqdm import tqdm


default_models = {
"encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379),
"synthesizer": ("https://drive.google.com/u/0/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s&export=download&confirm=t", 370554559),
"vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290),
}


class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)


def download(url: str, target: Path, bar_pos=0):
# Ensure the directory exists
target.parent.mkdir(exist_ok=True, parents=True)

desc = f"Downloading {target.name}"
with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t:
try:
urllib.request.urlretrieve(url, filename=target, reporthook=t.update_to)
except HTTPError:
return

with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t:
gdown.download(url, str(target), quiet=False)

def ensure_default_models(models_dir: Path):
# Define download tasks
Expand All @@ -46,7 +36,7 @@ def ensure_default_models(models_dir: Path):
thread = Thread(target=download, args=(url, target_path, len(jobs)))
thread.start()
jobs.append((thread, target_path, size))

# Run and join threads
for thread, target_path, size in jobs:
thread.join()
Expand Down