Skip to content

Commit

Permalink
Merge pull request #256 from idiap/knnvc
Browse files Browse the repository at this point in the history
Add kNN-VC
  • Loading branch information
eginhard authored Jan 15, 2025
2 parents 309c31c + 240aae4 commit 205eed3
Show file tree
Hide file tree
Showing 32 changed files with 534 additions and 129 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ repository are also still a useful source of information.

### Voice Conversion
- [FreeVC](https://arxiv.org/abs/2210.15418)
- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419)
- [OpenVoice](https://arxiv.org/abs/2312.01479)

### Others
Expand Down Expand Up @@ -234,7 +235,7 @@ tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)

#### Voice conversion (VC)

Converting the voice in `source_wav` to the voice of `target_wav`
Converting the voice in `source_wav` to the voice of `target_wav`:

```python
tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
Expand All @@ -246,9 +247,13 @@ tts.voice_conversion_to_file(
```

Other available voice conversion models:
- `voice_conversion_models/multilingual/multi-dataset/knnvc`
- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`

For more details, see the
[documentation](https://coqui-tts.readthedocs.io/en/latest/vc.html).

#### Voice cloning by combining single speaker TTS model with the default VC model

This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is
Expand Down
26 changes: 26 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,22 @@
"license": "apache 2.0"
}
},
"librispeech100": {
"wavlm-hifigan": {
"description": "HiFiGAN vocoder for WavLM features from kNN-VC",
"github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip",
"commit": "cfba7e0",
"author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
"license": "MIT"
},
"wavlm-hifigan_prematched": {
"description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC",
"github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip",
"commit": "cfba7e0",
"author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
"license": "MIT"
}
},
"ljspeech": {
"multiband-melgan": {
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
Expand Down Expand Up @@ -927,18 +943,27 @@
"freevc24": {
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
"default_vocoder": null,
"author": "Jing-Yi Li @OlaWod",
"license": "MIT",
"commit": null
}
},
"multi-dataset": {
"knnvc": {
"description": "kNN-VC model from https://github.com/bshall/knn-vc",
"default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched",
"author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
"license": "MIT",
"commit": null
},
"openvoice_v1": {
"hf_url": [
"https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
"https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
],
"description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
"default_vocoder": null,
"author": "MyShell.ai",
"license": "MIT",
"commit": null
Expand All @@ -949,6 +974,7 @@
"https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
],
"description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
"default_vocoder": null,
"author": "MyShell.ai",
"license": "MIT",
"commit": null
Expand Down
59 changes: 36 additions & 23 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
import warnings
from pathlib import Path
from typing import Optional
from typing import Optional, Union

from torch import nn

Expand Down Expand Up @@ -77,8 +77,8 @@ def __init__(
super().__init__()
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
self.config = load_config(config_path) if config_path else None
self.synthesizer = None
self.voice_converter = None
self.synthesizer: Optional[Synthesizer] = None
self.voice_converter: Optional[Synthesizer] = None
self.model_name = ""

self.vocoder_path = vocoder_path
Expand All @@ -95,7 +95,7 @@ def __init__(
if "tts_models" in model_name:
self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
elif "voice_conversion_models" in model_name:
self.load_vc_model_by_name(model_name, gpu=gpu)
self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu)
# To allow just TTS("xtts")
else:
self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
Expand Down Expand Up @@ -157,22 +157,24 @@ def list_models() -> list[str]:

def download_model_by_name(
self, model_name: str, vocoder_name: Optional[str] = None
) -> tuple[Optional[Path], Optional[Path], Optional[Path]]:
) -> tuple[Optional[Path], Optional[Path], Optional[Path], Optional[Path], Optional[Path]]:
model_path, config_path, model_item = self.manager.download_model(model_name)
if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
# return model directory if there are multiple files
# we assume that the model knows how to load itself
return None, None, model_path
return None, None, None, None, model_path
if model_item.get("default_vocoder") is None:
return model_path, config_path, None
return model_path, config_path, None, None, None
if vocoder_name is None:
vocoder_name = model_item["default_vocoder"]
vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
# A local vocoder model will take precedence if specified via vocoder_path
if self.vocoder_path is None or self.vocoder_config_path is None:
self.vocoder_path = vocoder_path
self.vocoder_config_path = vocoder_config_path
return model_path, config_path, None
vocoder_path, vocoder_config_path = None, None
# A local vocoder model will take precedence if already specified in __init__
if model_item["model_type"] == "tts_models":
vocoder_path = self.vocoder_path
vocoder_config_path = self.vocoder_config_path
if vocoder_path is None or vocoder_config_path is None:
vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
return model_path, config_path, vocoder_path, vocoder_config_path, None

def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
"""Load one of the 🐸TTS models by name.
Expand All @@ -183,17 +185,24 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None
"""
self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)

def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
def load_vc_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
"""Load one of the voice conversion models by name.
Args:
model_name (str): Model name to load. You can list models by ```tts.models```.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.model_name = model_name
model_path, config_path, model_dir = self.download_model_by_name(model_name)
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
model_name, vocoder_name
)
self.voice_converter = Synthesizer(
vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
vc_checkpoint=model_path,
vc_config=config_path,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path,
model_dir=model_dir,
use_cuda=gpu,
)

def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
Expand All @@ -208,7 +217,9 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
self.synthesizer = None
self.model_name = model_name

model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
model_name, vocoder_name
)

# init synthesizer
# None values are fetch from the model
Expand All @@ -217,8 +228,8 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
tts_config_path=config_path,
tts_speakers_file=None,
tts_languages_file=None,
vocoder_checkpoint=self.vocoder_path,
vocoder_config=self.vocoder_config_path,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path,
encoder_checkpoint=self.encoder_path,
encoder_config=self.encoder_config_path,
model_dir=model_dir,
Expand Down Expand Up @@ -377,7 +388,7 @@ def tts_to_file(
def voice_conversion(
self,
source_wav: str,
target_wav: str,
target_wav: Union[str, list[str]],
):
"""Voice conversion with FreeVC. Convert source wav to target speaker.
Expand All @@ -395,7 +406,7 @@ def voice_conversion(
def voice_conversion_to_file(
self,
source_wav: str,
target_wav: str,
target_wav: Union[str, list[str]],
file_path: str = "output.wav",
pipe_out=None,
) -> str:
Expand All @@ -418,8 +429,9 @@ def voice_conversion_to_file(
def tts_with_vc(
self,
text: str,
*,
language: Optional[str] = None,
speaker_wav: Optional[str] = None,
speaker_wav: Union[str, list[str]],
speaker: Optional[str] = None,
split_sentences: bool = True,
):
Expand Down Expand Up @@ -460,8 +472,9 @@ def tts_with_vc(
def tts_with_vc_to_file(
self,
text: str,
*,
language: Optional[str] = None,
speaker_wav: Optional[str] = None,
speaker_wav: Union[str, list[str]],
file_path: str = "output.wav",
speaker: Optional[str] = None,
split_sentences: bool = True,
Expand Down
5 changes: 3 additions & 2 deletions TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,13 +275,14 @@ def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
"--source_wav",
type=str,
default=None,
help="Original audio file to convert in the voice of the target_wav",
help="Original audio file to convert into the voice of the target_wav",
)
parser.add_argument(
"--target_wav",
type=str,
nargs="*",
default=None,
help="Target audio file to convert in the voice of the source_wav",
help="Audio file(s) of the target voice into which to convert the source_wav",
)

parser.add_argument(
Expand Down
4 changes: 4 additions & 0 deletions TTS/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,7 @@ def load_checkpoint(
It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
"""
...

@property
def device(self) -> torch.device:
return next(self.parameters()).device
4 changes: 0 additions & 4 deletions TTS/tts/layers/xtts/trainer/gpt_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,6 @@ def __init__(self, config: Coqpit):
mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate
)

@property
def device(self):
return next(self.parameters()).device

def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens):
"""
Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
Expand Down
4 changes: 0 additions & 4 deletions TTS/tts/models/bark.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ def __init__(
self.encodec = EncodecModel.encodec_model_24khz()
self.encodec.set_target_bandwidth(6.0)

@property
def device(self):
return next(self.parameters()).device

def load_bark_models(self):
self.semantic_model, self.config = load_model(
ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
Expand Down
4 changes: 0 additions & 4 deletions TTS/tts/models/delightful_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,10 +438,6 @@ def __init__(
periods=self.config.vocoder.periods_discriminator,
)

@property
def device(self):
return next(self.parameters()).device

@property
def energy_scaler(self):
return self.acoustic_model.energy_scaler
Expand Down
4 changes: 0 additions & 4 deletions TTS/tts/models/vits.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,10 +565,6 @@ def __init__(
use_spectral_norm=self.args.use_spectral_norm_disriminator,
)

@property
def device(self):
return next(self.parameters()).device

def init_multispeaker(self, config: Coqpit):
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
or with external `d_vectors` computed from a speaker encoder model.
Expand Down
4 changes: 0 additions & 4 deletions TTS/tts/models/xtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,6 @@ def init_models(self):
cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
)

@property
def device(self):
return next(self.parameters()).device

@torch.inference_mode()
def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6):
"""Compute the conditioning latents for the GPT model from the given audio.
Expand Down
1 change: 1 addition & 0 deletions TTS/utils/generic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def to_camel(text):
text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
text = text.replace("Tts", "TTS")
text = text.replace("vc", "VC")
text = text.replace("Knn", "KNN")
return text


Expand Down
13 changes: 10 additions & 3 deletions TTS/utils/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing_extensions import Required

from TTS.config import load_config, read_json_with_comments
from TTS.vc.configs.knnvc_config import KNNVCConfig

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -267,9 +268,9 @@ def set_model_url(model_item: ModelItem) -> ModelItem:
model_item["model_url"] = model_item["github_rls_url"]
elif "hf_url" in model_item:
model_item["model_url"] = model_item["hf_url"]
elif "fairseq" in model_item["model_name"]:
elif "fairseq" in model_item.get("model_name", ""):
model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/"
elif "xtts" in model_item["model_name"]:
elif "xtts" in model_item.get("model_name", ""):
model_item["model_url"] = "https://huggingface.co/coqui/"
return model_item

Expand Down Expand Up @@ -367,6 +368,9 @@ def create_dir_and_download_model(self, model_name: str, model_item: ModelItem,
logger.exception("Failed to download the model file to %s", output_path)
rmtree(output_path)
raise e
checkpoints = list(Path(output_path).glob("*.pt*"))
if len(checkpoints) == 1:
checkpoints[0].rename(checkpoints[0].parent / "model.pth")
self.print_model_license(model_item=model_item)

def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
Expand Down Expand Up @@ -431,11 +435,14 @@ def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelIt
output_model_path = output_path
output_config_path = None
if (
model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name
model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name
): # TODO:This is stupid but don't care for now.
output_model_path, output_config_path = self._find_files(output_path)
else:
output_config_path = output_model_path / "config.json"
if model == "knnvc" and not output_config_path.exists():
knnvc_config = KNNVCConfig()
knnvc_config.save_json(output_config_path)
# update paths in the config.json
self._update_paths(output_path, output_config_path)
return output_model_path, output_config_path, model_item
Expand Down
Loading

1 comment on commit 205eed3

@DrewThomasson
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This merge is causing issues with fairseq

For Example when the eng.tar.gz is downloaded and extracted the G_100000.pth is being renamed to model.pth

This is causing issues as you might expect when the fairseq engine is looking for the G_100000.pth file and can't find it

Please sign in to comment.