diff --git a/README.md b/README.md index 71b29ae2c7..80c84b5a5e 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,7 @@ repository are also still a useful source of information. ### Voice Conversion - [FreeVC](https://arxiv.org/abs/2210.15418) +- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419) - [OpenVoice](https://arxiv.org/abs/2312.01479) ### Others diff --git a/TTS/.models.json b/TTS/.models.json index 36654d0555..05c88bef43 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -787,6 +787,22 @@ "license": "apache 2.0" } }, + "librispeech100": { + "wavlm-hifigan": { + "description": "HiFiGAN vocoder for WavLM features from kNN-VC", + "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip", + "commit": "cfba7e0", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT" + }, + "wavlm-hifigan_prematched": { + "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC", + "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip", + "commit": "cfba7e0", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT" + } + }, "ljspeech": { "multiband-melgan": { "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", @@ -927,18 +943,27 @@ "freevc24": { "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip", "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC", + "default_vocoder": null, "author": "Jing-Yi Li @OlaWod", "license": "MIT", "commit": null } }, "multi-dataset": { + "knnvc": { + "description": "kNN-VC model from https://github.com/bshall/knn-vc", + "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT", + "commit": null + }, "openvoice_v1": { "hf_url": [ "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json", "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth" ], "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "default_vocoder": null, "author": "MyShell.ai", "license": "MIT", "commit": null @@ -949,6 +974,7 @@ "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth" ], "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "default_vocoder": null, "author": "MyShell.ai", "license": "MIT", "commit": null diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 54bb5ba825..77566c3f6a 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -31,6 +31,7 @@ def to_camel(text): text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) text = text.replace("Tts", "TTS") text = text.replace("vc", "VC") + text = text.replace("Knn", "KNN") return text diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index d7d4deab9d..5dff1b84c8 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -15,6 +15,7 @@ from typing_extensions import Required from TTS.config import load_config, read_json_with_comments +from TTS.vc.configs.knnvc_config import KNNVCConfig logger = logging.getLogger(__name__) @@ -267,9 +268,9 @@ def set_model_url(model_item: ModelItem) -> ModelItem: model_item["model_url"] = model_item["github_rls_url"] elif "hf_url" in model_item: model_item["model_url"] = model_item["hf_url"] - elif "fairseq" in model_item["model_name"]: + elif "fairseq" in model_item.get("model_name", ""): model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/" - elif "xtts" in model_item["model_name"]: + elif "xtts" in model_item.get("model_name", ""): model_item["model_url"] = "https://huggingface.co/coqui/" return model_item @@ -367,6 +368,9 @@ def create_dir_and_download_model(self, model_name: str, model_item: ModelItem, logger.exception("Failed to download the model file to %s", output_path) rmtree(output_path) raise e + checkpoints = list(Path(output_path).glob("*.pt*")) + if len(checkpoints) == 1: + checkpoints[0].rename(checkpoints[0].parent / "model.pth") self.print_model_license(model_item=model_item) def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None: @@ -431,11 +435,14 @@ def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelIt output_model_path = output_path output_config_path = None if ( - model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name + model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name ): # TODO:This is stupid but don't care for now. output_model_path, output_config_path = self._find_files(output_path) else: output_config_path = output_model_path / "config.json" + if model == "knnvc" and not output_config_path.exists(): + knnvc_config = KNNVCConfig() + knnvc_config.save_json(output_config_path) # update paths in the config.json self._update_paths(output_path, output_config_path) return output_model_path, output_config_path, model_item diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 517cb7d2b2..0c445c7088 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -139,7 +139,9 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N """ # pylint: disable=global-statement self.vc_config = load_config(vc_config_path) - self.output_sample_rate = self.vc_config.audio["output_sample_rate"] + self.output_sample_rate = self.vc_config.audio.get( + "output_sample_rate", self.vc_config.audio.get("sample_rate", None) + ) self.vc_model = setup_vc_model(config=self.vc_config) self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint) if use_cuda: diff --git a/TTS/vc/configs/knnvc_config.py b/TTS/vc/configs/knnvc_config.py new file mode 100644 index 0000000000..7728ea0a9b --- /dev/null +++ b/TTS/vc/configs/knnvc_config.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass, field + +from coqpit import Coqpit + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.vc.configs.shared_configs import BaseVCConfig + + +@dataclass +class KNNVCAudioConfig(BaseAudioConfig): + """Audio configuration. + + Args: + sample_rate (int): + The sampling rate of the input waveform. + """ + + sample_rate: int = field(default=16000) + + +@dataclass +class KNNVCArgs(Coqpit): + """Model arguments. + + Args: + ssl_dim (int): + The dimension of the self-supervised learning embedding. + """ + + ssl_dim: int = field(default=1024) + + +@dataclass +class KNNVCConfig(BaseVCConfig): + """Parameters. + + Args: + model (str): + Model name. Do not change unless you know what you are doing. + + model_args (KNNVCArgs): + Model architecture arguments. Defaults to `KNNVCArgs()`. + + audio (KNNVCAudioConfig): + Audio processing configuration. Defaults to `KNNVCAudioConfig()`. + + wavlm_layer (int): + WavLM layer to use for feature extraction. + + topk (int): + k in the kNN -- the number of nearest neighbors to average over + """ + + model: str = "knnvc" + model_args: KNNVCArgs = field(default_factory=KNNVCArgs) + audio: KNNVCAudioConfig = field(default_factory=KNNVCAudioConfig) + + wavlm_layer: int = 6 + topk: int = 4 diff --git a/TTS/vc/layers/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py index 62f7e74aaf..d9c3858f89 100644 --- a/TTS/vc/layers/freevc/wavlm/__init__.py +++ b/TTS/vc/layers/freevc/wavlm/__init__.py @@ -13,7 +13,7 @@ model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt" -def get_wavlm(device="cpu"): +def get_wavlm(device="cpu") -> WavLM: """Download the model and return the model object.""" output_path = get_user_data_dir("tts") diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py index 41677363c7..8151a0445e 100644 --- a/TTS/vc/models/__init__.py +++ b/TTS/vc/models/__init__.py @@ -1,7 +1,10 @@ import importlib import logging import re -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union + +from TTS.vc.configs.shared_configs import BaseVCConfig +from TTS.vc.models.base_vc import BaseVC logger = logging.getLogger(__name__) @@ -9,7 +12,11 @@ def setup_model(config: BaseVCConfig) -> BaseVC: logger.info("Using model: %s", config.model) # fetch the right model implementation. - if "model" in config and config["model"].lower() == "freevc": + if config["model"].lower() == "freevc": MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC - model = MyModel.init_from_config(config) - return model + elif config["model"].lower() == "knnvc": + MyModel = importlib.import_module("TTS.vc.models.knnvc").KNNVC + else: + msg = f"Model {config.model} does not exist!" + raise ValueError(msg) + return MyModel.init_from_config(config) diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py new file mode 100644 index 0000000000..2fb069fc86 --- /dev/null +++ b/TTS/vc/models/knnvc.py @@ -0,0 +1,182 @@ +import logging +import os +from typing import Any, Optional, Union + +import torch +import torch.nn.functional as F +import torchaudio +from coqpit import Coqpit +from typing_extensions import TypeAlias + +from TTS.vc.configs.knnvc_config import KNNVCConfig +from TTS.vc.layers.freevc.wavlm import get_wavlm +from TTS.vc.models.base_vc import BaseVC + +logger = logging.getLogger(__name__) + +PathOrTensor: TypeAlias = Union[str, os.PathLike[Any], torch.Tensor] + + +class KNNVC(BaseVC): + """ + Paper:: + https://arxiv.org/abs/2305.18975 + + Paper Abstract:: + Any-to-any voice conversion aims to transform source speech + into a target voice with just a few examples of the target speaker as a + reference. Recent methods produce convincing conversions, but at the cost of + increased complexity -- making results difficult to reproduce and build on. + Instead, we keep it simple. We propose k-nearest neighbors voice conversion + (kNN-VC): a straightforward yet effective method for any-to-any conversion. + First, we extract self-supervised representations of the source and reference + speech. To convert to the target speaker, we replace each frame of the source + representation with its nearest neighbor in the reference. Finally, a pretrained + vocoder synthesizes audio from the converted representation. Objective and + subjective evaluations show that kNN-VC improves speaker similarity with similar + intelligibility scores to existing methods. + + Samples:: + https://bshall.github.io/knn-vc + + Original code:: + https://github.com/bshall/knn-vc + + Examples: + >>> from TTS.vc.configs.knnvc_config import KNNVCConfig + >>> from TTS.vc.models.knnvc import KNNVC + >>> config = KNNVCConfig() + >>> model = KNNVC(config) + """ + + def __init__(self, config: Coqpit): + super().__init__(config) + self.ssl_dim = self.args.ssl_dim + self.wavlm = get_wavlm() + + @staticmethod + def init_from_config(config: KNNVCConfig) -> "KNNVC": + return KNNVC(config) + + @torch.inference_mode() + def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor: + """Return features for the given waveform with output shape (seq_len, dim). + + Optionally perform VAD trimming on start/end with `vad_trigger_level`. + """ + # load audio + if isinstance(audio, torch.Tensor): + x: torch.Tensor = audio + sr = self.config.audio.sample_rate + if x.dim() == 1: + x = x[None] + else: + x, sr = torchaudio.load(audio, normalize=True) + + if not sr == self.config.audio.sample_rate: + logger.info(f"Resampling {sr} to {self.config.audio.sample_rate} in {audio}") + x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate) + sr = self.config.audio.sample_rate + + # trim silence from front and back + if vad_trigger_level > 1e-3: + transform = torchaudio.transforms.Vad(sample_rate=sr, trigger_level=vad_trigger_level) + x_front_trim = transform(x) + waveform_reversed = torch.flip(x_front_trim, (-1,)) + waveform_reversed_front_trim = transform(waveform_reversed) + x = torch.flip(waveform_reversed_front_trim, (-1,)) + + # extract the representation of each layer + wav_input_16khz = x.to(self.device) + features = self.wavlm.extract_features( + wav_input_16khz, output_layer=self.config.wavlm_layer, ret_layer_results=False + )[0] + return features.squeeze(0) + + def get_matching_set(self, wavs: list[PathOrTensor], vad_trigger_level=7) -> torch.Tensor: + """Get concatenated wavlm features for the matching set using all waveforms in `wavs`. + + Wavs are specified as either a list of paths or list of loaded waveform tensors of + shape (channels, T), assumed to be of 16kHz sample rate. + """ + feats = [] + for p in wavs: + feats.append(self.get_features(p, vad_trigger_level=vad_trigger_level)) + + feats = torch.concat(feats, dim=0).cpu() + return feats + + @staticmethod + def fast_cosine_dist(source_feats: torch.Tensor, matching_pool: torch.Tensor) -> torch.Tensor: + """Like torch.cdist, but fixed dim=-1 and for cosine distance.""" + source_norms = torch.norm(source_feats, p=2, dim=-1) + matching_norms = torch.norm(matching_pool, p=2, dim=-1) + dotprod = ( + -(torch.cdist(source_feats[None], matching_pool[None], p=2)[0] ** 2) + + source_norms[:, None] ** 2 + + matching_norms[None] ** 2 + ) + dotprod /= 2 + + dists = 1 - (dotprod / (source_norms[:, None] * matching_norms[None])) + return dists + + @torch.inference_mode() + def match( + self, + query_seq: torch.Tensor, + matching_set: torch.Tensor, + synth_set: Optional[torch.Tensor] = None, + topk: Optional[int] = None, + target_duration: Optional[float] = None, + ) -> torch.Tensor: + """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching + with k=`topk`. + + Args: + `query_seq`: Tensor (N1, dim) of the input/source query features. + `matching_set`: Tensor (N2, dim) of the matching set used as the 'training set' for the kNN algorithm. + `synth_set`: optional Tensor (N2, dim) corresponding to the matching set. We use the matching set to assign + each query vector to a vector in the matching set, and then use the corresponding vector from + the synth set during HiFiGAN synthesis. + By default, and for best performance, this should be identical to the matching set. + `topk`: k in the kNN -- the number of nearest neighbors to average over. + `target_duration`: if set to a float, interpolate waveform duration to be equal to this value in seconds. + + Returns: + - converted features (1, N, dim) + """ + if topk is None: + topk = self.config.topk + synth_set = matching_set.to(self.device) if synth_set is None else synth_set.to(self.device) + matching_set = matching_set.to(self.device) + query_seq = query_seq.to(self.device) + + if target_duration is not None: + target_samples = int(target_duration * self.config.audio.sample_rate) + scale_factor = (target_samples / self.hop_length) / query_seq.shape[0] # n_targ_feats / n_input_feats + query_seq = F.interpolate(query_seq.T[None], scale_factor=scale_factor, mode="linear")[0].T + + dists = self.fast_cosine_dist(query_seq, matching_set) + best = dists.topk(k=topk, largest=False, dim=-1) + out_feats = synth_set[best.indices].mean(dim=1) + return out_feats.unsqueeze(0) + + def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: Union[str, os.PathLike[Any]]) -> None: + """kNN-VC does not use checkpoints.""" + + def forward(self) -> None: ... + def inference(self) -> None: ... + + @torch.inference_mode() + def voice_conversion( + self, + source: PathOrTensor, + target: Union[PathOrTensor, list[PathOrTensor]], + topk: Optional[int] = None, + ) -> torch.Tensor: + if not isinstance(target, list): + target = [target] + source_features = self.get_features(source) + matching_set = self.get_matching_set(target) + return self.match(source_features, matching_set, topk=topk) diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py index 9a102f0c89..60dde496b2 100644 --- a/TTS/vocoder/configs/hifigan_config.py +++ b/TTS/vocoder/configs/hifigan_config.py @@ -5,7 +5,7 @@ @dataclass class HifiganConfig(BaseGANVocoderConfig): - """Defines parameters for FullBand MelGAN vocoder. + """Defines parameters for HifiGAN vocoder. Example: diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index 6a7efe3977..481d234a54 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -40,7 +40,8 @@ def setup_generator(c: BaseGANVocoderConfig): MyModel = getattr(MyModel, to_camel(c.generator_model)) # this is to preserve the Wavernn class name (instead of Wavernn) if c.generator_model.lower() in "hifigan_generator": - model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) + c.generator_model_params["in_channels"] = c.generator_model_params.get("in_channels", c.audio["num_mels"]) + model = MyModel(out_channels=1, **c.generator_model_params) elif c.generator_model.lower() in "melgan_generator": model = MyModel( in_channels=c.audio["num_mels"], @@ -106,7 +107,7 @@ def setup_discriminator(c: BaseGANVocoderConfig): MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) if c.discriminator_model in "hifigan_discriminator": model = MyModel() - if c.discriminator_model in "random_window_discriminator": + elif c.discriminator_model in "random_window_discriminator": model = MyModel( cond_channels=c.audio["num_mels"], hop_length=c.audio["hop_length"], @@ -115,7 +116,7 @@ def setup_discriminator(c: BaseGANVocoderConfig): cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], window_sizes=c.discriminator_model_params["window_sizes"], ) - if c.discriminator_model in "melgan_multiscale_discriminator": + elif c.discriminator_model in "melgan_multiscale_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -124,7 +125,7 @@ def setup_discriminator(c: BaseGANVocoderConfig): max_channels=c.discriminator_model_params["max_channels"], downsample_factors=c.discriminator_model_params["downsample_factors"], ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": + elif c.discriminator_model == "residual_parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -139,7 +140,7 @@ def setup_discriminator(c: BaseGANVocoderConfig): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == "parallel_wavegan_discriminator": + elif c.discriminator_model == "parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -151,6 +152,8 @@ def setup_discriminator(c: BaseGANVocoderConfig): nonlinear_activation_params={"negative_slope": 0.2}, bias=True, ) - if c.discriminator_model == "univnet_discriminator": + elif c.discriminator_model == "univnet_discriminator": model = MyModel() + else: + raise NotImplementedError(f"Model {c.discriminator_model} not implemented!") return model diff --git a/hubconf.py b/hubconf.py index 6e10928265..b49c9d6bcc 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,4 +1,14 @@ -dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"] +dependencies = [ + "torch", + "gdown", + "pysbd", + "gruut", + "anyascii", + "pypinyin", + "coqpit-config", + "mecab-python3", + "unidic-lite", +] import torch from TTS.utils.manage import ModelManager @@ -39,5 +49,5 @@ def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, us if __name__ == "__main__": - synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github") + synthesizer = torch.hub.load("idiap/coqui-ai-TTS:dev", "tts", source="github") synthesizer.tts("This is a test!") diff --git a/pyproject.toml b/pyproject.toml index 71db481ac4..bdeb5d761c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ dependencies = [ "pyyaml>=6.0", "fsspec[http]>=2023.6.0", "packaging>=23.1", + "typing_extensions>=4.10", # Inference "pysbd>=0.3.4", # Training