Merge pull request #256 from idiap/knnvc

Add kNN-VC
idiap · Jan 15, 2025 · 205eed3 · 205eed3 · DrewThomasson · Jan 16, 2025
2 parents 309c31c + 240aae4
commit 205eed3
Show file tree

Hide file tree

Showing 32 changed files with 534 additions and 129 deletions.
diff --git a/README.md b/README.md
@@ -98,6 +98,7 @@ repository are also still a useful source of information.
 
 ### Voice Conversion
 - [FreeVC](https://arxiv.org/abs/2210.15418)
+- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419)
 - [OpenVoice](https://arxiv.org/abs/2312.01479)
 
 ### Others
@@ -234,7 +235,7 @@ tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
 
 #### Voice conversion (VC)
 
-Converting the voice in `source_wav` to the voice of `target_wav`
+Converting the voice in `source_wav` to the voice of `target_wav`:
 
 ```python
 tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
@@ -246,9 +247,13 @@ tts.voice_conversion_to_file(
 ```
 
 Other available voice conversion models:
+- `voice_conversion_models/multilingual/multi-dataset/knnvc`
 - `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
 - `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
 
+For more details, see the
+[documentation](https://coqui-tts.readthedocs.io/en/latest/vc.html).
+
 #### Voice cloning by combining single speaker TTS model with the default VC model
 
 This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -787,6 +787,22 @@
                     "license": "apache 2.0"
                 }
             },
+            "librispeech100": {
+                "wavlm-hifigan": {
+                    "description": "HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                },
+                "wavlm-hifigan_prematched": {
+                    "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                }
+            },
             "ljspeech": {
                 "multiband-melgan": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
@@ -927,18 +943,27 @@
                 "freevc24": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
                     "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+                    "default_vocoder": null,
                     "author": "Jing-Yi Li @OlaWod",
                     "license": "MIT",
                     "commit": null
                 }
             },
             "multi-dataset": {
+                "knnvc": {
+                    "description": "kNN-VC model from https://github.com/bshall/knn-vc",
+                    "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT",
+                    "commit": null
+                },
                 "openvoice_v1": {
                     "hf_url": [
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null
@@ -949,6 +974,7 @@
                         "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null

diff --git a/TTS/api.py b/TTS/api.py
@@ -4,7 +4,7 @@
 import tempfile
 import warnings
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 from torch import nn
 
@@ -77,8 +77,8 @@ def __init__(
         super().__init__()
         self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
         self.config = load_config(config_path) if config_path else None
-        self.synthesizer = None
-        self.voice_converter = None
+        self.synthesizer: Optional[Synthesizer] = None
+        self.voice_converter: Optional[Synthesizer] = None
         self.model_name = ""
 
         self.vocoder_path = vocoder_path
@@ -95,7 +95,7 @@ def __init__(
             if "tts_models" in model_name:
                 self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
             elif "voice_conversion_models" in model_name:
-                self.load_vc_model_by_name(model_name, gpu=gpu)
+                self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu)
             # To allow just TTS("xtts")
             else:
                 self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
@@ -157,22 +157,24 @@ def list_models() -> list[str]:
 
     def download_model_by_name(
         self, model_name: str, vocoder_name: Optional[str] = None
-    ) -> tuple[Optional[Path], Optional[Path], Optional[Path]]:
+    ) -> tuple[Optional[Path], Optional[Path], Optional[Path], Optional[Path], Optional[Path]]:
         model_path, config_path, model_item = self.manager.download_model(model_name)
         if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
             # return model directory if there are multiple files
             # we assume that the model knows how to load itself
-            return None, None, model_path
+            return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None
+            return model_path, config_path, None, None, None
         if vocoder_name is None:
             vocoder_name = model_item["default_vocoder"]
-        vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
-        # A local vocoder model will take precedence if specified via vocoder_path
-        if self.vocoder_path is None or self.vocoder_config_path is None:
-            self.vocoder_path = vocoder_path
-            self.vocoder_config_path = vocoder_config_path
-        return model_path, config_path, None
+        vocoder_path, vocoder_config_path = None, None
+        # A local vocoder model will take precedence if already specified in __init__
+        if model_item["model_type"] == "tts_models":
+            vocoder_path = self.vocoder_path
+            vocoder_config_path = self.vocoder_config_path
+        if vocoder_path is None or vocoder_config_path is None:
+            vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
 
     def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of the 🐸TTS models by name.
@@ -183,17 +185,24 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None
         """
         self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
 
-    def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
+    def load_vc_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of the voice conversion models by name.
 
         Args:
             model_name (str): Model name to load. You can list models by ```tts.models```.
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, model_dir = self.download_model_by_name(model_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
         self.voice_converter = Synthesizer(
-            vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
+            vc_checkpoint=model_path,
+            vc_config=config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            model_dir=model_dir,
+            use_cuda=gpu,
         )
 
     def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
@@ -208,7 +217,9 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
         self.synthesizer = None
         self.model_name = model_name
 
-        model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
 
         # init synthesizer
         # None values are fetch from the model
@@ -217,8 +228,8 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
             tts_config_path=config_path,
             tts_speakers_file=None,
             tts_languages_file=None,
-            vocoder_checkpoint=self.vocoder_path,
-            vocoder_config=self.vocoder_config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
             encoder_checkpoint=self.encoder_path,
             encoder_config=self.encoder_config_path,
             model_dir=model_dir,
@@ -377,7 +388,7 @@ def tts_to_file(
     def voice_conversion(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: Union[str, list[str]],
     ):
         """Voice conversion with FreeVC. Convert source wav to target speaker.
 
@@ -395,7 +406,7 @@ def voice_conversion(
     def voice_conversion_to_file(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: Union[str, list[str]],
         file_path: str = "output.wav",
         pipe_out=None,
     ) -> str:
@@ -418,8 +429,9 @@ def voice_conversion_to_file(
     def tts_with_vc(
         self,
         text: str,
+        *,
         language: Optional[str] = None,
-        speaker_wav: Optional[str] = None,
+        speaker_wav: Union[str, list[str]],
         speaker: Optional[str] = None,
         split_sentences: bool = True,
     ):
@@ -460,8 +472,9 @@ def tts_with_vc(
     def tts_with_vc_to_file(
         self,
         text: str,
+        *,
         language: Optional[str] = None,
-        speaker_wav: Optional[str] = None,
+        speaker_wav: Union[str, list[str]],
         file_path: str = "output.wav",
         speaker: Optional[str] = None,
         split_sentences: bool = True,

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -275,13 +275,14 @@ def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
         "--source_wav",
         type=str,
         default=None,
-        help="Original audio file to convert in the voice of the target_wav",
+        help="Original audio file to convert into the voice of the target_wav",
     )
     parser.add_argument(
         "--target_wav",
         type=str,
+        nargs="*",
         default=None,
-        help="Target audio file to convert in the voice of the source_wav",
+        help="Audio file(s) of the target voice into which to convert the source_wav",
     )
 
     parser.add_argument(

diff --git a/TTS/model.py b/TTS/model.py
@@ -64,3 +64,7 @@ def load_checkpoint(
                 It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
         """
         ...
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@@ -197,10 +197,6 @@ def __init__(self, config: Coqpit):
             mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens):
         """
         Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode

diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py
@@ -43,10 +43,6 @@ def __init__(
         self.encodec = EncodecModel.encodec_model_24khz()
         self.encodec.set_target_bandwidth(6.0)
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def load_bark_models(self):
         self.semantic_model, self.config = load_model(
             ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"

diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py
@@ -438,10 +438,6 @@ def __init__(
                 periods=self.config.vocoder.periods_discriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @property
     def energy_scaler(self):
         return self.acoustic_model.energy_scaler

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
@@ -565,10 +565,6 @@ def __init__(
                 use_spectral_norm=self.args.use_spectral_norm_disriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def init_multispeaker(self, config: Coqpit):
         """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
         or with external `d_vectors` computed from a speaker encoder model.

diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
@@ -239,10 +239,6 @@ def init_models(self):
             cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @torch.inference_mode()
     def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6):
         """Compute the conditioning latents for the GPT model from the given audio.

diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
@@ -31,6 +31,7 @@ def to_camel(text):
     text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
     text = text.replace("Tts", "TTS")
     text = text.replace("vc", "VC")
+    text = text.replace("Knn", "KNN")
     return text
 
 

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
@@ -15,6 +15,7 @@
 from typing_extensions import Required
 
 from TTS.config import load_config, read_json_with_comments
+from TTS.vc.configs.knnvc_config import KNNVCConfig
 
 logger = logging.getLogger(__name__)
 
@@ -267,9 +268,9 @@ def set_model_url(model_item: ModelItem) -> ModelItem:
             model_item["model_url"] = model_item["github_rls_url"]
         elif "hf_url" in model_item:
             model_item["model_url"] = model_item["hf_url"]
-        elif "fairseq" in model_item["model_name"]:
+        elif "fairseq" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/"
-        elif "xtts" in model_item["model_name"]:
+        elif "xtts" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://huggingface.co/coqui/"
         return model_item
 
@@ -367,6 +368,9 @@ def create_dir_and_download_model(self, model_name: str, model_item: ModelItem,
             logger.exception("Failed to download the model file to %s", output_path)
             rmtree(output_path)
             raise e
+        checkpoints = list(Path(output_path).glob("*.pt*"))
+        if len(checkpoints) == 1:
+            checkpoints[0].rename(checkpoints[0].parent / "model.pth")
         self.print_model_license(model_item=model_item)
 
     def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
@@ -431,11 +435,14 @@ def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelIt
         output_model_path = output_path
         output_config_path = None
         if (
-            model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name
+            model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name
         ):  # TODO:This is stupid but don't care for now.
             output_model_path, output_config_path = self._find_files(output_path)
         else:
             output_config_path = output_model_path / "config.json"
+        if model == "knnvc" and not output_config_path.exists():
+            knnvc_config = KNNVCConfig()
+            knnvc_config.save_json(output_config_path)
         # update paths in the config.json
         self._update_paths(output_path, output_config_path)
         return output_model_path, output_config_path, model_item