diff --git a/README.md b/README.md
index 71b29ae2c7..80c84b5a5e 100644
--- a/README.md
+++ b/README.md
@@ -98,6 +98,7 @@ repository are also still a useful source of information.
 
 ### Voice Conversion
 - [FreeVC](https://arxiv.org/abs/2210.15418)
+- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419)
 - [OpenVoice](https://arxiv.org/abs/2312.01479)
 
 ### Others
diff --git a/TTS/.models.json b/TTS/.models.json
index 36654d0555..05c88bef43 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -787,6 +787,22 @@
                     "license": "apache 2.0"
                 }
             },
+            "librispeech100": {
+                "wavlm-hifigan": {
+                    "description": "HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                },
+                "wavlm-hifigan_prematched": {
+                    "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                }
+            },
             "ljspeech": {
                 "multiband-melgan": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
@@ -927,18 +943,27 @@
                 "freevc24": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
                     "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+                    "default_vocoder": null,
                     "author": "Jing-Yi Li @OlaWod",
                     "license": "MIT",
                     "commit": null
                 }
             },
             "multi-dataset": {
+                "knnvc": {
+                    "description": "kNN-VC model from https://github.com/bshall/knn-vc",
+                    "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT",
+                    "commit": null
+                },
                 "openvoice_v1": {
                     "hf_url": [
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null
@@ -949,6 +974,7 @@
                         "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null
diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
index 54bb5ba825..77566c3f6a 100644
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@@ -31,6 +31,7 @@ def to_camel(text):
     text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
     text = text.replace("Tts", "TTS")
     text = text.replace("vc", "VC")
+    text = text.replace("Knn", "KNN")
     return text
 
 
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index d7d4deab9d..5dff1b84c8 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -15,6 +15,7 @@
 from typing_extensions import Required
 
 from TTS.config import load_config, read_json_with_comments
+from TTS.vc.configs.knnvc_config import KNNVCConfig
 
 logger = logging.getLogger(__name__)
 
@@ -267,9 +268,9 @@ def set_model_url(model_item: ModelItem) -> ModelItem:
             model_item["model_url"] = model_item["github_rls_url"]
         elif "hf_url" in model_item:
             model_item["model_url"] = model_item["hf_url"]
-        elif "fairseq" in model_item["model_name"]:
+        elif "fairseq" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/"
-        elif "xtts" in model_item["model_name"]:
+        elif "xtts" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://huggingface.co/coqui/"
         return model_item
 
@@ -367,6 +368,9 @@ def create_dir_and_download_model(self, model_name: str, model_item: ModelItem,
             logger.exception("Failed to download the model file to %s", output_path)
             rmtree(output_path)
             raise e
+        checkpoints = list(Path(output_path).glob("*.pt*"))
+        if len(checkpoints) == 1:
+            checkpoints[0].rename(checkpoints[0].parent / "model.pth")
         self.print_model_license(model_item=model_item)
 
     def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
@@ -431,11 +435,14 @@ def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelIt
         output_model_path = output_path
         output_config_path = None
         if (
-            model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name
+            model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name
         ):  # TODO:This is stupid but don't care for now.
             output_model_path, output_config_path = self._find_files(output_path)
         else:
             output_config_path = output_model_path / "config.json"
+        if model == "knnvc" and not output_config_path.exists():
+            knnvc_config = KNNVCConfig()
+            knnvc_config.save_json(output_config_path)
         # update paths in the config.json
         self._update_paths(output_path, output_config_path)
         return output_model_path, output_config_path, model_item
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 517cb7d2b2..0c445c7088 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -139,7 +139,9 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N
         """
         # pylint: disable=global-statement
         self.vc_config = load_config(vc_config_path)
-        self.output_sample_rate = self.vc_config.audio["output_sample_rate"]
+        self.output_sample_rate = self.vc_config.audio.get(
+            "output_sample_rate", self.vc_config.audio.get("sample_rate", None)
+        )
         self.vc_model = setup_vc_model(config=self.vc_config)
         self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint)
         if use_cuda:
diff --git a/TTS/vc/configs/knnvc_config.py b/TTS/vc/configs/knnvc_config.py
new file mode 100644
index 0000000000..7728ea0a9b
--- /dev/null
+++ b/TTS/vc/configs/knnvc_config.py
@@ -0,0 +1,59 @@
+from dataclasses import dataclass, field
+
+from coqpit import Coqpit
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.vc.configs.shared_configs import BaseVCConfig
+
+
+@dataclass
+class KNNVCAudioConfig(BaseAudioConfig):
+    """Audio configuration.
+
+    Args:
+        sample_rate (int):
+            The sampling rate of the input waveform.
+    """
+
+    sample_rate: int = field(default=16000)
+
+
+@dataclass
+class KNNVCArgs(Coqpit):
+    """Model arguments.
+
+    Args:
+        ssl_dim (int):
+            The dimension of the self-supervised learning embedding.
+    """
+
+    ssl_dim: int = field(default=1024)
+
+
+@dataclass
+class KNNVCConfig(BaseVCConfig):
+    """Parameters.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (KNNVCArgs):
+            Model architecture arguments. Defaults to `KNNVCArgs()`.
+
+        audio (KNNVCAudioConfig):
+            Audio processing configuration. Defaults to `KNNVCAudioConfig()`.
+
+        wavlm_layer (int):
+            WavLM layer to use for feature extraction.
+
+        topk (int):
+            k in the kNN -- the number of nearest neighbors to average over
+    """
+
+    model: str = "knnvc"
+    model_args: KNNVCArgs = field(default_factory=KNNVCArgs)
+    audio: KNNVCAudioConfig = field(default_factory=KNNVCAudioConfig)
+
+    wavlm_layer: int = 6
+    topk: int = 4
diff --git a/TTS/vc/layers/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py
index 62f7e74aaf..d9c3858f89 100644
--- a/TTS/vc/layers/freevc/wavlm/__init__.py
+++ b/TTS/vc/layers/freevc/wavlm/__init__.py
@@ -13,7 +13,7 @@
 model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt"
 
 
-def get_wavlm(device="cpu"):
+def get_wavlm(device="cpu") -> WavLM:
     """Download the model and return the model object."""
 
     output_path = get_user_data_dir("tts")
diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py
index 41677363c7..8151a0445e 100644
--- a/TTS/vc/models/__init__.py
+++ b/TTS/vc/models/__init__.py
@@ -1,7 +1,10 @@
 import importlib
 import logging
 import re
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
+
+from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.vc.models.base_vc import BaseVC
 
 logger = logging.getLogger(__name__)
 
@@ -9,7 +12,11 @@
 def setup_model(config: BaseVCConfig) -> BaseVC:
     logger.info("Using model: %s", config.model)
     # fetch the right model implementation.
-    if "model" in config and config["model"].lower() == "freevc":
+    if config["model"].lower() == "freevc":
         MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC
-        model = MyModel.init_from_config(config)
-    return model
+    elif config["model"].lower() == "knnvc":
+        MyModel = importlib.import_module("TTS.vc.models.knnvc").KNNVC
+    else:
+        msg = f"Model {config.model} does not exist!"
+        raise ValueError(msg)
+    return MyModel.init_from_config(config)
diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py
new file mode 100644
index 0000000000..2fb069fc86
--- /dev/null
+++ b/TTS/vc/models/knnvc.py
@@ -0,0 +1,182 @@
+import logging
+import os
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import torchaudio
+from coqpit import Coqpit
+from typing_extensions import TypeAlias
+
+from TTS.vc.configs.knnvc_config import KNNVCConfig
+from TTS.vc.layers.freevc.wavlm import get_wavlm
+from TTS.vc.models.base_vc import BaseVC
+
+logger = logging.getLogger(__name__)
+
+PathOrTensor: TypeAlias = Union[str, os.PathLike[Any], torch.Tensor]
+
+
+class KNNVC(BaseVC):
+    """
+    Paper::
+        https://arxiv.org/abs/2305.18975
+
+    Paper Abstract::
+        Any-to-any voice conversion aims to transform source speech
+        into a target voice with just a few examples of the target speaker as a
+        reference. Recent methods produce convincing conversions, but at the cost of
+        increased complexity -- making results difficult to reproduce and build on.
+        Instead, we keep it simple. We propose k-nearest neighbors voice conversion
+        (kNN-VC): a straightforward yet effective method for any-to-any conversion.
+        First, we extract self-supervised representations of the source and reference
+        speech. To convert to the target speaker, we replace each frame of the source
+        representation with its nearest neighbor in the reference. Finally, a pretrained
+        vocoder synthesizes audio from the converted representation. Objective and
+        subjective evaluations show that kNN-VC improves speaker similarity with similar
+        intelligibility scores to existing methods.
+
+    Samples::
+        https://bshall.github.io/knn-vc
+
+    Original code::
+        https://github.com/bshall/knn-vc
+
+    Examples:
+        >>> from TTS.vc.configs.knnvc_config import KNNVCConfig
+        >>> from TTS.vc.models.knnvc import KNNVC
+        >>> config = KNNVCConfig()
+        >>> model = KNNVC(config)
+    """
+
+    def __init__(self, config: Coqpit):
+        super().__init__(config)
+        self.ssl_dim = self.args.ssl_dim
+        self.wavlm = get_wavlm()
+
+    @staticmethod
+    def init_from_config(config: KNNVCConfig) -> "KNNVC":
+        return KNNVC(config)
+
+    @torch.inference_mode()
+    def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor:
+        """Return features for the given waveform with output shape (seq_len, dim).
+
+        Optionally perform VAD trimming on start/end with `vad_trigger_level`.
+        """
+        # load audio
+        if isinstance(audio, torch.Tensor):
+            x: torch.Tensor = audio
+            sr = self.config.audio.sample_rate
+            if x.dim() == 1:
+                x = x[None]
+        else:
+            x, sr = torchaudio.load(audio, normalize=True)
+
+        if not sr == self.config.audio.sample_rate:
+            logger.info(f"Resampling {sr} to {self.config.audio.sample_rate} in {audio}")
+            x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate)
+            sr = self.config.audio.sample_rate
+
+        # trim silence from front and back
+        if vad_trigger_level > 1e-3:
+            transform = torchaudio.transforms.Vad(sample_rate=sr, trigger_level=vad_trigger_level)
+            x_front_trim = transform(x)
+            waveform_reversed = torch.flip(x_front_trim, (-1,))
+            waveform_reversed_front_trim = transform(waveform_reversed)
+            x = torch.flip(waveform_reversed_front_trim, (-1,))
+
+        # extract the representation of each layer
+        wav_input_16khz = x.to(self.device)
+        features = self.wavlm.extract_features(
+            wav_input_16khz, output_layer=self.config.wavlm_layer, ret_layer_results=False
+        )[0]
+        return features.squeeze(0)
+
+    def get_matching_set(self, wavs: list[PathOrTensor], vad_trigger_level=7) -> torch.Tensor:
+        """Get concatenated wavlm features for the matching set using all waveforms in `wavs`.
+
+        Wavs are specified as either a list of paths or list of loaded waveform tensors of
+        shape (channels, T), assumed to be of 16kHz sample rate.
+        """
+        feats = []
+        for p in wavs:
+            feats.append(self.get_features(p, vad_trigger_level=vad_trigger_level))
+
+        feats = torch.concat(feats, dim=0).cpu()
+        return feats
+
+    @staticmethod
+    def fast_cosine_dist(source_feats: torch.Tensor, matching_pool: torch.Tensor) -> torch.Tensor:
+        """Like torch.cdist, but fixed dim=-1 and for cosine distance."""
+        source_norms = torch.norm(source_feats, p=2, dim=-1)
+        matching_norms = torch.norm(matching_pool, p=2, dim=-1)
+        dotprod = (
+            -(torch.cdist(source_feats[None], matching_pool[None], p=2)[0] ** 2)
+            + source_norms[:, None] ** 2
+            + matching_norms[None] ** 2
+        )
+        dotprod /= 2
+
+        dists = 1 - (dotprod / (source_norms[:, None] * matching_norms[None]))
+        return dists
+
+    @torch.inference_mode()
+    def match(
+        self,
+        query_seq: torch.Tensor,
+        matching_set: torch.Tensor,
+        synth_set: Optional[torch.Tensor] = None,
+        topk: Optional[int] = None,
+        target_duration: Optional[float] = None,
+    ) -> torch.Tensor:
+        """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching
+        with k=`topk`.
+
+        Args:
+            `query_seq`: Tensor (N1, dim) of the input/source query features.
+            `matching_set`: Tensor (N2, dim) of the matching set used as the 'training set' for the kNN algorithm.
+            `synth_set`: optional Tensor (N2, dim) corresponding to the matching set. We use the matching set to assign
+                         each query vector to a vector in the matching set, and then use the corresponding vector from
+                         the synth set during HiFiGAN synthesis.
+                         By default, and for best performance, this should be identical to the matching set.
+            `topk`: k in the kNN -- the number of nearest neighbors to average over.
+            `target_duration`: if set to a float, interpolate waveform duration to be equal to this value in seconds.
+
+        Returns:
+            - converted features (1, N, dim)
+        """
+        if topk is None:
+            topk = self.config.topk
+        synth_set = matching_set.to(self.device) if synth_set is None else synth_set.to(self.device)
+        matching_set = matching_set.to(self.device)
+        query_seq = query_seq.to(self.device)
+
+        if target_duration is not None:
+            target_samples = int(target_duration * self.config.audio.sample_rate)
+            scale_factor = (target_samples / self.hop_length) / query_seq.shape[0]  # n_targ_feats / n_input_feats
+            query_seq = F.interpolate(query_seq.T[None], scale_factor=scale_factor, mode="linear")[0].T
+
+        dists = self.fast_cosine_dist(query_seq, matching_set)
+        best = dists.topk(k=topk, largest=False, dim=-1)
+        out_feats = synth_set[best.indices].mean(dim=1)
+        return out_feats.unsqueeze(0)
+
+    def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: Union[str, os.PathLike[Any]]) -> None:
+        """kNN-VC does not use checkpoints."""
+
+    def forward(self) -> None: ...
+    def inference(self) -> None: ...
+
+    @torch.inference_mode()
+    def voice_conversion(
+        self,
+        source: PathOrTensor,
+        target: Union[PathOrTensor, list[PathOrTensor]],
+        topk: Optional[int] = None,
+    ) -> torch.Tensor:
+        if not isinstance(target, list):
+            target = [target]
+        source_features = self.get_features(source)
+        matching_set = self.get_matching_set(target)
+        return self.match(source_features, matching_set, topk=topk)
diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py
index 9a102f0c89..60dde496b2 100644
--- a/TTS/vocoder/configs/hifigan_config.py
+++ b/TTS/vocoder/configs/hifigan_config.py
@@ -5,7 +5,7 @@
 
 @dataclass
 class HifiganConfig(BaseGANVocoderConfig):
-    """Defines parameters for FullBand MelGAN vocoder.
+    """Defines parameters for HifiGAN vocoder.
 
     Example:
 
diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py
index 6a7efe3977..481d234a54 100644
--- a/TTS/vocoder/models/__init__.py
+++ b/TTS/vocoder/models/__init__.py
@@ -40,7 +40,8 @@ def setup_generator(c: BaseGANVocoderConfig):
     MyModel = getattr(MyModel, to_camel(c.generator_model))
     # this is to preserve the Wavernn class name (instead of Wavernn)
     if c.generator_model.lower() in "hifigan_generator":
-        model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params)
+        c.generator_model_params["in_channels"] = c.generator_model_params.get("in_channels", c.audio["num_mels"])
+        model = MyModel(out_channels=1, **c.generator_model_params)
     elif c.generator_model.lower() in "melgan_generator":
         model = MyModel(
             in_channels=c.audio["num_mels"],
@@ -106,7 +107,7 @@ def setup_discriminator(c: BaseGANVocoderConfig):
     MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower()))
     if c.discriminator_model in "hifigan_discriminator":
         model = MyModel()
-    if c.discriminator_model in "random_window_discriminator":
+    elif c.discriminator_model in "random_window_discriminator":
         model = MyModel(
             cond_channels=c.audio["num_mels"],
             hop_length=c.audio["hop_length"],
@@ -115,7 +116,7 @@ def setup_discriminator(c: BaseGANVocoderConfig):
             cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"],
             window_sizes=c.discriminator_model_params["window_sizes"],
         )
-    if c.discriminator_model in "melgan_multiscale_discriminator":
+    elif c.discriminator_model in "melgan_multiscale_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -124,7 +125,7 @@ def setup_discriminator(c: BaseGANVocoderConfig):
             max_channels=c.discriminator_model_params["max_channels"],
             downsample_factors=c.discriminator_model_params["downsample_factors"],
         )
-    if c.discriminator_model == "residual_parallel_wavegan_discriminator":
+    elif c.discriminator_model == "residual_parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -139,7 +140,7 @@ def setup_discriminator(c: BaseGANVocoderConfig):
             nonlinear_activation="LeakyReLU",
             nonlinear_activation_params={"negative_slope": 0.2},
         )
-    if c.discriminator_model == "parallel_wavegan_discriminator":
+    elif c.discriminator_model == "parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -151,6 +152,8 @@ def setup_discriminator(c: BaseGANVocoderConfig):
             nonlinear_activation_params={"negative_slope": 0.2},
             bias=True,
         )
-    if c.discriminator_model == "univnet_discriminator":
+    elif c.discriminator_model == "univnet_discriminator":
         model = MyModel()
+    else:
+        raise NotImplementedError(f"Model {c.discriminator_model} not implemented!")
     return model
diff --git a/hubconf.py b/hubconf.py
index 6e10928265..b49c9d6bcc 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,4 +1,14 @@
-dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"]
+dependencies = [
+    "torch",
+    "gdown",
+    "pysbd",
+    "gruut",
+    "anyascii",
+    "pypinyin",
+    "coqpit-config",
+    "mecab-python3",
+    "unidic-lite",
+]
 import torch
 
 from TTS.utils.manage import ModelManager
@@ -39,5 +49,5 @@ def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, us
 
 
 if __name__ == "__main__":
-    synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github")
+    synthesizer = torch.hub.load("idiap/coqui-ai-TTS:dev", "tts", source="github")
     synthesizer.tts("This is a test!")
diff --git a/pyproject.toml b/pyproject.toml
index 71db481ac4..bdeb5d761c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,7 @@ dependencies = [
     "pyyaml>=6.0",
     "fsspec[http]>=2023.6.0",
     "packaging>=23.1",
+    "typing_extensions>=4.10",
     # Inference
     "pysbd>=0.3.4",
     # Training