Merge pull request #132 from sensein/110-review-and-test-voice-clonin…

…g-task Enhancing voice cloning
sensein · Aug 16, 2024 · 7b501fa · 7b501fa
2 parents 1a8f24b + a72b51b
commit 7b501fa
Show file tree

Hide file tree

Showing 7 changed files with 352 additions and 31 deletions.
diff --git a/src/senselab/audio/tasks/preprocessing/preprocessing.py b/src/senselab/audio/tasks/preprocessing/preprocessing.py
@@ -110,6 +110,9 @@ def chunk_audios(data: List[Tuple[Audio, Tuple[float, float]]]) -> List[Audio]:
 
     Returns:
         List of Audios that have been chunked based on the provided timestamps
+
+    Todo:
+        Do we really need both chunk_audios and extract_segments?
     """
     chunked_audios = []
 

diff --git a/src/senselab/audio/tasks/voice_cloning/__init__.py b/src/senselab/audio/tasks/voice_cloning/__init__.py
@@ -1 +1,3 @@
-"""This module provides the API for the senselab voice cloning task."""
+""".. include:: ./doc.md"""  # noqa: D415
+
+from .api import clone_voices  # noqa: F401
diff --git a/src/senselab/audio/tasks/voice_cloning/api.py b/src/senselab/audio/tasks/voice_cloning/api.py
@@ -2,8 +2,6 @@
 
 from typing import Any, List, Optional
 
-import pydra
-
 from senselab.audio.data_structures.audio import Audio
 from senselab.audio.tasks.voice_cloning.knnvc import KNNVC
 from senselab.utils.data_structures.device import DeviceType
@@ -17,27 +15,51 @@ def clone_voices(
     device: Optional[DeviceType] = None,
     **kwargs: Any,  # noqa:ANN401
 ) -> List[Audio]:
-    """Clones voices from source audios to target audios using the given model."""
+    """Clones voices from source audios to target audios using the given model.
+
+    This function performs pairwise voice cloning, where the voice from each audio sample
+    in the `source_audios` list is transformed into the corresponding audio
+    sample in the `target_audios` list. The resulting list contains audio samples that
+    preserve the content of the original source audio but with the voice replaced by the
+    voice from the corresponding target audio.
+
+    Args:
+        source_audios (List[Audio]): A list of audio samples whose voices will be "replaced"
+            by the voices from the corresponding target audio samples. The content
+            (e.g., words) will remain the same, but the voice sounds like the target.
+        target_audios (List[Audio]): A list of audio samples whose voices will be extracted
+            and used to replace the voices in the corresponding source audio samples.
+        model (SenselabModel, optional): The model to use for voice cloning. Currently,
+            only KNNVC (K-Nearest Neighbors Voice Conversion) is supported, encapsulated
+            by the `TorchModel` class. `TorchModel` is a child class of `SenselabModel`
+            and specifies the model and revision for cloning. Defaults to
+            `TorchModel(path_or_uri="bshall/knn-vc", revision="master")`.
+        device (Optional[DeviceType], optional): The device to run the model on (e.g., CPU or GPU).
+            Defaults to None.
+        **kwargs: Additional keyword arguments for model-specific parameters that will
+            be passed directly to the underlying model's voice cloning method.
+
+    Returns:
+        List[Audio]: A list of audio samples with cloned voices from the corresponding source and target audios.
+
+    Raises:
+        ValueError: If the lengths of `source_audios` and `target_audios` do not match.
+        NotImplementedError: If the specified model is not supported. Currently, only KNNVC is supported.
+
+    Examples:
+        >>> source_audios = [Audio.from_filepath("source1.wav"), Audio.from_filepath("source2.wav")]
+        >>> target_audios = [Audio.from_filepath("target1.wav"), Audio.from_filepath("target2.wav")]
+        >>> cloned_audios = clone_voices(source_audios, target_audios)
+
+    Todo:
+        Add logging with timestamps.
+    """
     if len(source_audios) != len(target_audios):
-        raise ValueError("Source and target audios must have the same length.")
+        raise ValueError("The list of source and target audios must have the same length.")
 
     if isinstance(model, TorchModel) and model.path_or_uri == "bshall/knn-vc":
-        topk = kwargs.get("topk", 4)
-        if not isinstance(topk, int):
-            raise ValueError("topk must be an integer.")
-        prematched_vocoder = kwargs.get("prematched_vocoder", True)
-        if not isinstance(prematched_vocoder, bool):
-            raise ValueError("prematched_vocoder must be a boolean.")
         return KNNVC.clone_voices_with_knn_vc(
-            source_audios=source_audios,
-            target_audios=target_audios,
-            model=model,
-            prematched_vocoder=prematched_vocoder,
-            topk=topk,
-            device=device,
+            source_audios=source_audios, target_audios=target_audios, model=model, device=device, **kwargs
         )
     else:
         raise NotImplementedError("Only KNNVC is supported for now.")
-
-
-clone_voices_pt = pydra.mark.task(clone_voices)
diff --git a/src/senselab/audio/tasks/voice_cloning/doc.md b/src/senselab/audio/tasks/voice_cloning/doc.md
@@ -0,0 +1,32 @@
+# Voice cloning
+
+
+<button class="tutorial-button" onclick="window.location.href='https://github.com/sensein/senselab/blob/main/tutorials/voice_cloning.ipynb'">Tutorial</button>
+
+
+## Task Overview
+
+Any-to-any voice cloning aims to transform a source speech into a target voice using just one or a few examples of the target speaker's voice as references. Traditional voice conversion systems attempt to separate the speaker's identity from the speech content. This allows the replacement of speaker information to convert the voice to a target speaker. However, learning such disentangled representations is complex and poses significant challenges.
+
+
+## Models
+We have explored several models for voice cloning:
+- [speechT5](https://huggingface.co/microsoft/speecht5_vc) (not included in ```senselab``` as it did not meet our expectations),
+- [FREEVC](https://github.com/OlaWod/FreeVC) (planned to be included in ```senselab``` soon)
+- [KNNVC](https://github.com/bshall/knn-vc) (Already included in ```senselab```).
+
+
+## Evaluation
+### Metrics
+
+Objective evaluation involves comparing voice cloning outputs across different downstream tasks:
+
+- Using an automatic speaker verification tool to determine if the original speaker, the target speaker, and the cloned speaker can be distinguished from each other.
+- Ensuring the intelligibility of speech content using an automatic speech recognition system to verify that the content remains unchanged.
+- Assessing the preservation of the original speech's emotion after voice cloning.
+- ...more...
+
+
+### Benchmark
+
+Recent efforts to enhance privacy in speech technology include the [VoicePrivacy initiative](https://arxiv.org/pdf/2005.01387), which has been active since 2020, focusing on developing and benchmarking anonymization methods. Despite these efforts, achieving perfect privacy remains a challenge (see [here](https://www.voiceprivacychallenge.org/vp2022/docs/VoicePrivacy_2022_Challenge___Natalia_Tomashenko.pdf) for more details).
diff --git a/src/senselab/audio/tasks/voice_cloning/knnvc.py b/src/senselab/audio/tasks/voice_cloning/knnvc.py
@@ -21,8 +21,18 @@ def _get_knnvc_pipeline(
         prematched_vocoder: bool,
         topk: int,
         device: Optional[DeviceType] = None,
-    ) -> Any:  # noqa: ANN401
-        """Get or create a KNNVC pipeline."""
+    ) -> Any:  # noqa:ANN401
+        """Get or create a KNNVC pipeline.
+
+        Args:
+            model (TorchModel): The Torch model to use for the KNNVC pipeline.
+            prematched_vocoder (bool): Flag indicating whether to use a pre-matched vocoder.
+            topk (int): The number of top matches to consider.
+            device (Optional[DeviceType]): The device to run the pipeline on.
+
+        Returns:
+            Any: The KNNVC pipeline.
+        """
         key = f"{model.path_or_uri}-{model.revision}-{prematched_vocoder}-{topk}-{device}"
         if key not in cls._pipelines:
             device, _ = _select_device_and_dtype(
@@ -49,21 +59,45 @@ def clone_voices_with_knn_vc(
         topk: int = 4,
         device: Optional[DeviceType] = None,
     ) -> List[Audio]:
-        """Clones voices from source audios to target audios using KNNVC."""
+        """Clone voices from source audios to target audios using KNNVC.
+
+        Args:
+            source_audios (List[Audio]): List of source audio objects.
+            target_audios (List[Audio]): List of target audio objects.
+            model (TorchModel, optional): The Torch model to use for the KNNVC pipeline.
+                Defaults to TorchModel(path_or_uri="bshall/knn-vc", revision="master").
+            prematched_vocoder (bool, optional): Flag indicating whether to use a pre-matched vocoder. Defaults to True.
+            topk (int, optional): The number of top matches to consider. Defaults to 4.
+            device (Optional[DeviceType], optional): The device to run the pipeline on. Defaults to None.
+
+        Returns:
+            List[Audio]: List of cloned audio objects.
+
+        Raises:
+            ValueError: If the audio files are not mono or if the sampling rates are not supported.
+        """
+        if not isinstance(prematched_vocoder, bool):
+            raise TypeError("prematched_vocoder must be a boolean.")
+
         knn_vc = cls._get_knnvc_pipeline(model=model, prematched_vocoder=prematched_vocoder, topk=topk, device=device)
 
         cloned_audios = []
         for source_audio, target_audio in zip(source_audios, target_audios):
             if source_audio.waveform.shape[0] > 1 or target_audio.waveform.shape[0] > 1:
-                raise ValueError("Only mono audio files are supported.")
+                raise ValueError(
+                    "Only mono audio files are supported."
+                    f"Offending audios: source_audio={source_audio}, target_audio={target_audio}"
+                )
             source_sampling_rate = source_audio.sampling_rate
             target_sampling_rate = target_audio.sampling_rate
             # 16kHz is the only supported sampling rate for KNNVC
             supported_sampling_rate = 16000
             if source_sampling_rate != supported_sampling_rate or target_sampling_rate != supported_sampling_rate:
                 raise ValueError(
-                    f"Sampling rates {source_sampling_rate} and/or {target_sampling_rate} are not supported."
+                    f"Sampling rates for the source audio ({source_sampling_rate}) "
+                    f"and/or the target audio ({target_sampling_rate}) are not supported."
                     f"Only {supported_sampling_rate} sampling rate is supported."
+                    f"Offending audios: source_audio={source_audio}, target_audio={target_audio}"
                 )
 
             source_waveform = source_audio.waveform

diff --git a/src/tests/audio/tasks/voice_cloning_test.py b/src/tests/audio/tasks/voice_cloning_test.py
@@ -20,7 +20,7 @@ def test_clone_voices_length_mismatch(resampled_mono_audio_sample: Audio, torch_
         source_audios = [resampled_mono_audio_sample]
         target_audios = [resampled_mono_audio_sample, resampled_mono_audio_sample]
 
-        with pytest.raises(ValueError, match="Source and target audios must have the same length."):
+        with pytest.raises(ValueError, match="The list of source and target audios must have the same length"):
             clone_voices(
                 source_audios=source_audios, target_audios=target_audios, model=torch_model, device=DeviceType.CPU
             )
@@ -30,7 +30,7 @@ def test_clone_voices_invalid_topk(resampled_mono_audio_sample: Audio, torch_mod
         source_audios = [resampled_mono_audio_sample]
         target_audios = [resampled_mono_audio_sample]
 
-        with pytest.raises(ValueError, match="topk must be an integer."):
+        with pytest.raises(TypeError, match="argument 'k' must be int, not str"):
             clone_voices(
                 source_audios=source_audios,
                 target_audios=target_audios,
@@ -46,7 +46,7 @@ def test_clone_voices_invalid_prematched_vocoder(
         source_audios = [resampled_mono_audio_sample]
         target_audios = [resampled_mono_audio_sample]
 
-        with pytest.raises(ValueError, match="prematched_vocoder must be a boolean."):
+        with pytest.raises(TypeError, match="prematched_vocoder must be a boolean."):
             clone_voices(
                 source_audios=source_audios,
                 target_audios=target_audios,
@@ -57,8 +57,8 @@ def test_clone_voices_invalid_prematched_vocoder(
 
     def test_clone_voices_valid_input(resampled_mono_audio_sample: Audio, torch_model: TorchModel) -> None:
         """Test cloning voices with valid input."""
-        source_audios = [resampled_mono_audio_sample]
-        target_audios = [resampled_mono_audio_sample]
+        source_audios = [resampled_mono_audio_sample, resampled_mono_audio_sample]
+        target_audios = [resampled_mono_audio_sample, resampled_mono_audio_sample]
 
         try:
             cloned_output = clone_voices(
@@ -70,7 +70,7 @@ def test_clone_voices_valid_input(resampled_mono_audio_sample: Audio, torch_mode
                 prematched_vocoder=False,
             )
             assert isinstance(cloned_output, list), "Output must be a list."
-            assert len(cloned_output) == 1, "Output list should contain exactly one audio sample."
+            assert len(cloned_output) == 2, "Output list should contain exactly two audio samples."
             assert isinstance(cloned_output[0], Audio), "Each item in the output list should be an instance of Audio."
             source_duration = source_audios[0].waveform.shape[1]
             cloned_duration = cloned_output[0].waveform.shape[1]

diff --git a/tutorials/voice_cloning.ipynb b/tutorials/voice_cloning.ipynb