Skip to content

Commit

Permalink
Merge pull request #132 from sensein/110-review-and-test-voice-clonin…
Browse files Browse the repository at this point in the history
…g-task

Enhancing voice cloning
  • Loading branch information
fabiocat93 authored Aug 16, 2024
2 parents 1a8f24b + a72b51b commit 7b501fa
Show file tree
Hide file tree
Showing 7 changed files with 352 additions and 31 deletions.
3 changes: 3 additions & 0 deletions src/senselab/audio/tasks/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ def chunk_audios(data: List[Tuple[Audio, Tuple[float, float]]]) -> List[Audio]:
Returns:
List of Audios that have been chunked based on the provided timestamps
Todo:
Do we really need both chunk_audios and extract_segments?
"""
chunked_audios = []

Expand Down
4 changes: 3 additions & 1 deletion src/senselab/audio/tasks/voice_cloning/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
"""This module provides the API for the senselab voice cloning task."""
""".. include:: ./doc.md""" # noqa: D415

from .api import clone_voices # noqa: F401
60 changes: 41 additions & 19 deletions src/senselab/audio/tasks/voice_cloning/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

from typing import Any, List, Optional

import pydra

from senselab.audio.data_structures.audio import Audio
from senselab.audio.tasks.voice_cloning.knnvc import KNNVC
from senselab.utils.data_structures.device import DeviceType
Expand All @@ -17,27 +15,51 @@ def clone_voices(
device: Optional[DeviceType] = None,
**kwargs: Any, # noqa:ANN401
) -> List[Audio]:
"""Clones voices from source audios to target audios using the given model."""
"""Clones voices from source audios to target audios using the given model.
This function performs pairwise voice cloning, where the voice from each audio sample
in the `source_audios` list is transformed into the corresponding audio
sample in the `target_audios` list. The resulting list contains audio samples that
preserve the content of the original source audio but with the voice replaced by the
voice from the corresponding target audio.
Args:
source_audios (List[Audio]): A list of audio samples whose voices will be "replaced"
by the voices from the corresponding target audio samples. The content
(e.g., words) will remain the same, but the voice sounds like the target.
target_audios (List[Audio]): A list of audio samples whose voices will be extracted
and used to replace the voices in the corresponding source audio samples.
model (SenselabModel, optional): The model to use for voice cloning. Currently,
only KNNVC (K-Nearest Neighbors Voice Conversion) is supported, encapsulated
by the `TorchModel` class. `TorchModel` is a child class of `SenselabModel`
and specifies the model and revision for cloning. Defaults to
`TorchModel(path_or_uri="bshall/knn-vc", revision="master")`.
device (Optional[DeviceType], optional): The device to run the model on (e.g., CPU or GPU).
Defaults to None.
**kwargs: Additional keyword arguments for model-specific parameters that will
be passed directly to the underlying model's voice cloning method.
Returns:
List[Audio]: A list of audio samples with cloned voices from the corresponding source and target audios.
Raises:
ValueError: If the lengths of `source_audios` and `target_audios` do not match.
NotImplementedError: If the specified model is not supported. Currently, only KNNVC is supported.
Examples:
>>> source_audios = [Audio.from_filepath("source1.wav"), Audio.from_filepath("source2.wav")]
>>> target_audios = [Audio.from_filepath("target1.wav"), Audio.from_filepath("target2.wav")]
>>> cloned_audios = clone_voices(source_audios, target_audios)
Todo:
Add logging with timestamps.
"""
if len(source_audios) != len(target_audios):
raise ValueError("Source and target audios must have the same length.")
raise ValueError("The list of source and target audios must have the same length.")

if isinstance(model, TorchModel) and model.path_or_uri == "bshall/knn-vc":
topk = kwargs.get("topk", 4)
if not isinstance(topk, int):
raise ValueError("topk must be an integer.")
prematched_vocoder = kwargs.get("prematched_vocoder", True)
if not isinstance(prematched_vocoder, bool):
raise ValueError("prematched_vocoder must be a boolean.")
return KNNVC.clone_voices_with_knn_vc(
source_audios=source_audios,
target_audios=target_audios,
model=model,
prematched_vocoder=prematched_vocoder,
topk=topk,
device=device,
source_audios=source_audios, target_audios=target_audios, model=model, device=device, **kwargs
)
else:
raise NotImplementedError("Only KNNVC is supported for now.")


clone_voices_pt = pydra.mark.task(clone_voices)
32 changes: 32 additions & 0 deletions src/senselab/audio/tasks/voice_cloning/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Voice cloning


<button class="tutorial-button" onclick="window.location.href='https://github.com/sensein/senselab/blob/main/tutorials/voice_cloning.ipynb'">Tutorial</button>


## Task Overview

Any-to-any voice cloning aims to transform a source speech into a target voice using just one or a few examples of the target speaker's voice as references. Traditional voice conversion systems attempt to separate the speaker's identity from the speech content. This allows the replacement of speaker information to convert the voice to a target speaker. However, learning such disentangled representations is complex and poses significant challenges.


## Models
We have explored several models for voice cloning:
- [speechT5](https://huggingface.co/microsoft/speecht5_vc) (not included in ```senselab``` as it did not meet our expectations),
- [FREEVC](https://github.com/OlaWod/FreeVC) (planned to be included in ```senselab``` soon)
- [KNNVC](https://github.com/bshall/knn-vc) (Already included in ```senselab```).


## Evaluation
### Metrics

Objective evaluation involves comparing voice cloning outputs across different downstream tasks:

- Using an automatic speaker verification tool to determine if the original speaker, the target speaker, and the cloned speaker can be distinguished from each other.
- Ensuring the intelligibility of speech content using an automatic speech recognition system to verify that the content remains unchanged.
- Assessing the preservation of the original speech's emotion after voice cloning.
- ...more...


### Benchmark

Recent efforts to enhance privacy in speech technology include the [VoicePrivacy initiative](https://arxiv.org/pdf/2005.01387), which has been active since 2020, focusing on developing and benchmarking anonymization methods. Despite these efforts, achieving perfect privacy remains a challenge (see [here](https://www.voiceprivacychallenge.org/vp2022/docs/VoicePrivacy_2022_Challenge___Natalia_Tomashenko.pdf) for more details).
44 changes: 39 additions & 5 deletions src/senselab/audio/tasks/voice_cloning/knnvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,18 @@ def _get_knnvc_pipeline(
prematched_vocoder: bool,
topk: int,
device: Optional[DeviceType] = None,
) -> Any: # noqa: ANN401
"""Get or create a KNNVC pipeline."""
) -> Any: # noqa:ANN401
"""Get or create a KNNVC pipeline.
Args:
model (TorchModel): The Torch model to use for the KNNVC pipeline.
prematched_vocoder (bool): Flag indicating whether to use a pre-matched vocoder.
topk (int): The number of top matches to consider.
device (Optional[DeviceType]): The device to run the pipeline on.
Returns:
Any: The KNNVC pipeline.
"""
key = f"{model.path_or_uri}-{model.revision}-{prematched_vocoder}-{topk}-{device}"
if key not in cls._pipelines:
device, _ = _select_device_and_dtype(
Expand All @@ -49,21 +59,45 @@ def clone_voices_with_knn_vc(
topk: int = 4,
device: Optional[DeviceType] = None,
) -> List[Audio]:
"""Clones voices from source audios to target audios using KNNVC."""
"""Clone voices from source audios to target audios using KNNVC.
Args:
source_audios (List[Audio]): List of source audio objects.
target_audios (List[Audio]): List of target audio objects.
model (TorchModel, optional): The Torch model to use for the KNNVC pipeline.
Defaults to TorchModel(path_or_uri="bshall/knn-vc", revision="master").
prematched_vocoder (bool, optional): Flag indicating whether to use a pre-matched vocoder. Defaults to True.
topk (int, optional): The number of top matches to consider. Defaults to 4.
device (Optional[DeviceType], optional): The device to run the pipeline on. Defaults to None.
Returns:
List[Audio]: List of cloned audio objects.
Raises:
ValueError: If the audio files are not mono or if the sampling rates are not supported.
"""
if not isinstance(prematched_vocoder, bool):
raise TypeError("prematched_vocoder must be a boolean.")

knn_vc = cls._get_knnvc_pipeline(model=model, prematched_vocoder=prematched_vocoder, topk=topk, device=device)

cloned_audios = []
for source_audio, target_audio in zip(source_audios, target_audios):
if source_audio.waveform.shape[0] > 1 or target_audio.waveform.shape[0] > 1:
raise ValueError("Only mono audio files are supported.")
raise ValueError(
"Only mono audio files are supported."
f"Offending audios: source_audio={source_audio}, target_audio={target_audio}"
)
source_sampling_rate = source_audio.sampling_rate
target_sampling_rate = target_audio.sampling_rate
# 16kHz is the only supported sampling rate for KNNVC
supported_sampling_rate = 16000
if source_sampling_rate != supported_sampling_rate or target_sampling_rate != supported_sampling_rate:
raise ValueError(
f"Sampling rates {source_sampling_rate} and/or {target_sampling_rate} are not supported."
f"Sampling rates for the source audio ({source_sampling_rate}) "
f"and/or the target audio ({target_sampling_rate}) are not supported."
f"Only {supported_sampling_rate} sampling rate is supported."
f"Offending audios: source_audio={source_audio}, target_audio={target_audio}"
)

source_waveform = source_audio.waveform
Expand Down
12 changes: 6 additions & 6 deletions src/tests/audio/tasks/voice_cloning_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_clone_voices_length_mismatch(resampled_mono_audio_sample: Audio, torch_
source_audios = [resampled_mono_audio_sample]
target_audios = [resampled_mono_audio_sample, resampled_mono_audio_sample]

with pytest.raises(ValueError, match="Source and target audios must have the same length."):
with pytest.raises(ValueError, match="The list of source and target audios must have the same length"):
clone_voices(
source_audios=source_audios, target_audios=target_audios, model=torch_model, device=DeviceType.CPU
)
Expand All @@ -30,7 +30,7 @@ def test_clone_voices_invalid_topk(resampled_mono_audio_sample: Audio, torch_mod
source_audios = [resampled_mono_audio_sample]
target_audios = [resampled_mono_audio_sample]

with pytest.raises(ValueError, match="topk must be an integer."):
with pytest.raises(TypeError, match="argument 'k' must be int, not str"):
clone_voices(
source_audios=source_audios,
target_audios=target_audios,
Expand All @@ -46,7 +46,7 @@ def test_clone_voices_invalid_prematched_vocoder(
source_audios = [resampled_mono_audio_sample]
target_audios = [resampled_mono_audio_sample]

with pytest.raises(ValueError, match="prematched_vocoder must be a boolean."):
with pytest.raises(TypeError, match="prematched_vocoder must be a boolean."):
clone_voices(
source_audios=source_audios,
target_audios=target_audios,
Expand All @@ -57,8 +57,8 @@ def test_clone_voices_invalid_prematched_vocoder(

def test_clone_voices_valid_input(resampled_mono_audio_sample: Audio, torch_model: TorchModel) -> None:
"""Test cloning voices with valid input."""
source_audios = [resampled_mono_audio_sample]
target_audios = [resampled_mono_audio_sample]
source_audios = [resampled_mono_audio_sample, resampled_mono_audio_sample]
target_audios = [resampled_mono_audio_sample, resampled_mono_audio_sample]

try:
cloned_output = clone_voices(
Expand All @@ -70,7 +70,7 @@ def test_clone_voices_valid_input(resampled_mono_audio_sample: Audio, torch_mode
prematched_vocoder=False,
)
assert isinstance(cloned_output, list), "Output must be a list."
assert len(cloned_output) == 1, "Output list should contain exactly one audio sample."
assert len(cloned_output) == 2, "Output list should contain exactly two audio samples."
assert isinstance(cloned_output[0], Audio), "Each item in the output list should be an instance of Audio."
source_duration = source_audios[0].waveform.shape[1]
cloned_duration = cloned_output[0].waveform.shape[1]
Expand Down
228 changes: 228 additions & 0 deletions tutorials/voice_cloning.ipynb

Large diffs are not rendered by default.

0 comments on commit 7b501fa

Please sign in to comment.