feat(api): support setting speed (#316)

XTTS accepts a `speed` parameter, but that was previously captured in the API which prevented it from being passed through. Removing the non-functional `speed` parameter from the API lets it pass via `kwargs` to models that support it.
idiap · Feb 25, 2025 · b20533e · b20533e
1 parent 382b418
commit b20533e
Showing 1 changed file with 3 additions and 13 deletions.
diff --git a/TTS/api.py b/TTS/api.py
@@ -264,7 +264,6 @@ def _check_arguments(
         language: str | None = None,
         speaker_wav: str | None = None,
         emotion: str | None = None,
-        speed: float | None = None,
         **kwargs,
     ) -> None:
         """Check if the arguments are valid for the model."""
@@ -277,8 +276,8 @@ def _check_arguments(
             raise ValueError("Model is not multi-speaker but `speaker` is provided.")
         if not self.is_multi_lingual and language is not None:
             raise ValueError("Model is not multi-lingual but `language` is provided.")
-        if emotion is not None and speed is not None:
-            raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
+        if emotion is not None:
+            raise ValueError("Emotion can only be used with Coqui Studio models. Which is discontinued.")
 
     def tts(
         self,
@@ -287,7 +286,6 @@ def tts(
         language: str | None = None,
         speaker_wav: str | None = None,
         emotion: str | None = None,
-        speed: float | None = None,
         split_sentences: bool = True,
         **kwargs,
     ):
@@ -306,19 +304,14 @@ def tts(
                 Defaults to None.
             emotion (str, optional):
                 Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
-            speed (float, optional):
-                Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
-                Defaults to None.
             split_sentences (bool, optional):
                 Split text into sentences, synthesize them separately and concatenate the file audio.
                 Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
                 applicable to the 🐸TTS models. Defaults to True.
             kwargs (dict, optional):
                 Additional arguments for the model.
         """
-        self._check_arguments(
-            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
-        )
+        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, **kwargs)
         wav = self.synthesizer.tts(
             text=text,
             speaker_name=speaker,
@@ -336,7 +329,6 @@ def tts_to_file(
         language: str | None = None,
         speaker_wav: str | None = None,
         emotion: str | None = None,
-        speed: float = 1.0,
         pipe_out=None,
         file_path: str = "output.wav",
         split_sentences: bool = True,
@@ -358,8 +350,6 @@ def tts_to_file(
                 Defaults to None.
             emotion (str, optional):
                 Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
-            speed (float, optional):
-                Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
             pipe_out (BytesIO, optional):
                 Flag to stdout the generated TTS wav file for shell pipe.
             file_path (str, optional):