Add speed parameter for faster/slower speech

See sidharthrajaram#6
dialohq · Jan 23, 2024 · 454709b · 454709b
1 parent 9279ef0
commit 454709b
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/src/styletts2/tts.py b/src/styletts2/tts.py
@@ -361,7 +361,8 @@ def long_inference_segment(self,
                                t=0.7,
                                diffusion_steps=5,
                                embedding_scale=1,
-                               phonemize=True):
+                               phonemize=True,
+                               speed=1.0,):
         """
         Performs inference for segment of longform text; see long_inference()
         :param text: Input text
@@ -373,6 +374,7 @@ def long_inference_segment(self,
         :param diffusion_steps: The more the steps, the more diverse the samples are, with the cost of speed.
         :param embedding_scale: Higher scale means style is more conditional to the input text and hence more emotional.
         :param phonemize: Phonemize text? If not, expects that text is already phonemized
+        :param speed: Speech rate (higher = faster, default: 1.0)
         :return: audio data as a Numpy array
         """
         if phonemize:
@@ -423,7 +425,7 @@ def long_inference_segment(self,
             x, _ = self.model.predictor.lstm(d)
             duration = self.model.predictor.duration_proj(x)
 
-            duration = torch.sigmoid(duration).sum(axis=-1)
+            duration = torch.sigmoid(duration).sum(axis=-1) / speed
             pred_dur = torch.round(duration.squeeze()).clamp(min=1)