From 454709b1aacf8346d139333ca67926ad0fa62a8b Mon Sep 17 00:00:00 2001
From: Daniel Quernheim <quernd@users.noreply.github.com>
Date: Tue, 23 Jan 2024 17:22:51 +0000
Subject: [PATCH] Add speed parameter for faster/slower speech See
 https://github.com/sidharthrajaram/StyleTTS2/issues/6

---
 src/styletts2/tts.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/styletts2/tts.py b/src/styletts2/tts.py
index e6eb78d..0f8e82b 100644
--- a/src/styletts2/tts.py
+++ b/src/styletts2/tts.py
@@ -361,7 +361,8 @@ def long_inference_segment(self,
                                t=0.7,
                                diffusion_steps=5,
                                embedding_scale=1,
-                               phonemize=True):
+                               phonemize=True,
+                               speed=1.0,):
         """
         Performs inference for segment of longform text; see long_inference()
         :param text: Input text
@@ -373,6 +374,7 @@ def long_inference_segment(self,
         :param diffusion_steps: The more the steps, the more diverse the samples are, with the cost of speed.
         :param embedding_scale: Higher scale means style is more conditional to the input text and hence more emotional.
         :param phonemize: Phonemize text? If not, expects that text is already phonemized
+        :param speed: Speech rate (higher = faster, default: 1.0)
         :return: audio data as a Numpy array
         """
         if phonemize:
@@ -423,7 +425,7 @@ def long_inference_segment(self,
             x, _ = self.model.predictor.lstm(d)
             duration = self.model.predictor.duration_proj(x)
 
-            duration = torch.sigmoid(duration).sum(axis=-1)
+            duration = torch.sigmoid(duration).sum(axis=-1) / speed
             pred_dur = torch.round(duration.squeeze()).clamp(min=1)