From 454709b1aacf8346d139333ca67926ad0fa62a8b Mon Sep 17 00:00:00 2001 From: Daniel Quernheim Date: Tue, 23 Jan 2024 17:22:51 +0000 Subject: [PATCH] Add speed parameter for faster/slower speech See https://github.com/sidharthrajaram/StyleTTS2/issues/6 --- src/styletts2/tts.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/styletts2/tts.py b/src/styletts2/tts.py index e6eb78d..0f8e82b 100644 --- a/src/styletts2/tts.py +++ b/src/styletts2/tts.py @@ -361,7 +361,8 @@ def long_inference_segment(self, t=0.7, diffusion_steps=5, embedding_scale=1, - phonemize=True): + phonemize=True, + speed=1.0,): """ Performs inference for segment of longform text; see long_inference() :param text: Input text @@ -373,6 +374,7 @@ def long_inference_segment(self, :param diffusion_steps: The more the steps, the more diverse the samples are, with the cost of speed. :param embedding_scale: Higher scale means style is more conditional to the input text and hence more emotional. :param phonemize: Phonemize text? If not, expects that text is already phonemized + :param speed: Speech rate (higher = faster, default: 1.0) :return: audio data as a Numpy array """ if phonemize: @@ -423,7 +425,7 @@ def long_inference_segment(self, x, _ = self.model.predictor.lstm(d) duration = self.model.predictor.duration_proj(x) - duration = torch.sigmoid(duration).sum(axis=-1) + duration = torch.sigmoid(duration).sum(axis=-1) / speed pred_dur = torch.round(duration.squeeze()).clamp(min=1)