Skip to content

Commit

Permalink
Add speed parameter for faster/slower speech
Browse files Browse the repository at this point in the history
  • Loading branch information
quernd committed Jan 23, 2024
1 parent 9279ef0 commit 454709b
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions src/styletts2/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,8 @@ def long_inference_segment(self,
t=0.7,
diffusion_steps=5,
embedding_scale=1,
phonemize=True):
phonemize=True,
speed=1.0,):
"""
Performs inference for segment of longform text; see long_inference()
:param text: Input text
Expand All @@ -373,6 +374,7 @@ def long_inference_segment(self,
:param diffusion_steps: The more the steps, the more diverse the samples are, with the cost of speed.
:param embedding_scale: Higher scale means style is more conditional to the input text and hence more emotional.
:param phonemize: Phonemize text? If not, expects that text is already phonemized
:param speed: Speech rate (higher = faster, default: 1.0)
:return: audio data as a Numpy array
"""
if phonemize:
Expand Down Expand Up @@ -423,7 +425,7 @@ def long_inference_segment(self,
x, _ = self.model.predictor.lstm(d)
duration = self.model.predictor.duration_proj(x)

duration = torch.sigmoid(duration).sum(axis=-1)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration.squeeze()).clamp(min=1)


Expand Down

0 comments on commit 454709b

Please sign in to comment.