Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented the tuning parameters into the CLI #530

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ This script allows you to speak a single phrase with one or more voices.
python tortoise/do_tts.py --text "I'm going to speak this" --voice random --preset fast
```

List available parameters
```shell
python tortoise/do_tts.py --help
```

### read.py

This script provides tools for reading large amounts of text.
Expand Down
3 changes: 3 additions & 0 deletions tortoise/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,9 @@ def tts_with_preset(self, text, preset='fast', **kwargs):
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
}
# Filter out 'None' parameters.
kwargs = {k: v for k, v in kwargs.items() if v is not None}

settings.update(presets[preset])
settings.update(kwargs) # allow overriding of preset settings with kwargs
return self.tts(text, **settings)
Expand Down
46 changes: 31 additions & 15 deletions tortoise/do_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,28 @@

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--text', type=str, help='Text to speak.', default="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.")
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
parser.add_argument('--use_deepspeed', type=str, help='Which voice preset to use.', default=False)
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
'should only be specified if you have custom checkpoints.', default=MODELS_DIR)
parser.add_argument('--candidates', type=int, help='How many output candidates to produce per-voice.', default=3)
parser.add_argument('--seed', type=int, help='Random seed which can be used to reproduce results.', default=None)
parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True)
parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.'
'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0)
parser.add_argument('--text', type=str, default="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.", help='Text to speak.')
parser.add_argument('--voice', type=str, default='random', help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) Use the & character to join two voices together. Use a comma to perform inference on multiple voices.')
parser.add_argument('--preset', type=str, default='fast', help='Which voice preset to use.')
parser.add_argument('--use_deepspeed', type=str, default=False, help='Which voice preset to use.')
parser.add_argument('--output_path', type=str, default='results/', help='Where to store outputs.')
parser.add_argument('--model_dir', type=str, default=MODELS_DIR, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this should only be specified if you have custom checkpoints.')
parser.add_argument('--candidates', type=int, default=3, help='How many output candidates to produce per-voice.')
parser.add_argument('--seed', type=int, default=None, help='Random seed which can be used to reproduce results.')
parser.add_argument('--produce_debug_state', type=bool, default=True, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.')
parser.add_argument('--cvvp_amount', type=float, default=.0, help='How much the CVVP model should influence the output. Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)')
parser.add_argument('--models-dir', type=str, default=MODELS_DIR, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to ~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints.')
parser.add_argument('--num_autoregressive_samples', type=int, default=None, help='Number of samples taken from the autoregressive model, all of which are filtered using CLVP. As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great".')
parser.add_argument('--temperature', type=float, default=None, help='The softmax temperature of the autoregressive model.')
parser.add_argument('--length_penalty', type=float, default=None, help='A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.')
parser.add_argument('--repetition_penalty', type=float, default=None, help='A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence of long silences or "uhhhhhhs", etc.')
parser.add_argument('--top_p', type=float, default=None, help='P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs.')
parser.add_argument('--max_mel_tokens', type=int, default=None, help='Restricts the output length. 1 to 600. Each unit is 1/20 of a second.')
parser.add_argument('--diffusion_iterations', type=int, default=None, help='Number of diffusion steps to perform. More steps means the network has more chances to iteratively refine the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, however.')
parser.add_argument('--cond_free', type=bool, default=None, help='Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and dramatically improves realism.')
parser.add_argument('--cond_free_k', type=float, default=None, help='Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. As cond_free_k increases, the output becomes dominated by the conditioning-free signal. Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k')
parser.add_argument('--diffusion_temperature', type=float, default=None, help='Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 are the "mean" prediction of the diffusion network and will sound bland and smeared. ')

args = parser.parse_args()
os.makedirs(args.output_path, exist_ok=True)
tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)
Expand All @@ -34,8 +43,15 @@
voice_sel = [selected_voice]
voice_samples, conditioning_latents = load_voices(voice_sel)

gen, dbg_state = tts.tts_with_preset(args.text, k=args.candidates, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
preset=args.preset, use_deterministic_seed=args.seed, return_deterministic_state=True, cvvp_amount=args.cvvp_amount)
#,seed="1234567", diffusion_temperature="0.2", top_p="1"
gen, dbg_state = tts.tts_with_preset(
args.text, k=args.candidates, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
preset=args.preset, use_deterministic_seed=args.seed, return_deterministic_state=True, cvvp_amount=args.cvvp_amount,
num_autoregressive_samples=args.num_autoregressive_samples, temperature=args.temperature,
length_penalty=args.length_penalty, repetition_penalty=args.repetition_penalty,
top_p=args.top_p, max_mel_tokens=args.max_mel_tokens, diffusion_iterations=args.diffusion_iterations,
cond_free=args.cond_free, cond_free_k=args.cond_free_k, diffusion_temperature=args.diffusion_temperature
)
if isinstance(gen, list):
for j, g in enumerate(gen):
torchaudio.save(os.path.join(args.output_path, f'{selected_voice}_{k}_{j}.wav'), g.squeeze(0).cpu(), 24000)
Expand Down