diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py index bcb2670a6..52b7e1817 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py @@ -51,17 +51,7 @@ def __init__(self, hparams, create_waveform_fn, device): else: self.evaluators = {} - bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {}) - if bulk_evaluators: - self.bulk_evaluators = { - key: evaluator_f() - for key, evaluator_f in bulk_evaluators.items() - if key in self.enabled_evaluators - } - else: - self.bulk_evaluators = {} - - if not self.evaluators and not self.bulk_evaluators: + if not self.evaluators: logger.warn( "No evaluators were defined - this run will produce samples only" ) @@ -98,9 +88,7 @@ def on_evaluate_start(self, stage, epoch): self.create_reports() self.modules.model.show_inference_progress = False self.item_ids = [] - details_keys = list(self.evaluators.keys()) + list( - self.bulk_evaluators.keys() - ) + details_keys = list(self.evaluators.keys()) self.details = {evaluator_key: [] for evaluator_key in details_keys} self.sample_text = [] self.sample_file_names = [] @@ -141,7 +129,6 @@ def on_evaluate_end(self): dataset : speechbrain.dataio.dataset.DynamicItemDataset a dataset """ - self.evaluate_bulk() self.write_summary() logger.info("Evaluation done") @@ -182,19 +169,6 @@ def get_report_columns(self, evaluator_key): wavs_ref=bogus_wavs, length_ref=bogus_length, ) - else: - bogus_file_name = self.output_folder / "bogus.wav" - evaluator = self.bulk_evaluators[evaluator_key] - sb.dataio.dataio.write_audio( - str(bogus_file_name), - bogus_wavs[0].cpu(), - samplerate=self.hparams.model_sample_rate, - ) - result = evaluator.evaluate_files( - file_names=[bogus_file_name], - text=["BOGUS"], - file_names_ref=[bogus_file_name], - ) return ["uttid"] + list(result.details.keys()) @@ -228,19 +202,6 @@ def evaluate_batch(self, batch): self.write_result(evaluator_key, batch.uttid, details) self.details[evaluator_key].extend(details) - def evaluate_bulk(self): - """Runs all configured bulk evaluators, which evaluate a directory - of files - rather than one file at a time""" - for evaluator_key, evaluator in self.bulk_evaluators.items(): - result = evaluator.evaluate_files( - file_names=self.sample_file_names, - text=self.sample_text, - file_names_ref=self.ref_file_names, - ) - self.details[evaluator_key].append(result.details) - details = undo_batch(result.details) - self.write_result(evaluator_key, self.item_ids, details) - def write_result(self, evaluator_key, uttid, details): """Outputs the result details to the report for the specified evaluator diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index bdf6c0f75..dcdc6d920 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -1,50 +1,56 @@ +# ############################################################################ +# Evaluation Hyperparameters +# Common to old models, appended to main hyperparameters +# +# Authors: Artem Ploujnikov +# ############################################################################ + +eval_enabled: True eval_sample_rate: 16000 eval_samples: null eval_interval: 1 eval_asr_type: whisper -eval_asr_source: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech - whisper: openai/whisper-small +eval_asr_source: openai/whisper-small evaluations: utmos,asr tmp_folder: null -utmos_batch_size: 8 -utmos_model_path: ./utmos -utmos_ckpt_name: epoch=3-step=7459.ckpt -utmos_ckpt_path: !ref / -utmos_use_python: True -utmos_script: predict.py - - -eval_asr: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator - source: !ref - sample_rate: !ref - overrides: - lm_weight: 0.0 - whisper: !name:eval.WhisperASRSpeechEvaluator - source: !ref - sample_rate: !ref - savedir: !ref +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_utmos: !name:eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_asr: !name:eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref evaluators: + utmos: !ref asr: !ref -bulk_evaluators: - utmos: !name:eval.UTMOSSpeechEvaluator - model_path: !ref - output_folder: !ref - ckpt_path: !ref - batch_size: !ref - script: !ref - use_python: !ref - tmp_folder: !ref - eval_summary: asr: descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] utmos: descriptive: ["utmos"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median + +eval_threshold: + dwer_max: 90.0 + +eval_threshold_set: + utmos: 0.0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index cd4f338bc..d49afdf29 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -8,18 +8,23 @@ experiment_name: tokotron/dac # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" + +# Model type +representation_mode: discrete # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/dac +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -29,16 +34,27 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +token_model_kwargs: + n_quantizers: !ref + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -61,7 +77,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -85,8 +101,8 @@ model_bitrate: 8kbps # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -94,24 +110,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref silence_padding: !ref -# Token model (pretrained) -dac: !new:speechbrain.lobes.models.discrete.dac.DAC - sample_rate: !ref - model_type: !ref - model_bitrate: !ref - load_pretrained: True - -# Token model (pretrained) -token_model: !new:Tokotron.DACFeatureExtractor - dac: !ref - n_quantizers: !ref # Dataloader options train_dataloader_opts: @@ -143,20 +148,13 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 @@ -165,6 +163,7 @@ audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -178,7 +177,7 @@ attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -198,15 +197,23 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + + modules: model: !ref - dac: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -226,10 +233,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index f8a0ee622..af723f6c9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -4,22 +4,24 @@ # ############################################################################ experiment_name: tokotron/discrete_ssl - # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER # Model Type ssl_model_type: wavlm - -output_folder: !ref results/tokotron/// +representation_mode: discrete +output_folder: !ref results/// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/discrete- +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref vocoder_model_name: !ref unithifigan-dasb--discrete vocoder_model_path: !ref / @@ -36,36 +38,39 @@ progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 - +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref freeze_token_model: True token_model_src: !apply:speechbrain.utils.hparams.choice value: !ref choices: wavlm: microsoft/wavlm-large hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self - + wav2vec2: facebook/wav2vec2-large g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization -token_model_kmeans_dataset: LibriSpeech-100-360-500 -ssl_model_layers: [1, 3, 7, 12, 18, 23] -token_model_layers: !ref -token_offset: 1 -vocoder_src: !apply:speechbrain.utils.hparams.choice +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice value: !ref choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False - -vocoder_available_layers: [1, 3, 7, 12, 18, 23] - splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -80,7 +85,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 - +data_scale: null # index pad_index: 0 @@ -91,7 +96,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -107,12 +112,6 @@ eos_mode: gate decoder_mode: autoregressive scale_factor: 4 -# Beam Search-specific parameters -min_decode_ratio: 1.0 -max_decode_ratio: 10.0 -beam_size: 5 - - # Feature parameters sample_rate: 22050 model_sample_rate: 16000 @@ -122,8 +121,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -131,15 +130,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref - silence_padding: !ref use_silence_padding: True - # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref @@ -159,16 +156,6 @@ ssl_model: !apply:speechbrain.utils.hparams.choice save_path: !ref freeze: !ref output_all_hiddens: True - - -token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - ssl_model: !ref - kmeans_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref - save_path: !ref - layers_num: !ref - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa @@ -181,58 +168,42 @@ train_dataloader_opts: collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - valid_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - test_dataloader_opts: batch_size: 1 num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - sample_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - token_model_kwargs: - SSL_layers: !ref - -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - token_model_kwargs: !ref - ssl_model: !ref - ssl_model_layers: !ref - token_model_layers: !ref - sample_rate: !ref - model_sample_rate: !ref - spk_emb_model: !ref - + SSL_layers: !ref ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU -audio_num_tokens: 1000 +vocab_size: 1000 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False audio_emb_lr: 0.00001 audio_emb_weight_decay: 0.001 text_num_tokens: 39 @@ -247,14 +218,9 @@ attention_type: regularMHA ############################## models ################################ -vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams - source: !ref - savedir: !ref - - -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref - audio_num_tokens: !ref + audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref d_ffn: !ref @@ -273,20 +239,25 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref representation_mode: discrete +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref -# define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -302,7 +273,7 @@ compute_cost: !new:Tokotron.TokotronLoss representation_mode: discrete -lr_annealing: !new:Tokotron.TargetedNoamScheduler +lr_annealing: !new:model.Tokotron.TargetedNoamScheduler lr_initial: [!ref , !ref ] n_warmup_steps: !ref param_group: 0 @@ -314,10 +285,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index f5e82c309..1c54128b7 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -8,18 +8,21 @@ experiment_name: tokotron/encodec # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" +# Model type +representation_mode: discrete # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/encodec +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -29,16 +32,23 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -53,6 +63,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 @@ -60,7 +71,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -80,8 +91,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -89,20 +100,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref silence_padding: !ref -# Token model (pretrained) -token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec - source: !ref - save_path: !ref - bandwidth: !ref - flat_embeddings: True - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -133,20 +137,13 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 @@ -155,6 +152,7 @@ audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -168,7 +166,7 @@ attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -188,15 +186,24 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + modules: model: !ref - token_model: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -216,10 +223,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml new file mode 100644 index 000000000..505460dfa --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -0,0 +1,231 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/mimi + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 2048 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +flatten: false +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 103d584ed..0ff172529 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -8,18 +8,23 @@ experiment_name: tokotron/discrete_ssl # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + token_model_src: "fnlp/SpeechTokenizer" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" + +# Model type +representation_mode: discrete # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/st +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -29,16 +34,24 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -53,6 +66,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index @@ -61,7 +75,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -81,8 +95,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -90,7 +104,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -98,14 +112,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp silence_padding: !ref # Token model (pretrained) -speech_tokenizer: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - source: !ref - save_path: !ref - -token_model: !new:Tokotron.SpeechTokenizerFeatureExtractor - speech_tokenizer: !ref - codebooks: !ref - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -136,20 +142,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - - ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 @@ -157,7 +155,8 @@ activation: !name:torch.nn.GELU audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False -audio_emb_pretrained: True +audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -166,12 +165,13 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref audio_tokens_per_step: 2 +flatten: false bandwidth: 1.5 attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -191,15 +191,19 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + modules: model: !ref - token_model: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -219,10 +223,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..f0ab3d9c1 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -0,0 +1,258 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/sqcodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +transform_audio: !name:model.sq_codec.tokens_to_ternary + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 19683 +audio_emb_size: 36 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 4 +ternary_num_digits: 9 +ternary_num_positions: !ref * +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + audio_emb: !ref + out_proj: !ref + multihead_input: False + inference: !ref + +inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference + gate_offset: !ref + gate_threshold: !ref + tokens_per_step: !ref + bos_idx: !ref + audio_token_shift: 0 + max_steps: !ref + representation_mode: !ref + transform_audio: !name:model.sq_codec.tokens_to_ternary + feed_audio: !name:model.sq_codec.ternary_logits_to_tokens + +audio_emb: !new:torch.nn.Identity + +out_proj: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + num_positions: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + seq_cost: !name:model.sq_codec.ternary_loss + multihead_output: False + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..d3bf9c770 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -0,0 +1,231 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/wavtokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 4096 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 1 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py deleted file mode 120000 index 08621a288..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py +++ /dev/null @@ -1 +0,0 @@ -../../../utils/preparation.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 3dddf48dc..ec1845d36 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -22,17 +22,19 @@ import string from pathlib import Path from hyperpyyaml import load_hyperpyyaml -from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import clean_padding, clean_padding_ from speechbrain.utils.distributed import run_on_main -from preparation import add_prepared_features -from audio_tokens import ( + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from model.Tokotron import ( get_silence_token, use_silence_padding, feature_pad_to, -) -from Tokotron import RepresentationMode -from evaluate import TokotronEvaluator - + RepresentationMode, +) # noqa: E402 +from evaluate import TokotronEvaluator # noqa: E402 logger = logging.getLogger(__name__) @@ -59,6 +61,9 @@ def __init__( create_waveform_fn=self.create_waveform, device=self.device, ) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) def compute_forward(self, batch, stage): """Runs all the computation of the Tokotron TTS @@ -77,11 +82,13 @@ def compute_forward(self, batch, stage): """ batch = batch.to(self.device) tokens, tokens_length = batch.tokens - audio, audio_length = batch.audio_bos + features = self.prepare_features(batch) + audio, audio_length, _, _ = features emb = None if self.use_spk_emb: emb = {"spk": batch.spk_emb.data.squeeze(1)} + audio = self.transform_audio(audio) predictions = self.modules.model( input_tokens=tokens, input_length=tokens_length, @@ -90,7 +97,69 @@ def compute_forward(self, batch, stage): emb=emb, ) - return predictions + return predictions, features + + def prepare_features(self, batch): + """Prepares features, depending on the configuration + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation + + Returns + ------- + audio_bos : torch.Tensor + Audio features, with BOS + audio_bos_length : torch.Tensor + Relative lengths of the audio features, with BOS + audio_tgt : torch.Tensor + Target audio features (for loss computation) + audio_tgt_length : torch.Tensor + Relative lengths of the target audio features + """ + if self.representation_mode == RepresentationMode.DISCRETE: + audio_bos, audio_bos_length = batch.audio_bos + audio_tgt, audio_tgt_length = batch.audio_pad + if self.audio_token_offsets is not None: + audio_bos = torch.cat( + [ + audio_bos[:, : self.hparams.bos_width], + audio_bos[:, self.hparams.bos_width :] + - self.audio_token_offsets, + ], + dim=1, + ) + clean_padding_(audio_bos, audio_bos_length) + audio_tgt = audio_tgt - self.audio_token_offsets + clean_padding_(audio_tgt, audio_tgt_length) + else: + wav, audio_length = batch.sig + audio = self.modules.ssl_model(wav) + audio = audio[self.hparams.ssl_model_layers, :, :, :].permute( + 1, 2, 0, 3 + ) + batch_size, _, heads, dim = audio.shape + bos = torch.zeros_like(audio[:, :1, :, :]).reshape( + batch_size, self.hparams.bos_width, heads, dim + ) + audio_bos = torch.concatenate([bos, audio], dim=1) + audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) + audio_tgt = audio + audio_tgt_length = audio_length + return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length + + def get_token_offsets(self): + """Computes token offsets for tokenizers that require them""" + token_offsets = None + if self.hparams.audio_token_offsets: + token_offsets = ( + torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + ) + * self.hparams.audio_num_tokens + )[None, None, :] + return token_offsets @torch.no_grad() def evaluate_batch(self, batch, stage): @@ -140,24 +209,27 @@ def compute_objectives(self, predictions, batch, stage): A one-element tensor used for backpropagating the gradient. """ batch = batch.to(self.device) - audio, audio_length = batch.audio_pad + predictions, features = predictions + _, _, audio_tgt, audio_tgt_length = features + + audio_tgt = self.transform_audio(audio_tgt) loss_details = self.hparams.compute_cost( predictions=predictions, - audio=audio, - audio_length=audio_length, + audio=audio_tgt, + audio_length=audio_tgt_length, input_tokens=batch.tokens.data, input_length=batch.tokens.lengths, ) self.loss_metric.append( batch.uttid, predictions=predictions, - audio=audio, - audio_length=audio_length, + audio=audio_tgt, + audio_length=audio_tgt_length, input_tokens=batch.tokens.data, input_length=batch.tokens.lengths, reduction="batch", ) - return loss_details.loss + return loss_details.loss.contiguous() def on_stage_start(self, stage, epoch): """Gets called at the beginning of each epoch. @@ -195,15 +267,23 @@ def on_stage_start(self, stage, epoch): self.use_spk_emb = getattr(self.hparams, "use_spk_emb", False) self.is_evaluating = False - if stage == sb.Stage.VALID: - if self.is_eval_epoch(epoch): + if self.hparams.eval_enabled: + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: self.evaluator.on_evaluate_start(stage, epoch) self.is_evaluating = True - else: - logger.info("No evaluation on epoch %d", epoch) - elif stage == sb.Stage.TEST: - self.evaluator.on_evaluate_start(stage, epoch) - self.is_evaluating = True + + self.audio_token_offsets = self.get_token_offsets() + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + + self.transform_audio = getattr(self.hparams, "transform_audio", torch.nn.Identity()) def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. @@ -225,6 +305,13 @@ def on_stage_end(self, stage, stage_loss, epoch): if stage == sb.Stage.TRAIN: self.train_stats = stage_stats + # End evaluation and report stats + eval_summary_stats = {} + if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_end() + eval_summary_stats = self.get_summary_stats() + stage_stats.update(eval_summary_stats) + # Perform end-of-iteration things, like annealing, logging, etc. if stage == sb.Stage.VALID: @@ -243,13 +330,62 @@ def on_stage_end(self, stage, stage_loss, epoch): valid_stats=stage_stats, ) - # Save the current checkpoint and delete previous checkpoints. + # Save the current checkpoint and delete previous checkpoints. + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } self.checkpointer.save_and_keep_only( - meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs ) - if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): - self.evaluator.on_evaluate_end() + def get_summary_stats(self): + """Retrieves the stats that needs to be reported on every trial + in the train log, as indicated in eval_summary_log in eval.yaml + + Returns + ------- + eval_summary_stats : dict + A dict with stats""" + eval_summary = self.evaluator.compute_summary() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + self._check_threshold(eval_summary_stats) + return eval_summary_stats + + def _check_threshold(self, eval_summary_stats): + """Checks threshold values for the defined stats and terminates + the trials if the parameters are not met. This is necessary because + some metrics produce bogus high values when the speech samples + do not contain any speech at all (e.g. UTMOS can be above 3 for + silence). + + Classic usage: dWER > 0.9 - treat the whole run as "garbage", set + UTMOS to 0 + + Arguments + --------- + eval_summary_stats : dict + Summary statistics + """ + for key, threshold_value in self.hparams.eval_threshold.items(): + key, threshold_type = key.split("_") + value = eval_summary_stats[key] + if threshold_type == "min": + meets = value >= threshold_value + elif threshold_type == "max": + meets = value <= threshold_value + else: + raise ValueError( + f"Invalid threshold definition: {key}, check eval_threshold" + ) + if not meets: + eval_summary_stats["broken"] = True + for key, value in self.hparams.eval_threshold_set.items(): + eval_summary_stats[key] = value def fit_batch(self, batch): """Fit one batch, override to do multiple updates. @@ -281,11 +417,7 @@ def fit_batch(self, batch): def init_optimizers(self): """Custom optimizer initialization """ - representation_mode = getattr( - self.hparams, "representation_mode", RepresentationMode.DISCRETE - ) - representation_mode = RepresentationMode(representation_mode) - if representation_mode == RepresentationMode.CONTINUOUS: + if self.representation_mode == RepresentationMode.CONTINUOUS: audio_emb_params = self.modules.model.decoder.audio_emb.parameters() audio_emb_params_set = set(audio_emb_params) model_params = [ @@ -323,7 +455,19 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ - raise NotImplementedError() + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.modules.tokenizer.codec_vocoder.device = self.device + with torch.no_grad(): + if self.audio_token_offsets is not None: + audio = clean_padding(audio + self.audio_token_offsets, length) + wav = self.modules.tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed @@ -368,9 +512,7 @@ def dataio_prepare(hparams): the token used for silence """ - representation_mode = RepresentationMode( - hparams.get("representation_mode", RepresentationMode.DISCRETE) - ) + representation_mode = RepresentationMode(hparams["representation_mode"]) # Define datasets from json data manifest file # Define datasets sorted by ascending lengths for efficiency @@ -407,7 +549,7 @@ def audio_ref_pipeline(wav): Arguments --------- - wav : str + wav : strÆ’num_ The file path Returns @@ -421,50 +563,50 @@ def audio_ref_pipeline(wav): use_silence_padding = hparams.get("use_silence_padding", True) if representation_mode == RepresentationMode.DISCRETE: - layers_key = "token_model_layers" - model_key = "token_model" - audio_features = "audio_tokens" + model_key = "tokenizer" else: - layers_key = "ssl_model_layers" model_key = "ssl_model" - audio_features = "audio_ssl" - audio_tokens_per_step = ( - len(hparams[layers_key]) - if layers_key in hparams - else hparams["audio_tokens_per_step"] - ) - if use_silence_padding: - silence_token, silence_emb = get_silence_token( + audio_tokens_per_step = hparams["audio_tokens_per_step"] + if ( + use_silence_padding + and representation_mode == RepresentationMode.DISCRETE + ): + silence_token = get_silence_token( hparams[model_key], - extract_emb=representation_mode == RepresentationMode.CONTINUOUS, - model_kwargs=hparams.get("token_model_kwargs"), + num_codebooks=( + hparams["speech_model_layers"] + if "speech_model_layers" in hparams + else audio_tokens_per_step + ) ) + if silence_token.dim() == 2: + silence_token = silence_token.squeeze(-1) else: silence_token = ( torch.ones(hparams["audio_tokens_per_step"], dtype=torch.int64) * hparams["eos_index"] ) - silence_token = silence_token.cpu() - silence_padding = ( - silence_token - if representation_mode == RepresentationMode.DISCRETE - else silence_emb - ) + silence_padding = silence_token.cpu() + silence_padding = silence_padding[:audio_tokens_per_step] silence_padding_len = int(math.ceil(hparams["silence_padding"])) bos_width = hparams.get("bos_width", 1) audio_bos_prefix = ( torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"] ) - if representation_mode == RepresentationMode.CONTINUOUS: - audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat( - 1, 1, hparams["audio_dim"] - ) - @sb.utils.data_pipeline.takes(audio_features) + tokens_loader = hparams.get("tokens_loader") + if "speech_model_layers" in hparams: + tokens_loader_kwargs = { + "num_codebooks": get_selected_layer_indexes(hparams) + } + else: + tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} + + @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") - def audio_pipeline(audio): - audio = torch.from_numpy(audio) + def audio_pipeline(id): + audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding ) @@ -480,21 +622,20 @@ def audio_pipeline(audio): ] init_sequence_encoder(hparams) - use_spk_emb = hparams.get("use_spk_emb", False) - prepared_features = [audio_features] output_keys = [ "uttid", "tokens", - "audio_pad", - "audio_bos", "label_norm_eval", ] - if use_spk_emb: - prepared_features.append("spk_emb") - output_keys.append("spk_emb") + if representation_mode == RepresentationMode.DISCRETE: + output_keys += [ + "audio_pad", + "audio_bos", + ] + else: + output_keys.append("sig") eval_output_keys = [*output_keys, "sig"] - for dataset in data_info: if dataset == "train": dataset_output_keys = output_keys @@ -508,13 +649,6 @@ def audio_pipeline(audio): output_keys=dataset_output_keys, ) - add_prepared_features( - dataset=dynamic_dataset, - save_path=Path(hparams["prepare_save_folder"]) / "features", - id_key="uttid", - features=prepared_features, - ) - datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False @@ -539,51 +673,16 @@ def audio_pipeline(audio): raise NotImplementedError( "sorting must be random, ascending or descending" ) + data_scale = hparams.get("data_scale") + if data_scale: + scaled_data_count = int(len(datasets["train"]) * data_scale) + datasets["train"] = datasets["train"].filtered_sorted( + select_n=scaled_data_count + ) - datasets["sample"] = select_sample(hparams, datasets) return datasets, silence_padding -def select_sample(hparams, datasets): - """Selects a sample of files for sample generation, freezing the sample if - requested to persist across multiple experiments - - Arguments - --------- - hparams : dict - experiment hyperparameters - datasets : dict - a dictionary of datasets - - Returns - ------- - dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset - the sample dataset - """ - sample_path = hparams.get("sample_path") - dataset = None - if sample_path is not None: - sample_path = Path(sample_path) - if sample_path.exists(): - with open(sample_path, "r") as sample_file: - data_ids = [line.strip() for line in sample_file] - dataset = FilteredSortedDynamicItemDataset( - datasets["valid"], data_ids - ) - - if dataset is None: - dataset = ( - datasets["valid"] - .batch_shuffle(1) - .filtered_sorted(select_n=hparams["num_audio_samples"]) - ) - if sample_path is not None: - with open(sample_path, "w") as sample_file: - for data_id in dataset.data_ids: - print(data_id, file=sample_file) - return dataset - - def init_sequence_encoder(hparams): """Initialize a sequence encoder @@ -611,6 +710,22 @@ def init_sequence_encoder(hparams): return encoder +def get_selected_layer_indexes(hparams): + """Finds the layers of selected layers + + Arguments + --------- + hparams : dict + Hyperparameters + """ + selected_layers = hparams.get("speech_model_layers") + available_layers = hparams.get("available_speech_model_layers") + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + def read_token_list(file_name): """Reads a simple text file with tokens (e.g. characters or phonemes) listed one per line @@ -625,7 +740,10 @@ def read_token_list(file_name): result: list a list of tokens """ - if not Path(file_name).exists(): + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): raise ValueError(f"Token file {file_name} not found") with open(file_name) as token_file: return [line.strip("\r\n") for line in token_file if line] @@ -667,17 +785,23 @@ def apply_overfit_test(hparams, dataset): """ if hparams["overfit_test"]: if isinstance(dataset, tuple): - dataset_train, _, _ = dataset + dataset_train, dataset_valid, _ = dataset dataset_train = apply_overfit_test(hparams, dataset_train) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) result = { "train": dataset_train, "valid": dataset_eval, @@ -699,7 +823,7 @@ def apply_overfit_test(hparams, dataset): ) -def run_experiment(brain_cls): +if __name__ == "__main__": # Reading command line arguments hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) @@ -712,6 +836,8 @@ def run_experiment(brain_cls): # Load evaluation hyperparameters eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if not eval_hparams_file.exists(): + eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml" if eval_hparams_file.exists(): logger.info( "Using evaluation hyperparameters from %s", eval_hparams_file @@ -736,40 +862,23 @@ def run_experiment(brain_cls): from ljspeech_prepare import prepare_ljspeech # Data preparation, to be run on only one process. - representation_mode = RepresentationMode( - hparams.get("representation_mode", RepresentationMode.DISCRETE) - ) - audio_features = ( - "audio_tokens" - if representation_mode == RepresentationMode.DISCRETE - else "audio_ssl" - ) - extract_features = [audio_features] - if hparams.get("use_spk_emb", False): - extract_features.append("spk_emb") - if not hparams["skip_prep"]: - with hparams["freezer"]: - run_on_main( - prepare_ljspeech, - kwargs={ - "data_folder": hparams["data_folder"], - "save_folder": hparams["prepare_save_folder"], - "splits": hparams["splits"], - "split_ratio": hparams["split_ratio"], - "seed": hparams["seed"], - "extract_features": extract_features, - "extract_features_opts": hparams["extract_features_opts"], - "extract_phonemes": hparams["input"] == "phonemes", - "model_name": "tokotron", - "g2p_src": hparams["g2p_src"], - "skip_ignore_folders": hparams[ - "prepare_skip_ignore_folders" - ], - "frozen_split_path": hparams.get("frozen_split_path"), - "device": run_opts.get("device", "cpu"), - }, - ) + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["prepare_save_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "extract_phonemes": hparams["input"] == "phonemes", + "model_name": "tokotron", + "g2p_src": hparams["g2p_src"], + "skip_ignore_folders": hparams["prepare_skip_ignore_folders"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) # We can now directly create the datasets for training, valid, and test datasets, silence_padding = dataio_prepare(hparams) @@ -779,39 +888,65 @@ def run_experiment(brain_cls): audio_keys = ["audio_pad", "audio_bos"] # Trainer initialization - tts_brain = brain_cls( + tts_brain = TokotronBrain( modules=hparams["modules"], opt_class=hparams["opt_class"], hparams=hparams, run_opts=run_opts, checkpointer=hparams["checkpointer"], ) - tts_brain.sample_data = datasets["sample"] # The `fit()` method iterates the training loop, calling the methods # necessary to update the parameters of the model. Since all objects # with changing state are managed by the Checkpointer, training can be # stopped at any point, and will be resumed on next call. + + dataloader_opts = [ + hparams[f"{key}_dataloader_opts"] for key in ["train", "valid", "test"] + ] + representation_mode = RepresentationMode(hparams["representation_mode"]) + if representation_mode == RepresentationMode.DISCRETE: + dataloader_opts = [ + use_silence_padding(opts, silence_padding, audio_keys) + for opts in dataloader_opts + ] + ( + train_dataloader_opts, + valid_dataloader_opts, + test_dataloader_opts, + ) = dataloader_opts + tts_brain.fit( tts_brain.hparams.epoch_counter, datasets["train"], datasets["valid"], - train_loader_kwargs=use_silence_padding( - hparams["train_dataloader_opts"], silence_padding, audio_keys - ), - valid_loader_kwargs=use_silence_padding( - hparams["valid_dataloader_opts"], silence_padding, audio_keys - ), + train_loader_kwargs=train_dataloader_opts, + valid_loader_kwargs=valid_dataloader_opts, ) # Load best checkpoint for evaluation - tts_brain.evaluate( - test_set=datasets["test"], - min_key="loss", - test_loader_kwargs=use_silence_padding( - hparams["test_dataloader_opts"], silence_padding, audio_keys - ), - ) + if hparams["testing"]: + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + eval_kwargs = {} + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + if test_key: + eval_kwargs = { + f"{test_key_kind}_key": test_key + } + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs + ) + # Save final checkpoint (fixed name) tts_brain.checkpointer.save_checkpoint(name="latest") diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py deleted file mode 100644 index f3495eaca..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Continuous SSL verfsion - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronContinuousSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.vocoder(audio) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronContinuousSSLBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py deleted file mode 100644 index d0bc9f4f7..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDACBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - z, _, _ = self.modules.dac.quantizer.from_codes( - audio.transpose(1, 2).int() - ) - wav = self.modules.dac.decode(z).squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDACBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py deleted file mode 100644 index f9fc764cd..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Discrete SSL version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -import torch -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDiscreteSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def on_stage_start(self, stage, epoch): - self.compute_offset() - return super().on_stage_start(stage, epoch) - - def compute_offset(self): - """Computes per-layer offsets""" - layers_set = set(self.hparams.token_model_layers) - available_layers_set = set(self.hparams.vocoder_available_layers) - if not layers_set.issubset(available_layers_set): - unavailable_layers = ",".join( - str(layer) for layer in (layers_set - available_layers_set) - ) - raise ValueError(f"Layers {unavailable_layers} are not supported") - self.num_units = self.hparams.audio_num_tokens - _, layers_idx = torch.where( - torch.tensor( - self.hparams.vocoder_available_layers, device=self.device - ).unsqueeze(0) - == torch.tensor( - self.hparams.token_model_layers, device=self.device - ).unsqueeze(1) - ) - self.layer_offset = ( - torch.tensor(layers_idx, device=self.device) * self.num_units - )[None, None, :] - self.offset = self.hparams.token_offset - self.modules.vocoder.tokenize = False - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - units_with_offset = ( - audio + self.layer_offset.to(audio.device) + self.offset - ) - wav = self.modules.vocoder(units_with_offset) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDiscreteSSLBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py deleted file mode 100644 index 2168f970d..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronEncodecBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.token_model.decode(audio) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronEncodecBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py deleted file mode 100644 index bc51db78c..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronSTBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.token_model.decode(audio) - if length is not None: - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronSTBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py new file mode 100644 index 000000000..6c2dd1c8d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py @@ -0,0 +1,359 @@ +import json +import torch +import logging +import re +import csv +from speechbrain.utils.metric_stats import MetricStats +from types import SimpleNamespace +from pathlib import Path +from utils.data import undo_batch +from torch import nn + + +logger = logging.getLogger(__name__) + + +class SpeechEvaluationMetricStats(MetricStats): + """An aggregate metric combining multiple speech evaluators + + Arguments + --------- + hparams : dict | SimpleNamespace | object + Raw hyperparameters for evaluation + + device : str + The device on which evaluation will be performed + + """ + + def __init__(self, hparams, device="cpu"): + if isinstance(hparams, dict): + hparams = SimpleNamespace(**hparams) + self.hparams = hparams + self.device = device + modules = self.hparams.modules + self.modules = nn.ModuleDict(modules).to(self.device) + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.evaluators + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warn( + "No evaluators were defined - this run will produce samples only" + ) + + self.attention = [] + + def on_evaluation_start(self, output_folder="eval"): + """Invoked at the beginning of the evaluation cycle. + + Arguments + --------- + output_folder : str | path-like + The folder to which results will be output + + """ + logger.info("Starting evaluation") + output_folder = Path(output_folder) + self.output_folder = ( + output_folder + if output_folder.is_absolute() + else self.hparams.output_folder / output_folder + ) + self.output_folder.mkdir(parents=True, exist_ok=True) + + self.files = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.read_reports() + self.create_reports() + self.item_ids = [] + + def on_evaluation_end(self): + """Invoked at the beginning of the evaluation cycle. The default + implementation is a no-op + """ + logger.info("Ending evaluation") + self.write_summary() + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + self.files.append(file_name) + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_tracker_file_name(self): + """Determines the file name of the tracker file""" + suffix = ( + f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else "" + ) + file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt" + return self.output_folder / file_name + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def append(self, ids, wav, length, text, wav_ref, length_ref): + """Appends the result of a single item + + Arguments + --------- + ids : str + Utterance IDs + wav : torch.Tensor + Synthesized waveforms + length : torch.Tensor + Relative lengths of the synthesized waveforms + text : list + Ground truth text + wav_ref : torch.Tensor + Reference (ground truth) waveforms + length_ref : torch.Tensor + Reference lengths + """ + with torch.no_grad(): + self.item_ids.extend(ids) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=text, + wavs_ref=wav_ref, + length_ref=length_ref, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, ids, details) + self.details[evaluator_key].extend(details) + + def write_result(self, evaluator_key, ids, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + ids : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(ids, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def write_summary(self, file_name=None): + """Outputs summarized statistics + + Arguments + --------- + file_name : str | path-like + An alternative path to save the file + """ + summary = self.summarize() + if file_name is None: + file_name = self.output_folder / "summary.json" + self.files.append(file_name) + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def summarize(self, field=None): + """Computes the summarized statistics + + Arguments + --------- + field : str, optional + If specified, it will return a specific field + + Returns + ------- + result : dict | float + The summary - or the specified field from the sum + """ + result = { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], key=metric_key, + ).items() + } + if field is not None: + result = result[field] + return result + + def clear(self): + """Deletes all the files that have been created""" + for file_name in self.files: + file_name.unlink() + + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + """Removes any non-ASCII characters from a dictionary + + Arguments + --------- + values : dict + A dictionary of values + + Returns + ------- + result : dict + The same dictionary - but with non-ASCII strings removed""" + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + The key of the metric for which the statistics will be computed + + Returns + ------- + statistics : dict + The desccriptive statistics computed + _mean : the arithmetic mean + _std : the standard deviation + _min : the minimum value + _max : the maximum value + _median : the median value + _q1 : the first quartile + _q3 : the third quartile + _iqr : the interquartile ratio + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable. Strings + that look like integers or floats will be converted to integers + or floats. + + Arguments + --------- + value : str + a string value + + Returns + ------- + result : object + The processed result""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml new file mode 100644 index 000000000..08587ce23 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml @@ -0,0 +1,42 @@ +eval_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_asr_type: whisper +eval_asr_source: openai/whisper-small +evaluations: utmos,asr +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +evaluators: + utmos: !ref + asr: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_dac.yaml new file mode 100644 index 000000000..8dc6209f7 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_dac.yaml @@ -0,0 +1,241 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speechtokenizer +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + + +# DAC-specific settings +model_type: 24khz +model_bitrate: 8kbps + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 2 +flatten: false + + +freeze_lm_head: False + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..abe15d14f --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -0,0 +1,269 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/discrete_ssl +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +ssl_model_type: wavlm +output_folder: !ref results/// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +vocoder_model_name: !ref unithifigan-dasb--discrete +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1000 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 6 +flatten: false + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref / + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml new file mode 100644 index 000000000..cae286efd --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -0,0 +1,240 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/encodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_top_k: 20 +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: false +bandwidth: 6 + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + top_k: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..5aae5e0db --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -0,0 +1,243 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/espnet-encodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_top_k: 20 +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: false +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml + +freeze_lm_head: True + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + top_k: !ref + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml new file mode 100644 index 000000000..edae05d51 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -0,0 +1,237 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/mimi +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 2048 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: false +bandwidth: 6 + + +freeze_lm_head: False + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_speech_tokenizer.yaml new file mode 100644 index 000000000..3560eaf69 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -0,0 +1,233 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speechtokenizer +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: false + + +freeze_lm_head: False + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..bb3f07562 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -0,0 +1,282 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/sqcodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: False +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null +multispeaker_pretrain: True + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: ["", "", "", ""] + False: ["", "", ""] + +special_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: 5 + False: 4 +spk_prompt_length: 150 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +ternary_d_hidden: 512 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +target_dropout: 0.5 +vocab_size: 19683 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !ref * 2 + +audio_token_shift: 19683 + +audio_tokens_per_step: 4 +flatten: true +ternary_num_digits: 10 +pred_mode: ternary +freeze_lm_head: False + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: 1 + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + target_dropout: !ref + share_emb: !ref + qk_norm: !ref + emb: !ref + lm_head: !ref + logits_to_probs: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: 1 + top_k: !ref + +lm_head: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + d_hidden: !ref + num_positions: !ref * + tokens: null + +logits_to_probs: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryLogitTokenizer + num_tokens: !ref + num_positions: !ref + tokens: !new:torch.nn.Identity + + +emb: !new:speechbrain.nnet.containers.Sequential + ternary: !new:model.sq_codec.TernaryEmbedding + num_digits: !ref + flat: True + linear: !new:speechbrain.nnet.linear.Linear + input_size: !ref * + n_neurons: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !name:model.sq_codec.ternary_loss + targets_type: tokens + num_positions: !ref + tokens: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..03d7e433b --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -0,0 +1,241 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/wavtokenizer +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 4096 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 1 +flatten: False +bandwidth: 6 + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py new file mode 120000 index 000000000..2f703273c --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py @@ -0,0 +1 @@ +../../ljspeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py new file mode 100644 index 000000000..9d56ef5bf --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -0,0 +1,1066 @@ +#!/usr/bin/env/python3 +"""Recipe for training VALL-E + +Based on ESPNET VALL-E + +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import torch +import sys +import shutil +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataio import ( + clean_padding, + length_to_mask, + write_audio, +) +from speechbrain.utils.distributed import run_on_main +from speechbrain.utils.data_utils import batch_pad_right +import re +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats # noqa: E402 + +logger = logging.getLogger(__name__) + + +# Brain class for speech recognition training +class VALLEBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + tokenizer = ( + self.modules.tokenizer.module + if hasattr(self.modules.tokenizer, "module") + else self.modules.tokenizer + ) + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig(audio) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + prompt, prompt_length = batch.prompt + batch_size, prompt_max_len, num_tracks = prompt.shape + if self.train_nar: + nar_track = torch.randint( + 1, num_tracks, (batch_size,), device=self.device + ) + else: + nar_track = None + logits_ar, logits_nar = self.modules.model( + dec_seq=batch.prompt.data, + dec_seq_lengths=batch.prompt.lengths, + prefix_len=batch.prefix_length / prompt_max_len, + nar_level_idx=nar_track, + predict_ar=self.train_ar, + predict_nar=self.train_nar, + ) + return logits_ar, logits_nar, nar_track + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + logits_ar, logits_nar, nar_track = predictions + prompt, prompt_length = batch.prompt + prefix_length = batch.prefix_length + + batch_size, prompt_max_len, _ = prompt.shape + batch_idx = torch.arange(batch_size, device=prompt.device) + length_mask = length_to_mask( + prompt_length * prompt_max_len, prompt_max_len + ) + prefix_mask = length_to_mask( + prefix_length, prompt_max_len + ).logical_not() + mask = (length_mask * prefix_mask)[:, 1:] + + loss_components = [] + + if self.train_ar: + logits_ar_sm = self.hparams.log_softmax(logits_ar) + if self.hparams.flatten: + targets_ar = prompt[:, 1:] + else: + targets_ar = prompt[:, 1:, 0] + loss_ar = self.hparams.compute_cost( + logits_ar_sm, targets=targets_ar, mask=mask + ) + loss_components.append(loss_ar) + else: + logits_ar_sm, targets_ar = None, None + if self.train_nar: + logits_nar_sm = self.hparams.log_softmax(logits_nar) + targets_nar = prompt[batch_idx, 1:, nar_track] + loss_nar = self.hparams.compute_cost( + logits_nar_sm, targets=targets_nar, mask=mask, + ) + loss_components.append(loss_nar) + else: + logits_nar_sm, targets_nar = None, None + + self.loss_metric.append( + ids=batch.uttid, + logits_ar=logits_ar_sm, + targets_ar=targets_ar, + logits_nar=logits_nar_sm, + targets_nar=targets_nar, + mask=mask, + reduction="batch", + ) + + loss = torch.mean(torch.stack(loss_components)) + return loss + + def compute_loss_stats( + self, + logits_ar, + targets_ar, + logits_nar, + targets_nar, + mask, + reduction="batch" + ): + """Computes an autoregressive/non-autoregressive loss breakdown, + to be used for metrics/stats + + Arguments + --------- + logits_ar : torch.Tensor + The autoregressive predictions + targets_ar : torch.Tensor + The targets for autoregressive predictions + logits_nar : torch.Tensor + The non-autoregressive predictions + targets_nar : torch.Tensor + The targets for non-autoregressive prediction + + Returns + ------- + stats: dict + statistics + """ + stats = {} + if self.train_ar: + stats["loss_ar"] = self.hparams.compute_cost( + logits_ar, targets=targets_ar, mask=mask, + reduction=reduction, + ) + if self.train_nar: + stats["loss_nar"] = self.hparams.compute_cost( + logits_nar, targets=targets_nar, mask=mask, + reduction=reduction, + ) + return stats + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.offsets = get_offsets( + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) + + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.compute_loss_stats, batch_eval=True, + ) + self.apply_curriculum() + + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + self.transform_audio = getattr(self.hparams, "transform_audio", None) + + def apply_curriculum(self): + """Applies curriculum settings, if specified, training only the autoregressive part - or + only the non-autoregressive part""" + epoch = self.hparams.epoch_counter.current + self.train_ar, self.train_nar = True, True + lm_head = ( + self.modules.model.module.lm_head + if hasattr(self.modules.model, "module") + else self.modules.model.lm_head + ) + lm_head.requires_grad_(True) + if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: + # NOTE: If there is only one track it's autoregressive + self.train_nar = False + elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: + self.train_nar = False + elif ( + self.hparams.number_of_epochs_nar is not None + and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) + ): + self.train_ar = False + if self.hparams.freeze_lm_head: + lm_head.requires_grad_(False) + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + # NOTE: Need to get past AR-only training to be able to evaluate + can_evaluate = not ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ) + return can_evaluate and (epoch % self.hparams.eval_interval == 0) + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + with torch.no_grad(): + audio_tokens, audio_length = self.inference(batch) + if self.hparams.flip_layers: + audio_tokens = audio_tokens.flip(2) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.save_samples( + batch=batch, wav=wav, length=audio_length, stage=stage + ) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_evaluating: + self.evaluation_metric.on_evaluation_end() + self.save_eval(stage) + eval_summary = self.evaluation_metric.summarize() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + else: + eval_summary_stats = {} + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs + ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + + def inference(self, batch): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = ( + self.modules.model.module.inference + if hasattr(self.modules.model, "module") + else self.modules.model.inference + ) + inference_results = [ + inference( + prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() + ) + for prefix_item in prefix_items + ] + inferred_tokens = [ + result[0][0] + if result[0] + else torch.zeros(1000, self.hparams.audio_tokens_per_step) + for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + offsets = self.offsets + if self.hparams.flip_layers: + offsets = offsets.flip(2) + audio = (audio - self.hparams.audio_token_shift - offsets).clip(0) + return audio, audio_length + + def _get_inference_opts(self): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) + track_start = ( + self.hparams.audio_token_shift + + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True + return self.hparams.inference_opts( + masks={self.hparams.bos_index: mask}, device=self.device, + ) + + def save_samples(self, batch, wav, length, stage): + output_folder = self._get_eval_output_folder(stage) + samples = undo_padding_tensor(wav, length) + for uttid, sample in zip(batch.uttid, samples): + file_name = output_folder / f"pred_{uttid}.wav" + write_audio(file_name, sample.detach().cpu(), self.hparams.model_sample_rate) + + def save_eval(self, stage): + """Saves evaluation results + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + """ + output_folder = self._get_eval_output_folder(stage) + for src_file_name in self.evaluation_metric.files: + dest_file_name = output_folder / src_file_name.name + shutil.copyfile(src_file_name, dest_file_name) + self.evaluation_metric.clear() + + def _get_eval_output_folder(self, stage): + epoch = self.hparams.epoch_counter.current + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(exist_ok=True, parents=True) + return output_folder + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phonemes"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + offsets = get_offsets( + hparams["vocab_size"], hparams["audio_tokens_per_step"] + ).unsqueeze(0) + if not hparams["use_token_offsets"]: + offsets = torch.zeros_like(offsets) + if hparams["flip_layers"]: + offsets = offsets.flip(-1) + + tokens_loader = hparams.get("tokens_loader") + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label_norm + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + layer_idx = None + if "speech_model_layers" in hparams: + layer_idx = get_selected_layer_indexes( + hparams["available_speech_model_layers"], + hparams["speech_model_layers"], + ) + + if layer_idx is not None: + num_codebooks = layer_idx + else: + num_codebooks = hparams["audio_tokens_per_step"] + + @sb.utils.data_pipeline.takes("uttid", "tokens") + @sb.utils.data_pipeline.provides( + "audio", "prefix", "prompt", "prefix_length", "length" + ) + def prompt_pipeline_spk(id, tokens): + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=hparams["audio_tokens_per_step"] + ) + if hparams["flip_layers"]: + audio = audio.flip(-1) + yield audio + num_tracks = audio.size(1) + spk_prompt = torch.randint( + 0, + hparams["vocab_size"], + (hparams["spk_prompt_length"], num_tracks) + ) + prefix = torch.cat( + [ + torch.ones(1, num_tracks) * hparams["bos_index"], + tokens.unsqueeze(-1).expand(len(tokens), num_tracks), + torch.ones(1, num_tracks) * hparams["eot_index"], + spk_prompt + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eop_index"], + ] + ) + yield prefix + prompt = torch.cat( + [ + prefix, + torch.ones(1, num_tracks) * hparams["bos_index"], + audio + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eos_index"], + ] + ).int() + yield prompt + yield len(prefix) + yield len(prompt) + + @sb.utils.data_pipeline.takes("uttid", "tokens") + @sb.utils.data_pipeline.provides( + "audio", "prefix", "prompt", "prefix_length", "length" + ) + def prompt_pipeline(id, tokens): + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=num_codebooks + ) + if hparams["flip_layers"]: + audio = audio.flip(-1) + yield audio + num_tracks = audio.size(1) + prefix = torch.cat( + [ + torch.ones(1, num_tracks) * hparams["bos_index"], + tokens.unsqueeze(-1).expand(len(tokens), num_tracks), + torch.ones(1, num_tracks) * hparams["eot_index"], + ] + ) + yield prefix + prompt = torch.cat( + [ + prefix, + torch.ones(1, num_tracks) * hparams["bos_index"], + audio + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eos_index"], + ] + ).int() + yield prompt + yield len(prefix) + yield len(prompt) + + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sig_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + dynamic_items = [sig_pipeline, text_pipeline, tokens_pipeline] + if hparams.get("multispeaker_pretrain"): + dynamic_items.append(prompt_pipeline_spk) + else: + dynamic_items.append(prompt_pipeline) + + init_sequence_encoder(hparams) + use_spk_emb = hparams.get("use_spk_emb", False) + prepared_features = ["audio_tokens"] + output_keys = [ + "uttid", + "tokens", + "label_norm", + "audio", + "prompt", + "prefix_length", + "length", + ] + if use_spk_emb: + prepared_features.append("spk_emb") + output_keys.append("spk_emb") + + for dataset in data_info: + dataset_dynamic_items = list(dynamic_items) + dataset_output_keys = list(output_keys) + if dataset != "train": + dataset_output_keys += ["sig", "label_norm_eval", "prefix"] + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dataset_dynamic_items, + output_keys=dataset_output_keys, + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + data_scale = hparams.get("data_scale") + if data_scale: + scaled_data_count = int(len(datasets["train"]) * data_scale) + datasets["train"] = datasets["train"].filtered_sorted( + select_n=scaled_data_count + ) + return datasets + + +def get_offsets(vocab_size, tracks): + """Adds offsets to each track to treat the tokens as distinct + + Arguments + --------- + vocab_size : int + The vocabulary size, for each track + tracks : int + The number of tracks + """ + return torch.arange(tracks) * vocab_size + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + for token in hparams["special_tokens"]: + token_key = token.replace("<", "").replace(">", "") + token_index = hparams[f"{token_key}_index"] + encoder.insert_label(token, token_index) + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + hparams["special_num_tokens"]) + return encoder + + +def get_selected_layer_indexes(available_layers, selected_layers): + """Finds the layers of selected layers + + Arguments + --------- + available_layers : list + The available layers + selected_layers : list + The selected layers + + Returns + ------- + layer_idx : list + The layer indexes + """ + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, dataset_valid, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + from ljspeech_prepare import prepare_ljspeech + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["prepare_save_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "extract_phonemes": hparams["input"] == "phonemes", + "model_name": "tokotron", + "g2p_src": hparams["g2p_src"], + "skip_ignore_folders": hparams["prepare_skip_ignore_folders"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_tokens"] + + # Trainer initialization + tts_brain = VALLEBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs + ) diff --git a/benchmarks/DASB/LJSpeech/extraction/extract.py b/benchmarks/DASB/LJSpeech/extraction/extract.py new file mode 100644 index 000000000..556d8a9d0 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/extract.py @@ -0,0 +1,88 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +print(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from ljspeech_prepare import prepare_ljspeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["output_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid", "test"]: + json_path = hparams[f"{split}_json"] + name = pl.Path(json_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, + hparams["num_codebooks"], + (save_folder / "ljspeech").as_posix(), + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info(f"Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix(), + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], + ) diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml new file mode 100644 index 000000000..b90054db6 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml @@ -0,0 +1,63 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..d50cb85ef --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,100 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | + +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: WavLM +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +vocab_size: 1000 +save_embedding: False + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +num_codebooks: [1, 3, 7, 12, 18, 23] +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..869d1c503 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml @@ -0,0 +1,63 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml new file mode 100644 index 000000000..c03ffa936 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml @@ -0,0 +1,66 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..c534bef0f --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml @@ -0,0 +1,57 @@ +# ############################################################################ +# Auido Tokenizer: Mimi +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/mimi +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..d036e05a3 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,53 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 8 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False + + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..28c7c9be9 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml @@ -0,0 +1,56 @@ +# ############################################################################ +# Auido Tokenizer: SQCodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/sqcodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# SQCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks: 4 +save_path: /home/ubuntu/sq-codec/SQ-Codec + +# SQCodec model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..a23c29e59 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: wavtokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavtokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# WavTokenizer parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks: 1 +vocab_size: 4096 + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py new file mode 120000 index 000000000..2de5a21a8 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py @@ -0,0 +1 @@ +../ljspeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py index e88b92eb6..416c63010 100644 --- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py +++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py @@ -13,13 +13,11 @@ import json import random import logging -from types import SimpleNamespace import torch import torchaudio import numpy as np import tgt import re -import speechbrain as sb from tqdm import tqdm from pathlib import Path from speechbrain.utils.data_utils import download_file @@ -27,10 +25,6 @@ from speechbrain.inference.text import GraphemeToPhoneme from unidecode import unidecode from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations -from speechbrain.dataio.batch import PaddedData -from speechbrain.dataio.dataset import DynamicItemDataset -from preparation import FeatureExtractor -from torchaudio.functional import resample logger = logging.getLogger(__name__) @@ -59,8 +53,6 @@ def prepare_ljspeech( pitch_max_f0=400, skip_prep=False, use_custom_cleaner=False, - extract_features=None, - extract_features_opts=None, extract_phonemes=False, g2p_src="speechbrain/soundchoice-g2p", skip_ignore_folders=False, @@ -179,7 +171,7 @@ def prepare_ljspeech( os.makedirs(duration_folder) # extract pitch for both Fastspeech2 and FastSpeech2WithAligner models - if "FastSpeech2" in model_name: + if model_name is not None and "FastSpeech2" in model_name: pitch_folder = os.path.join(data_folder, "pitch") if not os.path.exists(pitch_folder): os.makedirs(pitch_folder) @@ -200,22 +192,11 @@ def prepare_ljspeech( data_folder, splits, split_ratio, frozen_split_path ) - extract_features_context = None - extract_features_folder = None - if extract_features: - extract_features_context = get_context( - extract_features=extract_features, - extract_features_opts=extract_features_opts or {}, - device=device, - ) - extract_features_folder = Path(save_folder) / "features" - if "train" in splits: prepare_json( model_name, data_split["train"], save_json_train, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -226,10 +207,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -239,7 +216,6 @@ def prepare_ljspeech( model_name, data_split["valid"], save_json_valid, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -250,10 +226,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -263,7 +235,6 @@ def prepare_ljspeech( model_name, data_split["test"], save_json_test, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -274,10 +245,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -421,7 +388,6 @@ def prepare_json( model_name, seg_lst, json_file, - data_folder, wavs_folder, csv_reader, phoneme_alignments_folder, @@ -432,10 +398,6 @@ def prepare_json( pitch_min_f0, pitch_max_f0, use_custom_cleaner=False, - extract_features=None, - extract_features_context=None, - extract_features_folder=None, - extract_features_opts=None, extract_phonemes=False, g2p_src="speechbrain/soundchoice-g2p", device="cpu", @@ -471,14 +433,8 @@ def prepare_json( Max f0 for pitch computation use_custom_cleaner : bool If True, uses custom cleaner defined for this recipe - extract_features : list, optional - If specified, feature extraction will be performed - extract_features_context : types.SimpleNamespace, optional - Context for feature extraction (pretrained models, etc) - extract_features_folder : path-like, optional - The folder where extracted features will be saved - extract_features_opts : dict, optional - Options for feature extraction + extract_phonemes : bool + Whether to extract phonemes g2p_src : str The name of the HuggingFace Hub to use for the Grapheme-to-Phoneme model or the path to it @@ -495,12 +451,12 @@ def prepare_json( extract_phonemes = True if extract_phonemes: logger.info( - "Computing phonemes for LJSpeech labels using SpeechBrain G2P. This may take a while." + "Computing phonemes for LJSpeech labels using SpeechBrain f This may take a while." ) g2p = GraphemeToPhoneme.from_hparams( g2p_src, run_opts={"device": device} ) - if "FastSpeech2" in model_name: + if model_name is not None and "FastSpeech2" in model_name: logger.info( "Computing pitch as required for FastSpeech2. This may take a while." ) @@ -649,19 +605,6 @@ def prepare_json( # Updates data for the utterance json_dict[id].update({"phonemes": phonemes}) - # Feature Extraction - if extract_features: - extract_features_folder.mkdir(exist_ok=True) - prepare_features( - data=json_dict, - data_folder=data_folder, - save_path=extract_features_folder, - features=extract_features, - context=extract_features_context, - options=extract_features_opts, - device=device, - ) - # Writing the dictionary to the json file with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) @@ -838,146 +781,3 @@ def custom_clean(text, model_name): for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text - - -INLINE_FEATURES = ["audio_ssl_len"] - - -def prepare_features( - data, data_folder, save_path, features, context, options=None, device="cpu" -): - """Performs feature extraction - - Arguments - --------- - data: dict - a preprocessed dataset - data_folder : str - the data folder - save_folder : str - the folder where features will be saved - context : dict - context data - features: list - the list of feature extractions to be performed - """ - dataset = DynamicItemDataset(data) - feature_extractor = FeatureExtractor( - save_path=save_path, - src_keys=["sig"], - id_key="uttid", - dataloader_opts=options.get("dataloader_opts", {}), - device=device, - ) - token_model_kwargs = options.get("token_model_kwargs", {}) - ssl_layers = options.get("ssl_model_layers") or options.get( - "token_model_layers" - ) - - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - """Load the audio signal. """ - wav = wav.replace("{data_root}", data_folder) - sig = sb.dataio.dataio.read_audio(wav) - - yield sig - - dataset.add_dynamic_item(audio_pipeline) - - @sb.utils.data_pipeline.takes("sig") - @sb.utils.data_pipeline.provides("sig_resampled") - def resample_pipeline(sig): - sig_data = resample( - waveform=sig.data, - orig_freq=options["sample_rate"], - new_freq=options["model_sample_rate"], - ) - return PaddedData(sig_data, sig.lengths) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("audio_tokens", "audio_emb") - def token_pipeline(sig): - with torch.no_grad(): - result = context.token_model( - sig.data, sig.lengths, **token_model_kwargs - ) - # TODO: Clean this up - if torch.is_tensor(result): - tokens = result - # Note: Dummy embedding - meaning embeddings are not available - emb = torch.zeros((len(sig.data), 1, 1), device=sig.data.device) - else: - tokens, emb = result[:2] - tokens = tokens.int() - if tokens.dim() < 3: - tokens = tokens.unsqueeze(-1) - yield PaddedData(tokens, sig.lengths) - yield PaddedData(emb, sig.lengths) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("spk_emb") - def spk_emb_pipeline(sig): - mel_spec = context.spk_emb_model.mel_spectogram(audio=sig.data) - return context.spk_emb_model.encode_mel_spectrogram_batch( - mel_spec, sig.lengths - ) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("audio_ssl", "audio_ssl_len") - def ssl_pipeline(sig): - ssl_raw = context.ssl_model(sig.data, sig.lengths) - ssl = ssl_raw[ssl_layers].permute(1, 2, 0, 3) - yield PaddedData(ssl, sig.lengths) - yield (sig.lengths * ssl.size(1)).tolist() - - dynamic_items = [ - resample_pipeline, - token_pipeline, - ssl_pipeline, - spk_emb_pipeline, - ] - for dynamic_item in dynamic_items: - feature_extractor.add_dynamic_item(dynamic_item) - - feature_keys = [key for key in features if key not in INLINE_FEATURES] - inline_keys = [key for key in features if key in INLINE_FEATURES] - feature_extractor.set_output_features(feature_keys, inline_keys=inline_keys) - feature_extractor.extract(dataset, data) - - -def get_context(extract_features, extract_features_opts, device): - """ - Gets the context (pretrained models, etc) for feature extraction - - Arguments - --------- - extract_features : list - A list of features to extract - Available features: - audio_tokens - raw tokens - audio_emb - embeddings from the model - extract_features_opts : dict - Options for feature extraction - device : str|torch.Device - The device on which extraction will be run - - Returns - -------- - context: SimpleNamespace - The context object - """ - context = {} - if ( - any(key in extract_features for key in ["audio_tokens", "audio_emb"]) - and "token_model" in extract_features_opts - ): - context["token_model"] = extract_features_opts["token_model"].to(device) - if "audio_ssl" in extract_features: - context["ssl_model"] = extract_features_opts["ssl_model"].to(device) - if "spk_emb" in extract_features: - context["spk_emb_model"] = extract_features_opts["spk_emb_model"]( - run_opts={"device": device} - ) - - return SimpleNamespace(**context) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml index f9720b170..7871d6212 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml @@ -47,7 +47,7 @@ save_embedding: False tokenizer: !new:utils.tokenizer_interface.MimiTokenizer source: !ref - save_path: !ref + save_path: !ref num_codebooks: !ref sample_rate: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml index 976614a3d..9a8b754eb 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml @@ -47,7 +47,7 @@ vocab_size: 4096 # wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref - save_path: !ref + save_path: !ref checkpoint: !ref config: !ref sample_rate: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py similarity index 100% rename from benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py rename to benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py new file mode 120000 index 000000000..4b3f08ebb --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py @@ -0,0 +1 @@ +../../../model/custom_model.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py new file mode 120000 index 000000000..d65702b6c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py @@ -0,0 +1 @@ +../../../utils/data.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py new file mode 120000 index 000000000..0ca6d4644 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py @@ -0,0 +1 @@ +../../../utils/eval.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py new file mode 100644 index 000000000..aa7ee2c4b --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -0,0 +1,494 @@ +"""Evaluates a checkpoint using an MOS estimation tool + +Authors +* Artem Ploujnikov 2024 +""" + +# TODO: There are too many evaluation scripts. Refactor to extract common +# features + +import speechbrain as sb +import json +import logging +import csv +import torch +import torchaudio +import string +import re +from pathlib import Path +from types import SimpleNamespace +from torch.nn import ModuleDict +from data import undo_batch +from eval import vocoder_to_device +from torch.utils.flop_counter import FlopCounterMode +from contextlib import nullcontext + + +logger = logging.getLogger(__name__) + + +class TokotronEvaluator: + """An evaluator class for the TTS model + + Arguments + --------- + hparams: dict + hyperparameters (as a dictionary) + device : str | torch.device + the device + """ + + def __init__(self, hparams, create_waveform_fn, device): + self.hparams = SimpleNamespace(**hparams) + self.create_waveform_fn = create_waveform_fn + self.device = device + modules = self.hparams.modules + self.modules = ModuleDict(modules).to(self.device) + self.spk_emb_model = self.hparams.spk_emb_model( + run_opts={"device": device} + ) + self.modules.model.vocoder = None + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.get("evaluators", {}) + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warning( + "No evaluators were defined - this run will produce samples only" + ) + + self.attention = [] + + def on_evaluate_start(self, stage, epoch): + """Invoked when evaluation starts + + Arguments + --------- + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.stage = stage + self.epoch = epoch + self.output_folder = self.get_output_folder(stage, epoch) + self.samples_folder = self.output_folder / "samples" + self.samples_folder.mkdir(parents=True, exist_ok=True) + logger.info( + "Starting evaluation, results will be saved in %s", + self.output_folder, + ) + self.create_reports() + self.modules.model.show_inference_progress = False + self.item_ids = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.sample_text = [] + self.sample_file_names = [] + self.ref_file_names = [] + if hasattr(self.modules, "vocoder"): + vocoder_to_device(self.modules.vocoder, self.device) + + def get_output_folder(self, stage, epoch): + """Computes the output folder of evaluation results + for the specified stage and epoch. + + If the folder does not exists, it will be created. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + + Returns + ------- + """ + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(parents=True, exist_ok=True) + return output_folder + + def on_evaluate_end(self): + """Invoked when evaluation starts + + Arguments + --------- + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.write_summary() + logger.info("Evaluation done") + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + if self.hparams.eval_perf: + self.perf_file = open(self.output_folder / "perf.csv", "w") + self.perf_writer = csv.DictWriter( + self.perf_file, + [ + "uttid", + "infer_flops", + "steps", + "infer_flops_per_step", + "vocoder_flops", + "total_flops", + "total_flops_per_step", + ], + ) + self.perf_writer.writeheader() + + def infer(self, tokens, tokens_length, emb): + stats = {} + if self.hparams.eval_perf: + flop_counter = FlopCounterMode() + else: + flop_counter = nullcontext() + + with flop_counter: + infer_out = self.modules.model.infer( + input_tokens=tokens, input_length=tokens_length, emb=emb + ) + if self.hparams.eval_perf: + steps = (infer_out.length * infer_out.audio.size(1)).sum().item() + total_flops = flop_counter.get_total_flops() + stats = { + "infer_flops": total_flops, + "steps": steps, + "infer_flops_per_step": total_flops / steps, + } + return infer_out, stats + + def vocoder(self, infer_out, emb): + stats = {} + if self.hparams.eval_perf: + flop_counter = FlopCounterMode() + else: + flop_counter = nullcontext() + + with flop_counter: + wav = self.create_waveform_fn( + infer_out.audio, length=infer_out.length, emb=emb + ) + if wav.dim() > 2: + wav = wav.squeeze(1) + + if self.hparams.eval_perf: + flops = flop_counter.get_total_flops() + stats = {"vocoder_flops": flops} + return wav, stats + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + if evaluator_key in self.evaluators: + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def evaluate_batch(self, batch): + """Runs evaluation on a single batch of speech + + Arguments + --------- + batch : speechbrain.dataio.batch.PaddedBatch + the batch to be evaluated""" + with torch.no_grad(): + batch = batch.to(self.device) + tokens, tokens_length = batch.tokens + if hasattr(self.modules, "vocoder"): + vocoder_to_device(self.modules.vocoder, self.device) + if hasattr(self.modules.vocoder, "device"): + self.modules.vocoder.device = self.device + audio_resampled = torchaudio.functional.resample( + batch.sig.data, + self.hparams.sample_rate, + self.hparams.model_sample_rate, + ) + mel_spec = self.spk_emb_model.mel_spectogram(audio=audio_resampled) + spk_emb = self.spk_emb_model.encode_mel_spectrogram_batch( + mel_spec, batch.sig.lengths + ).squeeze(1) + infer_out, perf_stats = self.infer( + tokens=tokens, tokens_length=tokens_length, emb={"spk": spk_emb} + ) + wav, vocoder_stats = self.vocoder(infer_out, spk_emb) + perf_stats.update(vocoder_stats) + length = infer_out.length + if wav.dim() > 2: + wav = wav.squeeze(1) + + self.save_samples(batch, wav, infer_out.length) + self.item_ids.extend(batch.uttid) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=batch.label_norm_eval, + wavs_ref=batch.sig.data, + length_ref=batch.sig.lengths, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, batch.uttid, details) + self.details[evaluator_key].extend(details) + + if self.hparams.eval_perf: + perf_stats.update(vocoder_stats) + perf_stats["total_flops"] = ( + perf_stats["vocoder_flops"] + perf_stats["infer_flops"] + ) + perf_stats["total_flops_per_step"] = ( + perf_stats["total_flops"] / perf_stats["steps"] + ) + self.write_perf_stats(batch.uttid, perf_stats) + + def write_result(self, evaluator_key, uttid, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + batch : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(uttid, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def save_samples(self, batch, wav, length): + """Saves the samples generated by the TTS system + + Arguments + --------- + batch : speechbrain.dataio.batch.PaddedBatch + the batch being evaluated + wav : torch.Tensor + the waveform + length: torch.Tensor + relative lengths + """ + wav_length_abs = (length * wav.size(1)).int() + for item_id, infer_wav, wav_length in zip( + batch.uttid, wav, wav_length_abs + ): + file_name = str(self.samples_folder / f"{item_id}_pred.wav") + infer_wav_cut = infer_wav[: wav_length.item()].cpu() + sb.dataio.dataio.write_audio( + file_name, + infer_wav_cut, + samplerate=self.hparams.model_sample_rate, + ) + self.sample_file_names.append(file_name) + + def write_summary(self): + """Outputs summarized statistics""" + summary = self.compute_summary() + file_name = self.output_folder / "summary.json" + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def write_perf_stats(self, uttid, details): + self.perf_writer.writerow({"uttid": " ".join(uttid), **details}) + self.perf_file.flush() + + def compute_summary(self): + """Computes the summarized statistics""" + return { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], key=metric_key, + ).items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +@sb.utils.data_pipeline.takes("label_norm") +@sb.utils.data_pipeline.provides("label_norm_eval") +def label_norm_pipeline(label): + """Normalizes labels for ASR comparison, converting to uppercase and removing + punctuation + + Arguments + --------- + label : str + The unnormalized label + + Returns + ------- + result : str + The normalized label + """ + label = label.upper() + label = RE_PUNCTUATION.sub("", label) + return label + + +@sb.utils.data_pipeline.takes("wav") +@sb.utils.data_pipeline.provides("sig") +def audio_ref_pipeline(wav): + """The audio loading pipeline for references + + Arguments + --------- + wav : str + The file path + + Returns + ------- + sig : torch.Tensor + The waveform + """ + sig = sb.dataio.dataio.read_audio(wav) + return sig + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml new file mode 100644 index 000000000..b39b11009 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -0,0 +1,57 @@ +eval_dataset: valid +eval_suffix: "" +eval_sample_rate: 16000 +eval_spk_sim_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_subset: null +eval_asr_beam_size: 66 +eval_asr_type: encoder_decoder +eval_asr_source: openai/whisper-small +eval_spk_sim_source: microsoft/wavlm-base-sv +evaluations: utmos,asr,spk_sim +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_asr: !name:eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +eval_utmos: !name:eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_spk_sim: !name:eval.SpkSimWavLM + source: !ref + savedir: !ref + model_sample_rate: !ref + +evaluators: + utmos: !ref + asr: !ref + spk_sim: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + spk_sim: + descriptive: ["score"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median + spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml new file mode 100644 index 000000000..01c818370 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -0,0 +1,292 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# DAC-specific settings +model_type: 24khz +model_bitrate: 8kbps + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +extract_features_opts: + dataloader_opts: + batch_size: !ref + num_workers: !ref + token_model: !ref + sample_rate: !ref + model_sample_rate: !ref + spk_emb_model: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +n_dim: 16 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + n_dim: !ref + decoder_chunk_size: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + vocoder: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..4efa9f75c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -0,0 +1,341 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +ssl_model_type: wavlm +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +# Position shift +use_position_shift: True +max_position_shift: 1000 +position_shift_seed: 42 +position_shift_probability: 1.0 + +freeze_token_model: True + +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large +g2p_src: flexthink/soundchoice-g2p +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +select_layers: null +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite + hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite + wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite +asr_src: speechbrain/asr-transformer-transformerlm-librispeech +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +batch_size_guided: 2 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 2000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +layerwise_renorm: True +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 1000 +audio_dim: 1024 +audio_emb_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: 1024 + continuous: 128 +audio_emb_freeze: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + representation_mode: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml new file mode 100644 index 000000000..e45794171 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -0,0 +1,294 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +alignments_folder: null +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: encodec +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_type: encodec +vocoder_src: "charactr/vocos-encodec-24khz" +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +# Guides +guides_enabled: False + + +silence_padding: !ref +use_silence_padding: True + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + emb: !ref + + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + + +modules: + model: !ref + compute_cost: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..e45794171 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml @@ -0,0 +1,294 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +alignments_folder: null +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: encodec +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_type: encodec +vocoder_src: "charactr/vocos-encodec-24khz" +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +# Guides +guides_enabled: False + + +silence_padding: !ref +use_silence_padding: True + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + emb: !ref + + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + + +modules: + model: !ref + compute_cost: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml new file mode 100644 index 000000000..156e05b02 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -0,0 +1,281 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +cached_data_folder: !PLACEHOLDER +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: vocos +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 2048 +audio_emb_size: 1024 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 8 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml similarity index 68% rename from benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml rename to benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index ac80bdac0..ffb68f2a5 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -2,61 +2,52 @@ # Model: Tokenized TTS (WhisperSpeech-inspired) # Authors: Artem Ploujnikov # ############################################################################ - -experiment_name: tokotron/continuous_ssl - # Seed needs to be set at top of yaml, before objects with parameters are made + seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/// +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt - -# Model type -ssl_model_type: wavlm -representation_mode: continuous +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/continuous- +prepare_save_folder: !ref /prepared pretrained_model_save_folder: !ref -vocoder_model_name: !ref unithifigan-dasb--continuous +representation_mode: discrete +vocoder_model_name: vocos vocoder_model_path: !ref / prepare_archive_path: null prepare_skip_ignore_folders: False +data_mode: lite train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml +init_from: null num_audio_samples: 32 samples_interval: 5 -freeze_ssl_model: True -ssl_model_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: microsoft/wavlm-large - hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self - -g2p_src: speechbrain/soundchoice-g2p -ssl_model_layers: [1, 3, 7, 12, 18, 23] +freeze_token_model: True +token_model_src: "fnlp/SpeechTokenizer" +g2p_src: flexthink/soundchoice-g2p token_offset: 1 -vocoder_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS +vocoder_takes_spk_emb: False spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec -use_spk_emb: False - -vocoder_available_layers: [1, 3, 7, 12, 18, 23] splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] @@ -66,8 +57,11 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 50 +number_of_epochs: 1000 +reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random @@ -87,7 +81,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -113,8 +107,8 @@ beam_size: 5 # Feature parameters sample_rate: 22050 model_sample_rate: 16000 -max_audio_length: 1000 -infer_max_audio_length: !ref +max_audio_length: 5000 +infer_max_audio_length: 1000 debug_infer_max_audio_length: 10 # Label encoder @@ -128,7 +122,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -138,29 +132,11 @@ use_silence_padding: True # Token model (pretrained) -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa + # Dataloader options train_dataloader_opts: batch_size: !ref @@ -171,7 +147,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: @@ -193,9 +169,9 @@ sample_dataloader_opts: extract_features_opts: dataloader_opts: - batch_size: !ref - ssl_model: !ref - ssl_model_layers: !ref + batch_size: !ref + num_workers: !ref + token_model: !ref sample_rate: !ref model_sample_rate: !ref spk_emb_model: !ref @@ -205,19 +181,20 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +n_dim: 16 +decoder_chunk_size: -1 transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU -audio_num_tokens: 1000 -audio_dim: 1024 +audio_num_tokens: 1024 audio_emb_size: 128 -audio_emb_freeze: False +audio_emb_freeze: True audio_emb_pretrained: False -audio_emb_lr: 0.00001 -audio_emb_weight_decay: 0.001 text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -229,17 +206,22 @@ audio_tokens_per_step: 6 attention_type: regularMHA ############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref -vocoder: !apply:speechbrain.inference.vocoders.HIFIGAN.from_hparams - source: !ref - savedir: !ref - -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + n_dim: !ref + decoder_chunk_size: !ref nhead: !ref enc_num_layers: !ref dec_num_layers: !ref @@ -247,6 +229,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- target_dropout: !ref activation: !ref attention_type: !ref + vocoder: !ref gate_threshold: !ref gate_offset: !ref audio_emb_size: !ref @@ -255,21 +238,23 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref - representation_mode: continuous + emb: !ref +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -282,13 +267,11 @@ compute_cost: !new:Tokotron.TokotronLoss eos_width: !ref audio_tokens_per_step: !ref audio_token_shift: !ref - representation_mode: continuous -lr_annealing: !new:Tokotron.TargetedNoamScheduler - lr_initial: [!ref , !ref ] +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref n_warmup_steps: !ref - param_group: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref @@ -297,10 +280,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..f4f745716 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -0,0 +1,281 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +cached_data_folder: !PLACEHOLDER +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: vocos +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 2048 +audio_emb_size: 1024 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 8 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py new file mode 120000 index 000000000..489ab4011 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py @@ -0,0 +1 @@ +../../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py new file mode 100644 index 000000000..7d99c5c7d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -0,0 +1,1031 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import math +import torch +import sys +from functools import partial +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import clean_padding_ +from speechbrain.utils.distributed import run_on_main +import re +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from model.Tokotron import ( + RepresentationMode, + get_silence_repr, + get_silence_token, + use_silence_padding, + feature_pad_to, +) # noqa: E402 +from evaluate import TokotronEvaluator # noqa: E402 + +logger = logging.getLogger(__name__) + +SPECIAL_TOKEN_COUNT = 1 + + +# Brain class for speech recognition training +class TokotronBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluator = TokotronEvaluator( + hparams=hparams, + create_waveform_fn=self.create_waveform, + device=self.device, + ) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.modules.tokenizer.codec_vocoder.device = self.device + with torch.no_grad(): + wav = self.modules.tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) + clean_padding_(wav, length) + wav = wav.to(self.device) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + tokens, tokens_length = batch.tokens + features = self.prepare_features(batch) + ( + audio_bos, + audio_bos_length, + audio_tgt, + audio_tgt_length, + spk_emb, + ) = features + + predictions = self.modules.model( + input_tokens=tokens, + input_length=tokens_length, + audio=audio_bos, + audio_length=audio_bos_length, + emb={"spk": spk_emb}, + ) + + return predictions, features + + def prepare_features(self, batch): + if self.hparams.spk_emb_shuffle: + wav, wav_length = batch.spk_emb_random_match + else: + wav, wav_length = batch.sig + spk_emb = self._compute_spk(wav, wav_length).squeeze(1) + + if self.representation_mode == RepresentationMode.DISCRETE: + audio_bos, audio_bos_length = batch.audio_bos + audio_tgt, audio_tgt_length = batch.audio_pad + else: + wav, audio_length = batch.sig + audio = self.modules.ssl_model(wav) + audio = audio[self.hparams.ssl_model_layers, :, :, :].permute( + 1, 2, 0, 3 + ) + batch_size, _, heads, dim = audio.shape + bos = torch.zeros_like(audio[:, :1, :, :]).reshape( + batch_size, self.hparams.bos_width, heads, dim + ) + audio_bos = torch.concatenate([bos, audio], dim=1) + audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) + audio_tgt = audio + audio_tgt_length = audio_length + + return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length, spk_emb + + def _compute_spk(self, wav, wav_length): + mel_spec = self.spk_emb_model.mel_spectogram(wav.squeeze(1)) + spk_emb_pred = self.spk_emb_model.encode_mel_spectrogram_batch( + mel_spec, wav_length + ) + return spk_emb_pred + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + predictions, features = predictions + ( + audio_bos, + audio_bos_length, + audio_tgt, + audio_tgt_length, + spk_emb, + ) = features + + loss_details = self.hparams.compute_cost( + predictions=predictions, + audio=audio_tgt, + audio_length=audio_tgt_length, + input_tokens=batch.tokens.data, + input_length=batch.tokens.lengths, + ) + self.loss_metric.append( + batch.uttid, + predictions=predictions, + audio=audio_tgt, + audio_length=audio_tgt_length, + input_tokens=batch.tokens.data, + input_length=batch.tokens.lengths, + reduction="batch", + ) + return loss_details.loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + if hasattr(self.modules, "vocoder") and hasattr( + self.modules.vocoder, "model" + ): + self.modules.vocoder.model.device = self.device + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.hparams.compute_cost, batch_eval=True, + ) + if ( + self.hparams.audio_emb_pretrained + and epoch == 1 + and stage == sb.Stage.TRAIN + ): + # TODO: Clean this up + if hasattr(self.hparams.token_model, "vocabulary"): + vocabulary = self.hparams.token_model.vocabulary + elif hasattr(self.hparams.token_model, "vocabularies"): + vocabulary = torch.stack( + [ + torch.from_numpy(voc) + for voc in self.hparams.token_model.vocabularies + ] + ) + self.modules.model.init_audio_emb(vocabulary) + # Load the compression model only if compression is enables + pretrained_run_opts = {"device": self.device} + self.spk_emb_model = self.hparams.spk_emb_model( + run_opts=pretrained_run_opts + ) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) + # If speaker embedding shuffling is enabled, re-initialize them for the + # epoch + if self.hparams.spk_emb_shuffle: + stage_key = stage.name.lower() + self.resample_fn[stage_key](epoch=epoch) + + # Reset the learning rate - if supported. This is useful when fine-tuning + # a model pre-trained on another dataset + if ( + stage == sb.Stage.TRAIN + and self.hparams.reset_annealing_epoch is not None + and epoch is not None + and epoch == self.hparams.reset_annealing_epoch + ): + self.hparams.lr_annealing.n_steps = 0 + + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + return epoch % self.hparams.eval_interval == 0 + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + checkpoint = self.checkpointer.recover_if_possible() + if not checkpoint: + self.check_init() + self._ckpt_recovered = True + + def check_init(self): + init_from = getattr(self.hparams, "init_from", None) + if init_from is not None: + logger.info("Initializing with pre-trained weights from %s", init_from) + init_from_path = Path(init_from) + model_path = init_from_path / "model.ckpt" + with open(model_path, "rb") as model_file: + model_state_dict = torch.load(model_file, map_location=self.device) + tgt_state_dict = self.modules.model.state_dict() + ignore_keys = [] + for k, v in model_state_dict.items(): + if k in tgt_state_dict and tgt_state_dict[k].shape != v.shape: + logger.warning("Ignoring shape mismatch for %s", k) + ignore_keys.append(k) + for k in ignore_keys: + del model_state_dict[k] + self.modules.model.load_state_dict(model_state_dict, strict=False) + logger.info("Successfully initialized with pre-trained weights from %s", init_from) + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + self.evaluator.evaluate_batch(batch) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_end() + eval_summary = self.evaluator.compute_summary() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + ) + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + representation_mode = RepresentationMode( + hparams.get("representation_mode", RepresentationMode.DISCRETE) + ) + + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_ref_pipeline(wav): + """The audio loading pipeline for references + + Arguments + --------- + wav : strÆ’num_ + The file path + + Returns + ------- + sig : torch.Tensor + The waveform + """ + sig = sb.dataio.dataio.read_audio(wav) + return sig + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label.upper() + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + use_silence_padding = hparams.get("use_silence_padding", True) + if "token_model_layers" in hparams: + audio_tokens_per_step = len(hparams["token_model_layers"]) + else: + audio_tokens_per_step = hparams["audio_tokens_per_step"] + layer_idx = None + if "speech_model_layers" in hparams: + layer_idx = get_selected_layer_indexes(hparams) + if use_silence_padding: + if representation_mode == RepresentationMode.DISCRETE: + silence_padding = get_silence_token( + hparams["tokenizer"], + num_codebooks=( + hparams["speech_model_layers"] + if "speech_model_layers" in hparams + else audio_tokens_per_step + ) + ) + else: + silence_padding = get_silence_repr(hparams["ssl_model"],) + else: + silence_padding = ( + torch.ones(audio_tokens_per_step, dtype=torch.int64) + * hparams["eos_index"] + ) + + silence_padding = silence_padding.cpu() + if layer_idx: + silence_padding = silence_padding[layer_idx] + else: + silence_padding = silence_padding[:audio_tokens_per_step] + silence_padding_len = int(math.ceil(hparams["silence_padding"])) + bos_width = hparams.get("bos_width", 1) + audio_bos_prefix = ( + torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"] + ) + if representation_mode == RepresentationMode.CONTINUOUS: + audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat( + 1, 1, hparams["audio_dim"] + ) + + tokens_loader = hparams.get("tokens_loader") + if layer_idx is not None: + tokens_loader_kwargs = { + "num_codebooks": layer_idx + } + else: + tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} + + @sb.utils.data_pipeline.takes("uttid") + @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") + def audio_pipeline(id): + audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs) + audio_pad = feature_pad_to( + audio, len(audio) + silence_padding_len, silence_padding + ) + yield audio_pad + audio_bos = torch.cat([audio_bos_prefix, audio_pad], dim=0) + yield audio_bos + + def spk_emb_random_match(uttid, dataset, spk_sample): + # Sample a speaker-matched embedding + selected_idx = spk_sample[uttid] + + # Retrieve the embedding value from the dataset + with dataset.output_keys_as(["sig"]): + spk_emb = dataset[selected_idx]["sig"] + return spk_emb + + dynamic_items = [ + text_pipeline, + tokens_pipeline, + audio_ref_pipeline, + audio_pipeline, + ] + output_keys = [ + "uttid", + "tokens", + "audio_pad", + "audio_bos", + "sig", + "spk_emb_random_match", + ] + + init_sequence_encoder(hparams) + + resample_fn = {} + for dataset in data_info: + dataset_output_keys = ( + output_keys + if dataset == "train" + else output_keys + ["label_norm_eval"] + ) + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dynamic_items, + output_keys=dataset_output_keys, + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + if hparams["spk_emb_shuffle"]: + spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams) + spk_sample = {} + spk_emb_random_match_pipeline = partial( + spk_emb_random_match, + spk_sample=spk_sample, + dataset=dynamic_dataset.filtered_sorted(), + ) + dynamic_dataset.add_dynamic_item( + func=spk_emb_random_match_pipeline, + takes=["uttid"], + provides=["spk_emb_random_match"], + ) + resample_fn[dataset] = partial( + resample_spk, + spk_idx=spk_idx, + sample=spk_sample, + dataset=dynamic_dataset, + spk_samplers=spk_samplers, + ) + resample_fn[dataset](epoch=0) + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + # Exclude samples without phonemes + if hparams["input"] == "phonemes": + for key in datasets: + datasets[key] = datasets[key].filtered_sorted( + key_test={"phn": lambda value: value} + ) + datasets["sample"] = select_sample(hparams, datasets) + return datasets, silence_padding, resample_fn + + +def select_sample(hparams, datasets): + """Selects a sample of files for sample generation, freezing the sample if + requested to persist across multiple experiments + + Arguments + --------- + hparams : dict + experiment hyperparameters + datasets : dict + a dictionary of datasets + + Returns + ------- + dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset + the sample dataset + """ + sample_path = hparams.get("sample_path") + dataset = None + if sample_path is not None: + sample_path = Path(sample_path) + if sample_path.exists(): + with open(sample_path, "r") as sample_file: + data_ids = [line.strip() for line in sample_file] + dataset = FilteredSortedDynamicItemDataset( + datasets["valid"], data_ids + ) + + if dataset is None: + dataset = ( + datasets["valid"] + .batch_shuffle(1) + .filtered_sorted(select_n=hparams["num_audio_samples"]) + ) + if sample_path is not None: + with open(sample_path, "w") as sample_file: + for data_id in dataset.data_ids: + print(data_id, file=sample_file) + return dataset + + +def group_by_speaker(dataset, hparams): + """Groups utterance IDs in a dataset by speaker, for selection. The selection + is stable based on the seed - calling this method multiple times will always + result in the same order + + Arguments + --------- + dataset : torch.Tensor + the dataset from which to select items + hparams : dict + hyperparameters + + Returns + ------- + spk_idx : dict + a str -> int dictionary with a list of utterance indexes + for every speaker + spk_samplers : dict + a reproducible sampler for every speaker + spk_samplers_it : dict + an iterator for each sampler + """ + spk_idx = {} + spk_samplers = {} + speakers = [] + generator = torch.Generator() + generator.manual_seed(hparams["seed"]) + + # Group by speaker + with dataset.output_keys_as(["spk_id"]): + for idx, item in enumerate(dataset): + spk_id = item["spk_id"] + if spk_id not in spk_idx: + spk_idx[spk_id] = [] + spk_idx[spk_id].append(idx) + speakers.append(spk_id) + + # Create a reproducible sampler + for spk_id in speakers: + sampler = hparams["spk_sampler"](data_source=spk_idx[spk_id]) + spk_samplers[spk_id] = sampler + + return spk_idx, spk_samplers + + +def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch): + """Selects new samples + + Arguments + --------- + spk_idx : dict + Data item indexes grouped by speaker + spk_samplers : dict + A sampler for each speaker + spk_samplers_it : dict + An iterator for each speaker + epoch : int + The epoch number + + Returns + ------- + sample : dict + a dictionary with uttids as keys and matching + indexes as values + """ + if epoch is None: + epoch = 0 + spk_samplers_it = {} + for spk_id, sampler in spk_samplers.items(): + sampler.set_epoch(epoch) + spk_samplers_it[spk_id] = iter(sampler) + with dataset.output_keys_as(["uttid", "spk_id"]): + for item in dataset: + spk_item_idx = next(spk_samplers_it[item["spk_id"]]) + dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx] + sample[item["uttid"]] = dataset_item_idx + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT) + return encoder + + +def get_selected_layer_indexes(hparams): + """Finds the indexes of selected layers + + Arguments + --------- + hparams : dict + Hyperparameters + """ + selected_layers = hparams.get("speech_model_layers") + available_layers = hparams.get("available_speech_model_layers") + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, _, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + from libritts_prepare import prepare_libritts + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": ( + hparams["test_json"] + if "test" in hparams["splits"] + else None + ), + "sample_rate": hparams["sample_rate"], + "train_split": hparams["train_split"], + "valid_split": hparams["valid_split"], + "test_split": ( + hparams["test_split"] + if "test" in hparams["splits"] + else None + ), + "seed": hparams["seed"], + "model_name": hparams["model"].__class__.__name__, + }, + ) + + # We can now directly create the datasets for training, valid, and test + (datasets, silence_padding, resample_fn) = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_pad", "audio_bos"] + + # Trainer initialization + tts_brain = TokotronBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + tts_brain.sample_data = datasets["sample"] + tts_brain.resample_fn = resample_fn + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=use_silence_padding( + hparams["train_dataloader_opts"], silence_padding, audio_keys + ), + valid_loader_kwargs=use_silence_padding( + hparams["valid_dataloader_opts"], silence_padding, audio_keys + ), + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs + ) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py new file mode 100644 index 000000000..9fd6da808 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py @@ -0,0 +1,357 @@ +import json +import torch +import logging +import re +import csv +from speechbrain.utils.metric_stats import MetricStats +from types import SimpleNamespace +from pathlib import Path +from utils.data import undo_batch +from torch import nn + + +logger = logging.getLogger(__name__) + + +class SpeechEvaluationMetricStats(MetricStats): + """An aggregate metric combining multiple speech evaluators + + Arguments + --------- + hparams : dict | SimpleNamespace | object + Raw hyperparameters for evaluation + + device : str + The device on which evaluation will be performed + + """ + + def __init__(self, hparams, device="cpu"): + if isinstance(hparams, dict): + hparams = SimpleNamespace(**hparams) + self.hparams = hparams + self.device = device + modules = self.hparams.modules + self.modules = nn.ModuleDict(modules).to(self.device) + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.evaluators + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warn( + "No evaluators were defined - this run will produce samples only" + ) + + def on_evaluation_start(self, output_folder="eval"): + """Invoked at the beginning of the evaluation cycle. + + Arguments + --------- + output_folder : str | path-like + The folder to which results will be output + + """ + logger.info("Starting evaluation") + output_folder = Path(output_folder) + self.output_folder = ( + output_folder + if output_folder.is_absolute() + else self.hparams.output_folder / output_folder + ) + self.output_folder.mkdir(parents=True, exist_ok=True) + + self.files = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.read_reports() + self.create_reports() + self.item_ids = [] + + def on_evaluation_end(self): + """Invoked at the beginning of the evaluation cycle. The default + implementation is a no-op + """ + logger.info("Ending evaluation") + self.write_summary() + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + self.files.append(file_name) + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_tracker_file_name(self): + """Determines the file name of the tracker file""" + suffix = ( + f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else "" + ) + file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt" + return self.output_folder / file_name + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def append(self, ids, wav, length, text, wav_ref, length_ref): + """Appends the result of a single item + + Arguments + --------- + ids : str + Utterance IDs + wav : torch.Tensor + Synthesized waveforms + length : torch.Tensor + Relative lengths of the synthesized waveforms + text : list + Ground truth text + wav_ref : torch.Tensor + Reference (ground truth) waveforms + length_ref : torch.Tensor + Reference lengths + """ + with torch.no_grad(): + self.item_ids.extend(ids) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=text, + wavs_ref=wav_ref, + length_ref=length_ref, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, ids, details) + self.details[evaluator_key].extend(details) + + def write_result(self, evaluator_key, ids, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + ids : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(ids, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def write_summary(self, file_name=None): + """Outputs summarized statistics + + Arguments + --------- + file_name : str | path-like + An alternative path to save the file + """ + summary = self.summarize() + if file_name is None: + file_name = self.output_folder / "summary.json" + self.files.append(file_name) + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def summarize(self, field=None): + """Computes the summarized statistics + + Arguments + --------- + field : str, optional + If specified, it will return a specific field + + Returns + ------- + result : dict | float + The summary - or the specified field from the sum + """ + result = { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], key=metric_key, + ).items() + } + if field is not None: + result = result[field] + return result + + def clear(self): + """Deletes all the files that have been created""" + for file_name in self.files: + file_name.unlink() + + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + """Removes any non-ASCII characters from a dictionary + + Arguments + --------- + values : dict + A dictionary of values + + Returns + ------- + result : dict + The same dictionary - but with non-ASCII strings removed""" + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + The key of the metric for which the statistics will be computed + + Returns + ------- + statistics : dict + The desccriptive statistics computed + _mean : the arithmetic mean + _std : the standard deviation + _min : the minimum value + _max : the maximum value + _median : the median value + _q1 : the first quartile + _q3 : the third quartile + _iqr : the interquartile ratio + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable. Strings + that look like integers or floats will be converted to integers + or floats. + + Arguments + --------- + value : str + a string value + + Returns + ------- + result : object + The processed result""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml new file mode 100644 index 000000000..129cf9337 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml @@ -0,0 +1,57 @@ +eval_dataset: valid +eval_suffix: "" +eval_sample_rate: 16000 +eval_spk_sim_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_subset: null +eval_asr_beam_size: 66 +eval_asr_type: encoder_decoder +eval_asr_source: openai/whisper-small +eval_spk_sim_source: microsoft/wavlm-base-sv +evaluations: utmos,asr,spk_sim +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_spk_sim: !name:utils.eval.SpkSimWavLM + source: !ref + savedir: !ref + model_sample_rate: !ref + +evaluators: + utmos: !ref + asr: !ref + spk_sim: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + spk_sim: + descriptive: ["score"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median + spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml new file mode 100644 index 000000000..b7579f092 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -0,0 +1,256 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/dac + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 2 +flatten: false + +# Model Settings +model_type: 24khz +model_bitrate: 8kbps + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..e7e4657aa --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -0,0 +1,297 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/discrete_ssl +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +representation_mode: discrete +output_folder: !ref results/// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache +alignments_folder: null +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +progress_folder: !ref /progress +progress_current: !ref /current +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True + +# Model Settings +ssl_model_type: wavlm +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True + +# Speaker Embeddings +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1000 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 6 +flatten: false + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref / + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + + +lr_annealing: !new:model.Tokotron.TargetedNoamScheduler + lr_initial: [!ref , !ref ] + n_warmup_steps: !ref + param_group: 0 + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml new file mode 100644 index 000000000..c35aaa4f9 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -0,0 +1,256 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/encodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: false + +# Model Settings +model_hub: facebook/encodec_24khz +bandwidth: 6 + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: !ref + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..efd408469 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -0,0 +1,260 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/encodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: false + +# Model Settings +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml + +freeze_lm_head: True + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_fairseq_hubert.yaml new file mode 100644 index 000000000..d51b43ea3 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_fairseq_hubert.yaml @@ -0,0 +1,268 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/fairseq_hubert +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +representation_mode: discrete +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache +alignments_folder: null +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +progress_folder: !ref /progress +progress_current: !ref /current +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True + +# Model Settings +model_path: !ref /fairseq-hubert +feature_extractor_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher.pt +kmeans_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher_L11_km500.bin +speech_model_layer: 11 +vocoder_dense_model_name: "mhubert-base-25hz" +vocoder_quantizer_model_name: "kmeans" +vocoder_vocab_size: 500 +flip_layers: False +use_token_offsets: True + +# Speaker Embeddings +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 500 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 1 +flatten: false + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +vocoder: !apply:textless.vocoders.hifigan.vocoder.CodeHiFiGANVocoder.by_name + dense_model_name: !ref + quantizer_model_name: !ref + vocab_size: !ref + +tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer + feat_extractor_path: !ref + km_path: !ref + layer: !ref + vocoder: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + + +lr_annealing: !new:model.Tokotron.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml new file mode 100644 index 000000000..7b61a18a7 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -0,0 +1,252 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/mimi + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 2048 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 + +# Model Settings +model_hub: kyutai/mimi + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml new file mode 100644 index 000000000..b6f699cf9 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -0,0 +1,252 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speech_tokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: false + +# Model Settings +model_hub: fnlp/SpeechTokenizer + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: !ref # Only the 24kHz version supports mono audio + save_path: !ref + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..b123ca67b --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -0,0 +1,290 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speech_tokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: False +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +top_k: 1 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +ternary_d_hidden: 128 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +target_dropout: 0.2 +vocab_size: 19683 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + + phonemes: !ref + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 4 +flatten: true +ternary_num_digits: 14 +pred_mode: ternary + +# Model Settings +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: 1 + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + target_dropout: !ref + share_emb: !ref + qk_norm: !ref + lm_head: !ref + emb: !ref + logits_to_probs: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: 1 + top_k: !ref + +lm_head: !new:model.custom_model.MultitrackPredictionHead + d_model: !ref + num_tracks: !ref + vocab_size: !ref + +logits_to_probs: !new:torch.nn.Identity + +ternary_emb_shift: 4 +ternary_emb_hybrid: false + +emb: !new:speechbrain.nnet.containers.Sequential + ternary: !new:model.sq_codec.TernaryEmbedding + num_digits: !ref + shift: !ref + shift_cutoff: !ref + hybrid: !ref + hybrid_cutoff: !ref + hybrid_size: !ref * + flat: True + linear: !new:speechbrain.nnet.linear.Linear + input_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + true: !ref * * 2 + false: !ref * + n_neurons: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec_ternary.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec_ternary.yaml new file mode 100644 index 000000000..3a22065d7 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec_ternary.yaml @@ -0,0 +1,292 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speech_tokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: False +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +top_k: 1 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +ternary_d_hidden: 128 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +target_dropout: 0.2 +vocab_size: 19683 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !ref + + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 4 +flatten: true +ternary_num_digits: 10 +pred_mode: ternary + +# Model Settings +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: 1 + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + target_dropout: !ref + share_emb: !ref + qk_norm: !ref + lm_head: !ref + emb: !ref + logits_to_probs: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: 1 + top_k: !ref + +lm_head: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + d_hidden: !ref + num_positions: !ref * + +logits_to_probs: !new:model.custom_model.TernaryLogitTokenizer + num_tokens: !ref + num_positions: !ref + +ternary_emb_shift: 4 +ternary_emb_hybrid: false + +emb: !new:speechbrain.nnet.containers.Sequential + ternary: !new:model.sq_codec.TernaryEmbedding + num_digits: !ref + shift: !ref + shift_cutoff: !ref + hybrid: !ref + hybrid_cutoff: !ref + hybrid_size: !ref * + flat: True + linear: !new:speechbrain.nnet.linear.Linear + input_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + true: !ref * * 2 + false: !ref * + n_neurons: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.sq_codec.ternary_loss + targets_type: tokens + num_positions: !ref + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..b63fe0d24 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -0,0 +1,256 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/wavtokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 4096 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 1 +flatten: false + +# Model Settings +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py new file mode 120000 index 000000000..489ab4011 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py @@ -0,0 +1 @@ +../../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py new file mode 100644 index 000000000..f37ae5b40 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -0,0 +1,1353 @@ +#!/usr/bin/env/python3 +"""Recipe for training VALL-E + +Based on ESPNET VALL-E + +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import torch +import sys +import shutil +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataio import ( + clean_padding, + length_to_mask, + write_audio, +) +from speechbrain.dataio.dataloader import LoopedLoader +from speechbrain.utils.data_utils import pad_right_to +from speechbrain.utils.distributed import run_on_main +from speechbrain.utils.data_utils import batch_pad_right +from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from functools import partial +from torch.utils.data import DataLoader +import re +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats # noqa: E402 + +logger = logging.getLogger(__name__) + +# Brain class for speech recognition training +class VALLEBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + tokenizer = ( + self.modules.tokenizer.module + if hasattr(self.modules.tokenizer, "module") + else self.modules.tokenizer + ) + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + prompt, prompt_length = batch.prompt + batch_size, prompt_max_len, num_tracks = prompt.shape + nar_track = None + if self.train_nar: + nar_track = torch.randint( + 1, num_tracks, (batch_size,), device=self.device + ) + logits_ar, logits_nar = self.modules.model( + dec_seq=batch.prompt.data, + dec_seq_lengths=batch.prompt.lengths, + prefix_len=batch.prefix_length / prompt_max_len, + nar_level_idx=nar_track, + predict_ar=self.train_ar, + predict_nar=self.train_nar, + ) + return logits_ar, logits_nar, nar_track + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + logits_ar, logits_nar, nar_track = predictions + prompt, prompt_length = batch.prompt + prefix_length = batch.prefix_length + + batch_size, prompt_max_len, _ = prompt.shape + batch_idx = torch.arange(batch_size, device=prompt.device) + length_mask = length_to_mask( + prompt_length * prompt_max_len, prompt_max_len + ) + prefix_mask = length_to_mask( + prefix_length, prompt_max_len + ).logical_not() + mask = (length_mask * prefix_mask)[:, 1:] + + loss_components = [] + + if self.train_ar: + logits_ar_sm = self.hparams.log_softmax(logits_ar) + if self.hparams.flatten: + targets_ar = prompt[:, 1:] + else: + targets_ar = prompt[:, 1:, 0] + + loss_ar = self.hparams.compute_cost( + logits_ar_sm, targets=targets_ar, mask=mask + ) + loss_components.append(loss_ar) + else: + logits_ar_sm, targets_ar = None, None + if self.train_nar: + logits_nar_sm = self.hparams.log_softmax(logits_nar) + targets_nar = prompt[batch_idx, 1:, nar_track] + loss_nar = self.hparams.compute_cost( + logits_nar_sm, targets=targets_nar, mask=mask, + ) + loss_components.append(loss_nar) + else: + logits_nar_sm, targets_nar = None, None + + self.loss_metric.append( + ids=batch.uttid, + logits_ar=logits_ar_sm, + targets_ar=targets_ar, + logits_nar=logits_nar_sm, + targets_nar=targets_nar, + mask=mask, + reduction="batch", + ) + + loss = torch.mean(torch.stack(loss_components)) + return loss + + def compute_loss_stats( + self, + logits_ar, + targets_ar, + logits_nar, + targets_nar, + mask, + reduction="batch" + ): + """Computes an autoregressive/non-autoregressive loss breakdown, + to be used for metrics/stats + + Arguments + --------- + logits_ar : torch.Tensor + The autoregressive predictions + targets_ar : torch.Tensor + The targets for autoregressive predictions + logits_nar : torch.Tensor + The non-autoregressive predictions + targets_nar : torch.Tensor + The targets for non-autoregressive prediction + + Returns + ------- + stats: dict + statistics + """ + stats = {} + if self.train_ar: + stats["loss_ar"] = self.hparams.compute_cost( + logits_ar, targets=targets_ar, mask=mask, + reduction=reduction, + ) + if self.train_nar: + stats["loss_nar"] = self.hparams.compute_cost( + logits_nar, targets=targets_nar, mask=mask, + reduction=reduction, + ) + return stats + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.offsets = get_offsets( + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) + + if hasattr(hparams, "speech_model_layers"): + self.layer_idx = get_selected_layer_indexes( + hparams.available_speech_model_layers, + hparams.speech_model_layers + ) + else: + self.layer_idx = None + + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.compute_loss_stats, batch_eval=True, + ) + self.apply_curriculum() + + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + dataset = stage.name.lower() + self.resample_fn[dataset](epoch=epoch or 0) + + def apply_curriculum(self): + """Applies curriculum settings, if specified, training only the autoregressive part - or + only the non-autoregressive part""" + epoch = self.hparams.epoch_counter.current + self.train_ar, self.train_nar = True, True + lm_head = ( + self.modules.model.module.lm_head + if hasattr(self.modules.model, "module") + else self.modules.model.lm_head + ) + lm_head.requires_grad_(True) + if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: + # NOTE: If there is only one track it's autoregressive + self.train_nar = False + elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: + self.train_nar = False + elif ( + self.hparams.number_of_epochs_nar is not None + and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) + ): + self.train_ar = False + if self.hparams.freeze_lm_head: + lm_head.requires_grad_(False) + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + # NOTE: Need to get past AR-only training to be able to evaluate + can_evaluate = not ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ) + return can_evaluate and (epoch % self.hparams.eval_interval == 0) + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + with torch.no_grad(): + audio_tokens, audio_length = self.inference(batch) + if self.hparams.flip_layers: + audio_tokens = audio_tokens.flip(2) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.save_samples( + batch=batch, wav=wav, length=audio_length, stage=stage + ) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + eval_summary_stats = {} + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_evaluating: + self.evaluation_metric.on_evaluation_end() + self.save_eval(stage) + eval_summary = self.evaluation_metric.summarize() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr, **eval_summary_stats}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs + ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + + def inference(self, batch): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = ( + self.modules.model.module.inference + if hasattr(self.modules.model, "module") + else self.modules.model.inference + ) + inference_results = [ + inference( + prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() + ) + for prefix_item in prefix_items + ] + inferred_tokens = [ + self._pad_inferred_sample(result) + for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + audio_length = audio_length.to(self.device) + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) + return audio, audio_length + + def _pad_inferred_sample(self, result): + """Applies length padding to an inference result + + Arguments + --------- + result : list + The VALL-E Inference output + + Returns + ------- + sample : torch.Tensor + A sample, padded if needed + """ + if result[0]: + sample = result[0][0] + else: + sample = torch.zeros( + 1000, self.hparams.audio_tokens_per_step, device=self.device + ) + min_length = getattr(self.hparams, "infer_min_length", 10) + sample_length, tracks = sample.shape + if sample_length < min_length: + sample = pad_right_to( + sample, + (min_length, tracks), + )[0] + return sample + + def _get_inference_opts(self): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) + track_start = ( + self.hparams.audio_token_shift + + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True + return self.hparams.inference_opts( + masks={self.hparams.bos_index: mask}, device=self.device, + ) + + def save_samples(self, batch, wav, length, stage): + output_folder = self._get_eval_output_folder(stage) + samples = undo_padding_tensor(wav, length) + for uttid, sample in zip(batch.uttid, samples): + file_name = output_folder / f"pred_{uttid}.wav" + write_audio(file_name, sample.cpu(), self.hparams.model_sample_rate) + + def save_eval(self, stage): + """Saves evaluation results + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + """ + output_folder = self._get_eval_output_folder(stage) + for src_file_name in self.evaluation_metric.files: + dest_file_name = output_folder / src_file_name.name + shutil.copyfile(src_file_name, dest_file_name) + self.evaluation_metric.clear() + + def _get_eval_output_folder(self, stage): + epoch = self.hparams.epoch_counter.current + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(exist_ok=True, parents=True) + return output_folder + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + def fit( + self, + epoch_counter, + train_set, + valid_set=None, + progressbar=None, + train_loader_kwargs={}, + valid_loader_kwargs={}, + ): + """Iterate epochs and datasets to improve objective. + + Relies on the existence of multiple functions that can (or should) be + overridden. The following methods are used and expected to have a + certain behavior: + + * ``fit_batch()`` + * ``evaluate_batch()`` + * ``update_average()`` + + If the initialization was done with distributed_count > 0 and the + distributed_backend is ddp, this will generally handle multiprocess + logic, like splitting the training data into subsets for each device and + only saving a checkpoint on the main process. + + Arguments + --------- + epoch_counter : iterable + Each call should return an integer indicating the epoch count. + train_set : Dataset, DataLoader + A set of data to use for training. If a Dataset is given, a + DataLoader is automatically created. If a DataLoader is given, it is + used directly. + valid_set : Dataset, DataLoader + A set of data to use for validation. If a Dataset is given, a + DataLoader is automatically created. If a DataLoader is given, it is + used directly. + progressbar : bool + Whether to display the progress of each epoch in a progressbar. + train_loader_kwargs : dict + Kwargs passed to `make_dataloader()` for making the train_loader + (if train_set is a Dataset, not DataLoader). + E.G. batch_size, num_workers. + DataLoader kwargs are all valid. + valid_loader_kwargs : dict + Kwargs passed to `make_dataloader()` for making the valid_loader + (if valid_set is a Dataset, not DataLoader). + E.g., batch_size, num_workers. + DataLoader kwargs are all valid. + + Returns + ------- + None + """ + if self.test_only: + logger.info( + "Test only mode, skipping training and validation stages." + ) + return + if not ( + isinstance(train_set, DataLoader) + or isinstance(train_set, LoopedLoader) + ): + train_set = self.make_dataloader( + train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs + ) + self.on_fit_start() + epoch = self.hparams.epoch_counter.current + if epoch < self.hparams.number_of_epochs: + valid_set = sample_dataset( + dataset=valid_set, + count=self.hparams.valid_inter_data_count, + seed=self.hparams.seed + ) + + valid_set = self.make_dataloader( + valid_set, + stage=sb.Stage.VALID, + ckpt_prefix=None, + **valid_loader_kwargs, + ) + + if progressbar is None: + progressbar = not self.noprogressbar + + # Only show progressbar if requested and main_process + enable = progressbar and sb.utils.distributed.if_main_process() + + # Iterate epochs + for epoch in epoch_counter: + self._fit_train(train_set=train_set, epoch=epoch, enable=enable) + self._fit_valid(valid_set=valid_set, epoch=epoch, enable=enable) + + # Debug mode only runs a few epochs + if ( + self.debug + and epoch == self.debug_epochs + or self._optimizer_step_limit_exceeded + ): + break + + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + offsets = get_offsets( + hparams["vocab_size"], hparams["audio_tokens_per_step"] + ).unsqueeze(0) + if not hparams["use_token_offsets"]: + offsets = torch.zeros_like(offsets) + if hparams["flip_layers"]: + offsets = offsets.flip(-1) + + tokens_loader = hparams.get("tokens_loader") + spk_prompt_length = hparams["spk_prompt_length"] + + layer_idx = None + if "speech_model_layers" in hparams: + layer_idx = get_selected_layer_indexes( + hparams["available_speech_model_layers"], + hparams["speech_model_layers"], + ) + + if layer_idx is not None: + num_codebooks = layer_idx + else: + num_codebooks = hparams["audio_tokens_per_step"] + + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label_norm + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + def spk_prompt(uttid, spk_sample): + # Sample a speaker-matched embedding + selected_uttid = spk_sample[uttid] + audio = tokens_loader.tokens_by_uttid( + selected_uttid, num_codebooks=num_codebooks + ) + if audio.size(0) > spk_prompt_length: + offset = torch.randint(0, audio.size(0), (1,)).item() + else: + offset = 0 + # Retrieve the embedding value from the dataset + audio_spk_prompt, _ = pad_right_to( + audio[offset : offset + spk_prompt_length], + (spk_prompt_length, audio.size(1)), + ) + return audio_spk_prompt + + @sb.utils.data_pipeline.takes("uttid", "tokens", "spk_prompt") + @sb.utils.data_pipeline.provides( + "audio", "prefix", "prompt", "prefix_length", "length" + ) + def prompt_pipeline(id, tokens, spk_prompt): + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=num_codebooks + ) + if hparams["flip_layers"]: + audio = audio.flip(-1) + yield audio + num_tracks = audio.size(1) + prefix = torch.cat( + [ + torch.ones(1, num_tracks) * hparams["bos_index"], + tokens.unsqueeze(-1).expand(len(tokens), num_tracks), + torch.ones(1, num_tracks) * hparams["eot_index"], + spk_prompt + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eop_index"], + ] + ) + yield prefix + prompt = torch.cat( + [ + prefix, + torch.ones(1, num_tracks) * hparams["bos_index"], + audio + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eos_index"], + ] + ).int() + yield prompt + yield len(prefix) + yield len(prompt) + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sig_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + dynamic_items = [text_pipeline, tokens_pipeline, sig_pipeline] + + init_sequence_encoder(hparams) + use_spk_emb = hparams.get("use_spk_emb", False) + prepared_features = ["audio_tokens"] + output_keys = [ + "uttid", + "tokens", + "label_norm", + "audio", + "prompt", + "prefix_length", + "length", + ] + if use_spk_emb: + prepared_features.append("spk_emb") + output_keys.append("spk_emb") + + resample_fn = {} + for dataset in data_info: + dataset_dynamic_items = list(dynamic_items) + dataset_output_keys = list(output_keys) + if dataset != "train": + dataset_output_keys += ["sig", "label_norm_eval", "prefix"] + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dataset_dynamic_items, + output_keys=dataset_output_keys, + ) + spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams) + spk_sample = {} + spk_prompt_pipeline = partial(spk_prompt, spk_sample=spk_sample,) + dynamic_dataset.add_dynamic_item( + func=spk_prompt_pipeline, takes=["uttid"], provides=["spk_prompt"], + ) + dynamic_dataset.add_dynamic_item(prompt_pipeline) + resample_fn[dataset] = partial( + resample_spk, + spk_idx=spk_idx, + sample=spk_sample, + dataset=dynamic_dataset, + spk_samplers=spk_samplers, + ) + resample_fn[dataset](epoch=0) + if hparams["input"] == "phonemes": + dynamic_dataset = dynamic_dataset.filtered_sorted( + key_test={"has_alignments": lambda value: value} + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + if not hparams["overfit_test"]: + hparams["train_dataloader_opts"]["shuffle"] = True + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + return datasets, resample_fn + + +def sample_dataset(dataset, count, seed): + """Selects a sample of the specified dataset in a + stable manner, returning the same sample on each call + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset + count : int + The number of items to select + seed : int + The seed to be used + """ + if len(dataset) < count: + return dataset + generator = torch.Generator() + generator.manual_seed(seed) + indexes = torch.randperm(len(dataset)).tolist()[:count] + data_ids = [ + dataset.data_ids[idx] + for idx in indexes + ] + return FilteredSortedDynamicItemDataset( + dataset, + data_ids, + ) + + +def get_offsets(vocab_size, tracks): + """Adds offsets to each track to treat the tokens as distinct + + Arguments + --------- + vocab_size : int + The vocabulary size, for each track + tracks : int + The number of tracks + """ + return torch.arange(tracks) * vocab_size + + +def group_by_speaker(dataset, hparams): + """Groups utterance IDs in a dataset by speaker, for selection. The selection + is stable based on the seed - calling this method multiple times will always + result in the same order + + Arguments + --------- + dataset : torch.Tensor + the dataset from which to select items + hparams : dict + hyperparameters + + Returns + ------- + spk_idx : dict + a str -> str with a list of utterance IDs + for every speaker + spk_samplers : dict + a reproducible sampler for every speaker + spk_samplers_it : dict + an iterator for each sampler + """ + spk_uttid = {} + spk_samplers = {} + speakers = [] + generator = torch.Generator() + generator.manual_seed(hparams["seed"]) + + # Group by speaker + with dataset.output_keys_as(["spk_id", "uttid"]): + for idx, item in enumerate(dataset): + spk_id = item["spk_id"] + if spk_id not in spk_uttid: + spk_uttid[spk_id] = [] + spk_uttid[spk_id].append(item["uttid"]) + speakers.append(spk_id) + + # Create a reproducible sampler + for spk_id in speakers: + sampler = hparams["spk_sampler"](data_source=spk_uttid[spk_id]) + spk_samplers[spk_id] = sampler + + return spk_uttid, spk_samplers + + +def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch): + """Selects new samples + + Arguments + --------- + spk_idx : dict + Data item indexes grouped by speaker + spk_samplers : dict + A sampler for each speaker + spk_samplers_it : dict + An iterator for each speaker + epoch : int + The epoch number + + Returns + ------- + sample : dict + a dictionary with uttids as keys and matching + indexes as values + """ + if epoch is None: + epoch = 0 + spk_samplers_it = {} + for spk_id, sampler in spk_samplers.items(): + sampler.set_epoch(epoch) + spk_samplers_it[spk_id] = iter(sampler) + with dataset.output_keys_as(["uttid", "spk_id"]): + for item in dataset: + spk_item_idx = next(spk_samplers_it[item["spk_id"]]) + dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx] + sample[item["uttid"]] = dataset_item_idx + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + for token in hparams["special_tokens"]: + token_key = token.replace("<", "").replace(">", "") + token_index = hparams[f"{token_key}_index"] + encoder.insert_label(token, token_index) + + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + hparams["special_num_tokens"]) + return encoder + + +def get_selected_layer_indexes(available_layers, selected_layers): + """Finds the layers of selected layers + + Arguments + --------- + available_layers : list + The available layers + selected_layers : list + The selected layers + + Returns + ------- + layer_idx : list + The layer indexes + """ + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, dataset_valid, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) + + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +def select_eval_subset(dataset, hparams, key="eval_subset"): + """Selects a subset of the dataset provided, if specified. + The selection is controlled by a hyperparameter named + eval_subset, which is expected to list the IDs of the + data items on which evaluation will take place, one per line + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams : dict + A hyperparameters file + + Returns + ------- + subset : dataset + The dataset, filtered down if applicable + """ + eval_subset_path = hparams.get(key) + if eval_subset_path is not None: + eval_subset_path = Path(eval_subset_path) + if not eval_subset_path.exists(): + raise ValueError(f"eval_subset {eval_subset_path} does not exist") + with open(eval_subset_path) as eval_subset_file: + eval_subset_ids = [line.strip() for line in eval_subset_file] + existing_ids = dataset.data_ids + eval_subset_ids = [uttid for uttid in eval_subset_ids if uttid in existing_ids] + if not eval_subset_ids: + raise ValueError("{eval_subset_path}: no items found in the dataset") + subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids) + else: + subset = dataset + return subset + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Data preparation, to be run on only one process. + from libritts_prepare import prepare_libritts + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": ( + hparams["test_json"] + if "test" in hparams["splits"] + else None + ), + "sample_rate": hparams["sample_rate"], + "train_split": hparams["train_split"], + "valid_split": hparams["valid_split"], + "test_split": ( + hparams["test_split"] + if "test" in hparams["splits"] + else None + ), + "seed": hparams["seed"], + "alignments_folder": hparams.get("alignments_folder"), + "model_name": hparams["model"].__class__.__name__, + }, + ) + + # We can now directly create the datasets for training, valid, and test + datasets, resample_fn = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_tokens"] + + # Trainer initialization + tts_brain = VALLEBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + tts_brain.resample_fn = resample_fn + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } + eval_dataset = datasets["test"] + eval_dataset = select_eval_subset(eval_dataset, hparams) + tts_brain.evaluate( + test_set=eval_dataset, + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs + ) diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py new file mode 100644 index 000000000..328fbe868 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/extract.py @@ -0,0 +1,90 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech + from libritts_prepare import prepare_libritts # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "train_split": hparams["train_splits"], + "valid_split": hparams["dev_splits"], + "test_split": hparams["test_splits"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": hparams["test_json"], + "sample_rate": hparams["sample_rate"], + "skip_prep": hparams["skip_prep"], + "max_valid_size": None, + "skip_resample": hparams["skip_resample"], + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid", "test"]: + json_path = hparams[f"{split}_json"] + name = pl.Path(json_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, + hparams["num_codebooks"], + (save_folder / "libritts").as_posix(), + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix(), + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], + ) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml new file mode 100644 index 000000000..836503717 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml @@ -0,0 +1,64 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False +skip_resample: False + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..6ae14c87c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,103 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | + +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: WavLM +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +vocab_size: 1000 +save_embedding: False +skip_resample: False + + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +num_codebooks: [1, 3, 7, 12, 18, 23] +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..188b38a6d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml @@ -0,0 +1,64 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False +skip_resample: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml new file mode 100644 index 000000000..a0542b189 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml @@ -0,0 +1,67 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False +skip_resample: False + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/fairseq_hubert.yaml new file mode 100644 index 000000000..1f7e87fa9 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/fairseq_hubert.yaml @@ -0,0 +1,64 @@ +# ############################################################################ +# Auido Tokenizer: FairSEQ HuBERT +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/fairseq-hubert +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +model_path: !PLACEHOLDER +feature_extractor_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher.pt +kmeans_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher_L11_km500.bin +vocab_size: 500 +save_embedding: False +skip_resample: False + + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +layer: 11 +num_codebooks: 1 +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer + feat_extractor_path: !ref + km_path: !ref + layer: !ref + vocoder: null + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..dc026cc55 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 32 +sample_rate: 24000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False +skip_resample: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..8d3a9aa27 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,55 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 8 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False +skip_resample: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..3d9792bbb --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/sqcodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# SQCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks: 4 +save_path: /home/ubuntu/sq-codec/SQ-Codec +skip_resample: False + + +# SQCodec model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..bfd802740 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,61 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# WavTokenizer parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks: 1 +vocab_size: 4096 +skip_resample: False + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py new file mode 120000 index 000000000..39f1a78c2 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py @@ -0,0 +1 @@ +../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py new file mode 100644 index 000000000..52594eaf9 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py @@ -0,0 +1,504 @@ +""" +LibriTTS data preparation + +Authors + * Pradnya Kandarkar 2022 +""" + +import json +import os +import random + +import torch +import torchaudio +import re +from tqdm import tqdm + +from speechbrain.inference.text import GraphemeToPhoneme +from speechbrain.utils.data_utils import get_all_files +from speechbrain.utils.logger import get_logger +from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations +from pathlib import Path + +logger = get_logger(__name__) +LIBRITTS_URL_PREFIX = "https://www.openslr.org/resources/60/" + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def prepare_libritts( + data_folder, + save_json_train, + save_json_valid, + save_json_test, + sample_rate, + split_ratio=[80, 10, 10], + libritts_subsets=None, + train_split=None, + valid_split=None, + test_split=None, + seed=1234, + model_name=None, + max_valid_size=500, + alignments_folder=None, + skip_prep=False, + skip_resample=False, +): + """ + Prepares the json files for the LibriTTS dataset. + Downloads the dataset if it is not found in the `data_folder` as expected. + + Arguments + --------- + data_folder : str + Path to the folder where the LibriTTS dataset is stored. + save_json_train : str + Path where the train data specification file will be saved. + save_json_valid : str + Path where the validation data specification file will be saved. + save_json_test : str + Path where the test data specification file will be saved. + sample_rate : int + The sample rate to be used for the dataset + split_ratio : list + List composed of three integers that sets split ratios for train, valid, + and test sets, respectively. For instance split_ratio=[80, 10, 10] will + assign 80% of the sentences to training, 10% for validation, and 10% + for test. + libritts_subsets: list + List of librispeech subsets to use (e.g., dev-clean, train-clean-100, ...) for the experiment. + This parameter will be ignored if explicit data splits are provided. + Explicit data splits parameters: "train_split", "valid_split", "test_split" + train_split : list + List of librispeech subsets to use (e.g.,train-clean-100, train-clean-360) for the experiment training stage. + valid_split : list + List of librispeech subsets to use (e.g., dev-clean) for the experiment validation stage. + test_split : list + List of librispeech subsets to use (e.g., test-clean) for the experiment testing stage. + seed : int + Seed value + model_name : str + Model name (used to prepare additional model specific data) + alignments_path : None + The path to alignments files + skip_prep: Bool + If True, skip preparation. + skip_resample: bool + If True, audio will not be resampled + + Returns + ------- + None + """ + + if skip_prep: + return + + # Setting the seed value + random.seed(seed) + + # Checks if this phase is already done (if so, skips it) + if skip(save_json_train, save_json_valid, save_json_test): + logger.info("Preparation completed in previous run, skipping.") + return + + logger.info( + f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}" + ) + + # If specific splits are provided, creates data manifest files accordingly + if train_split: + wav_list = prepare_split(data_folder, train_split) + create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name, skip_resample) + if valid_split: + wav_list = prepare_split(data_folder, valid_split) + # TODO add better way to speedup evaluation + if max_valid_size is not None and len(wav_list) > max_valid_size: + wav_list = random.sample(wav_list, max_valid_size) + create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name, skip_resample) + if test_split: + wav_list = prepare_split(data_folder, test_split) + create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name, skip_resample) + + if skip(save_json_train, save_json_valid, save_json_test): + logger.info("Preparation completed.") + return + + # If specific splits are not provided, and a list of subsets if provided, creates train, valid, test splits + # Creates data manifest files according to the data splits + if libritts_subsets: + wav_list = prepare_split(data_folder, libritts_subsets) + # Random split the signal list into train, valid, and test sets. + data_split = split_sets(wav_list, split_ratio) + # Creating json files + create_json( + data_split["train"], save_json_train, sample_rate, alignments_folder, model_name, skip_resample + ) + create_json( + data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name, skip_resample + ) + create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name, skip_resample) + + +def prepare_split(data_folder, split_list): + """ + Processes the provided list of LibriTTS subsets and creates a list of all the .wav files present in the subsets. + Downloads the LibriTTS subsets as required. + + Arguments + --------- + data_folder : str + Path to the folder where the LibriTTS dataset is stored + split_list : list + List of librispeech subsets to process (e.g., dev-clean, train-clean-100, ...) + + Returns + ------- + wav_list : list + List of all .wav files to be processed + """ + extension = [".wav"] # The expected extension for audio files + wav_list = list() # Stores all audio file paths for the dataset + + # For every subset of the dataset, if it doesn't exist, downloads it + for subset_name in split_list: + subset_folder = os.path.join(data_folder, subset_name) + subset_archive = os.path.join(subset_folder, subset_name + ".tar.gz") + + if not check_folders(subset_folder): + logger.info( + f"No data found for {subset_name}. Checking for an archive file." + ) + if not os.path.isfile(subset_archive): + logger.info( + f"No archive file found for {subset_name}. Downloading and unpacking." + ) + quit() + # Collects all files matching the provided extension + wav_list.extend(get_all_files(subset_folder, match_and=extension)) + + return wav_list + + +def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None, skip_resample=False): + """ + Creates the json file given a list of wav files. + Arguments + --------- + wav_list : list of str + The list of wav files. + json_file : str + The path of the output json file + sample_rate : int + The sample rate to be used for the dataset + data_folder : str + The path to LibriTTS + alignments_folder : str + The path to LibriTTS alignments + model_name : str + Model name (used to prepare additional model specific data) + skip_resample : int + Skips resampling - useful when large temporary storage + is absent. + """ + + # Downloads and initializes the G2P model to compute the phonemes if data is being prepared for Tacotron2 experiments + if model_name == "Tacotron2": + logger.info( + "Computing phonemes for labels using SpeechBrain G2P. This may take a while." + ) + g2p = GraphemeToPhoneme.from_hparams( + "speechbrain/soundchoice-g2p", run_opts={"device": DEVICE} + ) + else: + g2p = None + + json_dict = {} + + # Processes all the wav files in the list + for wav_file in tqdm(wav_list): + # Reads the signal + signal, sig_sr = torchaudio.load(wav_file) + duration = signal.shape[1] / sig_sr + + # TODO add better way to filter short utterances + if duration < 1.0: + continue + + # Manipulates path to get relative path and uttid + path_parts = wav_file.split(os.path.sep) + uttid, _ = os.path.splitext(path_parts[-1]) + # relative_path = os.path.join("{data_root}", *path_parts[-4:]) + + # Gets the path for the text files and extracts the input text + normalized_text_path = os.path.join( + "/", *path_parts[:-1], uttid + ".normalized.txt" + ) + try: + with open(normalized_text_path, encoding="utf-8") as f: + normalized_text = f.read() + if normalized_text.__contains__("{"): + normalized_text = normalized_text.replace("{", "") + if normalized_text.__contains__("}"): + normalized_text = normalized_text.replace("}", "") + except FileNotFoundError: + print(f"Warning: The file {normalized_text_path} does not exist.") + continue + + # Resamples the audio file if required + if sig_sr != sample_rate and not skip_resample: + resampled_signal = torchaudio.functional.resample( + signal, sig_sr, sample_rate + ) + os.unlink(wav_file) + torchaudio.save(wav_file, resampled_signal, sample_rate=sample_rate) + + # Gets the speaker-id from the utterance-id + spk_id = uttid.split("_")[0] + + # Creates an entry for the utterance + json_dict[uttid] = { + "uttid": uttid, + "wav": wav_file, + "duration": duration, + "spk_id": spk_id, + "label": normalized_text, + "segment": True if "train" in json_file else False, + } + if alignments_folder is not None: + alignments_file_name = get_alignment_path(data_folder, alignments_folder, wav_file) + alignments = parse_alignments(alignments_file_name) + json_dict[uttid].update(alignments) + + # Characters are used for Tacotron2, phonemes may be needed for other models + if model_name not in ["Tacotron2", "HiFi-GAN"] and g2p is not None: + # Computes phoneme labels using SpeechBrain G2P and keeps the punctuations + phonemes = _g2p_keep_punctuations(g2p, normalized_text) + json_dict[uttid].update({"label_phoneme": phonemes}) + + # Writes the dictionary to the json file + with open(json_file, mode="w", encoding="utf-8") as json_f: + json.dump(json_dict, json_f, indent=2) + + logger.info(f"{json_file} successfully created!") + + +def get_alignment_path(data_folder, alignments_folder, file_name): + """Returns the path in the LibriSpeech-Alignments dataset + corresponding to the specified file path in LibriSpeech + + Arguments + --------- + data_folder: str + the path to LibriSpeech + alignments_folder: str + the path to LibriSpeech-Alignments + file_name: str + the file name within LibriSpeech + + Returns + ------- + file_name: str + the alignment file path + """ + file_name = Path(file_name) + data_folder = Path(data_folder) + if file_name.parts[0] == "{data_root}": + file_name_rel = file_name.relative_to("{data_root}") + else: + file_name_rel = file_name.relative_to(data_folder) + data_slice = file_name_rel.parts[0] + + textgrid_folder = file_name_rel.relative_to(Path(data_slice) / "LibriTTS" / data_slice).parent.parent + textgrid_file_name = f"{file_name_rel.stem}.TextGrid" + textgrid_path = Path(alignments_folder) / data_slice / textgrid_folder / textgrid_file_name + + return textgrid_path + + +def skip(*filenames): + """ + Detects if the data preparation has been already done. + If the preparation has been done, we can skip it. + + Arguments + --------- + *filenames : tuple + Set of filenames to check for existence. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + for filename in filenames: + if isinstance(filename, list): + if any(not os.path.isfile(item) for item in filename): + return False + else: + if not os.path.isfile(filename): + return False + return True + + +def split_sets(wav_list, split_ratio): + """Randomly splits the wav list into training, validation, and test lists. + + Arguments + --------- + wav_list : list + list of all the signals in the dataset + split_ratio: list + List composed of three integers that sets split ratios for train, valid, + and test sets, respectively. For instance split_ratio=[80, 10, 10] will + assign 80% of the sentences to training, 10% for validation, and 10% + for test. + + Returns + ------- + dictionary containing train, valid, and test splits. + """ + # Random shuffles the list + random.shuffle(wav_list) + tot_split = sum(split_ratio) + tot_snts = len(wav_list) + data_split = {} + splits = ["train", "valid"] + + for i, split in enumerate(splits): + n_snts = int(tot_snts * split_ratio[i] / tot_split) + data_split[split] = wav_list[0:n_snts] + del wav_list[0:n_snts] + data_split["test"] = wav_list + + return data_split + + +def check_folders(*folders): + """Returns False if any passed folder does not exist.""" + for folder in folders: + if not os.path.exists(folder): + return False + return True + +def parse_alignments(file_name): + """Parses a given LibriSpeech-Alignments TextGrid file and + converts the results to the desired format (to be used in JSON + metadata) + + Arguments + --------- + file_name : path-like + the file name of the TextGrid file + + Returns + ------- + details: dict + the metadata details + """ + try: + import textgrids + except ImportError: + logger.error( + "Parsing LibriSpeech-alignments requires the" + "praat-textgrids package" + ) + raise + if not file_name.exists(): + return { + "has_alignments": False, + "phn": [], + "phn_stress": [], + "phn_start": [], + "phn_end": [], + "phn_count": 0, + "wrd": [], + "wrd_start": [], + "wrd_end": [], + "wrd_count": 0, + "unk_count": None + } + + text_grid = textgrids.TextGrid() + text_grid.read(file_name) + word_intervals = [ + {**word, "label": word["label"].upper()} + for word in text_grid.interval_tier_to_array("words") + ] + phn_intervals = text_grid.interval_tier_to_array("phones") + details = {} + details.update(intervals_to_dict(word_intervals, "wrd")) + phn = intervals_to_dict(phn_intervals, "phn") + phn_stress = phn["phn"] + phn_nostress = remove_stress_marks(phn_stress) + phn["phn"] = phn_nostress + phn["phn_stress"] = phn_stress + details.update(phn) + details["unk_count"] = sum(wrd == "" for wrd in details["wrd"]) + details["has_alignments"] = True + + return details + + +INTERVAL_MAP = [("label", ""), ("begin", "_start"), ("end", "_end")] +INTERVAL_EMPTY_LABELS = {"", "sil", "sp", "spn"} + + +def intervals_to_dict(intervals, prefix): + """ + Converts a parsed list of intervals from PRAAT TextGrid + to a learning-friendly array + + Arguments + --------- + intervals: list + A list of raw TextGrid intervals, as returned by + TextGrid.interval_tier_to_array + prefix: str + the prefix to add + + Returns + ------- + result: dict + A dictionary of the form + { + "{prefix}": , + "{prefix}_start": , + "{prefix}_end": , + "{prefix}_count: + } + + """ + # Remove meaningless labels + intervals_clean = [ + interval + for interval in intervals + if interval["label"] not in INTERVAL_EMPTY_LABELS + ] + result = { + f"{prefix}{suffix}": [interval[key] for interval in intervals_clean] + for key, suffix in INTERVAL_MAP + } + # This will map space labels to a single one + result[f"{prefix}_count"] = len(intervals_clean) + return result + + +RE_STRESS_MARK = re.compile(r"\d$") + + +def remove_stress_marks(phn): + """Removes stress marks from a phoneme annotation + + Arguments + --------- + phn: list + a list of phoneme annotations with or without stress marks + + Returns + ------- + result: list + a list of phoneme annotations without stress marks + """ + return [RE_STRESS_MARK.sub("", item) for item in phn] diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 5238beacd..6a2de5859 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -20,13 +20,13 @@ PositionalEncoding as TransformerPositionalEncoding, get_lookahead_mask, ) +from speechbrain.dataio.batch import PaddedBatch +from speechbrain.utils.data_utils import batch_pad_right from speechbrain.nnet.attention import RelPosEncXL from speechbrain.nnet.embedding import Embedding from speechbrain.nnet.linear import Linear -from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss +from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, nll_loss from speechbrain.dataio.dataio import length_to_mask -from speechbrain.dataio.batch import PaddedBatch -from speechbrain.decoders.seq2seq import S2STransformerBeamSearcher from speechbrain.utils.data_utils import concat_padded_features from speechbrain.nnet.schedulers import NoamScheduler @@ -157,8 +157,10 @@ def __init__( show_inference_progress=True, audio_token_shift=0, multihead_input=True, + multihead_output=True, representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, + out_proj=None, ): super().__init__() self.num_tokens = num_tokens @@ -182,9 +184,11 @@ def __init__( if self.representation_mode == RepresentationMode.DISCRETE else audio_dim ) - self.out_proj = Linear( - input_size=d_model, n_neurons=self.out_dim * tokens_per_step, - ) + if out_proj is None: + out_proj = Linear( + input_size=d_model, n_neurons=self.out_dim * tokens_per_step, + ) + self.out_proj = out_proj self.gate = Linear(input_size=d_model, n_neurons=1) if audio_emb is None: if self.representation_mode == RepresentationMode.DISCRETE: @@ -222,6 +226,7 @@ def __init__( self.multihead_input = multihead_input self.d_model = d_model self.d_model_sqrt = math.sqrt(d_model) + self.multihead_output = multihead_output def decode( self, @@ -371,16 +376,17 @@ def forward( pos_embs_src, ) lin_out = self.out_proj(dec_out) - batch_size, audio_max_len, num_tokens = lin_out.shape - lin_out_heads = lin_out.reshape( - batch_size, - audio_max_len, - self.tokens_per_step, - num_tokens // self.tokens_per_step, - ) + if self.multihead_output: + batch_size, audio_max_len, num_tokens = lin_out.shape + lin_out = lin_out.reshape( + batch_size, + audio_max_len, + self.tokens_per_step, + num_tokens // self.tokens_per_step, + ) gate_out = self.gate(dec_out).squeeze(-1) return TokotronDecoderOutput( - lin_out_heads, + lin_out, gate_out, dec_self_attn, dec_attn, @@ -439,6 +445,8 @@ def __init__( representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, show_inference_progress=True, + transform_audio=None, + feed_audio=None ): super().__init__() self.decoder = None @@ -451,6 +459,10 @@ def __init__( self.representation_mode = RepresentationMode(representation_mode) self.audio_dim = audio_dim self.show_inference_progress = show_inference_progress + if transform_audio is None: + transform_audio = nn.Identity() + self.transform_audio = transform_audio + self.feed_audio = feed_audio def bind(self, model): """Binds this inference implementation to a model @@ -522,6 +534,7 @@ def forward(self, enc_out, length, emb=None): steps_range = tqdm(steps_range, desc="Inference") for idx in steps_range: # One autoregressive step + audio = self.transform_audio(audio) step_out = self.decoder.forward( enc_out=enc_out, src_length=length, @@ -530,7 +543,9 @@ def forward(self, enc_out, length, emb=None): ) audio_out = step_out.out - if self.representation_mode == RepresentationMode.DISCRETE: + if self.feed_audio: + audio_out = self.feed_audio(audio_out) + elif self.representation_mode == RepresentationMode.DISCRETE: audio_out = audio_out.argmax(-1) # The model outputs predictions without BOS. Add the BOS back for the @@ -592,357 +607,6 @@ def forward(self, enc_out, length, emb=None): ) -class TokotronSearchWrapper(nn.Module): - """A wrapper class to facilitate seach-based inference. It takes care of re-interpreting - a multi-headed sequence as multiple samples, for compatibility, and for the retention - of attention tensors - - Arguments - --------- - decoder : TokotronTransformerDecoder - the Tokotron transformer decoder - """ - - def __init__(self, decoder): - super().__init__() - self.tokens_per_step = decoder.tokens_per_step - self.decoder = decoder - - def decode(self, memory, enc_states, enc_lens): - """Wraps the decode operation, will all the necessary - reshaping - - Arguments - --------- - memory : torch.Tensor - Characters predicted so far - enc_states : torch.Tensor - Encoder states - enc_lens : torch.Tensor - Encoder state lengths - """ - batch_size = enc_states.size(0) // self.tokens_per_step - _, mem_len = memory.shape - memory = memory.reshape( - self.tokens_per_step, batch_size, mem_len - ).permute(1, 2, 0) - dec_out, dec_self_attn, dec_attn = self.decoder.decode( - enc_out=enc_states[:batch_size], - src_length=enc_lens[:batch_size], - tgt=memory, - ) - self.dec_self_attn = dec_self_attn - self.dec_attn = dec_attn - return dec_out, dec_attn - - -class TokotronTransformerBeamSearcher(S2STransformerBeamSearcher): - """A slight modification of S2STransformerBeamSearcher that uses an - explicit number of tokens instead of trying to infer it from the - weights of the linear layer. This is needed because Tokotron is - multi-header and the final output layer outputs multiple output states - - Arguments - --------- - num_tokens : int - The number of audio tokens available - """ - - def __init__(self, num_tokens, *args, **kwargs): - super().__init__(*args, **kwargs) - self.num_tokens = num_tokens - - def set_n_out(self): - """Set the number of output tokens.""" - return self.num_tokens - - -class SearchLinearWrapper(nn.Module): - """A wrapper for the final linear layer of the Transformer. The goal is to - make it compatible with the SpeechBrain Beam Search implementation, which is - single-headed, by expanding multiple heads along the batch dimensions. - - Arguments - --------- - lin : torch.Tensor - A linear layer with an output feature dimensions of - (tokens_per_step x num_tokens) - tokens_per_step : int - the numer of tokens the model outputs for each - time step - """ - - def __init__(self, lin, tokens_per_step): - super().__init__() - self.lin = lin - self.tokens_per_step = tokens_per_step - - def forward(self, x): - """Performs a forward pass with all the required reshape operations - - Arguments - --------- - x : torch.Tensor - The decoder output - - Returns - ------- - result : torch.Tensor - The layer output, reshaped along the batch dimension - """ - x = self.lin(x) - batch_size, max_len, out_dim = x.shape - num_tokens = x.size(-1) // self.tokens_per_step - x = ( - # batch x tokens x length - x.transpose(2, 1) - # batch x heads x tokens x length - .view(batch_size, self.tokens_per_step, num_tokens, max_len) - # heads x batch x tokens x length - .transpose(0, 1) - # heads * batch x tokens x length - .reshape(self.tokens_per_step * batch_size, num_tokens, max_len) - # heads * batch x length x tokens - .transpose(1, 2) - ) - return x - - -class TokotronSearchInference(nn.Module): - """A beam search-based inference implementation - - All keyword arguments will be passed on to the underlying - beam search - """ - - def __init__(self, audio_token_shift=1, **kwargs): - super().__init__() - self.search_kwargs = kwargs - self.audio_token_shift = audio_token_shift - self.decoder, self.search, self.tokens_per_step = None, None, None - - def bind(self, model=None): - """Binds this inference implementation to a model - - Arguments - --------- - model : TokotronTransformerModel - The transformer model - """ - decoder = model.decoder - self.tokens_per_step = decoder.tokens_per_step - self.decoder = TokotronSearchWrapper(decoder) - self.search = TokotronTransformerBeamSearcher( - modules=[ - self.decoder, - SearchLinearWrapper(decoder.out_proj, self.tokens_per_step), - ], - num_tokens=decoder.num_tokens + self.audio_token_shift, - **self.search_kwargs, - ) - - def decode(self, enc_out, length): - """"Decodes the encoder representation using Beam Search - - Arguments - --------- - enc_out : torch.Tensor - Encoder output - length : torch.Tensor - Encoder output lengths - - Returns - ------- - output : TokotronDecoderInfernceOutput - The inference output - """ - with torch.no_grad(): - device = enc_out.device - # The search does not support multiple heads. "Trick" it by expanding encoded - # representations along the batch dimension so that the beam searcher - # treats it as if they were separate, independent samples. - batch_size, max_len, enc_dim = enc_out.shape - enc_out_search = ( - enc_out.unsqueeze(0) - .expand(self.tokens_per_step, batch_size, max_len, enc_dim) - .reshape(self.tokens_per_step * batch_size, max_len, enc_dim) - ) - length_search = ( - length.unsqueeze(0) - .expand(self.tokens_per_step, batch_size) - .reshape(self.tokens_per_step * batch_size) - ) - hyps, audio_length, scores, log_probs = self.search( - enc_out_search, length_search - ) - tokens_batch = PaddedBatch( - [ - {"hyps": torch.tensor(item, device=enc_out.device)} - for item in hyps - ] - ).to(device) - - audio_tokens, length = tokens_batch.hyps - _, audio_max_len = audio_tokens.shape - audio_tokens = audio_tokens.reshape( - self.tokens_per_step, batch_size, audio_max_len - ).permute(1, 2, 0) - length = ( - length.reshape(self.tokens_per_step, batch_size).min(dim=0) - ).values - audio_tokens = audio_tokens - self.audio_token_shift - - return TokotronDecoderInfernceOutput( - audio_tokens=audio_tokens, - length=length, - dec_self_attn=self.decoder.dec_self_attn, - dec_attn=self.decoder.dec_attn, - alignments=get_alignments(self.decoder.dec_attn), - p_eos=None, - ) - - -class TokotronForwardInference(nn.Module): - """A beam search-based inference implementation - - All keyword arguments will be passed on to the underlying - beam search - - Arguments - --------- - scale_factor : float - The scaling factor for encoder representations - gate_threshold : float - The threshold for gate activation - min_length : int - The minimum length for generating sequences, in tokens - """ - - def __init__( - self, - scale_factor=5.0, - gate_threshold=0.5, - min_length=16, - eos_mode=EosMode.GATE, - eos_index=0, - representation_mode=RepresentationMode.DISCRETE, - ): - super().__init__() - self.scale_factor = scale_factor - self.gate_threshold = gate_threshold - self.min_length = min_length - self.decoder = None - self.gate = None - self.eos_mode = EosMode(eos_mode) - self.eos_index = eos_index - self.representation_mode = RepresentationMode(representation_mode) - - def bind(self, model=None): - """Binds this inference implementation to a model - - Arguments - --------- - model : TokotronTransformerModel - The transformer model - """ - self.decoder = model.decoder - - def decode(self, enc_out, length): - """"Decodes the encoder representation using Beam Search - - Arguments - --------- - enc_out : torch.Tensor - Encoder output - length : torch.Tensor - Encoder output lengths - - Returns - ------- - output : TokotronDecoderInfernceOutput - The inference output - """ - with torch.no_grad(): - max_len = enc_out.size(1) - src_key_padding_mask = length_to_mask( - length * max_len, max_len, - ).logical_not() - tgt = scale(enc_out, self.scale_factor) - dec_out = self.decoder( - enc_out=enc_out, - tgt=tgt, - tgt_length=length, - src_length=length, - src_key_padding_mask=src_key_padding_mask, - pos_embs_src=None, - ) - if self.eos_mode == EosMode.GATE: - p_eos, eos = self.get_length_gate(dec_out) - else: - p_eos, eos = self.get_length_token(dec_out) - - infer_length_abs = eos.max(dim=1).indices - infer_length_abs_nonzero = infer_length_abs[infer_length_abs > 0] - if len(infer_length_abs_nonzero) > 0: - infer_length_max = infer_length_abs_nonzero.max() - else: - infer_length_max = 0 - if infer_length_max == 0: - infer_length_max = p_eos.size(1) - infer_length_abs = torch.where( - infer_length_abs == 0, infer_length_max, infer_length_abs - ) - infer_length_abs = infer_length_abs.clip(min=self.min_length) - infer_length = infer_length_abs / infer_length_max - - audio = dec_out.out[:, :infer_length_max].argmax(-1) - if self.representation_mode == RepresentationMode.DISCRETE: - audio = audio.argmax(-1) - return TokotronDecoderInfernceOutput( - audio=audio, - length=infer_length, - dec_self_attn=dec_out.dec_self_attn, - dec_attn=dec_out.dec_attn, - alignments=get_alignments(dec_out.dec_attn), - p_eos=p_eos, - ) - - def get_length_gate(self, dec_out): - """Infers lengths using the gate module - - Arguments - --------- - dec_out : TokotronDecoderOutput - The decoder output - - Returns - ------- - p_eos : torch.Tensor - EOS probabilities (as estimated by the gate) - eos : torch.Tensor - a Boolean tensor where positions indicate whether - the gate has activated - """ - p_eos = dec_out.gate_out.sigmoid() - eos = p_eos > self.gate_threshold - return p_eos, eos - - def get_length_token(self, dec_out): - """Infers lengths using an EOS token - - Arguments - --------- - dec_out : TokotronDecoderOutput - The decoder output - eos : torch.Tensor - A Boolean tensor indicating whether EOS has been reached - """ - p_seq = dec_out.out[:, :, 0].softmax(dim=-1) - p_eos = p_seq[:, :, self.eos_index].softmax(-1) - eos = p_seq.argmax(dim=-1) == self.eos_index - return p_eos, eos - - class TokotronTransformerModel(nn.Module): """An end-to-end Tokotron model receiving characters or phonemes as inputs and outputting audio tokens @@ -1052,11 +716,13 @@ def __init__( eos_mode=EosMode.GATE, inference=None, audio_token_shift=0, - decoder_mode=DecoderMode.AUTOREGRESSIVE, scale_factor=5.0, representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, emb=None, + audio_emb=None, + out_proj=None, + multihead_input=True ): super().__init__() self.in_emb = Embedding( @@ -1075,11 +741,6 @@ def __init__( activation=activation, normalize_before=True, ) - self.decoder_mode = DecoderMode(decoder_mode) - audio_emb = None - if self.decoder_mode == DecoderMode.FORWARD: - audio_emb = nn.Identity() - audio_emb_size = d_model self.decoder = TokotronTransformerDecoder( num_tokens=audio_num_tokens + self.audio_token_shift, tokens_per_step=audio_tokens_per_step, @@ -1099,9 +760,11 @@ def __init__( gate_threshold=gate_threshold, gate_offset=gate_offset, audio_token_shift=audio_token_shift, - multihead_input=self.decoder_mode == DecoderMode.AUTOREGRESSIVE, + multihead_input=multihead_input, + multihead_output=out_proj is None, representation_mode=representation_mode, audio_dim=audio_dim, + out_proj=out_proj, ) self.bos_idx = bos_idx self.attention_type = attention_type @@ -1255,17 +918,11 @@ def forward( src_key_padding_mask=src_key_padding_mask, pos_embs=pos_embs_encoder, ) - if self.decoder_mode == DecoderMode.AUTOREGRESSIVE: - tgt = audio - tgt_length = audio_length - else: - tgt = scale(enc_out, self.scale_factor) - tgt_length = input_length enc_out = self.add_emb(enc_out, emb) dec_out = self.decoder( enc_out=enc_out, - tgt=tgt, - tgt_length=tgt_length, + tgt=audio, + tgt_length=audio_length, src_length=input_length, src_key_padding_mask=src_key_padding_mask, pos_embs_src=pos_embs_encoder, @@ -1569,6 +1226,7 @@ def __init__( representation_mode=RepresentationMode.DISCRETE, audio_clip_min=-10.0, audio_clip_max=10.0, + multihead_output=True, ): super().__init__() self.guided_attention_weight = guided_attention_weight @@ -1597,6 +1255,7 @@ def __init__( self.register_buffer("audio_eos", audio_eos) self.audio_clip_min = audio_clip_min self.audio_clip_max = audio_clip_max + self.multihead_output = multihead_output def forward( self, @@ -1629,9 +1288,12 @@ def forward( out = out.log_softmax(dim=-1) batch_size, out_len, heads, tok_dim = out.shape max_len = out_len - 1 - out_reshaped = ( - out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim) - )[:, :max_len] + if self.multihead_output: + out_reshaped = ( + out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim) + )[:, :max_len] + else: + out_reshaped = out if self.eos_mode == EosMode.TOKEN: # NOTE: Shift only the tokens, but not EOS padding_lengths = torch.ones(batch_size, device=audio.device) @@ -1645,7 +1307,10 @@ def forward( ) tok_len = audio.size(1) - if self.representation_mode == RepresentationMode.DISCRETE: + if not self.multihead_output: + audio_reshaped = audio + lengths_reshaped = audio_length + elif self.representation_mode == RepresentationMode.DISCRETE: audio_reshaped = audio.transpose(1, 2).reshape( batch_size * heads, max_len ) @@ -1664,18 +1329,21 @@ def forward( ) audio_reshaped = audio_reshaped[:, :max_len] - lengths_reshaped = ( - audio_length.unsqueeze(-1) - .expand(batch_size, heads) - .reshape(batch_size * heads) - ) + if self.multihead_output: + lengths_reshaped = ( + audio_length.unsqueeze(-1) + .expand(batch_size, heads) + .reshape(batch_size * heads) + ) + else: + lengths_reshaped = audio_length seq_loss = self.seq_cost( out_reshaped[:, :tok_len], audio_reshaped, length=lengths_reshaped, reduction=reduction, ) - if reduction == "batch": + if reduction == "batch" and self.multihead_output: seq_loss = seq_loss.reshape(batch_size, heads).mean(-1) lengths_abs = audio_length * out_len @@ -2229,178 +1897,199 @@ def all_weights(self): return torch.stack([emb.weight for emb in self.emb]) -class DACFeatureExtractor(nn.Module): - """An adapter for feature extraction +def get_silence_token( + model, + sample_length=100000, + unsqueeze=False, + device=None, + num_codebooks=None, + +): + """Attempts to find out the silence tokens for a given model, + if applicable Arguments --------- - dac : DAC - a DAC model - """ - - def __init__(self, dac, n_quantizers): - super().__init__() - self.dac = dac - self.dac.eval() - self.n_quantizers = n_quantizers + model : nn.Module + A discrete token model, taking (wav, lengths) as arguments + sample_length : int + The length of the sample + unsqueeze: bool + Whether to add an extra dimension to the audio (needed for DAC) + device : str | torch.Device + The device to use + num_codebooks : int | list + The number of codebooks or the codebooks to use - def encode(self, inputs, length): - """Encodes a raw audio sample using DAC + Returns + ------- + silence_tokens : torch.Tensor + The token(s) corresponding to silence - Arguments - --------- - inputs : torch.Tensor - A (Batch x Samples) or (Batch x Channel x Samples) - tensor of audio - length : torch.Tensor - A tensor of relative lengths + silece_emb : torch.Tensor + The embedding(s) corresponding to silence - Returns - ------- - tokens : torch.Tensor - A (Batch x Tokens x Heads) tensor of audio tokens - emb : torch.Tensor - Raw vector embeddings from the model's - quantizers + """ + if device is None: + device = next(model.parameters()).device + + audio = torch.zeros(1, sample_length, device=device) + if unsqueeze: + audio = audio.unsqueeze(1) + length = torch.ones(1, device=device) + model_training = model.training + model.eval() + tokens = model.sig_to_tokens(audio, length, num_codebooks=num_codebooks) + if model_training: + model.train() + tokens = tokens.squeeze(0) + if unsqueeze: + tokens = tokens.squeeze(0) + silence_tokens = tokens.mode(0).values + return silence_tokens + + +def get_silence_repr(model, sample_length=100000, device=None): + """Gets continuous silence - """ - if inputs.dim() < 3: - inputs = inputs.unsqueeze(1) - emb, codes, _, _, _ = self.dac.encode( - inputs, n_quantizers=self.n_quantizers - ) - emb.transpose_(1, 2) - codes.transpose_(1, 2) - max_len = emb.size(1) - mask = length_to_mask( - length * max_len, max_len, device=inputs.device - ).unsqueeze(-1) - return codes * mask, emb * mask + Arguments + --------- + model : nn.Module + A discrete token model, taking (wav, lengths) as arguments + sample_length : int + The length of the sample + device : str | torch.Device + The device to use - def forward(self, inputs, length): - """Encodes a raw audio sample using DAC + Returns + ------- + silence : torch.Tensor + A silecnce tensor + """ + audio = torch.zeros(1, sample_length, device=device) + length = torch.ones(1, device=device) + audio_repr = model(audio, length) + silence = audio_repr.mean(dim=1)[0] + return silence - Arguments - --------- - inputs : torch.Tensor - A (Batch x Samples) or (Batch x Channel x Samples) - tensor of audio - length : torch.Tensor - A tensor of relative lengths - Returns - ------- - tokens : torch.Tensor - A (Batch x Tokens x Heads) tensor of audio tokens - emb : torch.Tensor - Raw vector embeddings from the model's - quantizers +def feature_pad_to(tensor, length, padding=None): + """Pads feature dimensions to the specified length with the specified padding, + assuming a (Batch x Length x Features..) tensor - """ - return self.encode(inputs, length) + Arguments + --------- + tensor : torch.Tensor + The tensor to be padded - def embeddings(self, tokens): - """Converts token indexes to vector embeddings + length : int + The length to which the tensor will be padded - Arguments - --------- - tokens : torch.Tensor - a (Batch x Length x Heads) tensor of token indexes + padding : torch.Tensor, optional + The padding tensor - if omitted, zero padding + will be used - Returns - ------- - emb : torch.Tensor - a (Batch x Length x Heads x Embedding) tensor - of raw vector embeddings from the model's - quantizer codebooks - """ - emb, _, _ = self.dac.quantizer.from_codes(tokens.transpose(1, 2).int()) - return emb.transpose(1, 2) + Returns + ------- + result : torch.Tensor + The padded tensor + """ + if padding is None: + padding = torch.zeros(tensor.shape[1:]) + padding = padding[None, ...].expand( + (length - tensor.size(0),) + tensor.shape[1:] + ) + return torch.cat([tensor, padding], dim=0) -class SpeechTokenizerFeatureExtractor(nn.Module): - """This lobe enables the integration of HuggingFace and SpeechBrain - pretrained SpeechTokenizer. +def batch_feature_pad(tensors, padding=None): + """Similar to batch_pad_right but pads with the specified padding, which + can be a vector or a tensor - Please, install speechtokenizer: - pip install speechtokenizer + Arguments + --------- + tensors : list + The list of tensors to be padded + padding : torch.Tensor + The padding tensor - Source paper: https://arxiv.org/abs/2308.16692 + Returns + ------- + result : torch.Tensor + the padded tensor + """ + lengths_abs = torch.tensor( + [len(item) for item in tensors], device=tensors[0].device + ) + max_length = lengths_abs.max() + data = torch.stack( + [feature_pad_to(item, max_length, padding) for item in tensors] + ) + lengths = lengths_abs / max_length + return data, lengths - The model can be used as a fixed Discrete feature extractor or can be finetuned. It - will download automatically the model from HuggingFace or use a local path. +def token_collate_fn(examples, silence_token, token_keys): + """A customized collation function for audio tokens where + the specified silence token will be used as padding - instead of + zeros Arguments --------- - speech_tokenizer : speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - The speech tokenizer interface - codebooks : int, optional - The number of codebooks to use - if omitted, - """ + examples : list + A list of examples - def __init__(self, speech_tokenizer, codebooks=None): - super().__init__() - self.speech_tokenizer = speech_tokenizer - self.codebooks = codebooks - - def forward(self, wav, wav_lens=None): - """Takes an input waveform and return its corresponding wav2vec encoding. - - Arguments - --------- - wav : torch.Tensor (signal) - A batch of audio signals to transform to features. - wav_lens : torch.Tensor - The relative length of the wav given in SpeechBrain format. + silence_token : torch.Tensor + The token(s) representing silence - Returns - ------- - tokens : torch.Tensor - A tensor of audio tokens - Shape: (N_q x Batch x Time) by default - (Batch x Time x N_q) if shape == compat + token_keys : list + The list of keys to which special padding will be applied - """ - return self.encode(wav, wav_lens) + Returns + ------- + result : speechbrain.dataio.batch.PaddedBatch + A padded batch + """ + token_tensor_ids = {id(examples[0][key]) for key in token_keys} + return PaddedBatch( + examples, + padding_func=_silence_padding, + padding_kwargs={ + "silence_token": silence_token, + "token_tensor_ids": token_tensor_ids, + }, + ) - def encode(self, wav, wav_lens=None): - """Takes an input waveform and return its corresponding wav2vec encoding. - Arguments - --------- - wav : torch.Tensor (signal) - A batch of audio signals to transform to features. - wav_lens : torch.Tensor - The relative length of the wav given in SpeechBrain format. +def _silence_padding(values, silence_token, token_tensor_ids): + return ( + batch_feature_pad(values, silence_token) + if id(values[0]) in token_tensor_ids + else batch_pad_right(values) + ) - Returns - ------- - tokens : torch.Tensor - A (Batch x Seq, N_q) tensor of audio tokens - """ - # Extract discrete codes from SpeechTokenizer - codes = self.speech_tokenizer.encode( - wav.unsqueeze(1), wav_lens - ) # codes: (n_q, B, T) - if self.codebooks is not None: - codes = codes[: self.codebooks] - codes = codes.permute(1, 2, 0) - return codes - - def decode(self, codes): - """Takes an input waveform and return its corresponding wav2vec encoding. +def use_silence_padding(dataloader_opts, silence_token, token_keys): + """Overrides the collation function to add silence padding to + audio token features - Arguments - --------- - tokens : torch.Tensor - A (N_q, Batch x Seq) tensor of audio tokens + Arguments + --------- + dataloder_opts : dict + Dataloader options + silence_token : torch.Tensor + The tensor to be used as silence padding + token_keys : torch.Tensor + The keys to apply silence padding to - Returns - ------- - wav : torch.Tensor (signal) - A batch of reconstructed audio signals. - """ - codes = codes.permute(2, 0, 1) - return self.speech_tokenizer.decode(codes) + Returns + ------- + dataloader_opts : dict + Updated data loader options + """ + return { + **dataloader_opts, + "collate_fn": partial( + token_collate_fn, silence_token=silence_token, token_keys=token_keys + ), + } \ No newline at end of file diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 972d35c66..6f28ee44d 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -1,4 +1,7 @@ +import math import torch +from speechbrain.nnet.linear import Linear +from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens class AttentionMLP(torch.nn.Module): @@ -109,3 +112,154 @@ def forward(self, in_tokens): if self.proj_layer is not None: in_embs = self.proj_layer(in_embs) return in_embs + + +class TernaryPredictionHead(torch.nn.Module): + """An alternative prediction head that predicts a fixed number of ternary digits + for each position (as used in SQ-Codec) + + Arguments + --------- + d_model : int + The model dimension + num_positions : int + the number of positions + """ + def __init__(self, d_model, num_positions, d_hidden=512, norm=True): + super().__init__() + self.num_positions = num_positions + self.d_model = d_model + self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity() + self.lin_hidden = Linear( + input_size=d_model, + n_neurons=d_hidden, + ) + self.act = torch.nn.LeakyReLU() + self.lin_p = Linear( + input_size=d_hidden, + n_neurons=num_positions * 3, + bias=False + ) + + def forward(self, x, track=None): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The decoder output (Batch x Length x d_model) + + track : int + The track index (if applicable) + + Returns + ------- + p : torch.Tensor + A tensor of shape (Batch x Length x num_positions x ternary digit) + The values are logits (unnormalized probabilities) + + p[:, :, :, 0] corresponds to -1 + p[:, :, :, 1] corresponds to 0 + p[:, :, :, 2] corresponds to 1 + """ + batch_size, max_len, _ = x.shape + x = self.norm(x) + x = self.lin_hidden(x) + x = self.act(x) + p = self.lin_p(x) + p = p.reshape(batch_size, max_len, self.num_positions, 3) + return p + + +class MultitrackPredictionHead(torch.nn.Module): + """An alternative prediction head that predicts multiple + tracks of tokens simultaneously + + Arguments + --------- + d_model : int + The model dimension + num_tracks: int + The number of tracks + vocab_size : int + The vocabulary size + """ + def __init__(self, d_model, num_tracks, vocab_size): + super().__init__() + self.num_tracks = num_tracks + self.vocab_size = vocab_size + self.lin = Linear( + input_size=d_model, + n_neurons=num_tracks * vocab_size + ) + + def forward(self, x): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + the input + Returns + ------- + result : torch.Tensor + a result of shape (Batch x Length x Tracks x Tokens) + """ + batch_size, max_len, _ = x.shape + x = self.lin(x) + x = x.reshape(batch_size, max_len, self.num_tracks, self.vocab_size) + return x + + + +class TernaryLogitTokenizer(torch.nn.Module): + """Converts ternary logits to probabilities + + Arguments + --------- + num_positions : int + The number of ternary digits/positions + num_tokens : int + The number of tokens + chunk_size : int + The size of the chunk (to prevent OOM) + mode : str + "probability" : treats the outputs as a probability distribution + "argmax" : "hard" mode, only the top probability is used. Cannot be used with + top_k sampling with k > 1 + + """ + def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10): + super().__init__() + self.num_positions = num_positions + if num_tokens is None: + num_tokens = 3 ** num_positions + self.num_tokens = num_tokens + self.num_tracks = num_tracks + self.chunk_size = chunk_size + self.register_buffer("vocab", torch.arange(num_tokens)) + self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) + self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) + + def forward(self, logits): + batch_size, max_len, num_positions, _ = logits.shape + logits = logits.softmax(-1) + logits = logits.reshape(batch_size, max_len, self.num_tracks, 1, num_positions // self.num_tracks, 3) + chunks = logits.chunk( + dim=1, + chunks=math.ceil(logits.size(1) / self.chunk_size) + ) + token_logits_chunks = [] + for chunk in chunks: + token_logits_raw = torch.where( + self.vocab_ternary[:, None, None, :, :, None] == self.idx, + chunk, + torch.ones_like(chunk) + ).prod(-1).log().sum(-1).exp() + token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) + token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2)) + token_logits = torch.cat( + token_logits_chunks, + dim=1 + ) + return token_logits diff --git a/benchmarks/DASB/model/fairseq_hubert.py b/benchmarks/DASB/model/fairseq_hubert.py new file mode 100644 index 000000000..3f86b54c5 --- /dev/null +++ b/benchmarks/DASB/model/fairseq_hubert.py @@ -0,0 +1,80 @@ +import joblib +import torch +import torch.nn.functional as F +from speechbrain.utils.data_utils import batch_pad_right + + + +MIN_WAV_LEN = 720 +MODEL_SR = 16000 + + +class FairseqHuBERT(torch.nn.Module): + def __init__( + self, + feat_extractor_path, + layer, + km_path, + max_chunk=1600000, + vocoder=None, + ): + super().__init__() + import fairseq + + # Feature extractor + ( + model, + cfg, + task, + ) = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [feat_extractor_path] + ) + self.model = model[0] + self.task = task + self.layer = layer + self.max_chunk = max_chunk + # Quantizer + km_model = joblib.load(km_path) + self.C_np = km_model.cluster_centers_.transpose() + self.Cnorm_np = (self.C_np**2).sum(0, keepdims=True) + self.register_buffer("C", torch.from_numpy(self.C_np)) + self.register_buffer("Cnorm", torch.from_numpy(self.Cnorm_np)) + self.sample_rate = MODEL_SR + self.vocoder = vocoder + + def encode(self, x, wav_lens=None): + if self.task.cfg.normalize: + x = F.layer_norm(x, x.shape) + + feat = [] + for start in range(0, x.size(1), self.max_chunk): + x_chunk = x[:, start : start + self.max_chunk] + if x_chunk.size(1) < MIN_WAV_LEN: + continue + feat_chunk, _ = self.model.extract_features( + source=x_chunk, + padding_mask=None, + mask=False, + output_layer=self.layer, + ) + feat.append(feat_chunk) + feat = torch.cat(feat, 1).squeeze(0) + dist = ( + feat.pow(2).sum(-1, keepdim=True) + - 2 * torch.matmul(feat, self.C) + + self.Cnorm + ) + tokens = dist.argmin(dim=-1) + if tokens.dim() < 2: + tokens = tokens.unsqueeze(0) + return tokens + + def decode(self, tokens): + if self.vocoder is None: + raise ValueError("Vocoder is not set") + sig_items = [ + self.vocoder(item, dur_prediction=True) + for item in tokens + ] + sig, _ = batch_pad_right(sig_items) + return sig diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 0e1ffe3f8..92f7ee7d2 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -21,6 +21,8 @@ from torch.autograd import Function from torch.nn.utils import remove_weight_norm, weight_norm +from speechbrain.dataio.dataio import length_to_mask + class SQCodec(nn.Module): """ @@ -124,7 +126,7 @@ def build_codec_model(self, config): exp_model_config = OmegaConf.load(config) scalar_codec = ScalarModel(**exp_model_config.generator.config) device = next(iter(scalar_codec.parameters())).device - parameter_dict = torch.load(self.ckpt_path, map_location=device) + parameter_dict = torch.load(self.ckpt_path, map_location=device, weights_only=False) scalar_codec.load_state_dict(parameter_dict["codec_model"]) return scalar_codec @@ -1281,6 +1283,109 @@ def forward(self, x): return x +class TernaryEmbedding(nn.Module): + """A module wrapper for tokens-to-ternary conversion + + Arguments + --------- + num_digits : int + The number of ternary digits + shift : int + The number of digits to "shift" embeddings by. + This is needed when text and special tokens are concatenated + shift_cutoff : int + The shifted tokens + flat : bool + Where to enable "flat" embeddings (e.g. multiple codebooks "flattened") + """ + def __init__( + self, + num_digits, + shift=None, + shift_cutoff=None, + hybrid=False, + hybrid_cutoff=None, + hybrid_size=None, + flat=False): + super().__init__() + self.num_digits = num_digits + if hybrid: + shift = None + self.shift = shift + if shift_cutoff is None and shift: + shift_cutoff = 3**shift + self.shift_cutoff = shift_cutoff + if hybrid and not flat: + raise ValueError( + "Hybrid embeddings are currently supported" + "only for flattened mode") + self.flat = flat + self.hybrid = hybrid + self.hybrid_cutoff = hybrid_cutoff + if hybrid: + self.emb = torch.nn.Embedding(hybrid_cutoff + 1, hybrid_size) + torch.nn.init.uniform_(self.emb.weight, a=-1., b=1.) + + def forward(self, tokens): + """Computes the forward pass + + Arguments + --------- + tokens : torch.Tensor + the tokens + """ + squeeze = False + if tokens.dim() < 3: + squeeze = True + tokens = tokens.unsqueeze(-1) + batch_size, max_len, tracks = tokens.shape + tokens = self._shift(tokens) + if self.hybrid: + # Note: Yes, text tokens will be "floored" but + emb_tokens = (tokens - self.hybrid_cutoff).clip(0) + else: + emb_tokens = tokens + emb = tokens_to_ternary(emb_tokens, D=self.num_digits).float() + if self.hybrid: + emb = self._hybrid_emb(emb, tokens) + positions = emb.size(-1) + if self.flat: + emb = emb.unsqueeze(-2) + else: + emb = emb.reshape(batch_size, max_len, tracks, positions // tracks) + if squeeze: + emb = emb.squeeze(-2) + return emb + + def _hybrid_emb(self, emb, tokens): + batch_size, max_len, tracks = tokens.shape + hybrid_emb = torch.cat( + [ + self.emb(tokens[:, :, 0].clip(max=self.hybrid_cutoff)), + torch.where( + (tokens[:, :, 0] < self.hybrid_cutoff).unsqueeze(-1), + torch.ones(batch_size, max_len, emb.size(-1), device=tokens.device) * -1, + emb + ) + ], + dim=-1 + + ) + return hybrid_emb + + def _shift(self, tokens): + if not self.shift: + return tokens + shift_multiplier = 3**self.shift + shift_offset = shift_multiplier - 1 + tokens_shift = torch.where( + tokens < self.shift_cutoff, + tokens, + (tokens - self.shift_cutoff) * shift_multiplier + shift_offset + ) + return tokens_shift + + def decimal_to_ternary_matrix(decimals, D): """ Convert a tensor of decimal numbers to a D*T ternary matrix for each batch. @@ -1300,7 +1405,7 @@ def decimal_to_ternary_matrix(decimals, D): corresponds to a batch, and each column is represented as a ternary number. """ B, T = decimals.shape - ternary_matrix = torch.zeros((B, D, T), dtype=torch.long) + ternary_matrix = torch.zeros((B, D, T), dtype=torch.long, device=decimals.device) for pos in range(D): ternary_matrix[:, pos, :] = decimals % 3 # Modulo operation decimals //= 3 # Floor division for next ternary digit @@ -1342,6 +1447,40 @@ def ternary_matrix_to_decimal(matrix): return decimals +def ternary_matrix_to_decimal_torch(matrix): + """ + Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. + + Arguments + --------- + matrix : numpy.ndarray + A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number + of ternary digits, and N is the number of ternary numbers in each batch. + + Returns + ------- + numpy.ndarray + A 2D numpy array of shape (B, N), where each value represents the decimal + equivalent of the corresponding ternary number in the input matrix. + """ + ( + B, + D, + N, + ) = ( + matrix.shape + ) # B is the batch size, D is the number of digits, N is the number of ternary numbers + powers_of_three = 3 ** torch.arange(D, device=matrix.device) # [3^0, 3^1, ..., 3^(D-1)] + + # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] + powers_of_three = powers_of_three[:, None] # Shape [D, 1] + + # Compute dot product using broadcasting: matrix * powers_of_three along D axis + decimals = torch.sum(matrix * powers_of_three, axis=1) # Sum along the D axis + + return decimals + + def get_padding(kernel_size, dilation=1): """ Computes the padding size for a given kernel size and dilation. @@ -1359,3 +1498,123 @@ def get_padding(kernel_size, dilation=1): Calculated padding size. """ return int((kernel_size * dilation - dilation) / 2) + + +def ternary_to_decimal(ternary, n_codebook=4): + """Converts ternary digits to their decimal equivalent + + Arguments + --------- + ternary : torch.Tensor + (Batch x Length x num_positions) - ternary digits + n_codebooks : torch.Tensor + The number of codebooks + + Returns + ------- + result: torch.Tensor + the result (Batch x Length x codebooks) + """ + chunks = ternary.chunk(n_codebook, dim=1) + codec_ls = [] + # TODO: Vectorize + for i, chunk in enumerate(chunks): + chunk = chunk + 1 + tmp_codec = ternary_matrix_to_decimal_torch(chunk) + codec_ls.append(tmp_codec) + codec_ls = torch.stack(codec_ls) + return codec_ls.permute(1, 2, 0) + + +def ternary_logits_to_tokens(logits, n_codebook=4): + """Converts ternary logits to tokens (as used for SQ-Codec) + + Arguments + --------- + logits : torch.Tensor + The logits + + Returns + ------- + tokens : torch.Tensor + Token IDs + """ + ternary_matrix = logits_to_ternary(logits) + tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2), n_codebook=n_codebook) + return tokens + + +def tokens_to_ternary(tokens, D=9): + """Converts a sequence of tokens to a ternary matrix + + Arguments + --------- + tokens : torch.Tensor + A (Batch x Length x Codebooks) tensor of tokens + D : int + The number of ternary digits + + Returns + ------- + result : torch.Tensor + A (Batch x Length x Ternary Positions) tensor + with values of (-1, 0, 1)""" + has_batch = tokens.dim() > 2 + if not has_batch: + tokens = tokens.unsqueeze(0) + batch_size = tokens.size(0) + n_codebook = tokens.size(2) + tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() + ternary_matrix = torch.cat([ + decimal_to_ternary_matrix(item, D=D) - 1 + for item in tokens + ], dim=1) + ternary_matrix = ternary_matrix.transpose(1, 2) + if not has_batch: + ternary_matrix = ternary_matrix[0] + return ternary_matrix + + +def logits_to_ternary(logits): + """Converts a tensor with two logits to a ternary matrix + + Arguments + --------- + logits : torch.Tensor + The logits (Batch x Length x num_positions x 3) + + Returns + ------- + result : torch.Tensor + The corresponding ternary matrix + """ + ternary = logits.argmax(-1) - 1 + return ternary + + +def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", num_positions=9, reduction="mean"): + if targets.dim() < 3: + targets = targets.unsqueeze(-1) + if targets_type == "tokens": + targets = tokens_to_ternary(targets.unsqueeze(-1), D=num_positions) + batch_size, max_len, positions = targets.shape + targets_cat = targets + 1 + predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() + loss = nn.functional.nll_loss( + predictions_loss, + targets_cat, + reduction="none" + ) + if length is not None: + mask = length_to_mask( + length * max_len, + max_len + ) + mask = mask.unsqueeze(-1) + if mask is not None: + loss = loss * mask + if reduction == "mean": + loss = loss.sum(2).sum(1).sum(0) / mask.sum() + elif reduction == "batch": + loss = loss.sum(2).sum(1) / mask.sum(-1).sum(-1) + return loss diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py new file mode 100644 index 000000000..0ff414c68 --- /dev/null +++ b/benchmarks/DASB/model/valle.py @@ -0,0 +1,1234 @@ +"""An adaptation of ESPNET VALL-E +Originally by Jinchuan Tian + +https://github.com/espnet/espnet + +Authors + * Artem Ploujnikov 2024 (adaptation only) +""" + +# Copyright 2024 Jinchuan Tian +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# Implementation of Vall-E: https://arxiv.org/abs/2301.02111 + +import logging +import torch +import inspect +from typing import Tuple, Optional +from speechbrain.dataio.dataio import length_to_mask + +from torch import Tensor +from torch import nn +from torch.nn import functional as F +from dataclasses import dataclass + +from speechbrain.nnet.losses import reduce_loss, truncate + + +@dataclass +class SpeechLMInferenceOptions: + """Inference options + """ + + device: str = None + search_algo: str = "topk_sampling" + nbest: int = 1 + sampling_temperature: float = 1.0 + top_k: int = 20 + maxlenratio: float = 0.0 + minlenratio: float = 0.0 + eos: int = 5 + start: int = 1 + masks: torch.Tensor = None + nq: int = None + allow_invalid: bool = True + + +class ValleLM(nn.Module): + """The Vall-E TTS model (decoder-only transformer), adopted from + ESPNET2 + + Arguments + --------- + vocab_size : int + Dimention of vocabulary. + nq : int + Number of codes for each token / frame, usually for speech codec. + share_emb : bool + If true, share the embedding and lm_head weight. + qk_norm : bool + If true, apply LayerNorm to q and k in atention. + dropout : float + dropout rate for attention layers. + target_dropout : float + a separate dropout applied to targets only (may be + useful to mitigate autorgressive prediction instability) + att_unit: int + Dimention of Transformer attention. + head : int + Number of heads in Transformer attention. + ar_layer : int + Number of layers in AR Transformer. + nar_layer : int + Number of layers in NAR Transformer. + n_ctx : int + maximum context length of AR & NAR Transformer. + lm_head : torch.nn.Module, optional + an alternative LM head implementation head, an alternative + to the default Linear, useful for non-trivial codecs, + such as SQ-Codec + logits_to_probs : callable, optional + A module or a function that converts logits to token probabilities to + support top-K sampling + """ + + def __init__( + self, + vocab_size, + nq, + pad_id=0, + share_emb=True, + qk_norm=False, + dropout=0.0, + target_dropout=0.0, + att_unit=256, + head=2, + ar_layer=4, + nar_layer=4, + n_ctx=3000, + emb=None, + lm_head=None, + logits_to_probs=None, + ): + super().__init__() + if emb is None: + emb = torch.nn.Embedding(vocab_size, att_unit) + self.emb = emb + if lm_head is None: + lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) + self.lm_head = lm_head + spec = inspect.getfullargspec(lm_head.forward) + self.lm_head_multitrack = "track" in spec.args + if logits_to_probs is None: + logits_to_probs = nn.Identity() + self.logits_to_probs = logits_to_probs + if share_emb: + self.lm_head.weight = self.emb.weight + + self.ar_decoder = TransformerDecoder( + n_ctx=n_ctx, + n_state=att_unit, + n_head=head, + n_layer=ar_layer, + qk_norm=qk_norm, + dropout=dropout, + target_dropout=target_dropout + ) + if nq > 1: + # NOTE: An NAR encoder is not needed if there is only one track + self.nar_decoder = ValleNARDecoder( + n_level=nq - 1, + n_ctx=n_ctx, + n_state=att_unit, + n_head=head, + n_layer=nar_layer, + qk_norm=qk_norm, + dropout=dropout, + ) + + self.nq = nq + self.n_ctx = n_ctx + self.pad_id = pad_id + self._initialize() + + def forward( + self, + dec_seq, + dec_seq_lengths=None, + prefix_len=None, + conti_feats=None, + nar_level_idx=1, + predict_ar=True, + predict_nar=True, + ): + """Vall-E forward for training + + Arguments + --------- + dec_seq : torch.Tensor + Batch of decoder sequences (B, T, nq). + dec_seq_lengths : torch.Tensor + Lengths of batched decoder sequences (B,). + enc_seq : torch.Tensor + Batch of encoder sequences (B, T, nq), keep + the interface, may not be used. + enc_seq_lengths : torch.Tensor + Lengths of batched encoder sequences (B,), + keep the interface, may not be used. + prefix_len : torch.Tensor + Lengths of condition part in dec_seq (B,). + nar_level_idx : int + the index of the non-autoregressive level to train + predict_ar : bool + Whether to make an autoregressive prediction + predict_nar : bool + Whether to make a non-autoregressive prediction + + Returns + ------- + logits_ar : torch.Tensor + Autoregressive predictions + logits_nar : torch.Tensor + Non-autoregressive predictions + """ + + assert dec_seq.dim() == 3 + + dec_seq_emb = self.emb(dec_seq) # [B, T, nq, D] + dec_seq_emb, _ = install_continuous_features( + dec_seq_emb, None, conti_feats + ) + + # Auto-Regressive part + if predict_ar: + input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[ + :, :-1 + ] # [B, T, D] + h_ar = self.ar_decoder(input_ar_emb) + + # Non-Auto-Regressive part + if predict_nar: + input_nar_emb = self.prepare_input( + dec_seq_emb, prefix_len, nar_level_idx + )[ + :, 1: + ] # [B, T, V] + max_len = dec_seq.size(1) + mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask) + + # Logits + logits_ar, logits_nar = None, None + if predict_ar: + logits_ar = self.apply_lm_head(h_ar, 0) + if predict_nar: + logits_nar = self.apply_lm_head(h_nar, nar_level_idx + 1) + + return logits_ar, logits_nar + + def prepare_input(self, dec_seq_emb, prefix_len, level): + # NOTE(Jinchuan): have to use "expand" here but maybe lead to extra memory usage. + # This is because both prefix_mask and level_mask are broadcastable and will + # trigger user warning. + + # (1) level mask, [B, 1, nq, 1], True is to include + if isinstance(level, int): + level = torch.ones_like(dec_seq_emb[:, 0, 0, 0]) * level + level_mask = length_to_mask(level, self.nq).bool() + level_mask = ( + level_mask.unsqueeze(1).unsqueeze(3).expand(dec_seq_emb.size()) + ) + + # (2) prefix mask, [B, T, 1, 1], True is the prefix + prefix_mask = length_to_mask( + prefix_len * dec_seq_emb.size(1), dec_seq_emb.size(1) + ).bool() + prefix_mask = ( + prefix_mask.unsqueeze(2).unsqueeze(3).expand(dec_seq_emb.size()) + ) + + # (3) mask and then sum in nq-axis. + mask = torch.logical_or(level_mask, prefix_mask) + return dec_seq_emb.masked_fill(~mask, 0.0).sum(2) + + @torch.no_grad() + def inference( + self, prefix, opts, enc_seq=None, suffix=None, + ): + """Vall-E Inference. + + Arguments + --------- + prefix : torch.Tensor + Prefix part of dec_seq (B, T, nq). + opts : SpeechLMInferenceOptions + inference options. + enc_seq : torch.Tensor + Encoder token sequence (B, T, nq). + suffix : torch.Tensor + suffix part of dec_seq (B, T, nq), + usually the target sequence for teacher-forcing. + + Returns + ------- + gen_tokens_list : list + Generated tokens + gen_scores_list : list + The scores associated with the generated tokens + """ + + # (1) initialization + cache = self.ar_decoder.init() + + # (2) auto-regressive prefix forward on first code layer + prefix = prefix.expand(opts.nbest, -1, -1) + if opts.search_algo == "teacher_force": + suffix = suffix.expand(opts.nbest, -1, -1) + prefix_emb = self.emb(prefix).sum(dim=2) # [B, T, D] + _ = self.ar_decoder(prefix_emb, kv_cache=cache) + + # (3) auto-regressive loop on first code layer + # (3.1) AR initialization + minlen = ( + int(prefix.size(1) * opts.minlenratio) + if opts.minlenratio > 0 + else 0 + ) + maxlen = int(prefix.size(1) * opts.maxlenratio) + if opts.search_algo == "teacher_force": + assert suffix is not None + minlen = suffix.size(1) + maxlen = suffix.size(1) + if maxlen + prefix.size(1) > self.n_ctx: + maxlen = self.n_ctx - prefix.size(1) + logging.info(f"maxlen={maxlen}, minlen={minlen}") + + generated = {"token": [], "score": []} + finish_idx = ( + torch.Tensor([-1]).expand(opts.nbest).long().to(opts.device) + ) + prev_tok = ( + torch.Tensor([opts.start]) + .tile(opts.nbest, 1) + .long() + .to(opts.device) + ) + modality_index = prev_tok.flatten() + mask = modality_index_to_mask(modality_index, opts) + tracks = prefix.size(-1) + is_flattened = opts.nq == 1 and tracks > 1 + if is_flattened: + prev_tok = prev_tok.expand(1, tracks) + mask_cache = [] + modality_tokens = torch.tensor( + list(opts.masks.keys()), device=prefix.device + ) + + for step in range(maxlen): + # (3.2) AR loop + if is_flattened: + prev_tok = prev_tok.unsqueeze(1) + prev_emb = self.emb(prev_tok).squeeze(2) # [B, 1, D] + h_ar = self.ar_decoder(prev_emb, kv_cache=cache) + logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0)) # [B, 1, V] + if logits.dim() < 4: + logits = logits.unsqueeze(-2) + gen_tok, gen_score = logits_to_tokens( + logits, + opts, + mask, + allow_eos=step >= minlen, + nq_level=0, + ) + # [B, 1, 1] -> [B, 1] + gen_tok, gen_score = gen_tok.squeeze(1), gen_score.squeeze(1) + + generated["token"].append(gen_tok) + generated["score"].append(gen_score) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, step : step + 1, 0] + else: + prev_tok = gen_tok # [B, 1] + + # (3.3) detect modality swtich + mask_cache.append(mask.clone()) + modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens) + # Note: The ESPNET VALL-E had + # modality_change_mask = torch.logical_and( + # prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, + # ) + if torch.any(modality_change_mask): + modality_index = torch.where( + modality_change_mask, prev_tok[:, 0], modality_index, + ).flatten().squeeze() + if modality_index.dim() == 0: + modality_index = modality_index.unsqueeze(0) + if modality_index.size(0) > 1: + modality_index = modality_index[0:1] + mask = modality_index_to_mask(modality_index, opts) + logging.warning( + f"Step {step}: change modality index {modality_index}" + ) + + # (3.4) detect ended hypotheses. + finish_idx = torch.where( + torch.logical_and(prev_tok[:, 0] == opts.eos, finish_idx == -1), + step, + finish_idx, + ) + + if torch.all(torch.ge(finish_idx, 0)): + break + + if step == maxlen - 1: + logging.warning( + f"Some examples cannot finish in {maxlen} steps: {finish_idx}" + f"Consider increasing the maxlenratio" + ) + + logging.info(f"Terminate at steps: {finish_idx.cpu().tolist()}") + + # (3.4) finalize auto-regressive + if opts.allow_invalid: + valid_idx = torch.arange(len(finish_idx), device=finish_idx.device) + finish_idx = torch.where(finish_idx == -1, step, finish_idx) + else: + valid_idx = finish_idx.ne(-1).nonzero(as_tuple=True)[0] + if len(valid_idx) == 0: + self.ar_decoder.reset() + logging.warning(f"No valid examples. Return None") + return [], [] + elif len(valid_idx) < prefix.size(0): + logging.info(f"Only {len(valid_idx)} of {prefix.size(0)} are valid") + + finish_idx = finish_idx[valid_idx] + prefix_emb = prefix_emb[valid_idx] + if opts.search_algo == "teacher_force": + suffix = suffix[valid_idx] + gen_tokens_ar = torch.cat(generated["token"], dim=1)[ + valid_idx + ].unsqueeze( + 2 + ) # [B, T, 1] + gen_scores_ar = torch.cat(generated["score"], dim=1)[ + valid_idx + ].unsqueeze(2) + gen_tokens_ar = gen_tokens_ar[:, : finish_idx.max() + 1] # idx -> count + gen_scores_ar = gen_scores_ar[:, : finish_idx.max() + 1] + + self.ar_decoder.reset() + + # (4) non-auto-regressive loop on the remained code layers + # (4.1) NAR initialization + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, 0] + else: + prev_tok = gen_tokens_ar[:, :, 0] + start_token = torch.tensor( + [opts.start], device=prefix.device + )[None, None, :] + + # (4.2) NAR loop + if self.nq > 1: + start_emb = self.emb(start_token).squeeze().tile( + len(valid_idx), 1, 1 + ) # [B, 1, D] + prev_emb = torch.cat( + [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1 + ) # [B, T, D] + + ones = torch.ones_like(valid_idx) + mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) + generated = {"token": [], "score": []} + + mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache + vocab_mask = torch.cat(mask_cache, dim=1) + + for step in range(1, opts.nq): + h_nar = self.nar_decoder( + prev_emb, ones * step - 1, mask=mask + ) # [B, T, D] + + logits = self.apply_lm_head(h_nar, step) + logits = self.logits_to_probs(logits) + gen_tok, gen_score = logits_to_tokens( + logits.unsqueeze(2), + opts, + vocab_mask, + search_algo="greedy_search", + allow_eos=False, + nq_level=step, + ) + gen_tok, gen_score = ( + gen_tok.squeeze(2), + gen_score.squeeze(2), + ) # [B, T] + + generated["token"].append(gen_tok[:, prefix.size(1) :]) + generated["score"].append(gen_score[:, prefix.size(1) :]) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, step] + else: + prev_tok = generated["token"][-1] + prev_emb[:, prefix.size(1) :] += self.emb(prev_tok) # [B, T, D] + prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb + + # (5) combine AR and NAR results + gen_tokens_nar = torch.stack(generated["token"], dim=2) # [B, T, nq] + gen_scores_nar = torch.stack(generated["score"], dim=2) + + gen_tokens = torch.cat( + [gen_tokens_ar, gen_tokens_nar], dim=2 + ) # [B, T, nq] + gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2) + else: + gen_tokens = gen_tokens_ar + gen_scores = gen_scores_ar + + gen_tokens_list, gen_scores_list = [], [] + for b in range(len(valid_idx)): + item_finish_idx = finish_idx[b] + gen_tokens_list.append(gen_tokens[b][:item_finish_idx]) + gen_scores_list.append(gen_scores[b][:item_finish_idx]) + return gen_tokens_list, gen_scores_list + + def apply_lm_head(self, x, track): + """Applies the language model head + + Arguments + --------- + """ + + if self.lm_head_multitrack: + result = self.lm_head(x, track) + else: + result = self.lm_head(x) + return result + + def _initialize(self): + for m in self.modules(): + if isinstance(m, torch.nn.Linear): + torch.nn.init.normal_(m.weight, mean=0.0, std=0.02) + if m.bias is not None: + torch.nn.init.zeros_(m.bias) + elif isinstance(m, torch.nn.Embedding): + torch.nn.init.normal_(m.weight, mean=0.0, std=0.02) + + +class ResidualAttentionBlock(nn.Module): + """A VALL-E residual attention block + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of heads + cross_attention : bool + Whether to use cross-attention + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Whether to normalize queries and keys + dropout : float + The dropout probability + """ + + def __init__( + self, + n_state, + n_head, + cross_attention=False, + causal=False, + qk_norm=False, + dropout=0.0, + ): + super().__init__() + + self.attn = MultiHeadAttention( + n_state, n_head, causal=causal, qk_norm=qk_norm, dropout=dropout, + ) + self.attn_ln = LayerNorm(n_state) + self.attn_dropout = nn.Dropout(p=dropout) + + self.cross_attn = ( + MultiHeadAttention( + n_state, n_head, causal=False, qk_norm=qk_norm, dropout=dropout, + ) + if cross_attention + else None + ) + self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None + self.cross_attn_dropout = ( + nn.Dropout(p=dropout) if cross_attention else None + ) + + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state) + ) + self.mlp_ln = LayerNorm(n_state) + self.mlp_dropout = nn.Dropout(p=dropout) + + def forward( + self, x, xa=None, mask=None, kv_cache=None, + ): + """The forward pass implementation + + Arguments + --------- + x : torch.Tensor + the feature tensor + xa : torch.Tensor + The tensor for cross-attention + mask : torch.Tensor + The attention mask to be applied + + """ + x = x + self.attn_dropout( + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache) + ) + if self.cross_attn: + x = x + self.cross_attn_dropout( + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache) + ) + x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x))) + return x + + +class TransformerDecoder(nn.Module): + """A custom transformer decoder implementation for VALL-E + + Arguments + --------- + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Whether to normalize queries and keys + dropout : float + The dropout probability + target_dropout : float + The target dropout probability + layer_class : type + The layer type to be used + """ + def __init__( + self, + n_ctx, + n_state, + n_head, + n_layer, + causal=True, + qk_norm=False, + dropout=0.0, + target_dropout=0.0, + layer_class=ResidualAttentionBlock, + ): + + super().__init__() + + self.pos_emb = nn.Embedding(n_ctx, n_state) + + self.blocks = nn.ModuleList( + [ + layer_class( + n_state=n_state, + n_head=n_head, + cross_attention=False, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + ) + for _ in range(n_layer) + ] + ) + self.ln = LayerNorm(n_state) + self.target_dropout = nn.Dropout(target_dropout) + + self.causal = causal + self.kv_cache = None + + def forward( + self, x, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + the feature tensor + mask : torch.Tensor + The attention mask to be applied + kv_cache : dict + The key/value cache (for inference) + + Returns + ------- + result : torch.Tensor + The decoder output + """ + if self.causal and mask is not None: + raise ValueError("Causal Transformer dones't allow mask") + + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0) + tgt = self.target_dropout(x) + + for block in self.blocks: + x = block(x, tgt, mask=mask, kv_cache=kv_cache) + tgt = x + + x = self.ln(x) + return x + + def init(self): + """Initializes the key/value cache and the hooks to update it""" + self.kv_cache, self.hooks = install_kv_cache_hook(self, self.kv_cache) + return self.kv_cache + + def reset(self): + """Resets the key-value cache""" + for hook in self.hooks: + hook.remove() + self.kv_cache = None + + +class LayerNorm(nn.LayerNorm): + """A layer normalziation wrapper""" + + def forward(self, x): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The tensor to be normalized + + Returns + ------- + result : torch.Tensor + A normalzied tensor + """ + return super().forward(x.float()).type(x.dtype) + + +class Linear(nn.Linear): + def forward(self, x: Tensor) -> Tensor: + return F.linear( + x, + self.weight.to(x.dtype), + None if self.bias is None else self.bias.to(x.dtype), + ) + + +class ResidualAttentionBlockAdaLN(ResidualAttentionBlock): + """"The Vall-E Adaptive Residual Attention Block + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of states + n_head : int + The number of attention heads + cross_attention : bool + The number of attention heads + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + """ + + def __init__( + self, + n_state, + n_head, + cross_attention=False, + causal=False, + qk_norm=False, + dropout=0.0, + ): + super(ResidualAttentionBlockAdaLN, self).__init__( + n_state=n_state, + n_head=n_head, + cross_attention=cross_attention, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + ) + + self.attn_ln = AdaLN(n_state) + self.mlp_ln = AdaLN(n_state) + + def forward( + self, x, level, xa=None, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + level : torch.Tensor + The level numbers for each batch element + xa : torch.Tensor + The sequence for cross attention + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ + x = x + self.attn_dropout( + self.attn(self.attn_ln(x, level), mask=mask, kv_cache=kv_cache) + ) + if self.cross_attn: + x = x + self.cross_attn_dropout( + self.cross_attn( + self.cross_attn_ln(x, level), xa, kv_cache=kv_cache + ) + ) + x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x, level))) + return x + + +class ValleNARDecoder(TransformerDecoder): + def __init__( + self, + n_level, + n_ctx, + n_state, + n_head, + n_layer, + causal=False, + qk_norm=False, + dropout=0.0, + layer_class=ResidualAttentionBlockAdaLN, + ): + """The VALL-E non-autoregressive decoder + + Arguments + --------- + n_level : int + The number of levels + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of attention heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + layer_class : type + The layer class to use + """ + super().__init__( + n_ctx=n_ctx, + n_state=n_state, + n_head=n_head, + n_layer=n_layer, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + layer_class=layer_class, + ) + + self.level_emb = nn.Embedding(n_level, n_state) + self.ln = AdaLN(n_state) + + def forward( + self, x, level, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + level : torch.Tensor + The level numbers for each batch element + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ + if self.causal and mask is not None: + raise ValueError("mask is not allowed when causal") + + level = self.level_emb(level) + + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0) + + for block in self.blocks: + x = block(x, level=level, mask=mask, kv_cache=kv_cache) + + x = self.ln(x, level) + return x + + +class MultiHeadAttention(nn.Module): + """A Multi-Head Attention implementation + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of attention heads + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + """ + + def __init__( + self, n_state, n_head, causal=False, qk_norm=False, dropout=0.0, + ): + super().__init__() + assert n_state % n_head == 0 + self.n_head = n_head + self.query = Linear(n_state, n_state) + self.key = Linear(n_state, n_state, bias=False) + self.value = Linear(n_state, n_state) + self.out = Linear(n_state, n_state) + self.causal = causal + self.dropout = dropout + + self.qk_norm = qk_norm + if qk_norm: + self.q_norm = LayerNorm(n_state // n_head) + self.k_norm = LayerNorm(n_state // n_head) + + def forward( + self, x, xa=None, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + xa : torch.Tensor + The sequence for cross attention + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ + q = self.query(x) + + if kv_cache is None or xa is None or self.key not in kv_cache: + # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; + # otherwise, perform key/value projections for self- or cross-attention as usual. + k = self.key(x if xa is None else xa) + v = self.value(x if xa is None else xa) + else: + # for cross-attention, calculate keys and values once and reuse in subsequent calls. + k = kv_cache[self.key] + v = kv_cache[self.value] + + wv = self.qkv_attention(q, k, v, mask) + + return self.out(wv) + + def qkv_attention(self, q, k, v, mask=None): + """Computes self-attention + + Arguments + --------- + q : torch.Tensor + The queries tensor + k : torch.Tensor + The keys tensor + v : torch.Tensor + The values tensor + + Returns + ------- + wv : torch.Tensor + The attention output + """ + if self.causal and mask is not None: + raise ValueError("mask is not allowed when the attention is causal") + + if self.causal and q.size(1) == k.size(1): + causal = True + else: + causal = False + + q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + + if self.qk_norm: + q = self.q_norm(q) + k = self.k_norm(k) + wv = ( + F.scaled_dot_product_attention( + q, k, v, mask, is_causal=causal, dropout_p=self.dropout + ) + .permute(0, 2, 1, 3) + .flatten(start_dim=2) + ) + + return wv + + +class AdaLN(nn.Module): + """Adaptive Layer Normalization, a Layer Norm implementation + that learns an affine transformation based on the level + embedding + + Arguemnts + --------- + n_state : int + The number of states + eps : float + The layer norm epsilon parameter""" + + def __init__(self, n_state, eps=1e-5): + super().__init__() + self.weight = nn.Linear(n_state, n_state, bias=False) + self.bias = nn.Linear(n_state, n_state, bias=False) + nn.init.constant_(self.weight.weight, 1.0) + nn.init.constant_(self.bias.weight, 0.0) + + self.n_state = n_state + self.eps = eps + + def forward(self, x, level_emb): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The tensor + level_emb : torch.Tensor + The level embedding + """ + w = self.weight(level_emb).unsqueeze(1) + b = self.bias(level_emb).unsqueeze(1) + x = nn.functional.layer_norm(x, (self.n_state,), eps=self.eps) + x = w * x + b + return x + + +def install_kv_cache_hook(model, cache): + """Sets up the key/value cache hook + + Arguments + --------- + model : torch.nn.Module + The model + cache : dict + The cache content + + Returns + ------- + cache : torch.Tensor + The cache dictionary (new or copied) + hooks : torch.Tensor + The installed hooks + """ + cache = {**cache} if cache is not None else {} + hooks = [] + + def save_to_cache(module, _, output): + if module not in cache: + # save as-is, for the first token or cross attention + cache[module] = output + else: + cache[module] = torch.cat([cache[module], output], dim=1).detach() + return cache[module] + + def install_hooks(layer: torch.nn.Module): + if isinstance(layer, MultiHeadAttention): + hooks.append(layer.key.register_forward_hook(save_to_cache)) + hooks.append(layer.value.register_forward_hook(save_to_cache)) + + model.apply(install_hooks) + return cache, hooks + + +def logits_to_tokens( + logits, opts, mask, search_algo=None, allow_eos=True, nq_level=None, +): + """ + Select the generated tokens and their scores based on logits prediction. + + Arguments + --------- + logits : torch.Tensor + predicted logits, of size [B, T, nq, V] + opts : SpeechLMInferenceOptions + search options + mask : torch.Tensor + mask to specify valid tokens, of size [B, 1, nq, V] + search_algo : str + search algorithm + allow_eos : bool + whether to allow end-of-sentence prediction + nq_level : int, optional + if not None, only conpute the specified codec level nq. + + Returns + ------- + gen_token_idx : torch.Tensor + The token indexes + gen_token_score : torch.Tensor + The token scores + """ + + assert logits.dim() == 4 + search_algo = search_algo if search_algo is not None else opts.search_algo + neg_inf = torch.finfo(logits.dtype).min + + # (1) Apply mask + if nq_level is not None: + mask = mask[:, :, nq_level : nq_level + 1] + + if allow_eos: + mask = mask.clone() + mask[:, :, 0, opts.eos] = False + + logits.masked_fill_(mask, neg_inf) + + # (2) token selection + if search_algo in ["topk_sampling"]: + topk_values, topk_indices = torch.topk(logits, opts.top_k, dim=-1) + probs = torch.softmax(topk_values / opts.sampling_temperature, dim=-1) + inner_indices = torch.multinomial( + probs.flatten(end_dim=-2), num_samples=1 + ).view(probs[..., :1].size()) + gen_token_idx = torch.gather(topk_indices, -1, inner_indices).squeeze( + -1 + ) + gen_token_score = ( + torch.gather(probs, -1, inner_indices).squeeze(-1).log() + ) + + elif search_algo in ["topp_sampling"]: + probs = torch.softmax(logits / opts.sampling_temperature, dim=-1) + sorted_probs, sorted_indices = torch.sort(probs, descending=True) + accum_probs = torch.cumsum(sorted_probs, dim=-1) + clip_probs = torch.where(accum_probs <= opts.top_p, sorted_probs, 0.0) + # always keep at least one candidate no matter what value it is + if torch.any(clip_probs[..., 0] == 0.0): + clip_probs[..., 0] = sorted_probs[..., 0] + clip_probs = clip_probs / clip_probs.sum(dim=-1, keepdim=True) + inner_indices = torch.multinomial( + clip_probs.flatten(end_dim=-2), num_samples=1 + ).view(clip_probs[..., :1].size()) + gen_token_idx = torch.gather(sorted_indices, -1, inner_indices).squeeze( + -1 + ) + gen_token_score = ( + torch.gather(clip_probs, -1, inner_indices).squeeze(-1).log() + ) + + elif search_algo in ["greedy_search", "teacher_force"]: + probs = logits.softmax(dim=-1) + topk_values, topk_indices = torch.topk(logits, 1, dim=-1) + gen_token_idx = topk_indices[:, :, :, 0] + gen_token_score = topk_values[:, :, :, 0].log() + + else: + raise NotImplementedError(f"opts.search_algo={opts.search_algo}") + + return gen_token_idx, gen_token_score + + +@torch.no_grad() +def install_continuous_features( + dec_emb: torch.Tensor, + enc_emb: Optional[torch.Tensor] = None, + conti_feats: Tuple = None, +): + if conti_feats is None: + return dec_emb, enc_emb + + assert dec_emb.size(0) == len(conti_feats) + if enc_emb is not None: + assert enc_emb.size(0) == len(conti_feats) + + for b, conti_feat in enumerate(conti_feats): + for conti_emb, start, end, part in conti_feat: + if part == "dec": + assert conti_emb.size(1) == dec_emb.size(2) + dec_emb[b, start:end] = conti_emb + else: + assert conti_emb.size(1) == enc_emb.size(2) + enc_emb[b, start:end] = conti_emb + + return dec_emb, enc_emb + + +def modality_index_to_mask( + modality_index: torch.Tensor, inference_opts: SpeechLMInferenceOptions, +): + assert modality_index.dim() == 1 + modality_index = modality_index.cpu().tolist() + mask = torch.stack( + [inference_opts.masks[idx] for idx in modality_index], dim=0 + ).unsqueeze( + 1 + ) # [B, 1, nq, V] + + return mask + + +def masked_nll_loss( + log_probabilities, targets, mask, allowed_len_diff=3, reduction="mean" +): + """Similar to the standard nll_loss from SpeechBrain + but applies a custom mask + + Arguments + --------- + log_probabilities : torch.Tensor + The probabilities after log has been applied. + Format is [batch, log_p] or [batch, frames, log_p]. + targets : torch.Tensor + The targets, of shape [batch] or [batch, frames]. + mask : torch.Tensor + The mask for loss calculation + allowed_len_diff : int + Length difference that will be tolerated before raising an exception. + reduction : str + Options are 'mean', 'batch', 'batchmean', 'sum'. + See pytorch for 'mean', 'sum'. The 'batch' option returns + one loss per item in the batch, 'batchmean' returns sum / batch size. + """ + log_probabilities, targets = truncate( + log_probabilities, targets, allowed_len_diff + ) + dims = [0, log_probabilities.dim() - 1] + list(range(1, log_probabilities.dim() - 1)) + log_probabilities = log_probabilities.permute(dims).contiguous() + loss = torch.nn.functional.nll_loss( + input=log_probabilities, target=targets.long(), reduction="none" + ) + while mask.dim() < loss.dim(): + mask = mask.unsqueeze(-1) + mask = mask.expand_as(loss) + loss *= mask + loss = reduce_loss(loss, mask, reduction, 0.0, log_probabilities, targets) + return loss.contiguous() diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh old mode 100644 new mode 100755 index e0f848aef..5dcd6b397 --- a/benchmarks/DASB/run_experiments.sh +++ b/benchmarks/DASB/run_experiments.sh @@ -149,8 +149,13 @@ seed="${seed:-$RANDOM}" if [ "$rnd_dir" = True ]; then - rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) - output_folder="$output_folder/$rnd_dirname" + if [[ ! -z "$ORION_TRIAL_ID" ]]; then + # Use the Orion Trial ID to ensure interrupted trials are resumed + output_folder="$output_folder/$ORION_TRIAL_ID" + else + rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) + output_folder="$output_folder/$rnd_dirname" + fi fi # Make sure the output_folder is created @@ -181,7 +186,7 @@ mkdir -p $cached_data_folder # Function to run the training experiment run_experiment() { -python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ +eval python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ $additional_flags } @@ -201,4 +206,4 @@ done echo 'Final Results (Performance Aggregation)' -python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt \ No newline at end of file +python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh old mode 100644 new mode 100755 index 2ad1dddf3..1ee79c0cd --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -63,6 +63,7 @@ orion_db_type="PickledDB" exp_max_trials=50 store_all=True compress_exp=True +hparam_filter="" # Function to print argument descriptions and exit print_argument_descriptions() { @@ -202,12 +203,28 @@ while [[ $# -gt 0 ]]; do shift ;; + --hparam_filter) + hparam_filter="$2" + shift + shift + ;; + --help) print_argument_descriptions ;; -*|--*) - additional_flags+="$1 $2 " # store additional flags + name=$1 + value=$2 + if [[ "$name" =~ ^--eval_run_ ]]; then + name=$(echo $name | sed s/^--eval_run_/--/) + eval_run_additional_flags+="$name $value " + else + if [[ ! "$eval_run_additional_flags" =~ "$name " ]]; then + eval_run_additional_flags+="$name $value " + fi + additional_flags+="$name $value " # store additional flags + fi shift # past argument ;; @@ -271,6 +288,7 @@ echo "-------------------------------------" get_flag() { local file_path="$1" local pattern="$2" + local filter="$3" # Check if the file exists if [ ! -f "$file_path" ]; then @@ -279,7 +297,7 @@ get_flag() { fi # Use grep to find all lines containing the pattern and then extract the flags using sed - grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n' + grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | grep "$filter" | tr -d '\n' } @@ -323,7 +341,7 @@ function extract_best_params() { step_id=1 hparams_step=$hparams pattern="@orion_step1:" -opt_flags=$(get_flag "$hparams_step" "$pattern") +opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter") # Check if the string is empty and exit with an error if it is if [ -z "$opt_flags" ]; then @@ -365,7 +383,7 @@ while [ -n "$opt_flags" ]; do eval $orion_hunt_command # Compress the exp folder (if required) - if [ "$compress_exp" = True ]; then + if [ "$compress_exp" = True ] && [ ! -e "$output_folder_step/exp.tar.gz" ]; then tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp" if [ -d "$output_folder_step/exp" ]; then rm -rf "$output_folder_step/exp" @@ -399,7 +417,7 @@ while [ -n "$opt_flags" ]; do pattern="@orion_step$step_id:" # update optimization flags pattern - opt_flags=$(get_flag "$hparams_step" "$pattern") + opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter") done echo @@ -415,6 +433,6 @@ scp $best_yaml_file $final_yaml_file ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --rnd_dir $store_all --testing True $additional_flags + --rnd_dir False --testing True $eval_run_additional_flags -echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file +echo "The test performance with best hparams is available at $output_folder/best" diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py index 0df315b7e..e11046ade 100644 --- a/benchmarks/DASB/utils/aggregate_results.py +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -144,6 +144,8 @@ def aggregate_metrics(prototype, metrics): # Report final metric to Orion # Remember: orion expects metrics to be minimized! - if eval_metric == "acc" or eval_metric == "f1": + if eval_metric in ["acc", "f1"]: final_metric = 1 - final_metric + elif eval_metric == "utmos": + final_metric = -final_metric report_objective(final_metric) diff --git a/benchmarks/DASB/utils/audio_tokens.py b/benchmarks/DASB/utils/audio_tokens.py deleted file mode 100644 index 9dc4014c4..000000000 --- a/benchmarks/DASB/utils/audio_tokens.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Utilities for discrete audio token models - - -Authors - * Artem Ploujnikov 2023 -""" -import torch -from speechbrain.dataio.batch import PaddedBatch -from speechbrain.utils.data_utils import batch_pad_right -from functools import partial - - -def get_silence_token( - model, - sample_length=100000, - extract_emb=True, - device=None, - model_kwargs=None, -): - """Attempts to find out the silence tokens for a given model, - if applicable - - Arguments - --------- - model : nn.Module - A discrete token model, taking (wav, lengths) as arguments - sample_length : int - The length of the sample - extract_emb : bool - Whether to extract embeddings - device : str | torch.Device - The device to use - model_kwargs : dict - Additional arguments to pass to the model - - Returns - ------- - silence_tokens : torch.Tensor - The token(s) corresponding to silence - - silece_emb : torch.Tensor - The embedding(s) corresponding to silence - - """ - if device is None: - device = next(model.parameters()).device - if model_kwargs is None: - model_kwargs = {} - - audio = torch.zeros(1, sample_length, device=device) - length = torch.ones(1, device=device) - result = model(audio, length, **model_kwargs) - tokens = result[0] - silence_tokens = tokens.squeeze(0).mode(0).values - silence_emb = None - if extract_emb: - if hasattr(model, "embeddings"): - silence_emb = model.embeddings( - silence_tokens[None, None, :] - ).squeeze() - else: - heads = tokens.shape[-1] - embs = result[1] - mode_idx = [ - (tokens[0, :, head] == silence_tokens[head]).nonzero()[0].item() - for head in range(heads) - ] - silence_emb = torch.stack( - [embs[0, idx, head] for head, idx in enumerate(mode_idx)] - ) - return silence_tokens, silence_emb - - -def feature_pad_to(tensor, length, padding=None): - """Pads feature dimensions to the specified length with the specified padding, - assuming a (Batch x Length x Features..) tensor - - Arguments - --------- - tensor : torch.Tensor - The tensor to be padded - - length : int - The length to which the tensor will be padded - - padding : torch.Tensor, optional - The padding tensor - if omitted, zero padding - will be used - - Returns - ------- - result : torch.Tensor - The padded tensor - """ - if padding is None: - padding = torch.zeros(tensor.shape[1:]) - padding = padding[None, ...].expand( - (length - tensor.size(0),) + tensor.shape[1:] - ) - return torch.cat([tensor, padding], dim=0) - - -def batch_feature_pad(tensors, padding=None): - """Similar to batch_pad_right but pads with the specified padding, whcih - can be a vector or a tensor - - Arguments - --------- - tensors : list - The list of tensors to be padded - padding : torch.Tensor - The padding tensor - - Returns - ------- - result : torch.Tensor - the padded tensor - """ - lengths_abs = torch.tensor( - [len(item) for item in tensors], device=tensors[0].device - ) - max_length = lengths_abs.max() - data = torch.stack( - [feature_pad_to(item, max_length, padding) for item in tensors] - ) - lengths = lengths_abs / max_length - return data, lengths - - -def token_collate_fn(examples, silence_token, token_keys): - """A customized collation function for audio tokens where - the specified silence token will be used as padding - instead of - zeros - - Arguments - --------- - examples : list - A list of examples - - silence_token : torch.Tensor - The token(s) representing silence - - token_keys : list - The list of keys to which special padding will be applied - - Returns - ------- - result : speechbrain.dataio.batch.PaddedBatch - A padded batch - """ - token_tensor_ids = {id(examples[0][key]) for key in token_keys} - return PaddedBatch( - examples, - padding_func=_silence_padding, - padding_kwargs={ - "silence_token": silence_token, - "token_tensor_ids": token_tensor_ids, - }, - ) - - -def _silence_padding(values, silence_token, token_tensor_ids): - return ( - batch_feature_pad(values, silence_token) - if id(values[0]) in token_tensor_ids - else batch_pad_right(values) - ) - - -def use_silence_padding(dataloader_opts, silence_token, token_keys): - """Overrides the collation function to add silence padding to - audio token features - - Arguments - --------- - dataloder_opts : dict - Dataloader options - silence_token : torch.Tensor - The tensor to be used as silence padding - token_keys : torch.Tensor - The keys to apply silence padding to - - Returns - ------- - dataloader_opts : dict - Updated data loader options - """ - return { - **dataloader_opts, - "collate_fn": partial( - token_collate_fn, silence_token=silence_token, token_keys=token_keys - ), - } diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index c0e14f867..76f2a6c2f 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -7,32 +7,52 @@ """ from speechbrain.inference.interfaces import Pretrained -from speechbrain.inference.ASR import EncoderDecoderASR from speechbrain.lobes.models.huggingface_transformers import Whisper +from speechbrain.lobes.models.huggingface_transformers.wav2vec2 import Wav2Vec2 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import length_to_mask from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher from speechbrain.dataio.batch import PaddedBatch from speechbrain.utils.metric_stats import ErrorRateStats -from speechbrain.utils.superpowers import run_shell +from speechbrain.utils.data_utils import pad_right_to +from speechbrain.utils.fetching import fetch from collections import namedtuple from pathlib import Path -import os +from torch import nn import torch import torchaudio import re import string import logging -import shutil -import shlex -import subprocess + logger = logging.getLogger(__name__) + +has_transformers = False +try: + from transformers import AutoModelForAudioXVector + + has_transformers = True +except ImportError: + logger.warning( + "transformers library not found - some evaluators may be disabled" + ) + + RE_PUNCTUATION = re.compile( "|".join(re.escape(char) for char in string.punctuation) ) +SAMPLE_RATE = 16000 +DEFAULT_ENCODER_HUB = "chaanks/wav2vec2-small" +DEFAULT_MODEL_URL = "https://huggingface.co/chaanks/UTMOS/resolve/main" +DEFAULT_MODEL_NAME = "utmos.ckpt" +DEFAULT_SAVE_DIR = "./pretrained_models" +DEFAULT_JUDGE_ID = 288 +DEFAULT_DOMAIN_ID = 0 + SpeechEvaluationResult = namedtuple( "SpeechEvaluationResult", ["score", "details"] ) @@ -217,77 +237,6 @@ def __call__(self, wavs, length): return self.mods.model(wavs, length) -class RegressionModelSpeechEvaluator(SpeechEvaluator): - """A speech evaluator that uses a regression model - that produces a quality score (e.g. SSL fine-tuning) - for a sample of speech - - Arguments - --------- - source : str - The source model path or HuggingFace hub name - sample_rate : int - The audio sample rate this evaluator expects - """ - - def __init__(self, source, sample_rate=None, *args, **kwargs): - super().__init__(sample_rate=sample_rate) - self.model = SpeechEvaluationRegressionModel.from_hparams( - source, *args, **kwargs - ) - - def evaluate( - self, - wavs, - length, - text=None, - wavs_ref=None, - length_ref=None, - sample_rate=None, - sample_rate_ref=None, - ): - """Evaluates a batch of waveforms - - Arguments - --------- - Arguments - --------- - wavs: torch.Tensor - the waveforms to evaluate - - length: torch.Tensor - relative lengths (a 1-D tensor) - - text : list, optional - Ground truth text - - wavs_ref : torch.Tensor - the reference waveforms - - length_ref : torch.Tensor - the reference waveform lengths - - sample_rate : int, optional - The sample rate of the audio. If not provided, - the audio is assumed to be at the same sample - rate as the model - - sample_rate_ref : int, optional - The sample rate of the reference samples - - Returns - ------- - result : SpeechEvaluationResult - an aggregated speech evaluation result with a score - for each item - """ - wavs = self.resample(wavs, sample_rate) - scores = self.model(wavs, length) - while scores.dim() > 1 and scores.size(-1) == 1: - scores = scores.squeeze(-1) - return SpeechEvaluationResult(score=scores, details={"score": scores}) - - class ASRSpeechEvaluator(SpeechEvaluator): """A superclass for ASR speech evaluators""" @@ -401,105 +350,6 @@ def _replace_blanks(self, preds): return [" " if item == "" else item for item in preds] -class EncoderDecoderASRSpeechEvaluator(ASRSpeechEvaluator): - """A speech evaluator implementation based on ASR. - Computes the Word Error Rate (WER), Character Error Rate (CER) - and a few other metrics - - Arguments - --------- - sample_rate : int - The audio sample rate this evaluator expects - """ - - def __init__(self, source, sample_rate=None, *args, **kwargs): - super().__init__(sample_rate=sample_rate) - self.asr = EncoderDecoderASR.from_hparams(source, *args, **kwargs) - self.device = next(self.asr.mods.parameters()).device - - def evaluate_samples(self, wavs, length, text, sample_rate): - wavs = self.resample(wavs, sample_rate) - if text is None: - raise ValueError("This evaluator requires ground-truth text") - predicted_words, scores, log_probs = self.transcribe_batch_with_details( - wavs, length - ) - ids = range(1, len(wavs) + 1) - wer_metric, cer_metric = init_asr_metrics() - wer_metric.append(ids, predicted_words, text) - cer_metric.append(ids, predicted_words, text) - wer = torch.tensor( - [score["WER"] for score in wer_metric.scores], device=wavs.device - ) - cer = torch.tensor( - [score["WER"] for score in cer_metric.scores], device=wavs.device - ) - prob_mean = log_probs.exp().mean(dim=-1) - return { - "wer": wer, - "cer": cer, - "beam_score": scores, - "prob_mean": prob_mean, - "pred": predicted_words, - "target": text, - } - - def transcribe_batch_with_details(self, wavs, wav_lens): - """Transcribes the input audio into a sequence of words - - The waveforms should already be in the model's desired format. - You can call: - ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)`` - to get a correctly converted signal in most cases. - - Arguments - --------- - predicted_words : list - The raw ASR predictions, fully decoded - best_scores : list - The best scores (from beam search) - best_log_probs : list - The best predicted log-probabilities (from beam search) - - - Returns - ------- - predicted_words : list - The predictions - - best_scores : torch.Tensor - The best scores (from beam search) - - best_log_probs : torch.Tensor - The best log-probabilities - - """ - with torch.no_grad(): - wav_lens = wav_lens.to(self.device) - encoder_out = self.asr.encode_batch(wavs, wav_lens) - ( - hyps, - best_lens, - best_scores, - best_log_probs, - ) = self.asr.mods.decoder(encoder_out, wav_lens) - predicted_words = [ - self.asr.tokenizer.decode_ids(token_seq) for token_seq in hyps - ] - return predicted_words, best_scores, best_log_probs - - def to(self, device): - """Transfers this module to the spcieifed device - - Arguments - --------- - device : str | torch.Device - the target device - """ - self.asr = self.asr.to(device) - return self - - class WhisperASRSpeechEvaluator(ASRSpeechEvaluator): """A speech evaluator implementation based on Whisper ASR @@ -743,171 +593,320 @@ def evaluate_files(self, file_names, text=None, file_names_ref=None): raise NotImplementedError() -UTMOS_REPO = "https://huggingface.co/spaces/sarulab-speech/UTMOS-demo" +class UTMOSModel(nn.Module): + """The UTMOS model wrapper + + Arguments + --------- + source : str + The WavLM source + save_path : str | path-like + The path where the model will be saved + features_dim : int, optional + The features dimension + num_domains : int, optional + The number of domains + domain_dim : int, optional + The dimension of each domain + num_judges : int, optional + The number of "judges" + judge_dim : int, optional + The dimension of each judge + decoder_hidden_size : int, optional + The size of the decoder hidden state + multiplier : float, optional + The number that the raw model output is multiplied by + to compute the score + offset : float, optional + The number that (raw output * multiplier) will be added + to in order to get the score + """ + + def __init__( + self, + source, + save_path, + features_dim=768, + num_domains=3, + domain_dim=128, + num_judges=3000, + judge_dim=128, + decoder_hidden_size=512, + multiplier=2.0, + offset=3.0, + ): + super().__init__() + + self.ssl_encoder = Wav2Vec2( + source, + save_path, + freeze=True, + output_norm=False, + freeze_feature_extractor=True, + output_all_hiddens=False, + ) + + self.domain_embedding = nn.Embedding(num_domains, domain_dim) + self.judge_embedding = nn.Embedding(num_judges, judge_dim) + + self.decoder = nn.LSTM( + input_size=features_dim + domain_dim + judge_dim, + hidden_size=decoder_hidden_size, + num_layers=1, + batch_first=True, + bidirectional=True, + ) + + self.classifier = nn.Sequential( + nn.Linear(decoder_hidden_size * 2, 2048), + torch.nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(2048, 1), + ) + self.multiplier = multiplier + self.offset = offset + + def forward(self, wav, domain_id=None, judge_id=None): + """Computes the forward pass + + Arguments + --------- + wav : torch.Tensor + The raw waveforms + domain_id : torch.Tensor + The domain identifiers + judge_id : torch.Tensor + The judge identifier + + Returns + ------- + result : torch.Tensor + The predicted rating(s) + """ + + if domain_id is None: + domain_id = torch.zeros( + len(wav), dtype=torch.int, device=wav.device + ) + if judge_id is None: + judge_id = ( + torch.ones(len(wav), dtype=torch.int, device=wav.device) + * DEFAULT_JUDGE_ID + ) + + ssl_features = self.ssl_encoder(wav) + domain_emb = self.domain_embedding(domain_id) + judge_emb = self.judge_embedding(judge_id) + + domain_emb = domain_emb.unsqueeze(1).expand( + -1, ssl_features.size(1), -1 + ) + judge_emb = judge_emb.unsqueeze(1).expand(-1, ssl_features.size(1), -1) + concatenated_feature = torch.cat( + [ssl_features, domain_emb, judge_emb], dim=2 + ) + + decoder_output, _ = self.decoder(concatenated_feature) + pred = self.classifier(decoder_output) + return pred.mean(dim=1).squeeze(1) * self.multiplier + self.offset -class UTMOSSpeechEvaluator(BulkSpeechEvaluator): - """An evaluation wrapper for UTMOS + +class UTMOSSpeechEvaluator(SpeechEvaluator): + """The UTMOS speech evaluator wrapper Github: https://github.com/sarulab-speech/UTMOS22 HuggingFace: https://huggingface.co/spaces/sarulab-speech/UTMOS-demo + Arguments --------- - model_path : str | path-like - The path where the HuggingFace repository was extracted - output_folder : str | path-like - The folder where results will be output - ckpt_path : str | path-like - The path to the checkpoint to be used - script : str | path-like - The path to the evaluation script, defaults to the bundled - predict.py - python : str | path-like, optional - The path to the Python interpreter to be used, defaults to - "python". Depending on the environment, it might need to be - changed (e.g. to "python3" or an absolute path to the interpreter) - use_python : bool - Whether to launch the script using python. This flag will need to be - set to False in environments where running UTMOS requires a wrapper shell - script (e.g. to initialize a different Python virtual environment from - the one in which SpeechBrain is running) - tmp_folder : str | path-like, optional - The temporary folder where files will be copied for evaluation. If - omitted, it will be set to output_folder. This can be useful on - compute environments that provide fast local storage (e.g. certain - compute clusters) - repo : str - The repor + source : str, optional + The WavLM source + save_path : str | path-like, optional + The path where the model will be saved + model_name : str + The name of the model hub + model_url : str + The model URL (if applicable) + domain_id : int + The domain ID of the underlying model + judge_id : int + The judge ID to use (given UTMOS was trained as an ensemble + of judges) + run_opts: dict, optional + The run options + sample_rate : int + The sample rate of the underlying model """ def __init__( self, - model_path, - output_folder, - ckpt_path, - script="predict.py", - python="python", - use_python=True, - batch_size=8, - tmp_folder=None, - repo=UTMOS_REPO, + source=None, + save_path=None, + model_name=None, + model_url=None, + domain_id=None, + judge_id=None, + run_opts=None, + sample_rate=16000, ): - self.output_folder = Path(output_folder) - rand = torch.randint(1, 999999999, (1,)).item() - if tmp_folder is None: - tmp_folder = self.output_folder - else: - tmp_folder = Path(tmp_folder) - self.eval_path = (tmp_folder / f"eval_{rand}").absolute() - self.model_path = Path(model_path).absolute() - script = self.model_path / script - self.script = script - self.ckpt_path = Path(ckpt_path).absolute() - self.batch_size = batch_size - self.python = python - self.use_python = use_python - self.repo = repo - self.install() - - def install(self): - if self.model_path.exists(): - logger.info("UTMOS is already installed in %s", self.model_path) - return - logger.info( - "Attempting to install UTMOS from %s to %s", - self.repo, - self.model_path, - ) - cmd = shlex.join( - [ - "git", - "-C", - str(self.model_path.parent), - "clone", - self.repo, - str(self.model_path.name), - ] - ) - output, err, return_code = run_shell(cmd) - if return_code != 0: - raise CommandError(cmd, output, err, return_code) - logger.info("Repository clone successful, performing an LFS fetch") - cwd = Path.cwd() - try: - os.chdir(self.model_path) - cmd = shlex.join(["git", "lfs", "fetch"]) - output, err, return_code = run_shell(cmd) - if return_code != 0: - raise CommandError(cmd, output, err, return_code) - finally: - os.chdir(cwd) - if not self.ckpt_path.exists(): - raise ValueError("ckpt_path {ckpt_path} does not exist") - - def evaluate_files(self, file_names, text, file_names_ref=None): - """Evaluates multiple files + super().__init__(sample_rate=sample_rate) + self.model = UTMOSModel(source=source, save_path=save_path,) + if run_opts is not None: + device = run_opts.get("device") + if device: + self.model = self.model.to(device) + fetch(model_name, model_url, save_path) + model_path = Path(save_path) / model_name + state_dict = torch.load(model_path) + self.model.load_state_dict(state_dict) + self.model.eval() + + self.domain_id = domain_id + self.judge_id = judge_id + + def evaluate( + self, + wavs, + length, + text=None, + wavs_ref=None, + length_ref=None, + sample_rate=None, + sample_rate_ref=None, + ): + """Evaluates a batch of waveforms using UTMOS Arguments --------- - file_names : list - A list of files - - text : list - File transcripts (not required for all evaluators) - Not used in this evaluator - - file_names_ref : list, optional - A list of reference files / ground truths (if applicable) - Not used in this evaluator + wavs: torch.Tensor + the waveforms to evaluate + length: torch.Tensor + relative lengths (a 1-D tensor) + text : list, optional + Ground truth text. Ignored for UTMOS. + wavs_ref : torch.Tensor + the reference waveforms. Ignored for UTMOS. + length_ref : torch.Tensor + the reference waveform lengths. Ignored for UTMOS. + sample_rate : int, optional + The sample rate of the audio. If not provided, + the audio is assumed to be at the same sample + rate as the model + sample_rate_ref : int, optional + The sample rate of the reference samples. Ignored for UTMOS. Returns ------- result : SpeechEvaluationResult - a consolidated evaluation result + an aggregated speech evaluation result with a score + for each item """ - current_path = os.getcwd() - try: - self.eval_path.mkdir(parents=True, exist_ok=True) - logger.info("Copying the files to '%s'", self.eval_path) - for file_name in file_names: - target_file_name = self.eval_path / Path(file_name).name - shutil.copy(file_name, target_file_name) - - logger.info("Running evaluation") - result_path = self.eval_path / "result.txt" - os.chdir(self.model_path) - cmd = [ - str(self.script), - "--mode", - "predict_dir", - "--bs", - str(self.batch_size), - "--inp_dir", - str(self.eval_path), - "--out_path", - str(result_path), - "--ckpt_path", - str(self.ckpt_path), - ] - if self.use_python: - cmd = [self.python] + cmd - - output = subprocess.check_output(cmd) - logger.info("Evaluation finished, output: %s", output) - file_names = [path.name for path in self.eval_path.glob("*.wav")] - with open(result_path) as result_path: - scores = [float(line.strip()) for line in result_path] - score_map = dict(zip(file_names, scores)) - scores_ordered = [ - score_map[Path(file_name).name] for file_name in file_names - ] - return SpeechEvaluationResult( - scores_ordered, {"utmos": scores_ordered} + wavs = self.resample(wavs, sample_rate=sample_rate) + domain_id, judge_id = None, None + if self.domain_id is not None: + domain_id = ( + torch.ones(len(wavs), device=wavs.device) * self.domain_id + ) + if self.judge_id is not None: + judge_id = torch.ones(len(wavs), device=wavs.device) * self.judge_id + + scores = self.model(wav=wavs, domain_id=domain_id, judge_id=judge_id) + return SpeechEvaluationResult(score=scores, details={"utmos": scores}) + + +class SpkSimWavLM(SpeechEvaluator): + """A speaker similarity evaluator based on WavLM / XVector + + Arguments + --------- + source : str + The model hub to use + savedir : str + The path where the model will be saved + model_sample_rate : int, optional + The sample rate to which all samples will be resampled + before being processed + """ + + def __init__( + self, + source, + savedir, + model_sample_rate=16000, + run_opts=None, + *args, + **kwargs, + ): + if not has_transformers: + raise ValueError( + "Unable to use the SpkSimWavLM evaluator because the " + "transformers library is not enabled" ) - finally: - os.chdir(current_path) - shutil.rmtree(self.eval_path) + if run_opts is None: + run_opts = {} + device = run_opts.get("device") + self.model = AutoModelForAudioXVector.from_pretrained( + source, cache_dir=savedir, *args, **kwargs + ) + if device is not None: + self.model = self.model.to(device) + + self.model.eval() + self.model_sample_rate = model_sample_rate + self.device = next(self.model.parameters()).device + + def evaluate( + self, + wavs, + length, + text=None, + wavs_ref=None, + length_ref=None, + sample_rate=None, + sample_rate_ref=None, + ): + # Resample + if sample_rate is not None: + wavs = torchaudio.functional.resample( + wavs, orig_freq=sample_rate, new_freq=self.model_sample_rate + ) + if sample_rate_ref is not None: + wavs_ref = torchaudio.functional.resample( + wavs_ref, + orig_freq=sample_rate_ref, + new_freq=self.model_sample_rate, + ) + + # Concatenate + batch_size, wavs_max_len = wavs.shape + _, wavs_ref_max_len = wavs_ref.shape + length_abs = length * wavs_max_len + length_ref_abs = length_ref * wavs_ref_max_len + max_len = max(wavs_max_len, wavs_ref_max_len) + wavs, _ = pad_right_to(wavs, (batch_size, max_len)) + wavs_ref, _ = pad_right_to(wavs_ref, (batch_size, max_len)) + audio = torch.cat([wavs, wavs_ref]) + + length_cat_abs = torch.cat([length_abs, length_ref_abs]) + # Attention mask + attention_mask = length_to_mask( + length_cat_abs.int() + ).long() # 0 for masked tokens + # Forward + with torch.inference_mode(): + embs = self.model( + input_values=audio, + attention_mask=attention_mask, + output_attentions=False, + ).embeddings + hyp_embs, ref_embs = embs.split([len(wavs), len(wavs_ref)]) + scores = torch.nn.functional.cosine_similarity( + hyp_embs, ref_embs, dim=-1 + ) + + return SpeechEvaluationResult(scores, {"score": scores}) def vocoder_to_device(vocoder, device): diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index be73fda74..a522aaecf 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -10,7 +10,9 @@ import sys import os import torch +import re from abc import ABC, abstractmethod +from pathlib import Path from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import ( DiscreteSSL, @@ -19,6 +21,17 @@ from speechbrain.lobes.models.discrete.speechtokenizer import SpeechTokenizer from speechbrain.lobes.models.discrete.wavtokenizer import WavTokenizer from speechbrain.lobes.models.huggingface_transformers.mimi import Mimi +from speechbrain.utils.superpowers import run_shell +from speechbrain.utils.fetching import fetch +from model.fairseq_hubert import FairseqHuBERT +from torch import nn +import logging +import shlex +import yaml + +logger = logging.getLogger(__name__) + + base_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "..") @@ -499,7 +512,7 @@ def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): self.eval() - signal = self.decode(tokens.view(tokens.shape[0], -1), **kwargs) + signal = self.decode(tokens.reshape(tokens.shape[0], -1), **kwargs) return signal.squeeze(1) @torch.no_grad() @@ -513,3 +526,132 @@ def get_pretrained_embeddings( raise ValueError( "SQCodec does not have any trainable quantizer or embedding since it uses scalar quantization." ) + + +DEFAULT_ESPNET_REPO = "https://github.com/espnet/espnet" + + +class ESPNetEncodecInterface(BaseTokenizer, nn.Module): + """An interface for pretrained ESPNet Encodec implementations""" + + def __init__( + self, + source, + model_ckpt, + model_config, + save_path, + sample_rate=24000, + n_codebook=32, + espnet_repo=DEFAULT_ESPNET_REPO, + espnet_commit=None, + ): + super().__init__() + self.source = source + self.model_ckpt = model_ckpt + self.model_config = model_config + self.save_path = Path(save_path) + self.sample_rate = sample_rate + self.n_codebook = n_codebook + self.espnet_repo = espnet_repo + self.espnet_commit = espnet_commit + self._load() + + def _load(self): + self._load_espnet() + ckpt_file_name = fetch( + filename=self.model_ckpt, + source=self.source, + savedir=str(self.save_path), + save_filename=str(Path(self.model_ckpt).name) + ) + config_file_name = fetch( + filename=self.model_config, + source=self.source, + savedir=str(self.save_path), + save_filename="config.yaml" + ) + with open(config_file_name) as config_file: + config = yaml.safe_load(config_file) + from espnet2.gan_codec.encodec.encodec import Encodec as ESPNetEncodec + self.encodec = ESPNetEncodec(**config["codec_conf"]) + device = next(iter(self.encodec.parameters())).device + state_dict = torch.load(ckpt_file_name, map_location=device) + state_dict = { + re.sub("^codec.", "", key): value + for key, value in state_dict.items() + } + self.encodec.load_state_dict(state_dict) + + def _load_espnet(self): + try: + import espnet2 + except ModuleNotFoundError: + self._download_espnet() + + def _download_espnet(self): + logger.info("espnet is not installed, installing") + espnet_path = self.save_path / "espnet" + if not espnet_path.exists(): + logger.info("Cloining %s into %s", self.espnet_repo, espnet_path) + cmd = shlex.join(["git", "clone", self.espnet_repo, str(espnet_path)]) + run_shell(cmd) + else: + logger.info("%s already exists", espnet_path) + if self.espnet_commit: + logger.info("Checking out %s", self.espnet_commit) + cmd = shlex.join(["git", "-C", str(espnet_path), "checkout", self.espnet_commit]) + run_shell(cmd) + logger.info("Installing") + cmd = shlex.join(["pip", "install", "-e", str(espnet_path)]) + run_shell(cmd) + logger.info("Installation completed") + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.encodec.eval() + if signal.dim() < 3: + signal = signal.unsqueeze(1) + tokens = self.encodec.encode(signal) + return tokens.permute(1, 2, 0)[:, :, :self.n_codebook] + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.encodec.eval() + tokens = tokens.permute(2, 0, 1) + signal = self.encodec.decode(tokens, **kwargs) + return signal.squeeze(1) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + """ + This method is not implemented for ESPNet Encodec, as it uses scalar quantization + and does not have any trainable quantizer or embedding. + """ + raise ValueError( + "ESPNet Encodec does not have any trainable quantizer or embedding since it uses scalar quantization." + ) + + +class FairseqHuBERTTokenizer(FairseqHuBERT, BaseTokenizer): + def __init__(self, *args, **kwargs): + FairseqHuBERT.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + self.eval() + tokens = self.encode(signal) + tokens = tokens.unsqueeze(-1) + return tokens + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + return self.decode(tokens.permute(0, 2, 1)) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + raise NotImplementedError("Fairseq HuBERT does not support embeddings")