diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
index bcb2670a6..52b7e1817 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
@@ -51,17 +51,7 @@ def __init__(self, hparams, create_waveform_fn, device):
         else:
             self.evaluators = {}
 
-        bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {})
-        if bulk_evaluators:
-            self.bulk_evaluators = {
-                key: evaluator_f()
-                for key, evaluator_f in bulk_evaluators.items()
-                if key in self.enabled_evaluators
-            }
-        else:
-            self.bulk_evaluators = {}
-
-        if not self.evaluators and not self.bulk_evaluators:
+        if not self.evaluators:
             logger.warn(
                 "No evaluators were defined - this run will produce samples only"
             )
@@ -98,9 +88,7 @@ def on_evaluate_start(self, stage, epoch):
         self.create_reports()
         self.modules.model.show_inference_progress = False
         self.item_ids = []
-        details_keys = list(self.evaluators.keys()) + list(
-            self.bulk_evaluators.keys()
-        )
+        details_keys = list(self.evaluators.keys())
         self.details = {evaluator_key: [] for evaluator_key in details_keys}
         self.sample_text = []
         self.sample_file_names = []
@@ -141,7 +129,6 @@ def on_evaluate_end(self):
         dataset : speechbrain.dataio.dataset.DynamicItemDataset
             a dataset
         """
-        self.evaluate_bulk()
         self.write_summary()
         logger.info("Evaluation done")
 
@@ -182,19 +169,6 @@ def get_report_columns(self, evaluator_key):
                 wavs_ref=bogus_wavs,
                 length_ref=bogus_length,
             )
-        else:
-            bogus_file_name = self.output_folder / "bogus.wav"
-            evaluator = self.bulk_evaluators[evaluator_key]
-            sb.dataio.dataio.write_audio(
-                str(bogus_file_name),
-                bogus_wavs[0].cpu(),
-                samplerate=self.hparams.model_sample_rate,
-            )
-            result = evaluator.evaluate_files(
-                file_names=[bogus_file_name],
-                text=["BOGUS"],
-                file_names_ref=[bogus_file_name],
-            )
 
         return ["uttid"] + list(result.details.keys())
 
@@ -228,19 +202,6 @@ def evaluate_batch(self, batch):
                 self.write_result(evaluator_key, batch.uttid, details)
                 self.details[evaluator_key].extend(details)
 
-    def evaluate_bulk(self):
-        """Runs all configured bulk evaluators, which evaluate a directory
-        of files - rather than one file at a time"""
-        for evaluator_key, evaluator in self.bulk_evaluators.items():
-            result = evaluator.evaluate_files(
-                file_names=self.sample_file_names,
-                text=self.sample_text,
-                file_names_ref=self.ref_file_names,
-            )
-            self.details[evaluator_key].append(result.details)
-            details = undo_batch(result.details)
-            self.write_result(evaluator_key, self.item_ids, details)
-
     def write_result(self, evaluator_key, uttid, details):
         """Outputs the result details to the report for the specified evaluator
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index bdf6c0f75..dcdc6d920 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -1,50 +1,56 @@
+# ############################################################################
+# Evaluation Hyperparameters
+# Common to old models, appended to main hyperparameters
+#
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+eval_enabled: True
 eval_sample_rate: 16000
 eval_samples: null
 eval_interval: 1
 eval_asr_type: whisper
-eval_asr_source: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech
-    whisper: openai/whisper-small
+eval_asr_source: openai/whisper-small
 evaluations: utmos,asr
 tmp_folder: null
-utmos_batch_size: 8
-utmos_model_path: ./utmos
-utmos_ckpt_name: epoch=3-step=7459.ckpt
-utmos_ckpt_path: !ref <utmos_model_path>/<utmos_ckpt_name>
-utmos_use_python: True
-utmos_script: predict.py
-
-
-eval_asr: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      overrides:
-        lm_weight: 0.0
-    whisper: !name:eval.WhisperASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      savedir: !ref <pretrained_model_save_folder>
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: False
+
+
+eval_utmos: !name:eval.UTMOSSpeechEvaluator
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
+
+eval_asr: !name:eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
 
 evaluators:
+  utmos: !ref <eval_utmos>
   asr: !ref <eval_asr>
 
-bulk_evaluators:
-  utmos: !name:eval.UTMOSSpeechEvaluator
-    model_path: !ref <utmos_model_path>
-    output_folder: !ref <output_folder>
-    ckpt_path: !ref <utmos_ckpt_path>
-    batch_size: !ref <utmos_batch_size>
-    script: !ref <utmos_script>
-    use_python: !ref <utmos_use_python>
-    tmp_folder: !ref <tmp_folder>
-
 eval_summary:
   asr:
     descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
   utmos:
     descriptive: ["utmos"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
+
+eval_threshold:
+  dwer_max: 90.0
+
+eval_threshold_set:
+  utmos: 0.0
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index cd4f338bc..d49afdf29 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -8,18 +8,23 @@ experiment_name: tokotron/dac
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
 
 token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
-vocoder_type: encodec
-vocoder_src: "charactr/vocos-encodec-24khz"
+
+# Model type
+representation_mode: discrete
 
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/dac
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
@@ -29,16 +34,27 @@ test_json: !ref <prepare_save_folder>/test.json
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
-splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
 
+token_model_kwargs:
+    n_quantizers: !ref <audio_tokens_per_step>
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -61,7 +77,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -85,8 +101,8 @@ model_bitrate: 8kbps
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -94,24 +110,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
 
 silence_padding: !ref <gate_offset>
 
-# Token model (pretrained)
-dac: !new:speechbrain.lobes.models.discrete.dac.DAC
-    sample_rate: !ref <model_sample_rate>
-    model_type: !ref <model_type>
-    model_bitrate: !ref <model_bitrate>
-    load_pretrained: True
-
-# Token model (pretrained)
-token_model: !new:Tokotron.DACFeatureExtractor
-    dac: !ref <dac>
-    n_quantizers: !ref <audio_tokens_per_step>
 
 # Dataloader options
 train_dataloader_opts:
@@ -143,20 +148,13 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-
 
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
@@ -165,6 +163,7 @@ audio_num_tokens: 1024
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
+audio_token_offsets: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
@@ -178,7 +177,7 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
@@ -198,15 +197,23 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
 
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    n_codebooks: !ref <audio_tokens_per_step>
+    load_pretrained: True
+    tag: latest
+
+
 modules:
     model: !ref <model>
-    dac: !ref <dac>
+    tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -226,10 +233,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index f8a0ee622..af723f6c9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -4,22 +4,24 @@
 # ############################################################################
 
 experiment_name: tokotron/discrete_ssl
-
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 
 # Model Type
 ssl_model_type: wavlm
-
-output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
+representation_mode: discrete
+output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/discrete-<ssl_model_type>
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-discrete
 vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
@@ -36,36 +38,39 @@ progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
-
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
 freeze_token_model: True
 token_model_src: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
     choices:
         wavlm: microsoft/wavlm-large
         hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
-
+        wav2vec2: facebook/wav2vec2-large
 g2p_src: speechbrain/soundchoice-g2p
 token_model_kmeans_src: poonehmousavi/SSL_Quantization
-token_model_kmeans_dataset: LibriSpeech-100-360-500
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
-token_model_layers: !ref <ssl_model_layers>
-token_offset: 1
-vocoder_src: !apply:speechbrain.utils.hparams.choice
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
     choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
-
-vocoder_available_layers: [1, 3, 7, 12, 18, 23]
-
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
-
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -80,7 +85,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
-
+data_scale: null
 
 # index
 pad_index: 0
@@ -91,7 +96,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -107,12 +112,6 @@ eos_mode: gate
 decoder_mode: autoregressive
 scale_factor: 4
 
-# Beam Search-specific parameters
-min_decode_ratio: 1.0
-max_decode_ratio: 10.0
-beam_size: 5
-
-
 # Feature parameters
 sample_rate: 22050
 model_sample_rate: 16000
@@ -122,8 +121,8 @@ debug_infer_max_audio_length: 10
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -131,15 +130,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
-
 silence_padding: !ref <gate_offset>
 use_silence_padding: True
 
-
 # Token model (pretrained)
 ssl_model: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -159,16 +156,6 @@ ssl_model: !apply:speechbrain.utils.hparams.choice
             save_path: !ref <pretrained_model_save_folder>
             freeze: !ref <freeze_token_model>
             output_all_hiddens: True
-
-
-token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
-    ssl_model: !ref <ssl_model>
-    kmeans_repo_id: !ref <token_model_kmeans_src>
-    kmeans_dataset: !ref <token_model_kmeans_dataset>
-    num_clusters: !ref <audio_num_tokens>
-    save_path: !ref <pretrained_model_save_folder>
-    layers_num: !ref <token_model_layers>
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
@@ -181,58 +168,42 @@ train_dataloader_opts:
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 valid_dataloader_opts:
     batch_size: !ref <batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 test_dataloader_opts:
     batch_size: 1
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 sample_dataloader_opts:
     batch_size: !ref <batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 token_model_kwargs:
-    SSL_layers: !ref <token_model_layers>
-
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    token_model_kwargs: !ref <token_model_kwargs>
-    ssl_model: !ref <ssl_model>
-    ssl_model_layers: !ref <ssl_model_layers>
-    token_model_layers: !ref <token_model_layers>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-    spk_emb_model: !ref <spk_emb_model>
-
+    SSL_layers: !ref <speech_model_layers>
 
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
-audio_num_tokens: 1000
+vocab_size: 1000
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
+audio_token_offsets: False
 audio_emb_lr: 0.00001
 audio_emb_weight_decay: 0.001
 text_num_tokens: 39
@@ -247,14 +218,9 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
-    source: !ref <vocoder_src>
-    savedir: !ref <vocoder_model_path>
-
-
-model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
-    audio_num_tokens: !ref <audio_num_tokens>
+    audio_num_tokens: !ref <vocab_size>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
     d_model: !ref <d_model>
     d_ffn: !ref <d_ffn>
@@ -273,20 +239,25 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     representation_mode: discrete
 
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
 modules:
     model: !ref <model>
-    vocoder: !ref <vocoder>
+    tokenizer: !ref <tokenizer>
     compute_cost: !ref <compute_cost>
 
-# define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -302,7 +273,7 @@ compute_cost: !new:Tokotron.TokotronLoss
     representation_mode: discrete
 
 
-lr_annealing: !new:Tokotron.TargetedNoamScheduler
+lr_annealing: !new:model.Tokotron.TargetedNoamScheduler
     lr_initial: [!ref <lr>, !ref <audio_emb_lr>]
     n_warmup_steps: !ref <lr_warmup_steps>
     param_group: 0
@@ -314,10 +285,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index f5e82c309..1c54128b7 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -8,18 +8,21 @@ experiment_name: tokotron/encodec
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
-vocoder_type: encodec
-vocoder_src: "charactr/vocos-encodec-24khz"
+# Model type
+representation_mode: discrete
 
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/encodec
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
@@ -29,16 +32,23 @@ test_json: !ref <prepare_save_folder>/test.json
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
-splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
 
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -53,6 +63,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
@@ -60,7 +71,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -80,8 +91,8 @@ debug_infer_max_audio_length: 10
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -89,20 +100,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
 
 silence_padding: !ref <gate_offset>
 
-# Token model (pretrained)
-token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec
-    source: !ref <token_model_src>
-    save_path: !ref <pretrained_model_save_folder>
-    bandwidth: !ref <bandwidth>
-    flat_embeddings: True
-
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -133,20 +137,13 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-
 
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
@@ -155,6 +152,7 @@ audio_num_tokens: 1024
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
+audio_token_offsets: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
@@ -168,7 +166,7 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
@@ -188,15 +186,24 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
 
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
 modules:
     model: !ref <model>
-    token_model: !ref <token_model>
+    tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -216,10 +223,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
new file mode 100644
index 000000000..505460dfa
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -0,0 +1,231 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/mimi
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+model_hub: kyutai/mimi
+g2p_src: flexthink/soundchoice-g2p
+
+# Model type
+representation_mode: discrete
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 150
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+
+# Token model (pretrained)
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 2048
+audio_emb_size: 1024    
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_token_offsets: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+flatten: false
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 103d584ed..0ff172529 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -8,18 +8,23 @@ experiment_name: tokotron/discrete_ssl
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
 
 token_model_src: "fnlp/SpeechTokenizer"
 g2p_src: flexthink/soundchoice-g2p
-vocoder_type: encodec
-vocoder_src: "charactr/vocos-encodec-24khz"
+
+# Model type
+representation_mode: discrete
 
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/st
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
@@ -29,16 +34,24 @@ test_json: !ref <prepare_save_folder>/test.json
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
-splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
 
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -53,6 +66,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 
 # index
@@ -61,7 +75,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -81,8 +95,8 @@ debug_infer_max_audio_length: 10
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -90,7 +104,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
@@ -98,14 +112,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp
 silence_padding: !ref <gate_offset>
 
 # Token model (pretrained)
-speech_tokenizer: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface
-    source: !ref <token_model_src>
-    save_path: !ref <pretrained_model_save_folder>
-
-token_model: !new:Tokotron.SpeechTokenizerFeatureExtractor
-    speech_tokenizer: !ref <speech_tokenizer>
-    codebooks: !ref <audio_tokens_per_step>
-
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -136,20 +142,12 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-
-
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
@@ -157,7 +155,8 @@ activation: !name:torch.nn.GELU
 audio_num_tokens: 1024
 audio_emb_size: 1024
 audio_emb_freeze: False
-audio_emb_pretrained: True
+audio_emb_pretrained: False
+audio_token_offsets: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
@@ -166,12 +165,13 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
+flatten: false
 bandwidth: 1.5
 attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
@@ -191,15 +191,19 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
 
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
+    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+
 modules:
     model: !ref <model>
-    token_model: !ref <token_model>
+    tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -219,10 +223,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
new file mode 100644
index 000000000..f0ab3d9c1
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -0,0 +1,258 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/sqcodec
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sq_codec_save_path: !ref <pretrained_model_save_folder>/sq-codec
+g2p_src: flexthink/soundchoice-g2p
+
+# Model type
+representation_mode: discrete
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 150
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+
+# Token model (pretrained)
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+transform_audio: !name:model.sq_codec.tokens_to_ternary
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 19683
+audio_emb_size:  36
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_token_offsets: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 4
+ternary_num_digits: 9
+ternary_num_positions: !ref <audio_tokens_per_step> * <ternary_num_digits>
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_emb: !ref <audio_emb>
+    out_proj: !ref <out_proj>
+    multihead_input: False
+    inference: !ref <inference>
+
+inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference
+    gate_offset: !ref <gate_offset>
+    gate_threshold: !ref <gate_threshold>
+    tokens_per_step: !ref <audio_tokens_per_step>
+    bos_idx: !ref <bos_index>
+    audio_token_shift: 0
+    max_steps: !ref <infer_max_audio_length>
+    representation_mode: !ref <representation_mode>
+    transform_audio: !name:model.sq_codec.tokens_to_ternary
+    feed_audio: !name:model.sq_codec.ternary_logits_to_tokens
+
+audio_emb: !new:torch.nn.Identity
+
+out_proj: !new:model.custom_model.TernaryPredictionHead
+    d_model: !ref <d_model>
+    num_positions: !ref <ternary_num_positions>
+
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+    save_path: !ref <sq_codec_save_path>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    seq_cost: !name:model.sq_codec.ternary_loss
+    multihead_output: False
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..d3bf9c770
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,231 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/wavtokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+g2p_src: flexthink/soundchoice-g2p
+
+# Model type
+representation_mode: discrete
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+splits: ["train", "valid", "test"]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 150
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+
+# Token model (pretrained)
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 4096
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_token_offsets: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 1
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+    freeze: True
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py
deleted file mode 120000
index 08621a288..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../utils/preparation.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 3dddf48dc..ec1845d36 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -22,17 +22,19 @@
 import string
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
-from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
+from speechbrain.dataio.dataio import clean_padding, clean_padding_
 from speechbrain.utils.distributed import run_on_main
-from preparation import add_prepared_features
-from audio_tokens import (
+
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
+sys.path.append(base_dir)
+
+from model.Tokotron import (
     get_silence_token,
     use_silence_padding,
     feature_pad_to,
-)
-from Tokotron import RepresentationMode
-from evaluate import TokotronEvaluator
-
+    RepresentationMode,
+)  # noqa: E402
+from evaluate import TokotronEvaluator  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -59,6 +61,9 @@ def __init__(
             create_waveform_fn=self.create_waveform,
             device=self.device,
         )
+        self.representation_mode = RepresentationMode(
+            self.hparams.representation_mode
+        )
 
     def compute_forward(self, batch, stage):
         """Runs all the computation of the Tokotron TTS
@@ -77,11 +82,13 @@ def compute_forward(self, batch, stage):
         """
         batch = batch.to(self.device)
         tokens, tokens_length = batch.tokens
-        audio, audio_length = batch.audio_bos
+        features = self.prepare_features(batch)
+        audio, audio_length, _, _ = features
         emb = None
         if self.use_spk_emb:
             emb = {"spk": batch.spk_emb.data.squeeze(1)}
 
+        audio = self.transform_audio(audio)
         predictions = self.modules.model(
             input_tokens=tokens,
             input_length=tokens_length,
@@ -90,7 +97,69 @@ def compute_forward(self, batch, stage):
             emb=emb,
         )
 
-        return predictions
+        return predictions, features
+
+    def prepare_features(self, batch):
+        """Prepares features, depending on the configuration
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation
+
+        Returns
+        -------
+        audio_bos : torch.Tensor
+            Audio features, with BOS
+        audio_bos_length : torch.Tensor
+            Relative lengths of the audio features, with BOS
+        audio_tgt : torch.Tensor
+            Target audio features (for loss computation)
+        audio_tgt_length : torch.Tensor
+            Relative lengths of the target audio features
+        """
+        if self.representation_mode == RepresentationMode.DISCRETE:
+            audio_bos, audio_bos_length = batch.audio_bos
+            audio_tgt, audio_tgt_length = batch.audio_pad
+            if self.audio_token_offsets is not None:
+                audio_bos = torch.cat(
+                    [
+                        audio_bos[:, : self.hparams.bos_width],
+                        audio_bos[:, self.hparams.bos_width :]
+                        - self.audio_token_offsets,
+                    ],
+                    dim=1,
+                )
+                clean_padding_(audio_bos, audio_bos_length)
+                audio_tgt = audio_tgt - self.audio_token_offsets
+                clean_padding_(audio_tgt, audio_tgt_length)
+        else:
+            wav, audio_length = batch.sig
+            audio = self.modules.ssl_model(wav)
+            audio = audio[self.hparams.ssl_model_layers, :, :, :].permute(
+                1, 2, 0, 3
+            )
+            batch_size, _, heads, dim = audio.shape
+            bos = torch.zeros_like(audio[:, :1, :, :]).reshape(
+                batch_size, self.hparams.bos_width, heads, dim
+            )
+            audio_bos = torch.concatenate([bos, audio], dim=1)
+            audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1)
+            audio_tgt = audio
+            audio_tgt_length = audio_length
+        return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length
+
+    def get_token_offsets(self):
+        """Computes token offsets for tokenizers that require them"""
+        token_offsets = None
+        if self.hparams.audio_token_offsets:
+            token_offsets = (
+                torch.arange(
+                    self.hparams.audio_tokens_per_step, device=self.device
+                )
+                * self.hparams.audio_num_tokens
+            )[None, None, :]
+        return token_offsets
 
     @torch.no_grad()
     def evaluate_batch(self, batch, stage):
@@ -140,24 +209,27 @@ def compute_objectives(self, predictions, batch, stage):
             A one-element tensor used for backpropagating the gradient.
         """
         batch = batch.to(self.device)
-        audio, audio_length = batch.audio_pad
+        predictions, features = predictions
+        _, _, audio_tgt, audio_tgt_length = features
+
+        audio_tgt = self.transform_audio(audio_tgt)
         loss_details = self.hparams.compute_cost(
             predictions=predictions,
-            audio=audio,
-            audio_length=audio_length,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
             input_tokens=batch.tokens.data,
             input_length=batch.tokens.lengths,
         )
         self.loss_metric.append(
             batch.uttid,
             predictions=predictions,
-            audio=audio,
-            audio_length=audio_length,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
             input_tokens=batch.tokens.data,
             input_length=batch.tokens.lengths,
             reduction="batch",
         )
-        return loss_details.loss
+        return loss_details.loss.contiguous()
 
     def on_stage_start(self, stage, epoch):
         """Gets called at the beginning of each epoch.
@@ -195,15 +267,23 @@ def on_stage_start(self, stage, epoch):
         self.use_spk_emb = getattr(self.hparams, "use_spk_emb", False)
 
         self.is_evaluating = False
-        if stage == sb.Stage.VALID:
-            if self.is_eval_epoch(epoch):
+        if self.hparams.eval_enabled:
+            if stage == sb.Stage.VALID:
+                if self.is_eval_epoch(epoch):
+                    self.evaluator.on_evaluate_start(stage, epoch)
+                    self.is_evaluating = True
+                else:
+                    logger.info("No evaluation on epoch %d", epoch)
+            elif stage == sb.Stage.TEST:
                 self.evaluator.on_evaluate_start(stage, epoch)
                 self.is_evaluating = True
-            else:
-                logger.info("No evaluation on epoch %d", epoch)
-        elif stage == sb.Stage.TEST:
-            self.evaluator.on_evaluate_start(stage, epoch)
-            self.is_evaluating = True
+
+        self.audio_token_offsets = self.get_token_offsets()
+        self.token_model_kwargs = getattr(
+            self.hparams, "token_model_kwargs", {}
+        )
+
+        self.transform_audio = getattr(self.hparams, "transform_audio", torch.nn.Identity())
 
     def on_stage_end(self, stage, stage_loss, epoch):
         """Gets called at the end of an epoch.
@@ -225,6 +305,13 @@ def on_stage_end(self, stage, stage_loss, epoch):
         if stage == sb.Stage.TRAIN:
             self.train_stats = stage_stats
 
+        # End evaluation and report stats
+        eval_summary_stats = {}
+        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
+            self.evaluator.on_evaluate_end()
+            eval_summary_stats = self.get_summary_stats()
+            stage_stats.update(eval_summary_stats)
+
         # Perform end-of-iteration things, like annealing, logging, etc.
         if stage == sb.Stage.VALID:
 
@@ -243,13 +330,62 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 valid_stats=stage_stats,
             )
 
-            # Save the current checkpoint and delete previous checkpoints.
+            # Save the current checkpoint and delete previous checkpoints.        
+            ckpt_kwargs = {
+                f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key],
+            }
             self.checkpointer.save_and_keep_only(
-                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+                meta={"loss": stage_stats["loss"], **eval_summary_stats},
+                num_to_keep=hparams["ckpt_keep"],
+                **ckpt_kwargs
             )
 
-        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
-            self.evaluator.on_evaluate_end()
+    def get_summary_stats(self):
+        """Retrieves the stats that needs to be reported on every trial
+        in the train log, as indicated in eval_summary_log in eval.yaml
+
+        Returns
+        -------
+        eval_summary_stats : dict
+            A dict with stats"""
+        eval_summary = self.evaluator.compute_summary()
+        eval_summary_stats = {
+            key: eval_summary.get(value)
+            for key, value in self.hparams.eval_summary_log.items()
+        }
+        self._check_threshold(eval_summary_stats)
+        return eval_summary_stats
+
+    def _check_threshold(self, eval_summary_stats):
+        """Checks threshold values for the defined stats and terminates
+        the trials if the parameters are not met. This is necessary because
+        some metrics produce bogus high values when the speech samples
+        do not contain any speech at all (e.g. UTMOS can be above 3 for
+        silence).
+
+        Classic usage: dWER > 0.9 - treat the whole run as "garbage", set
+        UTMOS to 0
+
+        Arguments
+        ---------
+        eval_summary_stats : dict
+            Summary statistics
+        """
+        for key, threshold_value in self.hparams.eval_threshold.items():
+            key, threshold_type = key.split("_")
+            value = eval_summary_stats[key]
+            if threshold_type == "min":
+                meets = value >= threshold_value
+            elif threshold_type == "max":
+                meets = value <= threshold_value
+            else:
+                raise ValueError(
+                    f"Invalid threshold definition: {key}, check eval_threshold"
+                )
+            if not meets:
+                eval_summary_stats["broken"] = True
+                for key, value in self.hparams.eval_threshold_set.items():
+                    eval_summary_stats[key] = value
 
     def fit_batch(self, batch):
         """Fit one batch, override to do multiple updates.
@@ -281,11 +417,7 @@ def fit_batch(self, batch):
     def init_optimizers(self):
         """Custom optimizer initialization
         """
-        representation_mode = getattr(
-            self.hparams, "representation_mode", RepresentationMode.DISCRETE
-        )
-        representation_mode = RepresentationMode(representation_mode)
-        if representation_mode == RepresentationMode.CONTINUOUS:
+        if self.representation_mode == RepresentationMode.CONTINUOUS:
             audio_emb_params = self.modules.model.decoder.audio_emb.parameters()
             audio_emb_params_set = set(audio_emb_params)
             model_params = [
@@ -323,7 +455,19 @@ def create_waveform(self, audio, length):
         -------
         wav : torch.Tensor
         """
-        raise NotImplementedError()
+        self.modules.tokenizer.device = self.device
+        if hasattr(self.modules.tokenizer, "codec_vocoder"):
+            self.modules.tokenizer.codec_vocoder.to(self.device)
+            self.modules.tokenizer.codec_vocoder.device = self.device
+        with torch.no_grad():
+            if self.audio_token_offsets is not None:
+                audio = clean_padding(audio + self.audio_token_offsets, length)
+            wav = self.modules.tokenizer.tokens_to_sig(
+                audio, **self.token_model_kwargs
+            )
+            wav = clean_padding(wav, length)
+            wav = wav.to(self.device)
+        return wav
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed
@@ -368,9 +512,7 @@ def dataio_prepare(hparams):
         the token used for silence
     """
 
-    representation_mode = RepresentationMode(
-        hparams.get("representation_mode", RepresentationMode.DISCRETE)
-    )
+    representation_mode = RepresentationMode(hparams["representation_mode"])
 
     # Define datasets from json data manifest file
     # Define datasets sorted by ascending lengths for efficiency
@@ -407,7 +549,7 @@ def audio_ref_pipeline(wav):
 
         Arguments
         ---------
-        wav : str
+        wav : strƒnum_
             The file path
 
         Returns
@@ -421,50 +563,50 @@ def audio_ref_pipeline(wav):
     use_silence_padding = hparams.get("use_silence_padding", True)
 
     if representation_mode == RepresentationMode.DISCRETE:
-        layers_key = "token_model_layers"
-        model_key = "token_model"
-        audio_features = "audio_tokens"
+        model_key = "tokenizer"
     else:
-        layers_key = "ssl_model_layers"
         model_key = "ssl_model"
-        audio_features = "audio_ssl"
 
-    audio_tokens_per_step = (
-        len(hparams[layers_key])
-        if layers_key in hparams
-        else hparams["audio_tokens_per_step"]
-    )
-    if use_silence_padding:
-        silence_token, silence_emb = get_silence_token(
+    audio_tokens_per_step = hparams["audio_tokens_per_step"]
+    if (
+        use_silence_padding
+        and representation_mode == RepresentationMode.DISCRETE
+    ):
+        silence_token = get_silence_token(
             hparams[model_key],
-            extract_emb=representation_mode == RepresentationMode.CONTINUOUS,
-            model_kwargs=hparams.get("token_model_kwargs"),
+            num_codebooks=(
+                hparams["speech_model_layers"]
+                if "speech_model_layers" in hparams
+                else audio_tokens_per_step
+            )
         )
+        if silence_token.dim() == 2:
+            silence_token = silence_token.squeeze(-1)
     else:
         silence_token = (
             torch.ones(hparams["audio_tokens_per_step"], dtype=torch.int64)
             * hparams["eos_index"]
         )
-    silence_token = silence_token.cpu()
-    silence_padding = (
-        silence_token
-        if representation_mode == RepresentationMode.DISCRETE
-        else silence_emb
-    )
+    silence_padding = silence_token.cpu()
+    silence_padding = silence_padding[:audio_tokens_per_step]
     silence_padding_len = int(math.ceil(hparams["silence_padding"]))
     bos_width = hparams.get("bos_width", 1)
     audio_bos_prefix = (
         torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"]
     )
-    if representation_mode == RepresentationMode.CONTINUOUS:
-        audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(
-            1, 1, hparams["audio_dim"]
-        )
 
-    @sb.utils.data_pipeline.takes(audio_features)
+    tokens_loader = hparams.get("tokens_loader")
+    if "speech_model_layers" in hparams:
+        tokens_loader_kwargs = {
+            "num_codebooks": get_selected_layer_indexes(hparams)
+        }
+    else:
+        tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step}
+
+    @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
-    def audio_pipeline(audio):
-        audio = torch.from_numpy(audio)
+    def audio_pipeline(id):
+        audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs)
         audio_pad = feature_pad_to(
             audio, len(audio) + silence_padding_len, silence_padding
         )
@@ -480,21 +622,20 @@ def audio_pipeline(audio):
     ]
 
     init_sequence_encoder(hparams)
-    use_spk_emb = hparams.get("use_spk_emb", False)
-    prepared_features = [audio_features]
     output_keys = [
         "uttid",
         "tokens",
-        "audio_pad",
-        "audio_bos",
         "label_norm_eval",
     ]
-    if use_spk_emb:
-        prepared_features.append("spk_emb")
-        output_keys.append("spk_emb")
+    if representation_mode == RepresentationMode.DISCRETE:
+        output_keys += [
+            "audio_pad",
+            "audio_bos",
+        ]
+    else:
+        output_keys.append("sig")
 
     eval_output_keys = [*output_keys, "sig"]
-
     for dataset in data_info:
         if dataset == "train":
             dataset_output_keys = output_keys
@@ -508,13 +649,6 @@ def audio_pipeline(audio):
             output_keys=dataset_output_keys,
         )
 
-        add_prepared_features(
-            dataset=dynamic_dataset,
-            save_path=Path(hparams["prepare_save_folder"]) / "features",
-            id_key="uttid",
-            features=prepared_features,
-        )
-
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
 
@@ -539,51 +673,16 @@ def audio_pipeline(audio):
         raise NotImplementedError(
             "sorting must be random, ascending or descending"
         )
+    data_scale = hparams.get("data_scale")
+    if data_scale:
+        scaled_data_count = int(len(datasets["train"]) * data_scale)
+        datasets["train"] = datasets["train"].filtered_sorted(
+            select_n=scaled_data_count
+        )
 
-    datasets["sample"] = select_sample(hparams, datasets)
     return datasets, silence_padding
 
 
-def select_sample(hparams, datasets):
-    """Selects a sample of files for sample generation, freezing the sample if
-    requested to persist across multiple experiments
-
-    Arguments
-    ---------
-    hparams : dict
-        experiment hyperparameters
-    datasets : dict
-        a dictionary of datasets
-
-    Returns
-    -------
-    dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset
-        the sample dataset
-    """
-    sample_path = hparams.get("sample_path")
-    dataset = None
-    if sample_path is not None:
-        sample_path = Path(sample_path)
-        if sample_path.exists():
-            with open(sample_path, "r") as sample_file:
-                data_ids = [line.strip() for line in sample_file]
-                dataset = FilteredSortedDynamicItemDataset(
-                    datasets["valid"], data_ids
-                )
-
-    if dataset is None:
-        dataset = (
-            datasets["valid"]
-            .batch_shuffle(1)
-            .filtered_sorted(select_n=hparams["num_audio_samples"])
-        )
-        if sample_path is not None:
-            with open(sample_path, "w") as sample_file:
-                for data_id in dataset.data_ids:
-                    print(data_id, file=sample_file)
-    return dataset
-
-
 def init_sequence_encoder(hparams):
     """Initialize a sequence encoder
 
@@ -611,6 +710,22 @@ def init_sequence_encoder(hparams):
     return encoder
 
 
+def get_selected_layer_indexes(hparams):
+    """Finds the layers of selected layers
+
+    Arguments
+    ---------
+    hparams : dict
+        Hyperparameters
+    """
+    selected_layers = hparams.get("speech_model_layers")
+    available_layers = hparams.get("available_speech_model_layers")
+    if not (selected_layers and available_layers):
+        return None
+    layer_idx = [available_layers.index(layer) for layer in selected_layers]
+    return layer_idx
+
+
 def read_token_list(file_name):
     """Reads a simple text file with tokens (e.g. characters or phonemes) listed
     one per line
@@ -625,7 +740,10 @@ def read_token_list(file_name):
     result: list
         a list of tokens
     """
-    if not Path(file_name).exists():
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
         raise ValueError(f"Token file {file_name} not found")
     with open(file_name) as token_file:
         return [line.strip("\r\n") for line in token_file if line]
@@ -667,17 +785,23 @@ def apply_overfit_test(hparams, dataset):
     """
     if hparams["overfit_test"]:
         if isinstance(dataset, tuple):
-            dataset_train, _, _ = dataset
+            dataset_train, dataset_valid, _ = dataset
             dataset_train = apply_overfit_test(hparams, dataset_train)
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(
+                list(dataset_valid.pipeline.output_mapping.keys())
+            )
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(
+                list(dataset["valid"].pipeline.output_mapping.keys())
+            )
             result = {
                 "train": dataset_train,
                 "valid": dataset_eval,
@@ -699,7 +823,7 @@ def apply_overfit_test(hparams, dataset):
 )
 
 
-def run_experiment(brain_cls):
+if __name__ == "__main__":
     # Reading command line arguments
     hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
 
@@ -712,6 +836,8 @@ def run_experiment(brain_cls):
 
     # Load evaluation hyperparameters
     eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if not eval_hparams_file.exists():
+        eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml"
     if eval_hparams_file.exists():
         logger.info(
             "Using evaluation hyperparameters from %s", eval_hparams_file
@@ -736,40 +862,23 @@ def run_experiment(brain_cls):
     from ljspeech_prepare import prepare_ljspeech
 
     # Data preparation, to be run on only one process.
-    representation_mode = RepresentationMode(
-        hparams.get("representation_mode", RepresentationMode.DISCRETE)
-    )
-    audio_features = (
-        "audio_tokens"
-        if representation_mode == RepresentationMode.DISCRETE
-        else "audio_ssl"
-    )
-    extract_features = [audio_features]
-    if hparams.get("use_spk_emb", False):
-        extract_features.append("spk_emb")
-
     if not hparams["skip_prep"]:
-        with hparams["freezer"]:
-            run_on_main(
-                prepare_ljspeech,
-                kwargs={
-                    "data_folder": hparams["data_folder"],
-                    "save_folder": hparams["prepare_save_folder"],
-                    "splits": hparams["splits"],
-                    "split_ratio": hparams["split_ratio"],
-                    "seed": hparams["seed"],
-                    "extract_features": extract_features,
-                    "extract_features_opts": hparams["extract_features_opts"],
-                    "extract_phonemes": hparams["input"] == "phonemes",
-                    "model_name": "tokotron",
-                    "g2p_src": hparams["g2p_src"],
-                    "skip_ignore_folders": hparams[
-                        "prepare_skip_ignore_folders"
-                    ],
-                    "frozen_split_path": hparams.get("frozen_split_path"),
-                    "device": run_opts.get("device", "cpu"),
-                },
-            )
+        run_on_main(
+            prepare_ljspeech,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_folder": hparams["prepare_save_folder"],
+                "splits": hparams["splits"],
+                "split_ratio": hparams["split_ratio"],
+                "seed": hparams["seed"],
+                "extract_phonemes": hparams["input"] == "phonemes",
+                "model_name": "tokotron",
+                "g2p_src": hparams["g2p_src"],
+                "skip_ignore_folders": hparams["prepare_skip_ignore_folders"],
+                "frozen_split_path": hparams.get("frozen_split_path"),
+                "device": run_opts.get("device", "cpu"),
+            },
+        )
 
     # We can now directly create the datasets for training, valid, and test
     datasets, silence_padding = dataio_prepare(hparams)
@@ -779,39 +888,65 @@ def run_experiment(brain_cls):
     audio_keys = ["audio_pad", "audio_bos"]
 
     # Trainer initialization
-    tts_brain = brain_cls(
+    tts_brain = TokotronBrain(
         modules=hparams["modules"],
         opt_class=hparams["opt_class"],
         hparams=hparams,
         run_opts=run_opts,
         checkpointer=hparams["checkpointer"],
     )
-    tts_brain.sample_data = datasets["sample"]
 
     # The `fit()` method iterates the training loop, calling the methods
     # necessary to update the parameters of the model. Since all objects
     # with changing state are managed by the Checkpointer, training can be
     # stopped at any point, and will be resumed on next call.
+
+    dataloader_opts = [
+        hparams[f"{key}_dataloader_opts"] for key in ["train", "valid", "test"]
+    ]
+    representation_mode = RepresentationMode(hparams["representation_mode"])
+    if representation_mode == RepresentationMode.DISCRETE:
+        dataloader_opts = [
+            use_silence_padding(opts, silence_padding, audio_keys)
+            for opts in dataloader_opts
+        ]
+    (
+        train_dataloader_opts,
+        valid_dataloader_opts,
+        test_dataloader_opts,
+    ) = dataloader_opts
+
     tts_brain.fit(
         tts_brain.hparams.epoch_counter,
         datasets["train"],
         datasets["valid"],
-        train_loader_kwargs=use_silence_padding(
-            hparams["train_dataloader_opts"], silence_padding, audio_keys
-        ),
-        valid_loader_kwargs=use_silence_padding(
-            hparams["valid_dataloader_opts"], silence_padding, audio_keys
-        ),
+        train_loader_kwargs=train_dataloader_opts,
+        valid_loader_kwargs=valid_dataloader_opts,
     )
 
     # Load best checkpoint for evaluation
-    tts_brain.evaluate(
-        test_set=datasets["test"],
-        min_key="loss",
-        test_loader_kwargs=use_silence_padding(
-            hparams["test_dataloader_opts"], silence_padding, audio_keys
-        ),
-    )
+    if hparams["testing"]:
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+            if test_summary_file.exists():
+                logging.info("Test run already completed: %s", test_summary_file)
+            else:
+                eval_kwargs = {}
+                test_key_kind = hparams["test_key_kind"]
+                test_key = hparams["test_key"]
+                if test_key:
+                    eval_kwargs = {
+                        f"{test_key_kind}_key": test_key
+                    }
+                tts_brain.evaluate(
+                    test_set=datasets["test"],
+                    test_loader_kwargs=hparams["test_dataloader_opts"],
+                    **eval_kwargs
+                )
+
 
     # Save final checkpoint (fixed name)
     tts_brain.checkpointer.save_checkpoint(name="latest")
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py
deleted file mode 100644
index f3495eaca..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
-Continuous SSL verfsion
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronContinuousSSLBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.vocoder(audio)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronContinuousSSLBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
deleted file mode 100644
index d0bc9f4f7..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronDACBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        z, _, _ = self.modules.dac.quantizer.from_codes(
-            audio.transpose(1, 2).int()
-        )
-        wav = self.modules.dac.decode(z).squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronDACBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
deleted file mode 100644
index f9fc764cd..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
-Discrete SSL version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-import torch
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronDiscreteSSLBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def on_stage_start(self, stage, epoch):
-        self.compute_offset()
-        return super().on_stage_start(stage, epoch)
-
-    def compute_offset(self):
-        """Computes per-layer offsets"""
-        layers_set = set(self.hparams.token_model_layers)
-        available_layers_set = set(self.hparams.vocoder_available_layers)
-        if not layers_set.issubset(available_layers_set):
-            unavailable_layers = ",".join(
-                str(layer) for layer in (layers_set - available_layers_set)
-            )
-            raise ValueError(f"Layers {unavailable_layers} are not supported")
-        self.num_units = self.hparams.audio_num_tokens
-        _, layers_idx = torch.where(
-            torch.tensor(
-                self.hparams.vocoder_available_layers, device=self.device
-            ).unsqueeze(0)
-            == torch.tensor(
-                self.hparams.token_model_layers, device=self.device
-            ).unsqueeze(1)
-        )
-        self.layer_offset = (
-            torch.tensor(layers_idx, device=self.device) * self.num_units
-        )[None, None, :]
-        self.offset = self.hparams.token_offset
-        self.modules.vocoder.tokenize = False
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        units_with_offset = (
-            audio + self.layer_offset.to(audio.device) + self.offset
-        )
-        wav = self.modules.vocoder(units_with_offset)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronDiscreteSSLBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py
deleted file mode 100644
index 2168f970d..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronEncodecBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.token_model.decode(audio)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronEncodecBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py
deleted file mode 100644
index bc51db78c..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronSTBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.token_model.decode(audio)
-        if length is not None:
-            clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronSTBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
new file mode 100644
index 000000000..6c2dd1c8d
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
@@ -0,0 +1,359 @@
+import json
+import torch
+import logging
+import re
+import csv
+from speechbrain.utils.metric_stats import MetricStats
+from types import SimpleNamespace
+from pathlib import Path
+from utils.data import undo_batch
+from torch import nn
+
+
+logger = logging.getLogger(__name__)
+
+
+class SpeechEvaluationMetricStats(MetricStats):
+    """An aggregate metric combining multiple speech evaluators
+
+    Arguments
+    ---------
+    hparams : dict | SimpleNamespace | object
+        Raw hyperparameters for evaluation
+
+    device : str
+        The device on which evaluation will be performed
+
+    """
+
+    def __init__(self, hparams, device="cpu"):
+        if isinstance(hparams, dict):
+            hparams = SimpleNamespace(**hparams)
+        self.hparams = hparams
+        self.device = device
+        modules = self.hparams.modules
+        self.modules = nn.ModuleDict(modules).to(self.device)
+        self.enabled_evaluators = set(self.hparams.evaluations.split(","))
+        evaluators = hparams.evaluators
+        if evaluators:
+            self.evaluators = {
+                key: evaluator_f(run_opts={"device": device})
+                for key, evaluator_f in evaluators.items()
+                if key in self.enabled_evaluators
+            }
+        else:
+            self.evaluators = {}
+
+        if not self.evaluators:
+            logger.warn(
+                "No evaluators were defined - this run will produce samples only"
+            )
+
+        self.attention = []
+
+    def on_evaluation_start(self, output_folder="eval"):
+        """Invoked at the beginning of the evaluation cycle.
+
+        Arguments
+        ---------
+        output_folder : str | path-like
+            The folder to which results will be output
+
+        """
+        logger.info("Starting evaluation")
+        output_folder = Path(output_folder)
+        self.output_folder = (
+            output_folder
+            if output_folder.is_absolute()
+            else self.hparams.output_folder / output_folder
+        )
+        self.output_folder.mkdir(parents=True, exist_ok=True)
+
+        self.files = []
+        details_keys = list(self.evaluators.keys())
+        self.details = {evaluator_key: [] for evaluator_key in details_keys}
+        self.read_reports()
+        self.create_reports()
+        self.item_ids = []
+
+    def on_evaluation_end(self):
+        """Invoked at the beginning of the evaluation cycle. The default
+        implementation is a no-op
+        """
+        logger.info("Ending evaluation")
+        self.write_summary()
+
+    def create_reports(self):
+        """Creates report files and report writers"""
+        self.report_files = {}
+        self.report_writers = {}
+        for evaluator_key in self.enabled_evaluators:
+            columns = self.get_report_columns(evaluator_key)
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            self.files.append(file_name)
+            resume = file_name.exists() and file_name.stat().st_size > 0
+            report_file = open(file_name, "a+")
+            self.report_files[evaluator_key] = report_file
+            writer = csv.DictWriter(report_file, columns)
+            if not resume:
+                writer.writeheader()
+            self.report_writers[evaluator_key] = writer
+
+    def read_reports(self):
+        """Invoked when resuming"""
+        for evaluator_key in self.enabled_evaluators:
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            if file_name.exists():
+                logger.info("%s exists, reading")
+                with open(file_name) as report_file:
+                    reader = csv.DictReader(report_file)
+                    for row in reader:
+                        del row["uttid"]
+                        row = {
+                            key: handle_number(value)
+                            for key, value in row.items()
+                        }
+                        self.details[evaluator_key].append(row)
+
+    def get_tracker_file_name(self):
+        """Determines the file name of the tracker file"""
+        suffix = (
+            f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else ""
+        )
+        file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt"
+        return self.output_folder / file_name
+
+    def get_report_columns(self, evaluator_key):
+        """Returns the columns for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            the identifier of the evaluator
+
+        Returns
+        -------
+        columns : list[str]
+            a list of column headers
+        """
+        bogus_wavs = torch.randn(2, 10000, device=self.device)
+        bogus_length = torch.tensor([1.0, 1.0], device=self.device)
+        evaluator = self.evaluators[evaluator_key]
+        result = evaluator.evaluate(
+            wavs=bogus_wavs,
+            length=bogus_length,
+            text=["BOGUS"] * len(bogus_wavs),
+            wavs_ref=bogus_wavs,
+            length_ref=bogus_length,
+        )
+
+        return ["uttid"] + list(result.details.keys())
+
+    def append(self, ids, wav, length, text, wav_ref, length_ref):
+        """Appends the result of a single item
+
+        Arguments
+        ---------
+        ids : str
+            Utterance IDs
+        wav : torch.Tensor
+            Synthesized waveforms
+        length : torch.Tensor
+            Relative lengths of the synthesized waveforms
+        text : list
+            Ground truth text
+        wav_ref : torch.Tensor
+            Reference (ground truth) waveforms
+        length_ref : torch.Tensor
+            Reference lengths
+        """
+        with torch.no_grad():
+            self.item_ids.extend(ids)
+            for evaluator_key, evaluator in self.evaluators.items():
+                result = evaluator.evaluate(
+                    wavs=wav,
+                    length=length,
+                    text=text,
+                    wavs_ref=wav_ref,
+                    length_ref=length_ref,
+                    sample_rate_ref=self.hparams.sample_rate,
+                    sample_rate=self.hparams.model_sample_rate,
+                )
+                details = undo_batch(result.details)
+                self.write_result(evaluator_key, ids, details)
+                self.details[evaluator_key].extend(details)
+
+    def write_result(self, evaluator_key, ids, details):
+        """Outputs the result details to the report for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            The evaluator key
+        ids : list
+            The list of IDs
+        details : list
+            a list of evaluation details, one dictionary per item
+        """
+        writer = self.report_writers[evaluator_key]
+        for uttid, details_item in zip(ids, details):
+            report_details = {
+                "uttid": uttid,
+                **details_item,
+            }
+            writer.writerow(ascii_only(flatten(report_details)))
+        self.report_files[evaluator_key].flush()
+
+    def write_summary(self, file_name=None):
+        """Outputs summarized statistics
+
+        Arguments
+        ---------
+        file_name : str | path-like
+            An alternative path to save the file
+        """
+        summary = self.summarize()
+        if file_name is None:
+            file_name = self.output_folder / "summary.json"
+        self.files.append(file_name)
+        with open(file_name, "w") as output_file:
+            json.dump(summary, output_file, indent=4)
+
+    def summarize(self, field=None):
+        """Computes the summarized statistics
+
+        Arguments
+        ---------
+        field : str, optional
+            If specified, it will return a specific field
+
+        Returns
+        -------
+        result : dict | float
+            The summary - or the specified field from the sum
+        """
+        result = {
+            f"{evaluator_key}_{stat_key}": value
+            for evaluator_key in self.enabled_evaluators
+            if evaluator_key in self.details
+            for metric_key in self.hparams.eval_summary[evaluator_key][
+                "descriptive"
+            ]
+            for stat_key, value in descriptive_statistics(
+                items=self.details[evaluator_key], key=metric_key,
+            ).items()
+        }
+        if field is not None:
+            result = result[field]
+        return result
+
+    def clear(self):
+        """Deletes all the files that have been created"""
+        for file_name in self.files:
+            file_name.unlink()
+
+
+RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+")
+
+
+def ascii_only(values):
+    """Removes any non-ASCII characters from a dictionary
+
+    Arguments
+    ---------
+    values : dict
+        A dictionary of values
+
+    Returns
+    -------
+    result : dict
+        The same dictionary - but with non-ASCII strings removed"""
+    return {
+        key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value
+        for key, value in values.items()
+    }
+
+
+def descriptive_statistics(items, key):
+    """Computes descriptive statistics for the summary
+
+    Arguments
+    ---------
+    items : list
+        a list of dictionaries with metric values for each item
+    key : str
+        The key of the metric for which the statistics will be computed
+
+    Returns
+    -------
+    statistics : dict
+        The desccriptive statistics computed
+            <key>_mean : the arithmetic mean
+            <key>_std : the standard deviation
+            <key>_min : the minimum value
+            <key>_max : the maximum value
+            <key>_median : the median value
+            <key>_q1 : the first quartile
+            <key>_q3 : the third quartile
+            <key>_iqr : the interquartile ratio
+    """
+    values = torch.tensor([item[key] for item in items])
+    quantiles = torch.tensor([0.25, 0.5, 0.75])
+    q1, median, q3 = values.quantile(quantiles)
+    stats = {
+        "mean": values.mean(),
+        "std": values.std(),
+        "min": values.min(),
+        "max": values.max(),
+        "median": median,
+        "q1": q1,
+        "q3": q3,
+        "iqr": q3 - q1,
+    }
+    return {
+        f"{key}_{stat_key}": value.item() for stat_key, value in stats.items()
+    }
+
+
+def flatten(value):
+    """Converts tensors to scalars and lists of strings to strings
+
+    Arguments
+    ---------
+    value : dict
+        the dictionary to flatten
+
+    Returns
+    -------
+    result : dict
+        a flattened dictionary
+    """
+    return {
+        key: item_value.item() if torch.is_tensor(item_value) else item_value
+        for key, item_value in value.items()
+    }
+
+
+RE_INTEGER = re.compile(r"^-?\d+$")
+RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$")
+
+
+def handle_number(value):
+    """Converts a value to a number, if applicable. Strings
+    that look like integers or floats will be converted to integers
+    or floats.
+
+    Arguments
+    ---------
+    value : str
+        a string value
+
+    Returns
+    -------
+    result : object
+        The processed result"""
+    if RE_INTEGER.match(value):
+        value = int(value)
+    elif RE_FLOAT.match(value):
+        value = float(value)
+    return value
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt
new file mode 100644
index 000000000..105a1dd9d
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt
@@ -0,0 +1,50 @@
+AA
+AE
+AH
+AO
+AW
+AY
+B
+CH
+D
+DH
+EH
+ER
+EY
+F
+G
+HH
+IH
+IY
+JH
+K
+L
+M
+N
+NG
+OW
+OY
+P
+R
+S
+SH
+T
+TH
+UH
+UW
+V
+W
+Y
+Z
+ZH
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt
new file mode 100644
index 000000000..f43d3b08d
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
+ 
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
new file mode 100644
index 000000000..08587ce23
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
@@ -0,0 +1,42 @@
+eval_sample_rate: 16000
+eval_samples: null
+eval_interval: 1
+eval_asr_type: whisper
+eval_asr_source: openai/whisper-small
+evaluations: utmos,asr
+tmp_folder: null
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: False
+
+
+eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
+
+eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
+
+evaluators:
+  utmos: !ref <eval_utmos>
+  asr: !ref <eval_asr>
+
+eval_summary:
+  asr:
+    descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
+  utmos:
+    descriptive: ["utmos"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_dac.yaml
new file mode 100644
index 000000000..8dc6209f7
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_dac.yaml
@@ -0,0 +1,241 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/speechtokenizer
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+g2p_src: speechbrain/soundchoice-g2p
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+
+# DAC-specific settings
+model_type: 24khz
+model_bitrate: 8kbps
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 2
+flatten: false
+
+
+freeze_lm_head: False
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    n_codebooks: !ref <audio_tokens_per_step>
+    load_pretrained: True
+    tag: latest
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
new file mode 100644
index 000000000..abe15d14f
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -0,0 +1,269 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/discrete_ssl
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+ssl_model_type: wavlm
+output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-discrete
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1000
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 6
+flatten: false
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>/<ssl_model_type>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
new file mode 100644
index 000000000..cae286efd
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -0,0 +1,240 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/encodec
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_top_k: 20
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+freeze_lm_head: False
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+flatten: false
+bandwidth: 6
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+    top_k: !ref <infer_top_k>
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
new file mode 100644
index 000000000..5aae5e0db
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -0,0 +1,243 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/espnet-encodec
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_top_k: 20
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+flatten: false
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+
+freeze_lm_head: True
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+    top_k: !ref <infer_top_k>
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <audio_tokens_per_step>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
new file mode 100644
index 000000000..edae05d51
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -0,0 +1,237 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/mimi
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+model_hub: kyutai/mimi
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 2048
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+flatten: false
+bandwidth: 6
+
+
+freeze_lm_head: False
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_speech_tokenizer.yaml
new file mode 100644
index 000000000..3560eaf69
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -0,0 +1,233 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/speechtokenizer
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+g2p_src: speechbrain/soundchoice-g2p
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+flatten: false
+
+
+freeze_lm_head: False
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
+    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
new file mode 100644
index 000000000..bb3f07562
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -0,0 +1,282 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/sqcodec
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sq_codec_save_path: !ref <pretrained_model_save_folder>/sq-codec
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: False
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+multispeaker_pretrain: True
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <multispeaker_pretrain>
+    choices: 
+        True: ["<bos>", "<eos>", "<eot>", "<eop>"]
+        False: ["<bos>", "<eos>", "<eot>"]
+
+special_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <multispeaker_pretrain>
+    choices:
+        True: 5
+        False: 4
+spk_prompt_length: 150
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+top_k: 20
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+ternary_d_hidden: 512
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+target_dropout: 0.5
+vocab_size: 19683
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !ref <vocab_size> * 2
+
+audio_token_shift: 19683
+ 
+audio_tokens_per_step: 4
+flatten: true
+ternary_num_digits: 10
+pred_mode: ternary
+freeze_lm_head: False
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: 1
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    target_dropout: !ref <target_dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+    emb: !ref <emb>
+    lm_head: !ref <lm_head>
+    logits_to_probs: !ref <logits_to_probs>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: 1
+    top_k: !ref <top_k>
+
+lm_head: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !new:model.custom_model.TernaryPredictionHead
+            d_model: !ref <d_model>
+            d_hidden: !ref <ternary_d_hidden>
+            num_positions: !ref <ternary_num_digits> * <audio_tokens_per_step>
+        tokens: null
+
+logits_to_probs: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !new:model.custom_model.TernaryLogitTokenizer
+            num_tokens: !ref <model_vocab_size>
+            num_positions: !ref <ternary_num_digits>
+        tokens: !new:torch.nn.Identity
+
+
+emb: !new:speechbrain.nnet.containers.Sequential
+    ternary: !new:model.sq_codec.TernaryEmbedding
+        num_digits: !ref <ternary_num_digits>
+        flat: True
+    linear: !new:speechbrain.nnet.linear.Linear
+        input_size: !ref <ternary_num_digits> * <audio_tokens_per_step>
+        n_neurons: !ref <d_model>
+
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+    save_path: !ref <sq_codec_save_path>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !name:model.sq_codec.ternary_loss
+            targets_type: tokens
+            num_positions: !ref <ternary_num_digits>
+        tokens: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..03d7e433b
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,241 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/wavtokenizer
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+freeze_lm_head: False
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 
+dropout: 0.2
+vocab_size: 4096
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step:  1
+flatten: False
+bandwidth: 6
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+    freeze: True
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py
new file mode 120000
index 000000000..2f703273c
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py
@@ -0,0 +1 @@
+../../ljspeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
new file mode 100644
index 000000000..9d56ef5bf
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -0,0 +1,1066 @@
+#!/usr/bin/env/python3
+"""Recipe for training VALL-E
+
+Based on ESPNET VALL-E
+
+Curriculum inspired by Lifeiteng's VALL-E
+https://github.com/lifeiteng/vall-e
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+
+import logging
+import speechbrain as sb
+import torch
+import sys
+import shutil
+from pathlib import Path
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.dataio.dataio import (
+    clean_padding,
+    length_to_mask,
+    write_audio,
+)
+from speechbrain.utils.distributed import run_on_main
+from speechbrain.utils.data_utils import batch_pad_right
+import re
+import string
+
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
+sys.path.append(base_dir)
+
+from evaluation import SpeechEvaluationMetricStats  # noqa: E402
+
+logger = logging.getLogger(__name__)
+
+
+# Brain class for speech recognition training
+class VALLEBrain(sb.Brain):
+    """Class that manages the training loop. See speechbrain.core.Brain."""
+
+    def __init__(
+        self,
+        modules=None,
+        opt_class=None,
+        hparams=None,
+        run_opts=None,
+        checkpointer=None,
+    ):
+        super().__init__(
+            modules, opt_class, hparams, run_opts, checkpointer,
+        )
+        self.evaluation_metric = SpeechEvaluationMetricStats(
+            self.hparams, self.device
+        )
+
+    def create_waveform(self, audio, length):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+
+                    Returns
+        -------
+        wav : torch.Tensor
+        """
+        tokenizer = (
+            self.modules.tokenizer.module
+            if hasattr(self.modules.tokenizer, "module")
+            else self.modules.tokenizer
+        )
+        tokenizer.device = self.device
+        if hasattr(tokenizer, "codec_vocoder"):
+            tokenizer.codec_vocoder.to(self.device)
+            tokenizer.codec_vocoder.device = self.device
+        wav = tokenizer.tokens_to_sig(audio)
+        wav = clean_padding(wav, length)
+        wav = wav.to(self.device)
+        return wav
+
+    def compute_forward(self, batch, stage):
+        """Runs all the computation of the Tokotron TTS
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        predictions : dict
+            TTS predictions
+        """
+        batch = batch.to(self.device)
+        prompt, prompt_length = batch.prompt
+        batch_size, prompt_max_len, num_tracks = prompt.shape
+        if self.train_nar:
+            nar_track = torch.randint(
+                1, num_tracks, (batch_size,), device=self.device
+            )
+        else:
+            nar_track = None
+        logits_ar, logits_nar = self.modules.model(
+            dec_seq=batch.prompt.data,
+            dec_seq_lengths=batch.prompt.lengths,
+            prefix_len=batch.prefix_length / prompt_max_len,
+            nar_level_idx=nar_track,
+            predict_ar=self.train_ar,
+            predict_nar=self.train_nar,
+        )
+        return logits_ar, logits_nar, nar_track
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs. We here
+        do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
+        costs.
+
+        Arguments
+        ---------
+        predictions : dict
+            The output dict from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+
+        logits_ar, logits_nar, nar_track = predictions
+        prompt, prompt_length = batch.prompt
+        prefix_length = batch.prefix_length
+
+        batch_size, prompt_max_len, _ = prompt.shape
+        batch_idx = torch.arange(batch_size, device=prompt.device)
+        length_mask = length_to_mask(
+            prompt_length * prompt_max_len, prompt_max_len
+        )
+        prefix_mask = length_to_mask(
+            prefix_length, prompt_max_len
+        ).logical_not()
+        mask = (length_mask * prefix_mask)[:, 1:]
+
+        loss_components = []
+
+        if self.train_ar:
+            logits_ar_sm = self.hparams.log_softmax(logits_ar)
+            if self.hparams.flatten:
+                targets_ar = prompt[:, 1:]
+            else:
+                targets_ar = prompt[:, 1:, 0]
+            loss_ar = self.hparams.compute_cost(
+                logits_ar_sm, targets=targets_ar, mask=mask
+            )
+            loss_components.append(loss_ar)
+        else:
+            logits_ar_sm, targets_ar = None, None
+        if self.train_nar:
+            logits_nar_sm = self.hparams.log_softmax(logits_nar)
+            targets_nar = prompt[batch_idx, 1:, nar_track]
+            loss_nar = self.hparams.compute_cost(
+                logits_nar_sm, targets=targets_nar, mask=mask,
+            )
+            loss_components.append(loss_nar)
+        else:
+            logits_nar_sm, targets_nar = None, None
+
+        self.loss_metric.append(
+            ids=batch.uttid,
+            logits_ar=logits_ar_sm,
+            targets_ar=targets_ar,
+            logits_nar=logits_nar_sm,
+            targets_nar=targets_nar,
+            mask=mask,
+            reduction="batch",
+        )
+
+        loss = torch.mean(torch.stack(loss_components))
+        return loss
+
+    def compute_loss_stats(
+        self,
+        logits_ar,
+        targets_ar,
+        logits_nar,
+        targets_nar,
+        mask,
+        reduction="batch"
+    ):
+        """Computes an autoregressive/non-autoregressive loss breakdown,
+        to be used for metrics/stats
+
+        Arguments
+        ---------
+        logits_ar : torch.Tensor
+            The autoregressive predictions
+        targets_ar : torch.Tensor
+            The targets for autoregressive predictions
+        logits_nar : torch.Tensor
+            The non-autoregressive predictions
+        targets_nar : torch.Tensor
+            The targets for non-autoregressive prediction
+        
+        Returns
+        -------
+        stats: dict
+            statistics
+        """
+        stats = {}
+        if self.train_ar:
+            stats["loss_ar"] = self.hparams.compute_cost(
+                logits_ar, targets=targets_ar, mask=mask,
+                reduction=reduction,
+            )
+        if self.train_nar:
+            stats["loss_nar"] = self.hparams.compute_cost(
+                logits_nar, targets=targets_nar, mask=mask,
+                reduction=reduction,
+            )
+        return stats
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        self.offsets = get_offsets(
+            self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
+        )[None, None, :].to(self.device)
+        if not self.hparams.use_token_offsets:
+            self.offsets = torch.zeros_like(self.offsets)
+
+        self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
+            metric=self.compute_loss_stats, batch_eval=True,
+        )
+        self.apply_curriculum()
+
+        self.is_evaluating = False
+        if stage == sb.Stage.VALID:
+            if self.is_eval_epoch(epoch):
+                self.evaluation_metric.on_evaluation_start()
+                self.is_evaluating = True
+            else:
+                logger.info("No evaluation on epoch %d", epoch)
+        elif stage == sb.Stage.TEST:
+            self.evaluation_metric.on_evaluation_start()
+            self.is_evaluating = True
+        self.transform_audio = getattr(self.hparams, "transform_audio", None)
+
+    def apply_curriculum(self):
+        """Applies curriculum settings, if specified, training only the autoregressive part - or
+        only the non-autoregressive part"""
+        epoch = self.hparams.epoch_counter.current
+        self.train_ar, self.train_nar = True, True
+        lm_head = (
+            self.modules.model.module.lm_head
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.lm_head
+        )
+        lm_head.requires_grad_(True)
+        if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten:
+            # NOTE: If there is only one track it's autoregressive
+            self.train_nar = False
+        elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
+            self.train_nar = False
+        elif (
+            self.hparams.number_of_epochs_nar is not None
+            and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
+        ):
+            self.train_ar = False
+            if self.hparams.freeze_lm_head:
+                lm_head.requires_grad_(False)
+
+    def is_eval_epoch(self, epoch):
+        """Determines whether or not evaluation should be performed
+        in the specieied epoch
+
+        Arguments
+        ---------
+        epoch : int
+            The epoch number. If omitted, the epoch number from the
+            epoch counter will be used
+
+        Returns
+        -------
+        eval_epoch : bool
+            True if evaluation should be run in this epoch, false
+            otherwise"""
+        if epoch is None:
+            epoch = self.hparams.epoch_counter.current
+        # NOTE: Need to get past AR-only training to be able to evaluate
+        can_evaluate = not (
+            self.hparams.number_of_epochs_ar is not None
+            and epoch <= self.hparams.number_of_epochs_ar
+        )
+        return can_evaluate and (epoch % self.hparams.eval_interval == 0)
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp.
+
+        Default implementation compiles the jit modules, initializes
+        optimizers, and loads the latest checkpoint to resume training.
+        """
+        # Run this *after* starting all processes since jit/compiled modules
+        # cannot be pickled.
+        self._compile()
+
+        # Wrap modules with parallel backend after jit
+        self._wrap_distributed()
+
+        # Initialize optimizers after parameters are configured
+        self.init_optimizers()
+
+        # Load latest checkpoint to resume training if interrupted
+        if self.checkpointer is not None and not getattr(
+            self, "_ckpt_recovered", False
+        ):
+            self.checkpointer.recover_if_possible()
+            self._ckpt_recovered = True
+
+    @torch.no_grad()
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch, override for different procedure than train.
+
+        The default implementation depends on two methods being defined
+        with a particular behavior:
+
+        * ``compute_forward()``
+        * ``compute_objectives()``
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for evaluation. Default implementation assumes
+            this batch has two elements: inputs and targets.
+        stage : Stage
+            The stage of the experiment: Stage.VALID, Stage.TEST
+
+        Returns
+        -------
+        detached loss
+        """
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        if self.is_evaluating:
+            with torch.no_grad():
+                audio_tokens, audio_length = self.inference(batch)
+                if self.hparams.flip_layers:
+                    audio_tokens = audio_tokens.flip(2)
+                wav = self.create_waveform(audio_tokens, audio_length)                
+                wav = wav.squeeze(1)
+                self.save_samples(
+                    batch=batch, wav=wav, length=audio_length, stage=stage
+                )
+                self.evaluation_metric.append(
+                    ids=batch.uttid,
+                    wav=wav,
+                    text=batch.label_norm_eval,
+                    length=audio_length,
+                    wav_ref=batch.sig.data,
+                    length_ref=batch.sig.lengths,
+                )
+        return loss.detach().cpu()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+        loss_stats = self.loss_metric.summarize(flat=True)
+        stage_stats = {"loss": stage_loss, **loss_stats}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+
+        # End evaluation and report stats
+        if stage != sb.Stage.TRAIN and self.is_evaluating:
+            self.evaluation_metric.on_evaluation_end()
+            self.save_eval(stage)
+            eval_summary = self.evaluation_metric.summarize()
+            eval_summary_stats = {
+                key: eval_summary.get(value)
+                for key, value in self.hparams.eval_summary_log.items()
+            }
+            stage_stats.update(eval_summary_stats)
+        else:
+            eval_summary_stats = {}
+
+        # Perform end-of-iteration things, like annealing, logging, etc.
+        if stage == sb.Stage.VALID:
+
+            if self.hparams.lr_annealing_mode == "epoch":
+                _, new_lr = self.hparams.lr_annealing(stage_loss)
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+
+            lr = self.optimizer.param_groups[0]["lr"]
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(
+                stats_meta={"epoch": epoch, "lr": lr},
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            ckpt_kwargs = {
+                f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key],
+            }
+            self.checkpointer.save_and_keep_only(
+                meta={"loss": stage_stats["loss"], **eval_summary_stats},
+                num_to_keep=hparams["ckpt_keep"],
+                **ckpt_kwargs
+            )
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
+            )
+
+    def inference(self, batch):
+        """Runs TTS inference
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            A batch
+
+        Returns
+        -------
+        audio : torch.Tensor
+            A padded tensor of audio
+        audio_length : torch.Tensor
+            Relative lengths
+        """
+        prefix, prefix_length = batch.prefix
+        # NOTE: ESPNET VALL-E does not support batched inference
+        prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
+        inference = (
+            self.modules.model.module.inference
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.inference
+        )
+        inference_results = [
+            inference(
+                prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts()
+            )
+            for prefix_item in prefix_items
+        ]
+        inferred_tokens = [
+            result[0][0]
+            if result[0]
+            else torch.zeros(1000, self.hparams.audio_tokens_per_step)
+            for result in inference_results
+        ]
+        audio, audio_length = batch_pad_right(inferred_tokens)
+        offsets = self.offsets
+        if self.hparams.flip_layers:
+            offsets = offsets.flip(2)
+        audio = (audio - self.hparams.audio_token_shift - offsets).clip(0)
+        return audio, audio_length
+
+    def _get_inference_opts(self):
+        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[
+            None, :
+        ]
+        tracks = torch.arange(
+            self.hparams.audio_tokens_per_step, device=self.device
+        )[:, None]
+        if not self.hparams.use_token_offsets:
+            tracks = torch.zeros_like(tracks)
+        track_start = (
+            self.hparams.audio_token_shift
+            + tracks * self.hparams.vocab_size
+        )
+        if self.hparams.flip_layers:
+            track_start = track_start.flip(0)
+        track_end = track_start + self.hparams.vocab_size
+        mask = (
+            ((idx >= track_start) & (idx < track_end))
+            | (idx == self.hparams.bos_index)
+        ).logical_not()
+        mask[
+            (
+                (idx >= self.hparams.special_num_tokens)
+                & (idx <= self.hparams.audio_token_shift)
+            ).expand_as(mask)
+        ] = True
+        return self.hparams.inference_opts(
+            masks={self.hparams.bos_index: mask}, device=self.device,
+        )
+
+    def save_samples(self, batch, wav, length, stage):
+        output_folder = self._get_eval_output_folder(stage)
+        samples = undo_padding_tensor(wav, length)
+        for uttid, sample in zip(batch.uttid, samples):
+            file_name = output_folder / f"pred_{uttid}.wav"
+            write_audio(file_name, sample.detach().cpu(), self.hparams.model_sample_rate)
+
+    def save_eval(self, stage):
+        """Saves evaluation results
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        """
+        output_folder = self._get_eval_output_folder(stage)
+        for src_file_name in self.evaluation_metric.files:
+            dest_file_name = output_folder / src_file_name.name
+            shutil.copyfile(src_file_name, dest_file_name)
+        self.evaluation_metric.clear()
+
+    def _get_eval_output_folder(self, stage):
+        epoch = self.hparams.epoch_counter.current
+        output_folder = (
+            Path(self.hparams.output_folder) / "eval" / stage.name.lower()
+        )
+        if epoch is not None:
+            output_folder = output_folder / str(epoch)
+        output_folder.mkdir(exist_ok=True, parents=True)
+        return output_folder
+
+    def fit_batch(self, batch):
+        loss = super().fit_batch(batch)
+        if self.hparams.lr_annealing_mode == "step":
+            self.hparams.lr_annealing(self.optimizer)
+        return loss
+
+
+INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phonemes"}
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+
+
+    Arguments
+    ---------
+    hparams : dict
+        This dictionary is loaded from the `train.yaml` file, and it includes
+        all the hyperparameters needed for dataset construction and loading.
+
+    Returns
+    -------
+    datasets : dict
+        Dictionary containing "train", "valid", and "test" keys that correspond
+        to the DynamicItemDataset objects.
+    silence_token : dict
+        the token used for silence
+    """
+
+    # Define datasets from json data manifest file
+    # Define datasets sorted by ascending lengths for efficiency
+    datasets = {}
+    data_folder = hparams["data_folder"]
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+
+    label_encoder = hparams["label_encoder"]
+    input_feature = INPUT_FEATURE_MAP[hparams["input"]]
+    offsets = get_offsets(
+        hparams["vocab_size"], hparams["audio_tokens_per_step"]
+    ).unsqueeze(0)
+    if not hparams["use_token_offsets"]:
+        offsets = torch.zeros_like(offsets)
+    if hparams["flip_layers"]:
+        offsets = offsets.flip(-1)
+
+    tokens_loader = hparams.get("tokens_loader")
+
+    @sb.utils.data_pipeline.takes("label")
+    @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
+    def text_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        label_norm = label.upper()
+        yield label_norm
+        label_norm_eval = RE_PUNCTUATION.sub("", label_norm)
+        yield label_norm_eval
+
+    @sb.utils.data_pipeline.takes(input_feature)
+    @sb.utils.data_pipeline.provides("tokens")
+    def tokens_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        return label_encoder.encode_sequence_torch(label)
+
+    layer_idx = None
+    if "speech_model_layers" in hparams:
+        layer_idx = get_selected_layer_indexes(
+            hparams["available_speech_model_layers"],
+            hparams["speech_model_layers"],
+        )
+
+    if layer_idx is not None:
+        num_codebooks = layer_idx
+    else:
+        num_codebooks = hparams["audio_tokens_per_step"]    
+
+    @sb.utils.data_pipeline.takes("uttid", "tokens")
+    @sb.utils.data_pipeline.provides(
+        "audio", "prefix", "prompt", "prefix_length", "length"
+    )
+    def prompt_pipeline_spk(id, tokens):
+        audio = tokens_loader.tokens_by_uttid(
+            id, num_codebooks=hparams["audio_tokens_per_step"]
+        )
+        if hparams["flip_layers"]:
+            audio = audio.flip(-1)
+        yield audio
+        num_tracks = audio.size(1)
+        spk_prompt = torch.randint(
+            0,
+            hparams["vocab_size"],
+            (hparams["spk_prompt_length"], num_tracks)
+        )
+        prefix = torch.cat(
+            [
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                tokens.unsqueeze(-1).expand(len(tokens), num_tracks),
+                torch.ones(1, num_tracks) * hparams["eot_index"],
+                spk_prompt + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eop_index"],
+            ]
+        )
+        yield prefix
+        prompt = torch.cat(
+            [
+                prefix,
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                audio + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eos_index"],
+            ]
+        ).int()
+        yield prompt
+        yield len(prefix)
+        yield len(prompt)
+
+    @sb.utils.data_pipeline.takes("uttid", "tokens")
+    @sb.utils.data_pipeline.provides(
+        "audio", "prefix", "prompt", "prefix_length", "length"
+    )
+    def prompt_pipeline(id, tokens):
+        audio = tokens_loader.tokens_by_uttid(
+            id, num_codebooks=num_codebooks
+        )
+        if hparams["flip_layers"]:
+            audio = audio.flip(-1)
+        yield audio
+        num_tracks = audio.size(1)
+        prefix = torch.cat(
+            [
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                tokens.unsqueeze(-1).expand(len(tokens), num_tracks),
+                torch.ones(1, num_tracks) * hparams["eot_index"],
+            ]
+        )
+        yield prefix
+        prompt = torch.cat(
+            [
+                prefix,
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                audio + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eos_index"],
+            ]
+        ).int()
+        yield prompt
+        yield len(prefix)
+        yield len(prompt)
+
+
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def sig_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+
+    dynamic_items = [sig_pipeline, text_pipeline, tokens_pipeline]
+    if hparams.get("multispeaker_pretrain"):
+        dynamic_items.append(prompt_pipeline_spk)
+    else:
+        dynamic_items.append(prompt_pipeline)
+
+    init_sequence_encoder(hparams)
+    use_spk_emb = hparams.get("use_spk_emb", False)
+    prepared_features = ["audio_tokens"]
+    output_keys = [
+        "uttid",
+        "tokens",
+        "label_norm",
+        "audio",
+        "prompt",
+        "prefix_length",
+        "length",
+    ]
+    if use_spk_emb:
+        prepared_features.append("spk_emb")
+        output_keys.append("spk_emb")
+
+    for dataset in data_info:
+        dataset_dynamic_items = list(dynamic_items)
+        dataset_output_keys = list(output_keys)
+        if dataset != "train":
+            dataset_output_keys += ["sig", "label_norm_eval", "prefix"]
+        dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": data_folder},
+            dynamic_items=dataset_dynamic_items,
+            output_keys=dataset_output_keys,
+        )
+
+        datasets[dataset] = dynamic_dataset
+        hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
+
+    # Sorting training data with ascending order makes the code  much
+    # faster  because we minimize zero-padding. In most of the cases, this
+    # does not harm the performance.
+    if hparams["sorting"] == "ascending":
+        datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        datasets["train"] = datasets["train"].filtered_sorted(
+            sort_key="length", reverse=True
+        )
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        hparams["train_dataloader_opts"]["shuffle"] = True
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+    data_scale = hparams.get("data_scale")
+    if data_scale:
+        scaled_data_count = int(len(datasets["train"]) * data_scale)
+        datasets["train"] = datasets["train"].filtered_sorted(
+            select_n=scaled_data_count
+        )
+    return datasets
+
+
+def get_offsets(vocab_size, tracks):
+    """Adds offsets to each track to treat the tokens as distinct
+
+    Arguments
+    ---------
+    vocab_size : int
+        The vocabulary size, for each track
+    tracks : int
+        The number of tracks
+    """
+    return torch.arange(tracks) * vocab_size
+
+
+def init_sequence_encoder(hparams):
+    """Initialize a sequence encoder
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    prefix: str
+        the prefix to be prepended to hyperparameter keys, per the naming
+        convention
+
+        {prefix}_label_encoder: the hparams key for the label encoder
+        {prefix}_list_file:  the hparams key for the list file
+
+    Returns
+    -------
+    encoder: speechbrain.dataio.encoder.TextEncoder
+        an encoder instance"""
+    encoder = hparams["label_encoder"]
+    token_list_file_name = hparams["token_list_file"]
+    tokens = read_token_list(token_list_file_name)    
+    encoder.add_unk()
+    for token in hparams["special_tokens"]:
+        token_key = token.replace("<", "").replace(">", "")
+        token_index = hparams[f"{token_key}_index"]
+        encoder.insert_label(token, token_index)
+    encoder.update_from_iterable(tokens, sequence_input=False)
+    encoder.expect_len(len(tokens) + hparams["special_num_tokens"])
+    return encoder
+
+
+def get_selected_layer_indexes(available_layers, selected_layers):
+    """Finds the layers of selected layers
+
+    Arguments
+    ---------
+    available_layers : list
+        The available layers
+    selected_layers : list
+        The selected layers
+
+    Returns
+    -------
+    layer_idx : list    
+        The layer indexes
+    """
+    if not (selected_layers and available_layers):
+        return None
+    layer_idx = [available_layers.index(layer) for layer in selected_layers]
+    return layer_idx
+
+
+def read_token_list(file_name):
+    """Reads a simple text file with tokens (e.g. characters or phonemes) listed
+    one per line
+
+    Arguments
+    ---------
+    file_name: str
+        the file name
+
+    Returns
+    -------
+    result: list
+        a list of tokens
+    """
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
+        raise ValueError(f"Token file {file_name} not found")
+    with open(file_name) as token_file:
+        return [line.strip("\r\n") for line in token_file if line]
+
+
+def apply_overfit_test(hparams, dataset):
+    """Helper for applying an overfit test conditionally based
+    on hyperparameters:
+
+    `overfit_test`: whether or not to apply an overfit test
+    `overfit_test_sample_count`: the number of samples to use from the
+        original dataset
+    `overfit_test_epoch_data_count`: the number of samples per epoch
+
+    The function will accept datasets, (train, valid, test) tuples
+    or dictionaries of the form:
+    {"train": dataset1, "valid": dataset2, "test": dataset3}
+
+    If a tuple or dictionary is used, the training dataset will be of length
+    overfit_test_epoch_data_count wheres the evaluation dataset will be of
+    length overfit_test_sample_count.
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    dataset: DynamicItemDataset|tuple|dict
+        One of the following
+        a dataset
+        a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3})
+        a (train, valid, test)  tuple of datasets
+
+    Returns
+    -------
+    result: DynamicItemDataset|tuple|dict
+        a dataset or collection of datasets suitable for
+        an overfitting test - in the same format as the
+        dataset argument (single dataset, dictionary and tuple)
+    """
+    if hparams["overfit_test"]:
+        if isinstance(dataset, tuple):
+            dataset_train, dataset_valid, _ = dataset
+            dataset_train = apply_overfit_test(hparams, dataset_train)
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            dataset_eval.set_output_keys(
+                list(dataset_valid.pipeline.output_mapping.keys())
+            )
+            result = dataset_train, dataset_eval, dataset_eval
+        elif isinstance(dataset, dict):
+            dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            dataset_eval.set_output_keys(
+                list(dataset["valid"].pipeline.output_mapping.keys())
+            )
+            result = {
+                "train": dataset_train,
+                "valid": dataset_eval,
+                "test": dataset_eval,
+                "sample": dataset_eval,
+            }
+        else:
+            result = dataset.overfit_test(
+                hparams["overfit_test_sample_count"],
+                hparams["overfit_test_epoch_data_count"],
+            )
+    else:
+        result = dataset
+    return result
+
+
+def undo_padding_tensor(batch, lengths):
+    """Produces Python lists given a batch of sentences with
+    their corresponding relative lengths.
+
+    Arguments
+    ---------
+    batch : torch.Tensor
+        Batch of sentences gathered in a batch.
+    lengths : torch.Tensor
+        Relative length of each sentence in the batch.
+
+    Returns
+    -------
+    as_list : list
+        A python list of the corresponding input tensor.
+
+    Example
+    -------
+    >>> batch=torch.rand([4,100])
+    >>> lengths=torch.tensor([0.5,0.6,0.7,1.0])
+    >>> snt_list=undo_padding(batch, lengths)
+    >>> len(snt_list)
+    4
+    """
+    batch_max_len = batch.shape[1]
+    as_list = []
+    for seq, seq_length in zip(batch, lengths):
+        actual_size = int(torch.round(seq_length * batch_max_len))
+        seq_true = seq.narrow(0, 0, actual_size)
+        as_list.append(seq_true)
+    return as_list
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+
+if __name__ == "__main__":
+    # Reading command line arguments
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Initialize ddp (useful only for multi-GPU DDP training)
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        yaml = fin.read()
+
+    # Load evaluation hyperparameters
+    eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if eval_hparams_file.exists():
+        logger.info(
+            "Using evaluation hyperparameters from %s", eval_hparams_file
+        )
+        with open(eval_hparams_file) as eval_hparams:
+            hparams_yaml = eval_hparams.read()
+            yaml = "\n".join([yaml, hparams_yaml])
+    else:
+        logger.info(
+            "%s not found - not using evaluation hyperparameters",
+            eval_hparams_file,
+        )
+    hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from ljspeech_prepare import prepare_ljspeech
+
+    # Data preparation, to be run on only one process.
+    if not hparams["skip_prep"]:
+        run_on_main(
+            prepare_ljspeech,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_folder": hparams["prepare_save_folder"],
+                "splits": hparams["splits"],
+                "split_ratio": hparams["split_ratio"],
+                "seed": hparams["seed"],
+                "extract_phonemes": hparams["input"] == "phonemes",
+                "model_name": "tokotron",
+                "g2p_src": hparams["g2p_src"],
+                "skip_ignore_folders": hparams["prepare_skip_ignore_folders"],
+                "frozen_split_path": hparams.get("frozen_split_path"),
+                "device": run_opts.get("device", "cpu"),
+            },
+        )
+
+    # We can now directly create the datasets for training, valid, and test
+    datasets = dataio_prepare(hparams)
+
+    # Apply overfit test settings
+    datasets = apply_overfit_test(hparams, datasets)
+    audio_keys = ["audio_tokens"]
+
+    # Trainer initialization
+    tts_brain = VALLEBrain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # The `fit()` method iterates the training loop, calling the methods
+    # necessary to update the parameters of the model. Since all objects
+    # with changing state are managed by the Checkpointer, training can be
+    # stopped at any point, and will be resumed on next call.
+    tts_brain.fit(
+        tts_brain.hparams.epoch_counter,
+        datasets["train"],
+        datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Load best checkpoint for evaluation
+    if hparams["testing"]:
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            test_key_kind = hparams["test_key_kind"]
+            test_key = hparams["test_key"]
+            eval_kwargs = {
+                f"{test_key_kind}_key": test_key
+            }
+            tts_brain.evaluate(
+                test_set=datasets["test"],
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+                **eval_kwargs
+            )
diff --git a/benchmarks/DASB/LJSpeech/extraction/extract.py b/benchmarks/DASB/LJSpeech/extraction/extract.py
new file mode 100644
index 000000000..556d8a9d0
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/extract.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env/python3
+"""Recipe for extracting a discrete tokens with librispeech.
+
+Authors
+ * Jarod Duret 2024
+"""
+
+import os
+import sys
+import logging
+import pathlib as pl
+import speechbrain as sb
+from speechbrain.dataio.dataset import DynamicItemDataset
+from speechbrain.utils.distributed import run_on_main
+from hyperpyyaml import load_hyperpyyaml
+
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+print(base_dir)
+
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep (parsing Librispeech)
+    from ljspeech_prepare import prepare_ljspeech  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["output_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "frozen_split_path": hparams.get("frozen_split_path"),
+            "device": run_opts.get("device", "cpu"),
+        },
+    )
+
+    tokens_extractor = hparams["tokens_extractor"]
+    data_folder = hparams["data_folder"]
+    datasets = []
+    for split in ["train", "valid", "test"]:
+        json_path = hparams[f"{split}_json"]
+        name = pl.Path(json_path).stem
+        dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=json_path, replacements={"data_root": data_folder},
+        )
+        datasets.append(dataset)
+
+    merged_data = {
+        key: value
+        for dataset in datasets
+        for key, value in dataset.data.items()
+    }
+    merged_dataset = DynamicItemDataset(merged_data)
+
+    save_folder = pl.Path(hparams["save_folder"])
+    logger.info("Extracting dataset tokens ...")
+    tokens_extractor.extract_tokens(
+        merged_dataset,
+        hparams["num_codebooks"],
+        (save_folder / "ljspeech").as_posix(),
+    )
+
+    if hparams["save_embedding"]:
+        save_folder = pl.Path(hparams["save_folder"])
+        logger.info(f"Saving embeddings ...")
+        tokens_extractor.save_pretrained_embeddings(
+            (save_folder / "embeddings").as_posix(),
+            vocab_size=hparams["vocab_size"],
+            num_codebooks=hparams["num_codebooks"],
+        )
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
new file mode 100644
index 000000000..b90054db6
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
@@ -0,0 +1,63 @@
+# ############################################################################
+# Auido Tokenizer: DAC
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 32
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+  model_type: !ref <model_type>
+  model_bitrate: !ref <model_bitrate>
+  load_pretrained: True
+  tag: latest
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
new file mode 100644
index 000000000..d50cb85ef
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
@@ -0,0 +1,100 @@
+# ############################################################################
+# Auido Tokenizer: WavLM
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavlm
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+### Configuration for  discrete SSL model
+# | SSL Model  | HF Encoder                             | K-Means Dataset | K-Means Size | SSL Layers           | Vocoder Model                            |
+# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------|
+# | WavLM      | microsoft/wavlm-large                  | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wavlm-k1000-LibriTTS |
+# | HuBERT     | facebook/hubert-large-ll60k            | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+# | Wav2Vec2   | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+
+# ssl_model_type: hubert, wavlm, wav2vec2
+# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
+ssl_model_type: WavLM
+ssl_hub: microsoft/wavlm-large
+ssl_folder: !ref <save_folder>/ssl_checkpoint
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+freeze_ssl: True
+freeze_feature_extractor: True
+vocab_size: 1000
+save_embedding: False
+
+### Config for Tokenizer
+# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
+num_codebooks: [1, 3, 7, 12, 18, 23]
+deduplicate: [False, False, False, False, False, False]
+bpe_tokenizer_path: [null, null, null, null, null, null]
+sample_rate: 16000
+encoder_dim: 1024
+
+ssl_model: !apply:speechbrain.utils.hparams.choice
+  value: !ref <ssl_model_type>
+  choices:
+    WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
new file mode 100644
index 000000000..869d1c503
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
@@ -0,0 +1,63 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+bandwidth: 24.0
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+  source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  bandwidth: !ref <bandwidth>
+  flat_embeddings: False
+  freeze: True
+  renorm_embeddings: False
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml
new file mode 100644
index 000000000..c03ffa936
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml
@@ -0,0 +1,66 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <num_codebooks>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
new file mode 100644
index 000000000..c534bef0f
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
@@ -0,0 +1,57 @@
+# ############################################################################
+# Auido Tokenizer: Mimi
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/mimi
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 1
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+model_hub: kyutai/mimi
+vocab_size: 1024
+num_codebooks: 32
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+  source: !ref <model_hub>
+  save_path: !ref <pretrained_model_save_folder>
+  num_codebooks: !ref <num_codebooks>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
new file mode 100644
index 000000000..d036e05a3
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -0,0 +1,53 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+vocab_size: 1024
+num_codebooks: 8
+sample_rate: 16000
+encoder_dim: 1024
+freeze_embedding: False
+save_embedding: False
+
+
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
+  source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+  save_path: !ref <pretrained_model_save_folder>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
new file mode 100644
index 000000000..28c7c9be9
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
@@ -0,0 +1,56 @@
+# ############################################################################
+# Auido Tokenizer: SQCodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/sqcodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# SQCodec parameters
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sample_rate: 16000
+save_embedding: False
+num_codebooks: 4
+save_path: /home/ubuntu/sq-codec/SQ-Codec
+
+# SQCodec model
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+  save_path: !ref <pretrained_model_save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
new file mode 100644
index 000000000..a23c29e59
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
@@ -0,0 +1,59 @@
+# ############################################################################
+# Auido Tokenizer: wavtokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavtokenizer
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# WavTokenizer parameters
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+sample_rate: 24000
+save_embedding: False
+num_codebooks: 1
+vocab_size: 4096
+
+# wavtokenizer model
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+  source: !ref <model_hub>
+  save_path: !ref <pretrained_model_save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+  freeze: True
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py
new file mode 120000
index 000000000..2de5a21a8
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py
@@ -0,0 +1 @@
+../ljspeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
index e88b92eb6..416c63010 100644
--- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
+++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
@@ -13,13 +13,11 @@
 import json
 import random
 import logging
-from types import SimpleNamespace
 import torch
 import torchaudio
 import numpy as np
 import tgt
 import re
-import speechbrain as sb
 from tqdm import tqdm
 from pathlib import Path
 from speechbrain.utils.data_utils import download_file
@@ -27,10 +25,6 @@
 from speechbrain.inference.text import GraphemeToPhoneme
 from unidecode import unidecode
 from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations
-from speechbrain.dataio.batch import PaddedData
-from speechbrain.dataio.dataset import DynamicItemDataset
-from preparation import FeatureExtractor
-from torchaudio.functional import resample
 
 
 logger = logging.getLogger(__name__)
@@ -59,8 +53,6 @@ def prepare_ljspeech(
     pitch_max_f0=400,
     skip_prep=False,
     use_custom_cleaner=False,
-    extract_features=None,
-    extract_features_opts=None,
     extract_phonemes=False,
     g2p_src="speechbrain/soundchoice-g2p",
     skip_ignore_folders=False,
@@ -179,7 +171,7 @@ def prepare_ljspeech(
             os.makedirs(duration_folder)
 
     # extract pitch for both Fastspeech2 and FastSpeech2WithAligner models
-    if "FastSpeech2" in model_name:
+    if model_name is not None and "FastSpeech2" in model_name:
         pitch_folder = os.path.join(data_folder, "pitch")
         if not os.path.exists(pitch_folder):
             os.makedirs(pitch_folder)
@@ -200,22 +192,11 @@ def prepare_ljspeech(
         data_folder, splits, split_ratio, frozen_split_path
     )
 
-    extract_features_context = None
-    extract_features_folder = None
-    if extract_features:
-        extract_features_context = get_context(
-            extract_features=extract_features,
-            extract_features_opts=extract_features_opts or {},
-            device=device,
-        )
-        extract_features_folder = Path(save_folder) / "features"
-
     if "train" in splits:
         prepare_json(
             model_name,
             data_split["train"],
             save_json_train,
-            data_folder,
             wavs_folder,
             meta_csv,
             phoneme_alignments_folder,
@@ -226,10 +207,6 @@ def prepare_ljspeech(
             pitch_min_f0,
             pitch_max_f0,
             use_custom_cleaner,
-            extract_features,
-            extract_features_context,
-            extract_features_folder,
-            extract_features_opts,
             extract_phonemes,
             g2p_src,
             device,
@@ -239,7 +216,6 @@ def prepare_ljspeech(
             model_name,
             data_split["valid"],
             save_json_valid,
-            data_folder,
             wavs_folder,
             meta_csv,
             phoneme_alignments_folder,
@@ -250,10 +226,6 @@ def prepare_ljspeech(
             pitch_min_f0,
             pitch_max_f0,
             use_custom_cleaner,
-            extract_features,
-            extract_features_context,
-            extract_features_folder,
-            extract_features_opts,
             extract_phonemes,
             g2p_src,
             device,
@@ -263,7 +235,6 @@ def prepare_ljspeech(
             model_name,
             data_split["test"],
             save_json_test,
-            data_folder,
             wavs_folder,
             meta_csv,
             phoneme_alignments_folder,
@@ -274,10 +245,6 @@ def prepare_ljspeech(
             pitch_min_f0,
             pitch_max_f0,
             use_custom_cleaner,
-            extract_features,
-            extract_features_context,
-            extract_features_folder,
-            extract_features_opts,
             extract_phonemes,
             g2p_src,
             device,
@@ -421,7 +388,6 @@ def prepare_json(
     model_name,
     seg_lst,
     json_file,
-    data_folder,
     wavs_folder,
     csv_reader,
     phoneme_alignments_folder,
@@ -432,10 +398,6 @@ def prepare_json(
     pitch_min_f0,
     pitch_max_f0,
     use_custom_cleaner=False,
-    extract_features=None,
-    extract_features_context=None,
-    extract_features_folder=None,
-    extract_features_opts=None,
     extract_phonemes=False,
     g2p_src="speechbrain/soundchoice-g2p",
     device="cpu",
@@ -471,14 +433,8 @@ def prepare_json(
         Max f0 for pitch computation
     use_custom_cleaner : bool
         If True, uses custom cleaner defined for this recipe
-    extract_features : list, optional
-        If specified, feature extraction will be performed
-    extract_features_context : types.SimpleNamespace, optional
-        Context for feature extraction (pretrained models, etc)
-    extract_features_folder : path-like, optional
-        The folder where extracted features will be saved
-    extract_features_opts : dict, optional
-        Options for feature extraction
+    extract_phonemes : bool
+        Whether to extract phonemes
     g2p_src : str
         The name of the HuggingFace Hub to use for the Grapheme-to-Phoneme
         model or the path to it
@@ -495,12 +451,12 @@ def prepare_json(
         extract_phonemes = True
     if extract_phonemes:
         logger.info(
-            "Computing phonemes for LJSpeech labels using SpeechBrain G2P. This may take a while."
+            "Computing phonemes for LJSpeech labels using SpeechBrain f This may take a while."
         )
         g2p = GraphemeToPhoneme.from_hparams(
             g2p_src, run_opts={"device": device}
         )
-    if "FastSpeech2" in model_name:
+    if model_name is not None and "FastSpeech2" in model_name:
         logger.info(
             "Computing pitch as required for FastSpeech2. This may take a while."
         )
@@ -649,19 +605,6 @@ def prepare_json(
             # Updates data for the utterance
             json_dict[id].update({"phonemes": phonemes})
 
-    # Feature Extraction
-    if extract_features:
-        extract_features_folder.mkdir(exist_ok=True)
-        prepare_features(
-            data=json_dict,
-            data_folder=data_folder,
-            save_path=extract_features_folder,
-            features=extract_features,
-            context=extract_features_context,
-            options=extract_features_opts,
-            device=device,
-        )
-
     # Writing the dictionary to the json file
     with open(json_file, mode="w") as json_f:
         json.dump(json_dict, json_f, indent=2)
@@ -838,146 +781,3 @@ def custom_clean(text, model_name):
     for regex, replacement in _abbreviations:
         text = re.sub(regex, replacement, text)
     return text
-
-
-INLINE_FEATURES = ["audio_ssl_len"]
-
-
-def prepare_features(
-    data, data_folder, save_path, features, context, options=None, device="cpu"
-):
-    """Performs feature extraction
-
-    Arguments
-    ---------
-    data: dict
-        a preprocessed dataset
-    data_folder : str
-        the data folder
-    save_folder : str
-        the folder where features will be saved
-    context : dict
-        context data
-    features: list
-        the list of feature extractions to be performed
-    """
-    dataset = DynamicItemDataset(data)
-    feature_extractor = FeatureExtractor(
-        save_path=save_path,
-        src_keys=["sig"],
-        id_key="uttid",
-        dataloader_opts=options.get("dataloader_opts", {}),
-        device=device,
-    )
-    token_model_kwargs = options.get("token_model_kwargs", {})
-    ssl_layers = options.get("ssl_model_layers") or options.get(
-        "token_model_layers"
-    )
-
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        """Load the audio signal. """
-        wav = wav.replace("{data_root}", data_folder)
-        sig = sb.dataio.dataio.read_audio(wav)
-
-        yield sig
-
-    dataset.add_dynamic_item(audio_pipeline)
-
-    @sb.utils.data_pipeline.takes("sig")
-    @sb.utils.data_pipeline.provides("sig_resampled")
-    def resample_pipeline(sig):
-        sig_data = resample(
-            waveform=sig.data,
-            orig_freq=options["sample_rate"],
-            new_freq=options["model_sample_rate"],
-        )
-        return PaddedData(sig_data, sig.lengths)
-
-    @sb.utils.data_pipeline.takes("sig_resampled")
-    @sb.utils.data_pipeline.provides("audio_tokens", "audio_emb")
-    def token_pipeline(sig):
-        with torch.no_grad():
-            result = context.token_model(
-                sig.data, sig.lengths, **token_model_kwargs
-            )
-            # TODO: Clean this up
-            if torch.is_tensor(result):
-                tokens = result
-                # Note: Dummy embedding - meaning embeddings are not available
-                emb = torch.zeros((len(sig.data), 1, 1), device=sig.data.device)
-            else:
-                tokens, emb = result[:2]
-                tokens = tokens.int()
-            if tokens.dim() < 3:
-                tokens = tokens.unsqueeze(-1)
-            yield PaddedData(tokens, sig.lengths)
-            yield PaddedData(emb, sig.lengths)
-
-    @sb.utils.data_pipeline.takes("sig_resampled")
-    @sb.utils.data_pipeline.provides("spk_emb")
-    def spk_emb_pipeline(sig):
-        mel_spec = context.spk_emb_model.mel_spectogram(audio=sig.data)
-        return context.spk_emb_model.encode_mel_spectrogram_batch(
-            mel_spec, sig.lengths
-        )
-
-    @sb.utils.data_pipeline.takes("sig_resampled")
-    @sb.utils.data_pipeline.provides("audio_ssl", "audio_ssl_len")
-    def ssl_pipeline(sig):
-        ssl_raw = context.ssl_model(sig.data, sig.lengths)
-        ssl = ssl_raw[ssl_layers].permute(1, 2, 0, 3)
-        yield PaddedData(ssl, sig.lengths)
-        yield (sig.lengths * ssl.size(1)).tolist()
-
-    dynamic_items = [
-        resample_pipeline,
-        token_pipeline,
-        ssl_pipeline,
-        spk_emb_pipeline,
-    ]
-    for dynamic_item in dynamic_items:
-        feature_extractor.add_dynamic_item(dynamic_item)
-
-    feature_keys = [key for key in features if key not in INLINE_FEATURES]
-    inline_keys = [key for key in features if key in INLINE_FEATURES]
-    feature_extractor.set_output_features(feature_keys, inline_keys=inline_keys)
-    feature_extractor.extract(dataset, data)
-
-
-def get_context(extract_features, extract_features_opts, device):
-    """
-    Gets the context (pretrained models, etc) for feature extraction
-
-    Arguments
-    ---------
-    extract_features : list
-        A list of features to extract
-        Available features:
-        audio_tokens - raw tokens
-        audio_emb - embeddings from the model
-    extract_features_opts : dict
-        Options for feature extraction
-    device : str|torch.Device
-        The device on which extraction will be run
-
-    Returns
-    --------
-    context: SimpleNamespace
-        The context object
-    """
-    context = {}
-    if (
-        any(key in extract_features for key in ["audio_tokens", "audio_emb"])
-        and "token_model" in extract_features_opts
-    ):
-        context["token_model"] = extract_features_opts["token_model"].to(device)
-    if "audio_ssl" in extract_features:
-        context["ssl_model"] = extract_features_opts["ssl_model"].to(device)
-    if "spk_emb" in extract_features:
-        context["spk_emb_model"] = extract_features_opts["spk_emb_model"](
-            run_opts={"device": device}
-        )
-
-    return SimpleNamespace(**context)
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
index f9720b170..7871d6212 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
@@ -47,7 +47,7 @@ save_embedding: False
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   num_codebooks: !ref <num_codebooks>
   sample_rate: !ref <sample_rate>
 
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
index 976614a3d..9a8b754eb 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
@@ -47,7 +47,7 @@ vocab_size: 4096
 # wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
   sample_rate: !ref <sample_rate>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py
similarity index 100%
rename from benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py
rename to benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py
new file mode 120000
index 000000000..4b3f08ebb
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py
@@ -0,0 +1 @@
+../../../model/custom_model.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py
new file mode 120000
index 000000000..d65702b6c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py
@@ -0,0 +1 @@
+../../../utils/data.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py
new file mode 120000
index 000000000..0ca6d4644
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py
@@ -0,0 +1 @@
+../../../utils/eval.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
new file mode 100644
index 000000000..aa7ee2c4b
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -0,0 +1,494 @@
+"""Evaluates a checkpoint using an MOS estimation tool
+
+Authors
+* Artem Ploujnikov 2024
+"""
+
+# TODO: There are too many evaluation scripts. Refactor to extract common
+# features
+
+import speechbrain as sb
+import json
+import logging
+import csv
+import torch
+import torchaudio
+import string
+import re
+from pathlib import Path
+from types import SimpleNamespace
+from torch.nn import ModuleDict
+from data import undo_batch
+from eval import vocoder_to_device
+from torch.utils.flop_counter import FlopCounterMode
+from contextlib import nullcontext
+
+
+logger = logging.getLogger(__name__)
+
+
+class TokotronEvaluator:
+    """An evaluator class for the TTS model
+
+    Arguments
+    ---------
+    hparams: dict
+        hyperparameters (as a dictionary)
+    device : str | torch.device
+        the device
+    """
+
+    def __init__(self, hparams, create_waveform_fn, device):
+        self.hparams = SimpleNamespace(**hparams)
+        self.create_waveform_fn = create_waveform_fn
+        self.device = device
+        modules = self.hparams.modules
+        self.modules = ModuleDict(modules).to(self.device)
+        self.spk_emb_model = self.hparams.spk_emb_model(
+            run_opts={"device": device}
+        )
+        self.modules.model.vocoder = None
+        self.enabled_evaluators = set(self.hparams.evaluations.split(","))
+        evaluators = hparams.get("evaluators", {})
+        if evaluators:
+            self.evaluators = {
+                key: evaluator_f(run_opts={"device": device})
+                for key, evaluator_f in evaluators.items()
+                if key in self.enabled_evaluators
+            }
+        else:
+            self.evaluators = {}
+
+        if not self.evaluators:
+            logger.warning(
+                "No evaluators were defined - this run will produce samples only"
+            )
+
+        self.attention = []
+
+    def on_evaluate_start(self, stage, epoch):
+        """Invoked when evaluation starts
+
+        Arguments
+        ---------
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        self.stage = stage
+        self.epoch = epoch
+        self.output_folder = self.get_output_folder(stage, epoch)
+        self.samples_folder = self.output_folder / "samples"
+        self.samples_folder.mkdir(parents=True, exist_ok=True)
+        logger.info(
+            "Starting evaluation, results will be saved in %s",
+            self.output_folder,
+        )
+        self.create_reports()
+        self.modules.model.show_inference_progress = False
+        self.item_ids = []
+        details_keys = list(self.evaluators.keys())
+        self.details = {evaluator_key: [] for evaluator_key in details_keys}
+        self.sample_text = []
+        self.sample_file_names = []
+        self.ref_file_names = []
+        if hasattr(self.modules, "vocoder"):
+            vocoder_to_device(self.modules.vocoder, self.device)
+
+    def get_output_folder(self, stage, epoch):
+        """Computes the output folder of evaluation results
+        for the specified stage and epoch.
+
+        If the folder does not exists, it will be created.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+
+        Returns
+        -------
+        """
+        output_folder = (
+            Path(self.hparams.output_folder) / "eval" / stage.name.lower()
+        )
+        if epoch is not None:
+            output_folder = output_folder / str(epoch)
+        output_folder.mkdir(parents=True, exist_ok=True)
+        return output_folder
+
+    def on_evaluate_end(self):
+        """Invoked when evaluation starts
+
+        Arguments
+        ---------
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        self.write_summary()
+        logger.info("Evaluation done")
+
+    def create_reports(self):
+        """Creates report files and report writers"""
+        self.report_files = {}
+        self.report_writers = {}
+        for evaluator_key in self.enabled_evaluators:
+            columns = self.get_report_columns(evaluator_key)
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            resume = file_name.exists() and file_name.stat().st_size > 0
+            report_file = open(file_name, "a+")
+            self.report_files[evaluator_key] = report_file
+            writer = csv.DictWriter(report_file, columns)
+            if not resume:
+                writer.writeheader()
+            self.report_writers[evaluator_key] = writer
+        if self.hparams.eval_perf:
+            self.perf_file = open(self.output_folder / "perf.csv", "w")
+            self.perf_writer = csv.DictWriter(
+                self.perf_file,
+                [
+                    "uttid",
+                    "infer_flops",
+                    "steps",
+                    "infer_flops_per_step",
+                    "vocoder_flops",
+                    "total_flops",
+                    "total_flops_per_step",
+                ],
+            )
+            self.perf_writer.writeheader()
+
+    def infer(self, tokens, tokens_length, emb):
+        stats = {}
+        if self.hparams.eval_perf:
+            flop_counter = FlopCounterMode()
+        else:
+            flop_counter = nullcontext()
+
+        with flop_counter:
+            infer_out = self.modules.model.infer(
+                input_tokens=tokens, input_length=tokens_length, emb=emb
+            )
+        if self.hparams.eval_perf:
+            steps = (infer_out.length * infer_out.audio.size(1)).sum().item()
+            total_flops = flop_counter.get_total_flops()
+            stats = {
+                "infer_flops": total_flops,
+                "steps": steps,
+                "infer_flops_per_step": total_flops / steps,
+            }
+        return infer_out, stats
+
+    def vocoder(self, infer_out, emb):
+        stats = {}
+        if self.hparams.eval_perf:
+            flop_counter = FlopCounterMode()
+        else:
+            flop_counter = nullcontext()
+
+        with flop_counter:
+            wav = self.create_waveform_fn(
+                infer_out.audio, length=infer_out.length, emb=emb
+            )
+            if wav.dim() > 2:
+                wav = wav.squeeze(1)
+
+        if self.hparams.eval_perf:
+            flops = flop_counter.get_total_flops()
+            stats = {"vocoder_flops": flops}
+        return wav, stats
+
+    def read_reports(self):
+        """Invoked when resuming"""
+        for evaluator_key in self.enabled_evaluators:
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            if file_name.exists():
+                logger.info("%s exists, reading")
+                with open(file_name) as report_file:
+                    reader = csv.DictReader(report_file)
+                    for row in reader:
+                        del row["uttid"]
+                        row = {
+                            key: handle_number(value)
+                            for key, value in row.items()
+                        }
+                        self.details[evaluator_key].append(row)
+
+    def get_report_columns(self, evaluator_key):
+        """Returns the columns for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            the identifier of the evaluator
+
+        Returns
+        -------
+        columns : list[str]
+            a list of column headers
+        """
+        bogus_wavs = torch.randn(2, 10000, device=self.device)
+        bogus_length = torch.tensor([1.0, 1.0], device=self.device)
+        if evaluator_key in self.evaluators:
+            evaluator = self.evaluators[evaluator_key]
+            result = evaluator.evaluate(
+                wavs=bogus_wavs,
+                length=bogus_length,
+                text=["BOGUS"] * len(bogus_wavs),
+                wavs_ref=bogus_wavs,
+                length_ref=bogus_length,
+            )
+
+        return ["uttid"] + list(result.details.keys())
+
+    def evaluate_batch(self, batch):
+        """Runs evaluation on a single batch of speech
+
+        Arguments
+        ---------
+        batch : speechbrain.dataio.batch.PaddedBatch
+            the batch to be evaluated"""
+        with torch.no_grad():
+            batch = batch.to(self.device)
+            tokens, tokens_length = batch.tokens
+            if hasattr(self.modules, "vocoder"):
+                vocoder_to_device(self.modules.vocoder, self.device)
+                if hasattr(self.modules.vocoder, "device"):
+                    self.modules.vocoder.device = self.device
+            audio_resampled = torchaudio.functional.resample(
+                batch.sig.data,
+                self.hparams.sample_rate,
+                self.hparams.model_sample_rate,
+            )
+            mel_spec = self.spk_emb_model.mel_spectogram(audio=audio_resampled)
+            spk_emb = self.spk_emb_model.encode_mel_spectrogram_batch(
+                mel_spec, batch.sig.lengths
+            ).squeeze(1)
+            infer_out, perf_stats = self.infer(
+                tokens=tokens, tokens_length=tokens_length, emb={"spk": spk_emb}
+            )
+            wav, vocoder_stats = self.vocoder(infer_out, spk_emb)
+            perf_stats.update(vocoder_stats)
+            length = infer_out.length
+            if wav.dim() > 2:
+                wav = wav.squeeze(1)
+
+            self.save_samples(batch, wav, infer_out.length)
+            self.item_ids.extend(batch.uttid)
+            for evaluator_key, evaluator in self.evaluators.items():
+                result = evaluator.evaluate(
+                    wavs=wav,
+                    length=length,
+                    text=batch.label_norm_eval,
+                    wavs_ref=batch.sig.data,
+                    length_ref=batch.sig.lengths,
+                    sample_rate_ref=self.hparams.sample_rate,
+                    sample_rate=self.hparams.model_sample_rate,
+                )
+                details = undo_batch(result.details)
+                self.write_result(evaluator_key, batch.uttid, details)
+                self.details[evaluator_key].extend(details)
+
+            if self.hparams.eval_perf:
+                perf_stats.update(vocoder_stats)
+                perf_stats["total_flops"] = (
+                    perf_stats["vocoder_flops"] + perf_stats["infer_flops"]
+                )
+                perf_stats["total_flops_per_step"] = (
+                    perf_stats["total_flops"] / perf_stats["steps"]
+                )
+                self.write_perf_stats(batch.uttid, perf_stats)
+
+    def write_result(self, evaluator_key, uttid, details):
+        """Outputs the result details to the report for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            The evaluator key
+        batch : list
+            The list of IDs
+        details : list
+            a list of evaluation details, one dictionary per item
+        """
+        writer = self.report_writers[evaluator_key]
+        for uttid, details_item in zip(uttid, details):
+            report_details = {
+                "uttid": uttid,
+                **details_item,
+            }
+            writer.writerow(ascii_only(flatten(report_details)))
+        self.report_files[evaluator_key].flush()
+
+    def save_samples(self, batch, wav, length):
+        """Saves the samples generated by the TTS system
+
+        Arguments
+        ---------
+        batch : speechbrain.dataio.batch.PaddedBatch
+            the batch being evaluated
+        wav : torch.Tensor
+            the waveform
+        length: torch.Tensor
+            relative lengths
+        """
+        wav_length_abs = (length * wav.size(1)).int()
+        for item_id, infer_wav, wav_length in zip(
+            batch.uttid, wav, wav_length_abs
+        ):
+            file_name = str(self.samples_folder / f"{item_id}_pred.wav")
+            infer_wav_cut = infer_wav[: wav_length.item()].cpu()
+            sb.dataio.dataio.write_audio(
+                file_name,
+                infer_wav_cut,
+                samplerate=self.hparams.model_sample_rate,
+            )
+            self.sample_file_names.append(file_name)
+
+    def write_summary(self):
+        """Outputs summarized statistics"""
+        summary = self.compute_summary()
+        file_name = self.output_folder / "summary.json"
+        with open(file_name, "w") as output_file:
+            json.dump(summary, output_file, indent=4)
+
+    def write_perf_stats(self, uttid, details):
+        self.perf_writer.writerow({"uttid": " ".join(uttid), **details})
+        self.perf_file.flush()
+
+    def compute_summary(self):
+        """Computes the summarized statistics"""
+        return {
+            f"{evaluator_key}_{stat_key}": value
+            for evaluator_key in self.enabled_evaluators
+            if evaluator_key in self.details
+            for metric_key in self.hparams.eval_summary[evaluator_key][
+                "descriptive"
+            ]
+            for stat_key, value in descriptive_statistics(
+                items=self.details[evaluator_key], key=metric_key,
+            ).items()
+        }
+
+
+def flatten(value):
+    """Converts tensors to scalars and lists of strings to strings
+
+    Arguments
+    ---------
+    value : dict
+        the dictionary to flatten
+
+    Returns
+    -------
+    result : dict
+        a flattened dictionary
+    """
+    return {
+        key: item_value.item() if torch.is_tensor(item_value) else item_value
+        for key, item_value in value.items()
+    }
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+")
+
+
+def ascii_only(values):
+    return {
+        key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value
+        for key, value in values.items()
+    }
+
+
+@sb.utils.data_pipeline.takes("label_norm")
+@sb.utils.data_pipeline.provides("label_norm_eval")
+def label_norm_pipeline(label):
+    """Normalizes labels for ASR comparison, converting to uppercase and removing
+    punctuation
+
+    Arguments
+    ---------
+    label : str
+        The unnormalized label
+
+    Returns
+    -------
+    result : str
+        The normalized label
+    """
+    label = label.upper()
+    label = RE_PUNCTUATION.sub("", label)
+    return label
+
+
+@sb.utils.data_pipeline.takes("wav")
+@sb.utils.data_pipeline.provides("sig")
+def audio_ref_pipeline(wav):
+    """The audio loading pipeline for references
+
+    Arguments
+    ---------
+    wav : str
+        The file path
+
+    Returns
+    -------
+    sig : torch.Tensor
+        The waveform
+    """
+    sig = sb.dataio.dataio.read_audio(wav)
+    return sig
+
+
+def descriptive_statistics(items, key):
+    """Computes descriptive statistics for the summary
+
+    Arguments
+    ---------
+    items : list
+        a list of dictionaries with metric values for each item
+    key : str
+        """
+    values = torch.tensor([item[key] for item in items])
+    quantiles = torch.tensor([0.25, 0.5, 0.75])
+    q1, median, q3 = values.quantile(quantiles)
+    stats = {
+        "mean": values.mean(),
+        "std": values.std(),
+        "min": values.min(),
+        "max": values.max(),
+        "median": median,
+        "q1": q1,
+        "q3": q3,
+        "iqr": q3 - q1,
+    }
+    return {
+        f"{key}_{stat_key}": value.item() for stat_key, value in stats.items()
+    }
+
+
+RE_INTEGER = re.compile(r"^-?\d+$")
+RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$")
+
+
+def handle_number(value):
+    """Converts a value to a number, if applicable"""
+    if RE_INTEGER.match(value):
+        value = int(value)
+    elif RE_FLOAT.match(value):
+        value = float(value)
+    return value
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt
new file mode 100644
index 000000000..105a1dd9d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt
@@ -0,0 +1,50 @@
+AA
+AE
+AH
+AO
+AW
+AY
+B
+CH
+D
+DH
+EH
+ER
+EY
+F
+G
+HH
+IH
+IY
+JH
+K
+L
+M
+N
+NG
+OW
+OY
+P
+R
+S
+SH
+T
+TH
+UH
+UW
+V
+W
+Y
+Z
+ZH
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt
new file mode 100644
index 000000000..f43d3b08d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
+ 
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
new file mode 100644
index 000000000..b39b11009
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -0,0 +1,57 @@
+eval_dataset: valid
+eval_suffix: ""
+eval_sample_rate: 16000
+eval_spk_sim_sample_rate: 16000
+eval_samples: null
+eval_interval: 1
+eval_subset: null
+eval_asr_beam_size: 66
+eval_asr_type: encoder_decoder
+eval_asr_source: openai/whisper-small
+eval_spk_sim_source: microsoft/wavlm-base-sv
+evaluations: utmos,asr,spk_sim
+tmp_folder: null
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: False
+
+
+eval_asr: !name:eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
+
+eval_utmos: !name:eval.UTMOSSpeechEvaluator
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
+
+eval_spk_sim: !name:eval.SpkSimWavLM
+  source: !ref <eval_spk_sim_source>
+  savedir: !ref <pretrained_model_save_folder>
+  model_sample_rate: !ref <eval_spk_sim_sample_rate>
+
+evaluators:
+  utmos: !ref <eval_utmos>
+  asr: !ref <eval_asr>
+  spk_sim: !ref <eval_spk_sim>
+
+eval_summary:
+  asr:
+    descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
+  utmos:
+    descriptive: ["utmos"]
+  spk_sim:
+    descriptive: ["score"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
+  spk_sim: spk_sim_score_mean
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
new file mode 100644
index 000000000..01c818370
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -0,0 +1,292 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+freeze_token_model: True
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+eos_mode: gate
+scale_factor: 4
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# DAC-specific settings
+model_type: 24khz
+model_bitrate: 8kbps
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+extract_features_opts:
+    dataloader_opts:
+        batch_size: !ref <extract_features_batch_size>
+        num_workers: !ref <num_workers>
+    token_model: !ref <token_model>
+    sample_rate: !ref <sample_rate>
+    model_sample_rate: !ref <model_sample_rate>
+    spk_emb_model: !ref <spk_emb_model>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+z_dim: 128
+hidden_dim: 2048
+n_dim: 16
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 128
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+
+model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    z_dim: !ref <z_dim>
+    hidden_dim: !ref <hidden_dim>
+    n_dim: !ref <n_dim>
+    decoder_chunk_size: !ref <decoder_chunk_size>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    vocoder: !ref <vocoder>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    emb: !ref <emb>
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    n_codebooks: !ref <audio_tokens_per_step>
+    load_pretrained: True
+    tag: latest
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
new file mode 100644
index 000000000..4efa9f75c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -0,0 +1,341 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+ssl_model_type: wavlm
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+# Position shift
+use_position_shift: True
+max_position_shift: 1000
+position_shift_seed: 42
+position_shift_probability: 1.0
+
+freeze_token_model: True
+
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large
+g2p_src: flexthink/soundchoice-g2p
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+select_layers: null
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite
+        hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite
+        wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite
+asr_src: speechbrain/asr-transformer-transformerlm-librispeech
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+batch_size_guided: 2
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+eos_mode: gate
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 2000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+layerwise_renorm: True
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 1000
+audio_dim: 1024
+audio_emb_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: 1024
+        continuous: 128
+audio_emb_freeze: False
+audio_emb_lr: 0.00001
+audio_emb_weight_decay: 0.001
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 6
+attention_type: regularMHA
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        injection: !ref <spk_emb_injection>
+
+model: !new:Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !ref <emb>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+compute_cost: !new:Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+    representation_mode: !ref <representation_mode>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
new file mode 100644
index 000000000..e45794171
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -0,0 +1,294 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+alignments_folder: null
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: encodec
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+freeze_token_model: True
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_type: encodec
+vocoder_src: "charactr/vocos-encodec-24khz"
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+# Guides
+guides_enabled: False
+
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 128
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+        injection: !ref <spk_emb_injection>
+
+model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    emb: !ref <emb>
+
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
+
+modules:
+    model: !ref <model>
+    compute_cost: !ref <compute_cost>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
new file mode 100644
index 000000000..e45794171
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
@@ -0,0 +1,294 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+alignments_folder: null
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: encodec
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+freeze_token_model: True
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_type: encodec
+vocoder_src: "charactr/vocos-encodec-24khz"
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+# Guides
+guides_enabled: False
+
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 128
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+        injection: !ref <spk_emb_injection>
+
+model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    emb: !ref <emb>
+
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
+
+modules:
+    model: !ref <model>
+    compute_cost: !ref <compute_cost>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
new file mode 100644
index 000000000..156e05b02
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
@@ -0,0 +1,281 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+cached_data_folder: !PLACEHOLDER
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: vocos
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+freeze_token_model: True
+model_hub: kyutai/mimi
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+scale_factor: 4
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 2048
+audio_emb_size: 1024
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 8
+attention_type: regularMHA
+
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+
+model: !new:Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !ref <emb>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
similarity index 68%
rename from benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
rename to benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index ac80bdac0..ffb68f2a5 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -2,61 +2,52 @@
 # Model: Tokenized TTS (WhisperSpeech-inspired)
 # Authors:  Artem Ploujnikov
 # ############################################################################
-
-experiment_name: tokotron/continuous_ssl
-
 # Seed needs to be set at top of yaml, before objects with parameters are made
+
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
-# Model type
-ssl_model_type: wavlm
-representation_mode: continuous
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/continuous-<ssl_model_type>
+prepare_save_folder: !ref <data_folder>/prepared
 pretrained_model_save_folder: !ref <prepare_save_folder>
-vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-continuous
+representation_mode: discrete
+vocoder_model_name: vocos
 vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
+data_mode: lite
 train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
 num_audio_samples: 32
 samples_interval: 5
 
-freeze_ssl_model: True
-ssl_model_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: microsoft/wavlm-large
-        hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
-
-g2p_src: speechbrain/soundchoice-g2p
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
+freeze_token_model: True
+token_model_src: "fnlp/SpeechTokenizer"
+g2p_src: flexthink/soundchoice-g2p
 token_offset: 1
-vocoder_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS
+vocoder_takes_spk_emb: False
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
-use_spk_emb: False
-
-vocoder_available_layers: [1, 3, 7, 12, 18, 23]
 
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
@@ -66,8 +57,11 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 50
+number_of_epochs: 1000
+reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
+extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
@@ -87,7 +81,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -113,8 +107,8 @@ beam_size: 5
 # Feature parameters
 sample_rate: 22050
 model_sample_rate: 16000
-max_audio_length: 1000
-infer_max_audio_length: !ref <max_audio_length>
+max_audio_length: 5000
+infer_max_audio_length: 1000
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -128,7 +122,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
@@ -138,29 +132,11 @@ use_silence_padding: True
 
 
 # Token model (pretrained)
-ssl_model: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
-            source: !ref <ssl_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_ssl_model>
-            output_all_hiddens: True
-        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
-            source: !ref <ssl_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_ssl_model>
-            output_all_hiddens: True
-        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
-            source: !ref <ssl_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_ssl_model>
-            output_all_hiddens: True
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
 
+
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -171,7 +147,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
@@ -193,9 +169,9 @@ sample_dataloader_opts:
 
 extract_features_opts:
     dataloader_opts:
-        batch_size: !ref <batch_size>
-    ssl_model: !ref <ssl_model>
-    ssl_model_layers: !ref <ssl_model_layers>
+        batch_size: !ref <extract_features_batch_size>
+        num_workers: !ref <num_workers>
+    token_model: !ref <token_model>
     sample_rate: !ref <sample_rate>
     model_sample_rate: !ref <model_sample_rate>
     spk_emb_model: !ref <spk_emb_model>
@@ -205,19 +181,20 @@ extract_features_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
+z_dim: 128
+hidden_dim: 2048
+n_dim: 16
+decoder_chunk_size: -1
 transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
-audio_num_tokens: 1000
-audio_dim: 1024
+audio_num_tokens: 1024
 audio_emb_size: 128
-audio_emb_freeze: False
+audio_emb_freeze: True
 audio_emb_pretrained: False
-audio_emb_lr: 0.00001
-audio_emb_weight_decay: 0.001
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
@@ -229,17 +206,22 @@ audio_tokens_per_step: 6
 attention_type: regularMHA
 
 ############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
 
-vocoder: !apply:speechbrain.inference.vocoders.HIFIGAN.from_hparams
-    source: !ref <vocoder_src>
-    savedir: !ref <vocoder_model_path>
-
-model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
     d_model: !ref <d_model>
     d_ffn: !ref <d_ffn>
+    z_dim: !ref <z_dim>
+    hidden_dim: !ref <hidden_dim>
+    n_dim: !ref <n_dim>
+    decoder_chunk_size: !ref <decoder_chunk_size>
     nhead: !ref <nhead>
     enc_num_layers: !ref <enc_num_layers>
     dec_num_layers: !ref <dec_num_layers>
@@ -247,6 +229,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     target_dropout: !ref <target_dropout>
     activation: !ref <activation>
     attention_type: !ref <attention_type>
+    vocoder: !ref <vocoder>
     gate_threshold: !ref <gate_threshold>
     gate_offset: !ref <gate_offset>
     audio_emb_size: !ref <audio_emb_size>
@@ -255,21 +238,23 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
-    representation_mode: continuous
+    emb: !ref <emb>
 
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
+    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
 
 modules:
     model: !ref <model>
-    vocoder: !ref <vocoder>
+    tokenizer: !ref <tokenizer>
     compute_cost: !ref <compute_cost>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -282,13 +267,11 @@ compute_cost: !new:Tokotron.TokotronLoss
     eos_width: !ref <eos_width>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
     audio_token_shift: !ref <audio_token_shift>
-    representation_mode: continuous
 
 
-lr_annealing: !new:Tokotron.TargetedNoamScheduler
-    lr_initial: [!ref <lr>, !ref <audio_emb_lr>]
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
-    param_group: 0
 
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
@@ -297,10 +280,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..f4f745716
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,281 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+cached_data_folder: !PLACEHOLDER
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: vocos
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+freeze_token_model: True
+model_hub: kyutai/mimi
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+scale_factor: 4
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 2048
+audio_emb_size: 1024
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 8
+attention_type: regularMHA
+
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+
+model: !new:Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !ref <emb>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py
new file mode 120000
index 000000000..489ab4011
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py
@@ -0,0 +1 @@
+../../libritts_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
new file mode 100644
index 000000000..7d99c5c7d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -0,0 +1,1031 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+
+import logging
+import speechbrain as sb
+import math
+import torch
+import sys
+from functools import partial
+from pathlib import Path
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
+from speechbrain.dataio.dataio import clean_padding_
+from speechbrain.utils.distributed import run_on_main
+import re
+import string
+
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
+sys.path.append(base_dir)
+
+from model.Tokotron import (
+    RepresentationMode,
+    get_silence_repr,
+    get_silence_token,
+    use_silence_padding,
+    feature_pad_to,
+)  # noqa: E402
+from evaluate import TokotronEvaluator  # noqa: E402
+
+logger = logging.getLogger(__name__)
+
+SPECIAL_TOKEN_COUNT = 1
+
+
+# Brain class for speech recognition training
+class TokotronBrain(sb.Brain):
+    """Class that manages the training loop. See speechbrain.core.Brain."""
+
+    def __init__(
+        self,
+        modules=None,
+        opt_class=None,
+        hparams=None,
+        run_opts=None,
+        checkpointer=None,
+    ):
+        super().__init__(
+            modules, opt_class, hparams, run_opts, checkpointer,
+        )
+        self.evaluator = TokotronEvaluator(
+            hparams=hparams,
+            create_waveform_fn=self.create_waveform,
+            device=self.device,
+        )
+        self.representation_mode = RepresentationMode(
+            self.hparams.representation_mode
+        )
+
+    def create_waveform(self, audio, length, emb):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+        emb: dict
+            Embeddings (speaker, etc)
+
+        Returns
+        -------
+        wav : torch.Tensor
+        """
+        self.modules.tokenizer.device = self.device
+        if hasattr(self.modules.tokenizer, "codec_vocoder"):
+            self.modules.tokenizer.codec_vocoder.to(self.device)
+            self.modules.tokenizer.codec_vocoder.device = self.device
+        with torch.no_grad():
+            wav = self.modules.tokenizer.tokens_to_sig(
+                audio, **self.token_model_kwargs
+            )
+            clean_padding_(wav, length)
+            wav = wav.to(self.device)
+        return wav
+
+    def compute_forward(self, batch, stage):
+        """Runs all the computation of the Tokotron TTS
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        predictions : dict
+            TTS predictions
+        """
+        batch = batch.to(self.device)
+        tokens, tokens_length = batch.tokens
+        features = self.prepare_features(batch)
+        (
+            audio_bos,
+            audio_bos_length,
+            audio_tgt,
+            audio_tgt_length,
+            spk_emb,
+        ) = features
+
+        predictions = self.modules.model(
+            input_tokens=tokens,
+            input_length=tokens_length,
+            audio=audio_bos,
+            audio_length=audio_bos_length,
+            emb={"spk": spk_emb},
+        )
+
+        return predictions, features
+
+    def prepare_features(self, batch):
+        if self.hparams.spk_emb_shuffle:
+            wav, wav_length = batch.spk_emb_random_match
+        else:
+            wav, wav_length = batch.sig
+        spk_emb = self._compute_spk(wav, wav_length).squeeze(1)
+
+        if self.representation_mode == RepresentationMode.DISCRETE:
+            audio_bos, audio_bos_length = batch.audio_bos
+            audio_tgt, audio_tgt_length = batch.audio_pad
+        else:
+            wav, audio_length = batch.sig
+            audio = self.modules.ssl_model(wav)
+            audio = audio[self.hparams.ssl_model_layers, :, :, :].permute(
+                1, 2, 0, 3
+            )
+            batch_size, _, heads, dim = audio.shape
+            bos = torch.zeros_like(audio[:, :1, :, :]).reshape(
+                batch_size, self.hparams.bos_width, heads, dim
+            )
+            audio_bos = torch.concatenate([bos, audio], dim=1)
+            audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1)
+            audio_tgt = audio
+            audio_tgt_length = audio_length
+
+        return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length, spk_emb
+
+    def _compute_spk(self, wav, wav_length):
+        mel_spec = self.spk_emb_model.mel_spectogram(wav.squeeze(1))
+        spk_emb_pred = self.spk_emb_model.encode_mel_spectrogram_batch(
+            mel_spec, wav_length
+        )
+        return spk_emb_pred
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs. We here
+        do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
+        costs.
+
+        Arguments
+        ---------
+        predictions : dict
+            The output dict from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+
+        predictions, features = predictions
+        (
+            audio_bos,
+            audio_bos_length,
+            audio_tgt,
+            audio_tgt_length,
+            spk_emb,
+        ) = features
+
+        loss_details = self.hparams.compute_cost(
+            predictions=predictions,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
+            input_tokens=batch.tokens.data,
+            input_length=batch.tokens.lengths,
+        )
+        self.loss_metric.append(
+            batch.uttid,
+            predictions=predictions,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
+            input_tokens=batch.tokens.data,
+            input_length=batch.tokens.lengths,
+            reduction="batch",
+        )
+        return loss_details.loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        if hasattr(self.modules, "vocoder") and hasattr(
+            self.modules.vocoder, "model"
+        ):
+            self.modules.vocoder.model.device = self.device
+        self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
+            metric=self.hparams.compute_cost, batch_eval=True,
+        )
+        if (
+            self.hparams.audio_emb_pretrained
+            and epoch == 1
+            and stage == sb.Stage.TRAIN
+        ):
+            # TODO: Clean this up
+            if hasattr(self.hparams.token_model, "vocabulary"):
+                vocabulary = self.hparams.token_model.vocabulary
+            elif hasattr(self.hparams.token_model, "vocabularies"):
+                vocabulary = torch.stack(
+                    [
+                        torch.from_numpy(voc)
+                        for voc in self.hparams.token_model.vocabularies
+                    ]
+                )
+            self.modules.model.init_audio_emb(vocabulary)
+        # Load the compression model only if compression is enables
+        pretrained_run_opts = {"device": self.device}
+        self.spk_emb_model = self.hparams.spk_emb_model(
+            run_opts=pretrained_run_opts
+        )
+        self.representation_mode = RepresentationMode(
+            self.hparams.representation_mode
+        )
+        # If speaker embedding shuffling is enabled, re-initialize them for the
+        # epoch
+        if self.hparams.spk_emb_shuffle:
+            stage_key = stage.name.lower()
+            self.resample_fn[stage_key](epoch=epoch)
+
+        # Reset the learning rate - if supported. This is useful when fine-tuning
+        # a model pre-trained on another dataset
+        if (
+            stage == sb.Stage.TRAIN
+            and self.hparams.reset_annealing_epoch is not None
+            and epoch is not None
+            and epoch == self.hparams.reset_annealing_epoch
+        ):
+            self.hparams.lr_annealing.n_steps = 0
+
+        self.is_evaluating = False
+        if stage == sb.Stage.VALID:
+            if self.is_eval_epoch(epoch):
+                self.evaluator.on_evaluate_start(stage, epoch)
+                self.is_evaluating = True
+            else:
+                logger.info("No evaluation on epoch %d", epoch)
+        elif stage == sb.Stage.TEST:
+            self.evaluator.on_evaluate_start(stage, epoch)
+            self.is_evaluating = True
+        self.token_model_kwargs = getattr(
+            self.hparams, "token_model_kwargs", {}
+        )
+
+    def is_eval_epoch(self, epoch):
+        """Determines whether or not evaluation should be performed
+        in the specieied epoch
+
+        Arguments
+        ---------
+        epoch : int
+            The epoch number. If omitted, the epoch number from the
+            epoch counter will be used
+
+        Returns
+        -------
+        eval_epoch : bool
+            True if evaluation should be run in this epoch, false
+            otherwise"""
+        if epoch is None:
+            epoch = self.hparams.epoch_counter.current
+        return epoch % self.hparams.eval_interval == 0
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp.
+
+        Default implementation compiles the jit modules, initializes
+        optimizers, and loads the latest checkpoint to resume training.
+        """
+        # Run this *after* starting all processes since jit/compiled modules
+        # cannot be pickled.
+        self._compile()
+
+        # Wrap modules with parallel backend after jit
+        self._wrap_distributed()
+
+        # Initialize optimizers after parameters are configured
+        self.init_optimizers()
+
+        # Load latest checkpoint to resume training if interrupted
+        if self.checkpointer is not None and not getattr(
+            self, "_ckpt_recovered", False
+        ):
+            checkpoint = self.checkpointer.recover_if_possible()
+            if not checkpoint:
+                self.check_init()
+            self._ckpt_recovered = True
+
+    def check_init(self):
+        init_from = getattr(self.hparams, "init_from", None)
+        if init_from is not None:
+            logger.info("Initializing with pre-trained weights from %s", init_from)
+            init_from_path = Path(init_from)
+            model_path = init_from_path / "model.ckpt"
+            with open(model_path, "rb") as model_file:
+                model_state_dict = torch.load(model_file, map_location=self.device)
+                tgt_state_dict = self.modules.model.state_dict()
+                ignore_keys = []
+                for k, v in model_state_dict.items():
+                    if k in tgt_state_dict and tgt_state_dict[k].shape != v.shape:
+                        logger.warning("Ignoring shape mismatch for %s", k)
+                        ignore_keys.append(k)
+                for k in ignore_keys:
+                    del model_state_dict[k]
+                self.modules.model.load_state_dict(model_state_dict, strict=False)
+            logger.info("Successfully initialized with pre-trained weights from %s", init_from)
+
+    @torch.no_grad()
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch, override for different procedure than train.
+
+        The default implementation depends on two methods being defined
+        with a particular behavior:
+
+        * ``compute_forward()``
+        * ``compute_objectives()``
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for evaluation. Default implementation assumes
+            this batch has two elements: inputs and targets.
+        stage : Stage
+            The stage of the experiment: Stage.VALID, Stage.TEST
+
+        Returns
+        -------
+        detached loss
+        """
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        if self.is_evaluating:
+            self.evaluator.evaluate_batch(batch)
+        return loss.detach().cpu()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+        loss_stats = self.loss_metric.summarize(flat=True)
+        stage_stats = {"loss": stage_loss, **loss_stats}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+
+        # End evaluation and report stats
+        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
+            self.evaluator.on_evaluate_end()
+            eval_summary = self.evaluator.compute_summary()
+            eval_summary_stats = {
+                key: eval_summary.get(value)
+                for key, value in self.hparams.eval_summary_log.items()
+            }
+            stage_stats.update(eval_summary_stats)
+
+        # Perform end-of-iteration things, like annealing, logging, etc.
+        if stage == sb.Stage.VALID:
+
+            if self.hparams.lr_annealing_mode == "epoch":
+                _, new_lr = self.hparams.lr_annealing(stage_loss)
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+
+            lr = self.optimizer.param_groups[0]["lr"]
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(
+                stats_meta={"epoch": epoch, "lr": lr},
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            self.checkpointer.save_and_keep_only(
+                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+            )
+
+    def fit_batch(self, batch):
+        loss = super().fit_batch(batch)
+        if self.hparams.lr_annealing_mode == "step":
+            self.hparams.lr_annealing(self.optimizer)
+        return loss
+
+
+INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+
+
+    Arguments
+    ---------
+    hparams : dict
+        This dictionary is loaded from the `train.yaml` file, and it includes
+        all the hyperparameters needed for dataset construction and loading.
+
+    Returns
+    -------
+    datasets : dict
+        Dictionary containing "train", "valid", and "test" keys that correspond
+        to the DynamicItemDataset objects.
+    silence_token : dict
+        the token used for silence
+    """
+
+    # Define datasets from json data manifest file
+    # Define datasets sorted by ascending lengths for efficiency
+    representation_mode = RepresentationMode(
+        hparams.get("representation_mode", RepresentationMode.DISCRETE)
+    )
+
+    datasets = {}
+    data_folder = hparams["data_folder"]
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    label_encoder = hparams["label_encoder"]
+    input_feature = INPUT_FEATURE_MAP[hparams["input"]]
+
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_ref_pipeline(wav):
+        """The audio loading pipeline for references
+
+        Arguments
+        ---------
+        wav : strƒnum_
+            The file path
+
+        Returns
+        -------
+        sig : torch.Tensor
+            The waveform
+        """
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+
+    @sb.utils.data_pipeline.takes("label")
+    @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
+    def text_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        label_norm = label.upper()
+        yield label.upper()
+        label_norm_eval = RE_PUNCTUATION.sub("", label_norm)
+        yield label_norm_eval
+
+    @sb.utils.data_pipeline.takes(input_feature)
+    @sb.utils.data_pipeline.provides("tokens")
+    def tokens_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        return label_encoder.encode_sequence_torch(label)
+
+    use_silence_padding = hparams.get("use_silence_padding", True)
+    if "token_model_layers" in hparams:
+        audio_tokens_per_step = len(hparams["token_model_layers"])
+    else:
+        audio_tokens_per_step = hparams["audio_tokens_per_step"]
+    layer_idx = None
+    if "speech_model_layers" in hparams:
+        layer_idx = get_selected_layer_indexes(hparams)
+    if use_silence_padding:
+        if representation_mode == RepresentationMode.DISCRETE:
+            silence_padding = get_silence_token(
+                hparams["tokenizer"],
+                num_codebooks=(
+                    hparams["speech_model_layers"]
+                    if "speech_model_layers" in hparams
+                    else audio_tokens_per_step
+                )
+            )
+        else:
+            silence_padding = get_silence_repr(hparams["ssl_model"],)
+    else:
+        silence_padding = (
+            torch.ones(audio_tokens_per_step, dtype=torch.int64)
+            * hparams["eos_index"]
+        )
+
+    silence_padding = silence_padding.cpu()
+    if layer_idx:
+        silence_padding = silence_padding[layer_idx]
+    else:
+        silence_padding = silence_padding[:audio_tokens_per_step]
+    silence_padding_len = int(math.ceil(hparams["silence_padding"]))
+    bos_width = hparams.get("bos_width", 1)
+    audio_bos_prefix = (
+        torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"]
+    )
+    if representation_mode == RepresentationMode.CONTINUOUS:
+        audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(
+            1, 1, hparams["audio_dim"]
+        )
+
+    tokens_loader = hparams.get("tokens_loader")
+    if layer_idx is not None:
+        tokens_loader_kwargs = {
+            "num_codebooks": layer_idx
+        }
+    else:
+        tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step}    
+
+    @sb.utils.data_pipeline.takes("uttid")
+    @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
+    def audio_pipeline(id):
+        audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs)
+        audio_pad = feature_pad_to(
+            audio, len(audio) + silence_padding_len, silence_padding
+        )
+        yield audio_pad
+        audio_bos = torch.cat([audio_bos_prefix, audio_pad], dim=0)
+        yield audio_bos
+
+    def spk_emb_random_match(uttid, dataset, spk_sample):
+        # Sample a speaker-matched embedding
+        selected_idx = spk_sample[uttid]
+
+        # Retrieve the embedding value from the dataset
+        with dataset.output_keys_as(["sig"]):
+            spk_emb = dataset[selected_idx]["sig"]
+        return spk_emb
+
+    dynamic_items = [
+        text_pipeline,
+        tokens_pipeline,
+        audio_ref_pipeline,
+        audio_pipeline,
+    ]
+    output_keys = [
+        "uttid",
+        "tokens",
+        "audio_pad",
+        "audio_bos",
+        "sig",
+        "spk_emb_random_match",
+    ]
+
+    init_sequence_encoder(hparams)
+
+    resample_fn = {}
+    for dataset in data_info:
+        dataset_output_keys = (
+            output_keys
+            if dataset == "train"
+            else output_keys + ["label_norm_eval"]
+        )
+        dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": data_folder},
+            dynamic_items=dynamic_items,
+            output_keys=dataset_output_keys,
+        )
+
+        datasets[dataset] = dynamic_dataset
+        hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
+        if hparams["spk_emb_shuffle"]:
+            spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams)
+            spk_sample = {}
+            spk_emb_random_match_pipeline = partial(
+                spk_emb_random_match,
+                spk_sample=spk_sample,
+                dataset=dynamic_dataset.filtered_sorted(),
+            )
+            dynamic_dataset.add_dynamic_item(
+                func=spk_emb_random_match_pipeline,
+                takes=["uttid"],
+                provides=["spk_emb_random_match"],
+            )
+            resample_fn[dataset] = partial(
+                resample_spk,
+                spk_idx=spk_idx,
+                sample=spk_sample,
+                dataset=dynamic_dataset,
+                spk_samplers=spk_samplers,
+            )
+            resample_fn[dataset](epoch=0)
+
+    # Sorting training data with ascending order makes the code  much
+    # faster  because we minimize zero-padding. In most of the cases, this
+    # does not harm the performance.
+    if hparams["sorting"] == "ascending":
+        datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        datasets["train"] = datasets["train"].filtered_sorted(
+            sort_key="length", reverse=True
+        )
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        hparams["train_dataloader_opts"]["shuffle"] = True
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+
+    # Exclude samples without phonemes
+    if hparams["input"] == "phonemes":
+        for key in datasets:
+            datasets[key] = datasets[key].filtered_sorted(
+                key_test={"phn": lambda value: value}
+            )
+    datasets["sample"] = select_sample(hparams, datasets)
+    return datasets, silence_padding, resample_fn
+
+
+def select_sample(hparams, datasets):
+    """Selects a sample of files for sample generation, freezing the sample if
+    requested to persist across multiple experiments
+
+    Arguments
+    ---------
+    hparams : dict
+        experiment hyperparameters
+    datasets : dict
+        a dictionary of datasets
+
+    Returns
+    -------
+    dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset
+        the sample dataset
+    """
+    sample_path = hparams.get("sample_path")
+    dataset = None
+    if sample_path is not None:
+        sample_path = Path(sample_path)
+        if sample_path.exists():
+            with open(sample_path, "r") as sample_file:
+                data_ids = [line.strip() for line in sample_file]
+                dataset = FilteredSortedDynamicItemDataset(
+                    datasets["valid"], data_ids
+                )
+
+    if dataset is None:
+        dataset = (
+            datasets["valid"]
+            .batch_shuffle(1)
+            .filtered_sorted(select_n=hparams["num_audio_samples"])
+        )
+        if sample_path is not None:
+            with open(sample_path, "w") as sample_file:
+                for data_id in dataset.data_ids:
+                    print(data_id, file=sample_file)
+    return dataset
+
+
+def group_by_speaker(dataset, hparams):
+    """Groups utterance IDs in a dataset by speaker, for selection. The selection
+    is stable based on the seed - calling this method multiple times will always
+    result in the same order
+
+    Arguments
+    ---------
+    dataset : torch.Tensor
+        the dataset from which to select items
+    hparams : dict
+        hyperparameters
+
+    Returns
+    -------
+    spk_idx : dict
+        a str -> int dictionary with a list of utterance indexes
+        for every speaker
+    spk_samplers : dict
+        a reproducible sampler for every speaker
+    spk_samplers_it : dict
+        an iterator for each sampler
+    """
+    spk_idx = {}
+    spk_samplers = {}
+    speakers = []
+    generator = torch.Generator()
+    generator.manual_seed(hparams["seed"])
+
+    # Group by speaker
+    with dataset.output_keys_as(["spk_id"]):
+        for idx, item in enumerate(dataset):
+            spk_id = item["spk_id"]
+            if spk_id not in spk_idx:
+                spk_idx[spk_id] = []
+            spk_idx[spk_id].append(idx)
+            speakers.append(spk_id)
+
+    # Create a reproducible sampler
+    for spk_id in speakers:
+        sampler = hparams["spk_sampler"](data_source=spk_idx[spk_id])
+        spk_samplers[spk_id] = sampler
+
+    return spk_idx, spk_samplers
+
+
+def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch):
+    """Selects new samples
+
+    Arguments
+    ---------
+    spk_idx : dict
+        Data item indexes grouped by speaker
+    spk_samplers : dict
+        A sampler for each speaker
+    spk_samplers_it : dict
+        An iterator for each speaker
+    epoch : int
+        The epoch number
+
+    Returns
+    -------
+    sample : dict
+        a dictionary with uttids as keys and matching
+        indexes as values
+    """
+    if epoch is None:
+        epoch = 0
+    spk_samplers_it = {}
+    for spk_id, sampler in spk_samplers.items():
+        sampler.set_epoch(epoch)
+        spk_samplers_it[spk_id] = iter(sampler)
+    with dataset.output_keys_as(["uttid", "spk_id"]):
+        for item in dataset:
+            spk_item_idx = next(spk_samplers_it[item["spk_id"]])
+            dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx]
+            sample[item["uttid"]] = dataset_item_idx
+
+
+def init_sequence_encoder(hparams):
+    """Initialize a sequence encoder
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    prefix: str
+        the prefix to be prepended to hyperparameter keys, per the naming
+        convention
+
+        {prefix}_label_encoder: the hparams key for the label encoder
+        {prefix}_list_file:  the hparams key for the list file
+
+    Returns
+    -------
+    encoder: speechbrain.dataio.encoder.TextEncoder
+        an encoder instance"""
+    encoder = hparams["label_encoder"]
+    token_list_file_name = hparams["token_list_file"]
+    tokens = read_token_list(token_list_file_name)
+    encoder.add_unk()
+    encoder.update_from_iterable(tokens, sequence_input=False)
+    encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT)
+    return encoder
+
+
+def get_selected_layer_indexes(hparams):
+    """Finds the indexes of selected layers
+
+    Arguments
+    ---------
+    hparams : dict
+        Hyperparameters
+    """
+    selected_layers = hparams.get("speech_model_layers")
+    available_layers = hparams.get("available_speech_model_layers")
+    if not (selected_layers and available_layers):
+        return None
+    layer_idx = [available_layers.index(layer) for layer in selected_layers]
+    return layer_idx
+
+
+def read_token_list(file_name):
+    """Reads a simple text file with tokens (e.g. characters or phonemes) listed
+    one per line
+
+    Arguments
+    ---------
+    file_name: str
+        the file name
+
+    Returns
+    -------
+    result: list
+        a list of tokens
+    """
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
+        raise ValueError(f"Token file {file_name} not found")
+    with open(file_name) as token_file:
+        return [line.strip("\r\n") for line in token_file if line]
+
+
+def apply_overfit_test(hparams, dataset):
+    """Helper for applying an overfit test conditionally based
+    on hyperparameters:
+
+    `overfit_test`: whether or not to apply an overfit test
+    `overfit_test_sample_count`: the number of samples to use from the
+        original dataset
+    `overfit_test_epoch_data_count`: the number of samples per epoch
+
+    The function will accept datasets, (train, valid, test) tuples
+    or dictionaries of the form:
+    {"train": dataset1, "valid": dataset2, "test": dataset3}
+
+    If a tuple or dictionary is used, the training dataset will be of length
+    overfit_test_epoch_data_count wheres the evaluation dataset will be of
+    length overfit_test_sample_count.
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    dataset: DynamicItemDataset|tuple|dict
+        One of the following
+        a dataset
+        a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3})
+        a (train, valid, test)  tuple of datasets
+
+    Returns
+    -------
+    result: DynamicItemDataset|tuple|dict
+        a dataset or collection of datasets suitable for
+        an overfitting test - in the same format as the
+        dataset argument (single dataset, dictionary and tuple)
+    """
+    if hparams["overfit_test"]:
+        if isinstance(dataset, tuple):
+            dataset_train, _, _ = dataset
+            dataset_train = apply_overfit_test(hparams, dataset_train)
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = dataset_train, dataset_eval, dataset_eval
+        elif isinstance(dataset, dict):
+            dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = {
+                "train": dataset_train,
+                "valid": dataset_eval,
+                "test": dataset_eval,
+                "sample": dataset_eval,
+            }
+        else:
+            result = dataset.overfit_test(
+                hparams["overfit_test_sample_count"],
+                hparams["overfit_test_epoch_data_count"],
+            )
+    else:
+        result = dataset
+    return result
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+
+if __name__ == "__main__":
+    # Reading command line arguments
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Initialize ddp (useful only for multi-GPU DDP training)
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        yaml = fin.read()
+
+    # Load evaluation hyperparameters
+    eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if eval_hparams_file.exists():
+        logger.info(
+            "Using evaluation hyperparameters from %s", eval_hparams_file
+        )
+        with open(eval_hparams_file) as eval_hparams:
+            hparams_yaml = eval_hparams.read()
+            yaml = "\n".join([yaml, hparams_yaml])
+    else:
+        logger.info(
+            "%s not found - not using evaluation hyperparameters",
+            eval_hparams_file,
+        )
+    hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from libritts_prepare import prepare_libritts
+
+    # Data preparation, to be run on only one process.
+    if not hparams["skip_prep"]:
+        run_on_main(
+            prepare_libritts,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_json_train": hparams["train_json"],
+                "save_json_valid": hparams["valid_json"],
+                "save_json_test": (
+                    hparams["test_json"]
+                    if "test" in hparams["splits"]
+                    else None
+                ),
+                "sample_rate": hparams["sample_rate"],
+                "train_split": hparams["train_split"],
+                "valid_split": hparams["valid_split"],
+                "test_split": (
+                    hparams["test_split"]
+                    if "test" in hparams["splits"]
+                    else None
+                ),
+                "seed": hparams["seed"],
+                "model_name": hparams["model"].__class__.__name__,
+            },
+        )
+
+    # We can now directly create the datasets for training, valid, and test
+    (datasets, silence_padding, resample_fn) = dataio_prepare(hparams)
+
+    # Apply overfit test settings
+    datasets = apply_overfit_test(hparams, datasets)
+    audio_keys = ["audio_pad", "audio_bos"]
+
+    # Trainer initialization
+    tts_brain = TokotronBrain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+    tts_brain.sample_data = datasets["sample"]
+    tts_brain.resample_fn = resample_fn
+
+    # The `fit()` method iterates the training loop, calling the methods
+    # necessary to update the parameters of the model. Since all objects
+    # with changing state are managed by the Checkpointer, training can be
+    # stopped at any point, and will be resumed on next call.
+    tts_brain.fit(
+        tts_brain.hparams.epoch_counter,
+        datasets["train"],
+        datasets["valid"],
+        train_loader_kwargs=use_silence_padding(
+            hparams["train_dataloader_opts"], silence_padding, audio_keys
+        ),
+        valid_loader_kwargs=use_silence_padding(
+            hparams["valid_dataloader_opts"], silence_padding, audio_keys
+        ),
+    )
+
+    # Load best checkpoint for evaluation
+    if hparams["testing"]:
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            test_key_kind = hparams["test_key_kind"]
+            test_key = hparams["test_key"]
+            eval_kwargs = {
+                f"{test_key_kind}_key": test_key
+            }
+            tts_brain.evaluate(
+                test_set=datasets["test"],
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+                **eval_kwargs
+            )
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
new file mode 100644
index 000000000..9fd6da808
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
@@ -0,0 +1,357 @@
+import json
+import torch
+import logging
+import re
+import csv
+from speechbrain.utils.metric_stats import MetricStats
+from types import SimpleNamespace
+from pathlib import Path
+from utils.data import undo_batch
+from torch import nn
+
+
+logger = logging.getLogger(__name__)
+
+
+class SpeechEvaluationMetricStats(MetricStats):
+    """An aggregate metric combining multiple speech evaluators
+
+    Arguments
+    ---------
+    hparams : dict | SimpleNamespace | object
+        Raw hyperparameters for evaluation
+
+    device : str
+        The device on which evaluation will be performed
+
+    """
+
+    def __init__(self, hparams, device="cpu"):
+        if isinstance(hparams, dict):
+            hparams = SimpleNamespace(**hparams)
+        self.hparams = hparams
+        self.device = device
+        modules = self.hparams.modules
+        self.modules = nn.ModuleDict(modules).to(self.device)
+        self.enabled_evaluators = set(self.hparams.evaluations.split(","))
+        evaluators = hparams.evaluators
+        if evaluators:
+            self.evaluators = {
+                key: evaluator_f(run_opts={"device": device})
+                for key, evaluator_f in evaluators.items()
+                if key in self.enabled_evaluators
+            }
+        else:
+            self.evaluators = {}
+
+        if not self.evaluators:
+            logger.warn(
+                "No evaluators were defined - this run will produce samples only"
+            )
+
+    def on_evaluation_start(self, output_folder="eval"):
+        """Invoked at the beginning of the evaluation cycle.
+
+        Arguments
+        ---------
+        output_folder : str | path-like
+            The folder to which results will be output
+
+        """
+        logger.info("Starting evaluation")
+        output_folder = Path(output_folder)
+        self.output_folder = (
+            output_folder
+            if output_folder.is_absolute()
+            else self.hparams.output_folder / output_folder
+        )
+        self.output_folder.mkdir(parents=True, exist_ok=True)
+
+        self.files = []
+        details_keys = list(self.evaluators.keys())
+        self.details = {evaluator_key: [] for evaluator_key in details_keys}
+        self.read_reports()
+        self.create_reports()
+        self.item_ids = []
+
+    def on_evaluation_end(self):
+        """Invoked at the beginning of the evaluation cycle. The default
+        implementation is a no-op
+        """
+        logger.info("Ending evaluation")
+        self.write_summary()
+
+    def create_reports(self):
+        """Creates report files and report writers"""
+        self.report_files = {}
+        self.report_writers = {}
+        for evaluator_key in self.enabled_evaluators:
+            columns = self.get_report_columns(evaluator_key)
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            self.files.append(file_name)
+            resume = file_name.exists() and file_name.stat().st_size > 0
+            report_file = open(file_name, "a+")
+            self.report_files[evaluator_key] = report_file
+            writer = csv.DictWriter(report_file, columns)
+            if not resume:
+                writer.writeheader()
+            self.report_writers[evaluator_key] = writer
+
+    def read_reports(self):
+        """Invoked when resuming"""
+        for evaluator_key in self.enabled_evaluators:
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            if file_name.exists():
+                logger.info("%s exists, reading")
+                with open(file_name) as report_file:
+                    reader = csv.DictReader(report_file)
+                    for row in reader:
+                        del row["uttid"]
+                        row = {
+                            key: handle_number(value)
+                            for key, value in row.items()
+                        }
+                        self.details[evaluator_key].append(row)
+
+    def get_tracker_file_name(self):
+        """Determines the file name of the tracker file"""
+        suffix = (
+            f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else ""
+        )
+        file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt"
+        return self.output_folder / file_name
+
+    def get_report_columns(self, evaluator_key):
+        """Returns the columns for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            the identifier of the evaluator
+
+        Returns
+        -------
+        columns : list[str]
+            a list of column headers
+        """
+        bogus_wavs = torch.randn(2, 10000, device=self.device)
+        bogus_length = torch.tensor([1.0, 1.0], device=self.device)
+        evaluator = self.evaluators[evaluator_key]
+        result = evaluator.evaluate(
+            wavs=bogus_wavs,
+            length=bogus_length,
+            text=["BOGUS"] * len(bogus_wavs),
+            wavs_ref=bogus_wavs,
+            length_ref=bogus_length,
+        )
+
+        return ["uttid"] + list(result.details.keys())
+
+    def append(self, ids, wav, length, text, wav_ref, length_ref):
+        """Appends the result of a single item
+
+        Arguments
+        ---------
+        ids : str
+            Utterance IDs
+        wav : torch.Tensor
+            Synthesized waveforms
+        length : torch.Tensor
+            Relative lengths of the synthesized waveforms
+        text : list
+            Ground truth text
+        wav_ref : torch.Tensor
+            Reference (ground truth) waveforms
+        length_ref : torch.Tensor
+            Reference lengths
+        """
+        with torch.no_grad():
+            self.item_ids.extend(ids)
+            for evaluator_key, evaluator in self.evaluators.items():
+                result = evaluator.evaluate(
+                    wavs=wav,
+                    length=length,
+                    text=text,
+                    wavs_ref=wav_ref,
+                    length_ref=length_ref,
+                    sample_rate_ref=self.hparams.sample_rate,
+                    sample_rate=self.hparams.model_sample_rate,
+                )
+                details = undo_batch(result.details)
+                self.write_result(evaluator_key, ids, details)
+                self.details[evaluator_key].extend(details)
+
+    def write_result(self, evaluator_key, ids, details):
+        """Outputs the result details to the report for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            The evaluator key
+        ids : list
+            The list of IDs
+        details : list
+            a list of evaluation details, one dictionary per item
+        """
+        writer = self.report_writers[evaluator_key]
+        for uttid, details_item in zip(ids, details):
+            report_details = {
+                "uttid": uttid,
+                **details_item,
+            }
+            writer.writerow(ascii_only(flatten(report_details)))
+        self.report_files[evaluator_key].flush()
+
+    def write_summary(self, file_name=None):
+        """Outputs summarized statistics
+
+        Arguments
+        ---------
+        file_name : str | path-like
+            An alternative path to save the file
+        """
+        summary = self.summarize()
+        if file_name is None:
+            file_name = self.output_folder / "summary.json"
+        self.files.append(file_name)
+        with open(file_name, "w") as output_file:
+            json.dump(summary, output_file, indent=4)
+
+    def summarize(self, field=None):
+        """Computes the summarized statistics
+
+        Arguments
+        ---------
+        field : str, optional
+            If specified, it will return a specific field
+
+        Returns
+        -------
+        result : dict | float
+            The summary - or the specified field from the sum
+        """
+        result = {
+            f"{evaluator_key}_{stat_key}": value
+            for evaluator_key in self.enabled_evaluators
+            if evaluator_key in self.details
+            for metric_key in self.hparams.eval_summary[evaluator_key][
+                "descriptive"
+            ]
+            for stat_key, value in descriptive_statistics(
+                items=self.details[evaluator_key], key=metric_key,
+            ).items()
+        }
+        if field is not None:
+            result = result[field]
+        return result
+
+    def clear(self):
+        """Deletes all the files that have been created"""
+        for file_name in self.files:
+            file_name.unlink()
+
+
+RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+")
+
+
+def ascii_only(values):
+    """Removes any non-ASCII characters from a dictionary
+
+    Arguments
+    ---------
+    values : dict
+        A dictionary of values
+
+    Returns
+    -------
+    result : dict
+        The same dictionary - but with non-ASCII strings removed"""
+    return {
+        key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value
+        for key, value in values.items()
+    }
+
+
+def descriptive_statistics(items, key):
+    """Computes descriptive statistics for the summary
+
+    Arguments
+    ---------
+    items : list
+        a list of dictionaries with metric values for each item
+    key : str
+        The key of the metric for which the statistics will be computed
+
+    Returns
+    -------
+    statistics : dict
+        The desccriptive statistics computed
+            <key>_mean : the arithmetic mean
+            <key>_std : the standard deviation
+            <key>_min : the minimum value
+            <key>_max : the maximum value
+            <key>_median : the median value
+            <key>_q1 : the first quartile
+            <key>_q3 : the third quartile
+            <key>_iqr : the interquartile ratio
+    """
+    values = torch.tensor([item[key] for item in items])
+    quantiles = torch.tensor([0.25, 0.5, 0.75])
+    q1, median, q3 = values.quantile(quantiles)
+    stats = {
+        "mean": values.mean(),
+        "std": values.std(),
+        "min": values.min(),
+        "max": values.max(),
+        "median": median,
+        "q1": q1,
+        "q3": q3,
+        "iqr": q3 - q1,
+    }
+    return {
+        f"{key}_{stat_key}": value.item() for stat_key, value in stats.items()
+    }
+
+
+def flatten(value):
+    """Converts tensors to scalars and lists of strings to strings
+
+    Arguments
+    ---------
+    value : dict
+        the dictionary to flatten
+
+    Returns
+    -------
+    result : dict
+        a flattened dictionary
+    """
+    return {
+        key: item_value.item() if torch.is_tensor(item_value) else item_value
+        for key, item_value in value.items()
+    }
+
+
+RE_INTEGER = re.compile(r"^-?\d+$")
+RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$")
+
+
+def handle_number(value):
+    """Converts a value to a number, if applicable. Strings
+    that look like integers or floats will be converted to integers
+    or floats.
+
+    Arguments
+    ---------
+    value : str
+        a string value
+
+    Returns
+    -------
+    result : object
+        The processed result"""
+    if RE_INTEGER.match(value):
+        value = int(value)
+    elif RE_FLOAT.match(value):
+        value = float(value)
+    return value
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt
new file mode 100644
index 000000000..105a1dd9d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt
@@ -0,0 +1,50 @@
+AA
+AE
+AH
+AO
+AW
+AY
+B
+CH
+D
+DH
+EH
+ER
+EY
+F
+G
+HH
+IH
+IY
+JH
+K
+L
+M
+N
+NG
+OW
+OY
+P
+R
+S
+SH
+T
+TH
+UH
+UW
+V
+W
+Y
+Z
+ZH
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt
new file mode 100644
index 000000000..f43d3b08d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
+ 
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
new file mode 100644
index 000000000..129cf9337
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
@@ -0,0 +1,57 @@
+eval_dataset: valid
+eval_suffix: ""
+eval_sample_rate: 16000
+eval_spk_sim_sample_rate: 16000
+eval_samples: null
+eval_interval: 1
+eval_subset: null
+eval_asr_beam_size: 66
+eval_asr_type: encoder_decoder
+eval_asr_source: openai/whisper-small
+eval_spk_sim_source: microsoft/wavlm-base-sv
+evaluations: utmos,asr,spk_sim
+tmp_folder: null
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: False
+
+
+eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
+
+eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
+
+eval_spk_sim: !name:utils.eval.SpkSimWavLM
+  source: !ref <eval_spk_sim_source>
+  savedir: !ref <pretrained_model_save_folder>
+  model_sample_rate: !ref <eval_spk_sim_sample_rate>
+
+evaluators:
+  utmos: !ref <eval_utmos>
+  asr: !ref <eval_asr>
+  spk_sim: !ref <eval_spk_sim>
+
+eval_summary:
+  asr:
+    descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
+  utmos:
+    descriptive: ["utmos"]
+  spk_sim:
+    descriptive: ["score"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
+  spk_sim: spk_sim_score_mean
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
new file mode 100644
index 000000000..b7579f092
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -0,0 +1,256 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/dac
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: True
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+freeze_lm_head: False
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 2
+flatten: false
+
+# Model Settings
+model_type: 24khz
+model_bitrate: 8kbps
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    n_codebooks: !ref <audio_tokens_per_step>
+    load_pretrained: True
+    tag: latest
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
new file mode 100644
index 000000000..e7e4657aa
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -0,0 +1,297 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/discrete_ssl
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+representation_mode: discrete
+output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
+alignments_folder: null
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+
+# Model Settings
+ssl_model_type: wavlm
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+use_token_offsets: True
+
+# Speaker Embeddings
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1000
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_emb_lr: 0.00001
+audio_emb_weight_decay: 0.001
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 6
+flatten: false
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>/<ssl_model_type>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+
+lr_annealing: !new:model.Tokotron.TargetedNoamScheduler
+    lr_initial: [!ref <lr>, !ref <audio_emb_lr>]
+    n_warmup_steps: !ref <lr_warmup_steps>
+    param_group: 0
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
new file mode 100644
index 000000000..c35aaa4f9
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -0,0 +1,256 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/encodec
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: True
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+freeze_lm_head: False
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+flatten: false
+
+# Model Settings
+model_hub: facebook/encodec_24khz
+bandwidth: 6
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
new file mode 100644
index 000000000..efd408469
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -0,0 +1,260 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/encodec
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: True
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+flatten: false
+
+# Model Settings
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+
+freeze_lm_head: True
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <audio_tokens_per_step>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_fairseq_hubert.yaml
new file mode 100644
index 000000000..d51b43ea3
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_fairseq_hubert.yaml
@@ -0,0 +1,268 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/fairseq_hubert
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+representation_mode: discrete
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
+alignments_folder: null
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+
+# Model Settings
+model_path: !ref <pretrained_model_save_folder>/fairseq-hubert
+feature_extractor_path: !ref <model_path>/mhubert_base_25hz_cp_mls_cv_sp_fisher.pt
+kmeans_path: !ref <model_path>/mhubert_base_25hz_cp_mls_cv_sp_fisher_L11_km500.bin
+speech_model_layer: 11
+vocoder_dense_model_name: "mhubert-base-25hz"
+vocoder_quantizer_model_name: "kmeans"
+vocoder_vocab_size: 500
+flip_layers: False
+use_token_offsets: True
+
+# Speaker Embeddings
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 500
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_emb_lr: 0.00001
+audio_emb_weight_decay: 0.001
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 1
+flatten: false
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+vocoder: !apply:textless.vocoders.hifigan.vocoder.CodeHiFiGANVocoder.by_name
+  dense_model_name: !ref <vocoder_dense_model_name>
+  quantizer_model_name: !ref <vocoder_quantizer_model_name>
+  vocab_size: !ref <vocoder_vocab_size>
+
+tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer
+  feat_extractor_path: !ref <feature_extractor_path>
+  km_path: !ref <kmeans_path>
+  layer: !ref <speech_model_layer>
+  vocoder: !ref <vocoder>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+
+lr_annealing: !new:model.Tokotron.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
new file mode 100644
index 000000000..7b61a18a7
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -0,0 +1,252 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/mimi
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: True
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 2048
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+
+# Model Settings
+model_hub: kyutai/mimi
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
new file mode 100644
index 000000000..b6f699cf9
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -0,0 +1,252 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/speech_tokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: True
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+flatten: false
+
+# Model Settings
+model_hub: fnlp/SpeechTokenizer
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
+    source: !ref <model_hub>  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
new file mode 100644
index 000000000..b123ca67b
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -0,0 +1,290 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/speech_tokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: False
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+top_k: 1
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+ternary_d_hidden: 128
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+target_dropout: 0.2
+vocab_size: 19683
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + <vocab_size> + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <vocab_size> + <special_num_tokens>
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 4
+flatten: true
+ternary_num_digits: 14
+pred_mode: ternary
+
+# Model Settings
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sq_codec_save_path: !ref <pretrained_model_save_folder>/sq-codec
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: 1
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    target_dropout: !ref <target_dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+    lm_head: !ref <lm_head>
+    emb: !ref <emb>
+    logits_to_probs: !ref <logits_to_probs>    
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: 1
+    top_k: !ref <top_k>
+
+lm_head: !new:model.custom_model.MultitrackPredictionHead
+    d_model: !ref <d_model>
+    num_tracks: !ref <audio_tokens_per_step>
+    vocab_size: !ref <model_vocab_size>
+
+logits_to_probs: !new:torch.nn.Identity
+
+ternary_emb_shift: 4
+ternary_emb_hybrid: false
+
+emb: !new:speechbrain.nnet.containers.Sequential
+    ternary: !new:model.sq_codec.TernaryEmbedding
+        num_digits: !ref <ternary_num_digits>
+        shift: !ref <ternary_emb_shift>
+        shift_cutoff: !ref <audio_token_shift>
+        hybrid: !ref <ternary_emb_hybrid>
+        hybrid_cutoff: !ref <audio_token_shift>
+        hybrid_size: !ref <ternary_num_digits> * <audio_tokens_per_step>
+        flat: True
+    linear: !new:speechbrain.nnet.linear.Linear
+        input_size: !apply:speechbrain.utils.hparams.choice
+            value: !ref <ternary_emb_hybrid>
+            choices:
+                true: !ref <ternary_num_digits> * <audio_tokens_per_step> * 2
+                false: !ref <ternary_num_digits> * <audio_tokens_per_step>
+        n_neurons: !ref <d_model>
+
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+    save_path: !ref <sq_codec_save_path>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec_ternary.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec_ternary.yaml
new file mode 100644
index 000000000..3a22065d7
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec_ternary.yaml
@@ -0,0 +1,292 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/speech_tokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: False
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+top_k: 1
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+ternary_d_hidden: 128
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+target_dropout: 0.2
+vocab_size: 19683
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !ref <text_num_tokens> + <vocab_size> + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 4
+flatten: true
+ternary_num_digits: 10
+pred_mode: ternary
+
+# Model Settings
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sq_codec_save_path: !ref <pretrained_model_save_folder>/sq-codec
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: 1
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    target_dropout: !ref <target_dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+    lm_head: !ref <lm_head>
+    emb: !ref <emb>
+    logits_to_probs: !ref <logits_to_probs>    
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: 1
+    top_k: !ref <top_k>
+
+lm_head: !new:model.custom_model.TernaryPredictionHead
+    d_model: !ref <d_model>
+    d_hidden: !ref <ternary_d_hidden>
+    num_positions: !ref <ternary_num_digits> * <audio_tokens_per_step>
+
+logits_to_probs: !new:model.custom_model.TernaryLogitTokenizer
+    num_tokens: !ref <model_vocab_size>
+    num_positions: !ref <ternary_num_digits>
+
+ternary_emb_shift: 4
+ternary_emb_hybrid: false
+
+emb: !new:speechbrain.nnet.containers.Sequential
+    ternary: !new:model.sq_codec.TernaryEmbedding
+        num_digits: !ref <ternary_num_digits>
+        shift: !ref <ternary_emb_shift>
+        shift_cutoff: !ref <audio_token_shift>
+        hybrid: !ref <ternary_emb_hybrid>
+        hybrid_cutoff: !ref <audio_token_shift>
+        hybrid_size: !ref <ternary_num_digits> * <audio_tokens_per_step>
+        flat: True
+    linear: !new:speechbrain.nnet.linear.Linear
+        input_size: !apply:speechbrain.utils.hparams.choice
+            value: !ref <ternary_emb_hybrid>
+            choices:
+                true: !ref <ternary_num_digits> * <audio_tokens_per_step> * 2
+                false: !ref <ternary_num_digits> * <audio_tokens_per_step>
+        n_neurons: !ref <d_model>
+
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+    save_path: !ref <sq_codec_save_path>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.sq_codec.ternary_loss
+    targets_type: tokens
+    num_positions: !ref <ternary_num_digits>
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..b63fe0d24
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,256 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/wavtokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+use_token_offsets: True
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 4096
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 1
+flatten: false
+
+# Model Settings
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+    freeze: True
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py
new file mode 120000
index 000000000..489ab4011
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py
@@ -0,0 +1 @@
+../../libritts_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
new file mode 100644
index 000000000..f37ae5b40
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -0,0 +1,1353 @@
+#!/usr/bin/env/python3
+"""Recipe for training VALL-E
+
+Based on ESPNET VALL-E
+
+Curriculum inspired by Lifeiteng's VALL-E
+https://github.com/lifeiteng/vall-e
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+
+import logging
+import speechbrain as sb
+import torch
+import sys
+import shutil
+from pathlib import Path
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.dataio.dataio import (
+    clean_padding,
+    length_to_mask,
+    write_audio,
+)
+from speechbrain.dataio.dataloader import LoopedLoader
+from speechbrain.utils.data_utils import pad_right_to
+from speechbrain.utils.distributed import run_on_main
+from speechbrain.utils.data_utils import batch_pad_right
+from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
+from functools import partial
+from torch.utils.data import DataLoader
+import re
+import string
+
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
+sys.path.append(base_dir)
+
+from evaluation import SpeechEvaluationMetricStats  # noqa: E402
+
+logger = logging.getLogger(__name__)
+
+# Brain class for speech recognition training
+class VALLEBrain(sb.Brain):
+    """Class that manages the training loop. See speechbrain.core.Brain."""
+
+    def __init__(
+        self,
+        modules=None,
+        opt_class=None,
+        hparams=None,
+        run_opts=None,
+        checkpointer=None,
+    ):
+        super().__init__(
+            modules, opt_class, hparams, run_opts, checkpointer,
+        )
+        self.evaluation_metric = SpeechEvaluationMetricStats(
+            self.hparams, self.device
+        )
+
+    def create_waveform(self, audio, length):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+
+                    Returns
+        -------
+        wav : torch.Tensor
+        """
+        tokenizer = (
+            self.modules.tokenizer.module
+            if hasattr(self.modules.tokenizer, "module")
+            else self.modules.tokenizer
+        )
+        tokenizer.device = self.device
+        if hasattr(tokenizer, "codec_vocoder"):
+            tokenizer.codec_vocoder.to(self.device)
+            tokenizer.codec_vocoder.device = self.device
+        wav = tokenizer.tokens_to_sig(
+            audio, **self.token_model_kwargs
+        )
+        wav = clean_padding(wav, length)
+        wav = wav.to(self.device)
+        return wav
+
+    def compute_forward(self, batch, stage):
+        """Runs all the computation of the Tokotron TTS
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        predictions : dict
+            TTS predictions
+        """
+        batch = batch.to(self.device)
+        prompt, prompt_length = batch.prompt
+        batch_size, prompt_max_len, num_tracks = prompt.shape
+        nar_track = None
+        if self.train_nar:
+            nar_track = torch.randint(
+                1, num_tracks, (batch_size,), device=self.device
+            )
+        logits_ar, logits_nar = self.modules.model(
+            dec_seq=batch.prompt.data,
+            dec_seq_lengths=batch.prompt.lengths,
+            prefix_len=batch.prefix_length / prompt_max_len,
+            nar_level_idx=nar_track,
+            predict_ar=self.train_ar,
+            predict_nar=self.train_nar,
+        )
+        return logits_ar, logits_nar, nar_track
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs. We here
+        do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
+        costs.
+
+        Arguments
+        ---------
+        predictions : dict
+            The output dict from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+
+        logits_ar, logits_nar, nar_track = predictions
+        prompt, prompt_length = batch.prompt
+        prefix_length = batch.prefix_length
+
+        batch_size, prompt_max_len, _ = prompt.shape
+        batch_idx = torch.arange(batch_size, device=prompt.device)
+        length_mask = length_to_mask(
+            prompt_length * prompt_max_len, prompt_max_len
+        )
+        prefix_mask = length_to_mask(
+            prefix_length, prompt_max_len
+        ).logical_not()
+        mask = (length_mask * prefix_mask)[:, 1:]
+
+        loss_components = []
+
+        if self.train_ar:
+            logits_ar_sm = self.hparams.log_softmax(logits_ar)
+            if self.hparams.flatten:
+                targets_ar = prompt[:, 1:]
+            else:
+                targets_ar = prompt[:, 1:, 0]
+
+            loss_ar = self.hparams.compute_cost(
+                logits_ar_sm, targets=targets_ar, mask=mask
+            )
+            loss_components.append(loss_ar)
+        else:
+            logits_ar_sm, targets_ar = None, None
+        if self.train_nar:
+            logits_nar_sm = self.hparams.log_softmax(logits_nar)
+            targets_nar = prompt[batch_idx, 1:, nar_track]
+            loss_nar = self.hparams.compute_cost(
+                logits_nar_sm, targets=targets_nar, mask=mask,
+            )
+            loss_components.append(loss_nar)
+        else:
+            logits_nar_sm, targets_nar = None, None
+
+        self.loss_metric.append(
+            ids=batch.uttid,
+            logits_ar=logits_ar_sm,
+            targets_ar=targets_ar,
+            logits_nar=logits_nar_sm,
+            targets_nar=targets_nar,
+            mask=mask,
+            reduction="batch",
+        )
+
+        loss = torch.mean(torch.stack(loss_components))
+        return loss
+    
+    def compute_loss_stats(
+        self,
+        logits_ar,
+        targets_ar,
+        logits_nar,
+        targets_nar,
+        mask,
+        reduction="batch"
+    ):
+        """Computes an autoregressive/non-autoregressive loss breakdown,
+        to be used for metrics/stats
+        
+        Arguments
+        ---------
+        logits_ar : torch.Tensor
+            The autoregressive predictions
+        targets_ar : torch.Tensor
+            The targets for autoregressive predictions
+        logits_nar : torch.Tensor
+            The non-autoregressive predictions
+        targets_nar : torch.Tensor
+            The targets for non-autoregressive prediction
+        
+        Returns
+        -------
+        stats: dict
+            statistics
+        """
+        stats = {}
+        if self.train_ar:
+            stats["loss_ar"] = self.hparams.compute_cost(
+                logits_ar, targets=targets_ar, mask=mask,
+                reduction=reduction,
+            )
+        if self.train_nar:
+            stats["loss_nar"] = self.hparams.compute_cost(
+                logits_nar, targets=targets_nar, mask=mask,
+                reduction=reduction,
+            )
+        return stats
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        self.offsets = get_offsets(
+            self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
+        )[None, None, :].to(self.device)
+        if not self.hparams.use_token_offsets:
+            self.offsets = torch.zeros_like(self.offsets)
+
+        if hasattr(hparams, "speech_model_layers"):
+            self.layer_idx = get_selected_layer_indexes(
+                hparams.available_speech_model_layers,
+                hparams.speech_model_layers
+            )
+        else:
+            self.layer_idx = None
+
+        self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
+            metric=self.compute_loss_stats, batch_eval=True,
+        )
+        self.apply_curriculum()
+
+        self.is_evaluating = False
+        if stage == sb.Stage.VALID:
+            if self.is_eval_epoch(epoch):
+                self.evaluation_metric.on_evaluation_start()
+                self.is_evaluating = True
+            else:
+                logger.info("No evaluation on epoch %d", epoch)
+        elif stage == sb.Stage.TEST:
+            self.evaluation_metric.on_evaluation_start()
+            self.is_evaluating = True
+        self.token_model_kwargs = getattr(
+            self.hparams, "token_model_kwargs", {}
+        )
+        dataset = stage.name.lower()
+        self.resample_fn[dataset](epoch=epoch or 0)
+
+    def apply_curriculum(self):
+        """Applies curriculum settings, if specified, training only the autoregressive part - or
+        only the non-autoregressive part"""
+        epoch = self.hparams.epoch_counter.current
+        self.train_ar, self.train_nar = True, True
+        lm_head = (
+            self.modules.model.module.lm_head
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.lm_head
+        )
+        lm_head.requires_grad_(True)
+        if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten:
+            # NOTE: If there is only one track it's autoregressive
+            self.train_nar = False
+        elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
+            self.train_nar = False
+        elif (
+            self.hparams.number_of_epochs_nar is not None
+            and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
+        ):
+            self.train_ar = False
+            if self.hparams.freeze_lm_head:
+                lm_head.requires_grad_(False)
+
+    def is_eval_epoch(self, epoch):
+        """Determines whether or not evaluation should be performed
+        in the specieied epoch
+
+        Arguments
+        ---------
+        epoch : int
+            The epoch number. If omitted, the epoch number from the
+            epoch counter will be used
+
+        Returns
+        -------
+        eval_epoch : bool
+            True if evaluation should be run in this epoch, false
+            otherwise"""
+        if epoch is None:
+            epoch = self.hparams.epoch_counter.current
+        # NOTE: Need to get past AR-only training to be able to evaluate
+        can_evaluate = not (
+            self.hparams.number_of_epochs_ar is not None
+            and epoch <= self.hparams.number_of_epochs_ar
+        )
+        return can_evaluate and (epoch % self.hparams.eval_interval == 0)
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp.
+
+        Default implementation compiles the jit modules, initializes
+        optimizers, and loads the latest checkpoint to resume training.
+        """
+        # Run this *after* starting all processes since jit/compiled modules
+        # cannot be pickled.
+        self._compile()
+
+        # Wrap modules with parallel backend after jit
+        self._wrap_distributed()
+
+        # Initialize optimizers after parameters are configured
+        self.init_optimizers()
+
+        # Load latest checkpoint to resume training if interrupted
+        if self.checkpointer is not None and not getattr(
+            self, "_ckpt_recovered", False
+        ):
+            self.checkpointer.recover_if_possible()
+            self._ckpt_recovered = True
+
+    @torch.no_grad()
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch, override for different procedure than train.
+
+        The default implementation depends on two methods being defined
+        with a particular behavior:
+
+        * ``compute_forward()``
+        * ``compute_objectives()``
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for evaluation. Default implementation assumes
+            this batch has two elements: inputs and targets.
+        stage : Stage
+            The stage of the experiment: Stage.VALID, Stage.TEST
+
+        Returns
+        -------
+        detached loss
+        """
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        if self.is_evaluating:
+            with torch.no_grad():
+                audio_tokens, audio_length = self.inference(batch)
+                if self.hparams.flip_layers:
+                    audio_tokens = audio_tokens.flip(2)
+                wav = self.create_waveform(audio_tokens, audio_length)
+                wav = wav.squeeze(1)
+                self.save_samples(
+                    batch=batch, wav=wav, length=audio_length, stage=stage
+                )
+                self.evaluation_metric.append(
+                    ids=batch.uttid,
+                    wav=wav,
+                    text=batch.label_norm_eval,
+                    length=audio_length,
+                    wav_ref=batch.sig.data,
+                    length_ref=batch.sig.lengths,
+                )
+        return loss.detach().cpu()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+        loss_stats = self.loss_metric.summarize(flat=True)
+        stage_stats = {"loss": stage_loss, **loss_stats}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+
+        eval_summary_stats = {}
+        # End evaluation and report stats
+        if stage != sb.Stage.TRAIN and self.is_evaluating:
+            self.evaluation_metric.on_evaluation_end()
+            self.save_eval(stage)
+            eval_summary = self.evaluation_metric.summarize()
+            eval_summary_stats = {
+                key: eval_summary.get(value)
+                for key, value in self.hparams.eval_summary_log.items()
+            }
+            stage_stats.update(eval_summary_stats)
+
+        # Perform end-of-iteration things, like annealing, logging, etc.
+        if stage == sb.Stage.VALID:
+
+            if self.hparams.lr_annealing_mode == "epoch":
+                _, new_lr = self.hparams.lr_annealing(stage_loss)
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+
+            lr = self.optimizer.param_groups[0]["lr"]
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(
+                stats_meta={"epoch": epoch, "lr": lr, **eval_summary_stats},
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+
+            ckpt_kwargs = {
+                f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key],
+            }
+            # Save the current checkpoint and delete previous checkpoints.
+            self.checkpointer.save_and_keep_only(
+                meta={"loss": stage_stats["loss"], **eval_summary_stats},
+                num_to_keep=hparams["ckpt_keep"],
+                **ckpt_kwargs
+            )
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
+            )
+
+    def inference(self, batch):
+        """Runs TTS inference
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            A batch
+
+        Returns
+        -------
+        audio : torch.Tensor
+            A padded tensor of audio
+        audio_length : torch.Tensor
+            Relative lengths
+        """
+        prefix, prefix_length = batch.prefix
+        # NOTE: ESPNET VALL-E does not support batched inference
+        prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
+        inference = (
+            self.modules.model.module.inference
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.inference
+        )        
+        inference_results = [
+            inference(
+                prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts()
+            )
+            for prefix_item in prefix_items
+        ]
+        inferred_tokens = [
+            self._pad_inferred_sample(result)
+            for result in inference_results
+        ]
+        audio, audio_length = batch_pad_right(inferred_tokens)
+        audio_length = audio_length.to(self.device)
+        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0)
+        return audio, audio_length
+
+    def _pad_inferred_sample(self, result):
+        """Applies length padding to an inference result
+
+        Arguments
+        ---------
+        result : list
+            The VALL-E Inference output
+
+        Returns
+        -------
+        sample : torch.Tensor
+            A sample, padded if needed
+        """
+        if result[0]:
+            sample = result[0][0]
+        else:
+            sample = torch.zeros(
+                1000, self.hparams.audio_tokens_per_step, device=self.device
+            )
+        min_length = getattr(self.hparams, "infer_min_length", 10)
+        sample_length, tracks = sample.shape
+        if sample_length < min_length:
+            sample = pad_right_to(
+                sample,
+                (min_length, tracks),
+            )[0]
+        return sample
+
+    def _get_inference_opts(self):
+        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[
+            None, :
+        ]
+        tracks = torch.arange(
+            self.hparams.audio_tokens_per_step, device=self.device
+        )[:, None]
+        if not self.hparams.use_token_offsets:
+            tracks = torch.zeros_like(tracks)
+        track_start = (
+            self.hparams.audio_token_shift
+            + tracks * self.hparams.vocab_size
+        )
+        if self.hparams.flip_layers:
+            track_start = track_start.flip(0)
+        track_end = track_start + self.hparams.vocab_size
+        mask = (
+            ((idx >= track_start) & (idx < track_end))
+            | (idx == self.hparams.bos_index)
+        ).logical_not()
+        mask[
+            (
+                (idx >= self.hparams.special_num_tokens)
+                & (idx <= self.hparams.audio_token_shift)
+            ).expand_as(mask)
+        ] = True
+        return self.hparams.inference_opts(
+            masks={self.hparams.bos_index: mask}, device=self.device,
+        )
+
+    def save_samples(self, batch, wav, length, stage):
+        output_folder = self._get_eval_output_folder(stage)
+        samples = undo_padding_tensor(wav, length)
+        for uttid, sample in zip(batch.uttid, samples):
+            file_name = output_folder / f"pred_{uttid}.wav"
+            write_audio(file_name, sample.cpu(), self.hparams.model_sample_rate)
+
+    def save_eval(self, stage):
+        """Saves evaluation results
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        """
+        output_folder = self._get_eval_output_folder(stage)
+        for src_file_name in self.evaluation_metric.files:
+            dest_file_name = output_folder / src_file_name.name
+            shutil.copyfile(src_file_name, dest_file_name)
+        self.evaluation_metric.clear()
+
+    def _get_eval_output_folder(self, stage):
+        epoch = self.hparams.epoch_counter.current
+        output_folder = (
+            Path(self.hparams.output_folder) / "eval" / stage.name.lower()
+        )
+        if epoch is not None:
+            output_folder = output_folder / str(epoch)
+        output_folder.mkdir(exist_ok=True, parents=True)
+        return output_folder
+
+    def fit_batch(self, batch):
+        loss = super().fit_batch(batch)
+        if self.hparams.lr_annealing_mode == "step":
+            self.hparams.lr_annealing(self.optimizer)
+        return loss
+    
+    def fit(
+        self,
+        epoch_counter,
+        train_set,
+        valid_set=None,
+        progressbar=None,
+        train_loader_kwargs={},
+        valid_loader_kwargs={},
+    ):
+        """Iterate epochs and datasets to improve objective.
+
+        Relies on the existence of multiple functions that can (or should) be
+        overridden. The following methods are used and expected to have a
+        certain behavior:
+
+        * ``fit_batch()``
+        * ``evaluate_batch()``
+        * ``update_average()``
+
+        If the initialization was done with distributed_count > 0 and the
+        distributed_backend is ddp, this will generally handle multiprocess
+        logic, like splitting the training data into subsets for each device and
+        only saving a checkpoint on the main process.
+
+        Arguments
+        ---------
+        epoch_counter : iterable
+            Each call should return an integer indicating the epoch count.
+        train_set : Dataset, DataLoader
+            A set of data to use for training. If a Dataset is given, a
+            DataLoader is automatically created. If a DataLoader is given, it is
+            used directly.
+        valid_set : Dataset, DataLoader
+            A set of data to use for validation. If a Dataset is given, a
+            DataLoader is automatically created. If a DataLoader is given, it is
+            used directly.
+        progressbar : bool
+            Whether to display the progress of each epoch in a progressbar.
+        train_loader_kwargs : dict
+            Kwargs passed to `make_dataloader()` for making the train_loader
+            (if train_set is a Dataset, not DataLoader).
+            E.G. batch_size, num_workers.
+            DataLoader kwargs are all valid.
+        valid_loader_kwargs : dict
+            Kwargs passed to `make_dataloader()` for making the valid_loader
+            (if valid_set is a Dataset, not DataLoader).
+            E.g., batch_size, num_workers.
+            DataLoader kwargs are all valid.
+
+        Returns
+        -------
+        None
+        """
+        if self.test_only:
+            logger.info(
+                "Test only mode, skipping training and validation stages."
+            )
+            return
+        if not (
+            isinstance(train_set, DataLoader)
+            or isinstance(train_set, LoopedLoader)
+        ):        
+            train_set = self.make_dataloader(
+                train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs
+            )
+        self.on_fit_start()
+        epoch = self.hparams.epoch_counter.current
+        if epoch < self.hparams.number_of_epochs:
+            valid_set = sample_dataset(
+                dataset=valid_set,
+                count=self.hparams.valid_inter_data_count,
+                seed=self.hparams.seed
+            )
+
+        valid_set = self.make_dataloader(
+            valid_set,
+            stage=sb.Stage.VALID,
+            ckpt_prefix=None,
+            **valid_loader_kwargs,
+        )
+
+        if progressbar is None:
+            progressbar = not self.noprogressbar
+
+        # Only show progressbar if requested and main_process
+        enable = progressbar and sb.utils.distributed.if_main_process()
+
+        # Iterate epochs
+        for epoch in epoch_counter:
+            self._fit_train(train_set=train_set, epoch=epoch, enable=enable)
+            self._fit_valid(valid_set=valid_set, epoch=epoch, enable=enable)
+
+            # Debug mode only runs a few epochs
+            if (
+                self.debug
+                and epoch == self.debug_epochs
+                or self._optimizer_step_limit_exceeded
+            ):
+                break
+
+
+
+INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+
+
+    Arguments
+    ---------
+    hparams : dict
+        This dictionary is loaded from the `train.yaml` file, and it includes
+        all the hyperparameters needed for dataset construction and loading.
+
+    Returns
+    -------
+    datasets : dict
+        Dictionary containing "train", "valid", and "test" keys that correspond
+        to the DynamicItemDataset objects.
+    silence_token : dict
+        the token used for silence
+    """
+
+    # Define datasets from json data manifest file
+    # Define datasets sorted by ascending lengths for efficiency
+    datasets = {}
+    data_folder = hparams["data_folder"]
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    label_encoder = hparams["label_encoder"]
+    input_feature = INPUT_FEATURE_MAP[hparams["input"]]
+    offsets = get_offsets(
+        hparams["vocab_size"], hparams["audio_tokens_per_step"]
+    ).unsqueeze(0)
+    if not hparams["use_token_offsets"]:
+        offsets = torch.zeros_like(offsets)    
+    if hparams["flip_layers"]:
+        offsets = offsets.flip(-1)
+
+    tokens_loader = hparams.get("tokens_loader")
+    spk_prompt_length = hparams["spk_prompt_length"]
+
+    layer_idx = None
+    if "speech_model_layers" in hparams:
+        layer_idx = get_selected_layer_indexes(
+            hparams["available_speech_model_layers"],
+            hparams["speech_model_layers"],
+        )
+
+    if layer_idx is not None:
+        num_codebooks = layer_idx
+    else:
+        num_codebooks = hparams["audio_tokens_per_step"]
+
+
+    @sb.utils.data_pipeline.takes("label")
+    @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
+    def text_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        label_norm = label.upper()
+        yield label_norm
+        label_norm_eval = RE_PUNCTUATION.sub("", label_norm)
+        yield label_norm_eval
+
+    @sb.utils.data_pipeline.takes(input_feature)
+    @sb.utils.data_pipeline.provides("tokens")
+    def tokens_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        return label_encoder.encode_sequence_torch(label)
+
+    def spk_prompt(uttid, spk_sample):
+        # Sample a speaker-matched embedding
+        selected_uttid = spk_sample[uttid]
+        audio = tokens_loader.tokens_by_uttid(
+            selected_uttid, num_codebooks=num_codebooks
+        )
+        if audio.size(0) > spk_prompt_length:
+            offset = torch.randint(0, audio.size(0), (1,)).item()
+        else:
+            offset = 0
+        # Retrieve the embedding value from the dataset
+        audio_spk_prompt, _ = pad_right_to(
+            audio[offset : offset + spk_prompt_length],
+            (spk_prompt_length, audio.size(1)),
+        )
+        return audio_spk_prompt
+
+    @sb.utils.data_pipeline.takes("uttid", "tokens", "spk_prompt")
+    @sb.utils.data_pipeline.provides(
+        "audio", "prefix", "prompt", "prefix_length", "length"
+    )
+    def prompt_pipeline(id, tokens, spk_prompt):
+        audio = tokens_loader.tokens_by_uttid(
+            id, num_codebooks=num_codebooks
+        )
+        if hparams["flip_layers"]:
+            audio = audio.flip(-1)
+        yield audio
+        num_tracks = audio.size(1)
+        prefix = torch.cat(
+            [
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                tokens.unsqueeze(-1).expand(len(tokens), num_tracks),
+                torch.ones(1, num_tracks) * hparams["eot_index"],
+                spk_prompt + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eop_index"],
+            ]
+        )
+        yield prefix
+        prompt = torch.cat(
+            [
+                prefix,
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                audio + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eos_index"],
+            ]
+        ).int()
+        yield prompt
+        yield len(prefix)
+        yield len(prompt)
+
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def sig_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+
+    dynamic_items = [text_pipeline, tokens_pipeline, sig_pipeline]
+
+    init_sequence_encoder(hparams)
+    use_spk_emb = hparams.get("use_spk_emb", False)
+    prepared_features = ["audio_tokens"]
+    output_keys = [
+        "uttid",
+        "tokens",
+        "label_norm",
+        "audio",
+        "prompt",
+        "prefix_length",
+        "length",
+    ]
+    if use_spk_emb:
+        prepared_features.append("spk_emb")
+        output_keys.append("spk_emb")
+
+    resample_fn = {}
+    for dataset in data_info:
+        dataset_dynamic_items = list(dynamic_items)
+        dataset_output_keys = list(output_keys)
+        if dataset != "train":
+            dataset_output_keys += ["sig", "label_norm_eval", "prefix"]
+        dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": data_folder},
+            dynamic_items=dataset_dynamic_items,
+            output_keys=dataset_output_keys,
+        )
+        spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams)
+        spk_sample = {}
+        spk_prompt_pipeline = partial(spk_prompt, spk_sample=spk_sample,)
+        dynamic_dataset.add_dynamic_item(
+            func=spk_prompt_pipeline, takes=["uttid"], provides=["spk_prompt"],
+        )
+        dynamic_dataset.add_dynamic_item(prompt_pipeline)
+        resample_fn[dataset] = partial(
+            resample_spk,
+            spk_idx=spk_idx,
+            sample=spk_sample,
+            dataset=dynamic_dataset,
+            spk_samplers=spk_samplers,
+        )
+        resample_fn[dataset](epoch=0)
+        if hparams["input"] == "phonemes":
+            dynamic_dataset = dynamic_dataset.filtered_sorted(
+                key_test={"has_alignments": lambda value: value}
+            )
+
+        datasets[dataset] = dynamic_dataset
+        hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
+
+    # Sorting training data with ascending order makes the code  much
+    # faster  because we minimize zero-padding. In most of the cases, this
+    # does not harm the performance.
+    if hparams["sorting"] == "ascending":
+        datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        datasets["train"] = datasets["train"].filtered_sorted(
+            sort_key="length", reverse=True
+        )
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        if not hparams["overfit_test"]:
+            hparams["train_dataloader_opts"]["shuffle"] = True
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+    return datasets, resample_fn
+
+
+def sample_dataset(dataset, count, seed):
+    """Selects a sample of the specified dataset in a
+    stable manner, returning the same sample on each call
+
+    Arguments
+    ---------
+    dataset : speechbrain.dataio.dataset.DynamicItemDataset
+        A dataset
+    count : int
+        The number of items to select
+    seed : int
+        The seed to be used
+    """
+    if len(dataset) < count:
+        return dataset
+    generator = torch.Generator()
+    generator.manual_seed(seed)
+    indexes = torch.randperm(len(dataset)).tolist()[:count]
+    data_ids = [
+        dataset.data_ids[idx]
+        for idx in indexes
+    ]
+    return FilteredSortedDynamicItemDataset(
+        dataset,
+        data_ids,
+    )
+
+
+def get_offsets(vocab_size, tracks):
+    """Adds offsets to each track to treat the tokens as distinct
+
+    Arguments
+    ---------
+    vocab_size : int
+        The vocabulary size, for each track
+    tracks : int
+        The number of tracks
+    """
+    return torch.arange(tracks) * vocab_size
+
+
+def group_by_speaker(dataset, hparams):
+    """Groups utterance IDs in a dataset by speaker, for selection. The selection
+    is stable based on the seed - calling this method multiple times will always
+    result in the same order
+
+    Arguments
+    ---------
+    dataset : torch.Tensor
+        the dataset from which to select items
+    hparams : dict
+        hyperparameters
+
+    Returns
+    -------
+    spk_idx : dict
+        a str -> str with a list of utterance IDs
+        for every speaker
+    spk_samplers : dict
+        a reproducible sampler for every speaker
+    spk_samplers_it : dict
+        an iterator for each sampler
+    """
+    spk_uttid = {}
+    spk_samplers = {}
+    speakers = []
+    generator = torch.Generator()
+    generator.manual_seed(hparams["seed"])
+
+    # Group by speaker
+    with dataset.output_keys_as(["spk_id", "uttid"]):
+        for idx, item in enumerate(dataset):
+            spk_id = item["spk_id"]
+            if spk_id not in spk_uttid:
+                spk_uttid[spk_id] = []
+            spk_uttid[spk_id].append(item["uttid"])
+            speakers.append(spk_id)
+
+    # Create a reproducible sampler
+    for spk_id in speakers:
+        sampler = hparams["spk_sampler"](data_source=spk_uttid[spk_id])
+        spk_samplers[spk_id] = sampler
+
+    return spk_uttid, spk_samplers
+
+
+def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch):
+    """Selects new samples
+
+    Arguments
+    ---------
+    spk_idx : dict
+        Data item indexes grouped by speaker
+    spk_samplers : dict
+        A sampler for each speaker
+    spk_samplers_it : dict
+        An iterator for each speaker
+    epoch : int
+        The epoch number
+
+    Returns
+    -------
+    sample : dict
+        a dictionary with uttids as keys and matching
+        indexes as values
+    """
+    if epoch is None:
+        epoch = 0
+    spk_samplers_it = {}
+    for spk_id, sampler in spk_samplers.items():
+        sampler.set_epoch(epoch)
+        spk_samplers_it[spk_id] = iter(sampler)
+    with dataset.output_keys_as(["uttid", "spk_id"]):
+        for item in dataset:
+            spk_item_idx = next(spk_samplers_it[item["spk_id"]])
+            dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx]
+            sample[item["uttid"]] = dataset_item_idx
+
+
+def init_sequence_encoder(hparams):
+    """Initialize a sequence encoder
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    prefix: str
+        the prefix to be prepended to hyperparameter keys, per the naming
+        convention
+
+        {prefix}_label_encoder: the hparams key for the label encoder
+        {prefix}_list_file:  the hparams key for the list file
+
+    Returns
+    -------
+    encoder: speechbrain.dataio.encoder.TextEncoder
+        an encoder instance"""
+    encoder = hparams["label_encoder"]
+    token_list_file_name = hparams["token_list_file"]
+    tokens = read_token_list(token_list_file_name)
+    encoder.add_unk()
+    for token in hparams["special_tokens"]:
+        token_key = token.replace("<", "").replace(">", "")
+        token_index = hparams[f"{token_key}_index"]
+        encoder.insert_label(token, token_index)
+
+    encoder.update_from_iterable(tokens, sequence_input=False)
+    encoder.expect_len(len(tokens) + hparams["special_num_tokens"])
+    return encoder
+
+
+def get_selected_layer_indexes(available_layers, selected_layers):
+    """Finds the layers of selected layers
+
+    Arguments
+    ---------
+    available_layers : list
+        The available layers
+    selected_layers : list
+        The selected layers
+
+    Returns
+    -------
+    layer_idx : list    
+        The layer indexes
+    """
+    if not (selected_layers and available_layers):
+        return None
+    layer_idx = [available_layers.index(layer) for layer in selected_layers]
+    return layer_idx
+
+
+def read_token_list(file_name):
+    """Reads a simple text file with tokens (e.g. characters or phonemes) listed
+    one per line
+
+    Arguments
+    ---------
+    file_name: str
+        the file name
+
+    Returns
+    -------
+    result: list
+        a list of tokens
+    """
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
+        raise ValueError(f"Token file {file_name} not found")
+    with open(file_name) as token_file:
+        return [line.strip("\r\n") for line in token_file if line]
+
+
+def apply_overfit_test(hparams, dataset):
+    """Helper for applying an overfit test conditionally based
+    on hyperparameters:
+
+    `overfit_test`: whether or not to apply an overfit test
+    `overfit_test_sample_count`: the number of samples to use from the
+        original dataset
+    `overfit_test_epoch_data_count`: the number of samples per epoch
+
+    The function will accept datasets, (train, valid, test) tuples
+    or dictionaries of the form:
+    {"train": dataset1, "valid": dataset2, "test": dataset3}
+
+    If a tuple or dictionary is used, the training dataset will be of length
+    overfit_test_epoch_data_count wheres the evaluation dataset will be of
+    length overfit_test_sample_count.
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    dataset: DynamicItemDataset|tuple|dict
+        One of the following
+        a dataset
+        a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3})
+        a (train, valid, test)  tuple of datasets
+
+    Returns
+    -------
+    result: DynamicItemDataset|tuple|dict
+        a dataset or collection of datasets suitable for
+        an overfitting test - in the same format as the
+        dataset argument (single dataset, dictionary and tuple)
+    """
+    if hparams["overfit_test"]:
+        if isinstance(dataset, tuple):
+            dataset_train, dataset_valid, _ = dataset
+            dataset_train = apply_overfit_test(hparams, dataset_train)
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            dataset_eval.set_output_keys(
+                list(dataset_valid.pipeline.output_mapping.keys())
+            )
+            result = dataset_train, dataset_eval, dataset_eval
+        elif isinstance(dataset, dict):
+            dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            dataset_eval.set_output_keys(
+                list(dataset["valid"].pipeline.output_mapping.keys())
+            )
+
+            result = {
+                "train": dataset_train,
+                "valid": dataset_eval,
+                "test": dataset_eval,
+                "sample": dataset_eval,
+            }
+        else:
+            result = dataset.overfit_test(
+                hparams["overfit_test_sample_count"],
+                hparams["overfit_test_epoch_data_count"],
+            )
+    else:
+        result = dataset
+    return result
+
+
+def select_eval_subset(dataset, hparams, key="eval_subset"):
+    """Selects a subset of the dataset provided, if specified.
+    The selection is controlled by a hyperparameter named
+    eval_subset, which is expected to list the IDs of the
+    data items on which evaluation will take place, one per line
+
+    Arguments
+    ---------
+    dataset : speechbrain.dataio.dataset.DynamicItemDataset
+        A dataset
+    hparams : dict
+        A hyperparameters file
+
+    Returns
+    -------
+    subset : dataset
+        The dataset, filtered down if applicable
+    """
+    eval_subset_path = hparams.get(key)
+    if eval_subset_path is not None:
+        eval_subset_path = Path(eval_subset_path)
+        if not eval_subset_path.exists():
+            raise ValueError(f"eval_subset {eval_subset_path} does not exist")
+        with open(eval_subset_path) as eval_subset_file:
+            eval_subset_ids = [line.strip() for line in eval_subset_file]
+        existing_ids = dataset.data_ids
+        eval_subset_ids = [uttid for uttid in eval_subset_ids if uttid in existing_ids]
+        if not eval_subset_ids:
+            raise ValueError("{eval_subset_path}: no items found in the dataset")
+        subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids)
+    else:
+        subset = dataset
+    return subset
+
+
+def undo_padding_tensor(batch, lengths):
+    """Produces Python lists given a batch of sentences with
+    their corresponding relative lengths.
+
+    Arguments
+    ---------
+    batch : torch.Tensor
+        Batch of sentences gathered in a batch.
+    lengths : torch.Tensor
+        Relative length of each sentence in the batch.
+
+    Returns
+    -------
+    as_list : list
+        A python list of the corresponding input tensor.
+
+    Example
+    -------
+    >>> batch=torch.rand([4,100])
+    >>> lengths=torch.tensor([0.5,0.6,0.7,1.0])
+    >>> snt_list=undo_padding(batch, lengths)
+    >>> len(snt_list)
+    4
+    """
+    batch_max_len = batch.shape[1]
+    as_list = []
+    for seq, seq_length in zip(batch, lengths):
+        actual_size = int(torch.round(seq_length * batch_max_len))
+        seq_true = seq.narrow(0, 0, actual_size)
+        as_list.append(seq_true)
+    return as_list
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+
+if __name__ == "__main__":
+    # Reading command line arguments
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Initialize ddp (useful only for multi-GPU DDP training)
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        yaml = fin.read()
+
+    # Load evaluation hyperparameters
+    eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if eval_hparams_file.exists():
+        logger.info(
+            "Using evaluation hyperparameters from %s", eval_hparams_file
+        )
+        with open(eval_hparams_file) as eval_hparams:
+            hparams_yaml = eval_hparams.read()
+            yaml = "\n".join([yaml, hparams_yaml])
+    else:
+        logger.info(
+            "%s not found - not using evaluation hyperparameters",
+            eval_hparams_file,
+        )
+    hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Data preparation, to be run on only one process.
+    from libritts_prepare import prepare_libritts
+
+    # Data preparation, to be run on only one process.
+    if not hparams["skip_prep"]:
+        run_on_main(
+            prepare_libritts,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_json_train": hparams["train_json"],
+                "save_json_valid": hparams["valid_json"],
+                "save_json_test": (
+                    hparams["test_json"]
+                    if "test" in hparams["splits"]
+                    else None
+                ),
+                "sample_rate": hparams["sample_rate"],
+                "train_split": hparams["train_split"],
+                "valid_split": hparams["valid_split"],
+                "test_split": (
+                    hparams["test_split"]
+                    if "test" in hparams["splits"]
+                    else None
+                ),
+                "seed": hparams["seed"],
+                "alignments_folder": hparams.get("alignments_folder"),
+                "model_name": hparams["model"].__class__.__name__,
+            },
+        )
+
+    # We can now directly create the datasets for training, valid, and test
+    datasets, resample_fn = dataio_prepare(hparams)
+
+    # Apply overfit test settings
+    datasets = apply_overfit_test(hparams, datasets)
+    audio_keys = ["audio_tokens"]
+
+    # Trainer initialization
+    tts_brain = VALLEBrain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    tts_brain.resample_fn = resample_fn
+
+    # The `fit()` method iterates the training loop, calling the methods
+    # necessary to update the parameters of the model. Since all objects
+    # with changing state are managed by the Checkpointer, training can be
+    # stopped at any point, and will be resumed on next call.
+    tts_brain.fit(
+        tts_brain.hparams.epoch_counter,
+        datasets["train"],
+        datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Load best checkpoint for evaluation
+    if hparams["testing"]:
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            test_key_kind = hparams["test_key_kind"]
+            test_key = hparams["test_key"]
+            eval_kwargs = {
+                f"{test_key_kind}_key": test_key
+            }
+            eval_dataset = datasets["test"]
+            eval_dataset = select_eval_subset(eval_dataset, hparams)
+            tts_brain.evaluate(
+                test_set=eval_dataset,
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+                **eval_kwargs
+            )
diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py
new file mode 100644
index 000000000..328fbe868
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/extract.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env/python3
+"""Recipe for extracting a discrete tokens with librispeech.
+
+Authors
+ * Jarod Duret 2024
+"""
+
+import os
+import sys
+import logging
+import pathlib as pl
+import speechbrain as sb
+from speechbrain.dataio.dataset import DynamicItemDataset
+from speechbrain.utils.distributed import run_on_main
+from hyperpyyaml import load_hyperpyyaml
+
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep (parsing Librispeech
+    from libritts_prepare import prepare_libritts  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_libritts,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "train_split": hparams["train_splits"],
+            "valid_split": hparams["dev_splits"],
+            "test_split": hparams["test_splits"],
+            "save_json_train": hparams["train_json"],
+            "save_json_valid": hparams["valid_json"],
+            "save_json_test": hparams["test_json"],
+            "sample_rate": hparams["sample_rate"],
+            "skip_prep": hparams["skip_prep"],
+            "max_valid_size": None,
+            "skip_resample": hparams["skip_resample"],
+        },
+    )
+
+    tokens_extractor = hparams["tokens_extractor"]
+    data_folder = hparams["data_folder"]
+    datasets = []
+    for split in ["train", "valid", "test"]:
+        json_path = hparams[f"{split}_json"]
+        name = pl.Path(json_path).stem
+        dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=json_path, replacements={"data_root": data_folder},
+        )
+        datasets.append(dataset)
+
+    merged_data = {
+        key: value
+        for dataset in datasets
+        for key, value in dataset.data.items()
+    }
+    merged_dataset = DynamicItemDataset(merged_data)
+
+    save_folder = pl.Path(hparams["save_folder"])
+    logger.info("Extracting dataset tokens ...")
+    tokens_extractor.extract_tokens(
+        merged_dataset,
+        hparams["num_codebooks"],
+        (save_folder / "libritts").as_posix(),
+    )
+
+    if hparams["save_embedding"]:
+        save_folder = pl.Path(hparams["save_folder"])
+        logger.info("Saving embeddings ...")
+        tokens_extractor.save_pretrained_embeddings(
+            (save_folder / "embeddings").as_posix(),
+            vocab_size=hparams["vocab_size"],
+            num_codebooks=hparams["num_codebooks"],
+        )
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
new file mode 100644
index 000000000..836503717
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
@@ -0,0 +1,64 @@
+# ############################################################################
+# Auido Tokenizer: DAC
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 32
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+skip_resample: False
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+  model_type: !ref <model_type>
+  model_bitrate: !ref <model_bitrate>
+  load_pretrained: True
+  tag: latest
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
new file mode 100644
index 000000000..6ae14c87c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
@@ -0,0 +1,103 @@
+# ############################################################################
+# Auido Tokenizer: WavLM
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavlm
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+### Configuration for  discrete SSL model
+# | SSL Model  | HF Encoder                             | K-Means Dataset | K-Means Size | SSL Layers           | Vocoder Model                            |
+# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------|
+# | WavLM      | microsoft/wavlm-large                  | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wavlm-k1000-LibriTTS |
+# | HuBERT     | facebook/hubert-large-ll60k            | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+# | Wav2Vec2   | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+
+# ssl_model_type: hubert, wavlm, wav2vec2
+# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
+ssl_model_type: WavLM
+ssl_hub: microsoft/wavlm-large
+ssl_folder: !ref <save_folder>/ssl_checkpoint
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+freeze_ssl: True
+freeze_feature_extractor: True
+vocab_size: 1000
+save_embedding: False
+skip_resample: False
+
+
+### Config for Tokenizer
+# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
+num_codebooks: [1, 3, 7, 12, 18, 23]
+deduplicate: [False, False, False, False, False, False]
+bpe_tokenizer_path: [null, null, null, null, null, null]
+sample_rate: 16000
+encoder_dim: 1024
+
+ssl_model: !apply:speechbrain.utils.hparams.choice
+  value: !ref <ssl_model_type>
+  choices:
+    WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
new file mode 100644
index 000000000..188b38a6d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
@@ -0,0 +1,64 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+bandwidth: 24.0
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+skip_resample: False
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+  source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  bandwidth: !ref <bandwidth>
+  flat_embeddings: False
+  freeze: True
+  renorm_embeddings: False
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml
new file mode 100644
index 000000000..a0542b189
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml
@@ -0,0 +1,67 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+skip_resample: False
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <num_codebooks>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/fairseq_hubert.yaml
new file mode 100644
index 000000000..1f7e87fa9
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/fairseq_hubert.yaml
@@ -0,0 +1,64 @@
+# ############################################################################
+# Auido Tokenizer: FairSEQ HuBERT
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/fairseq-hubert
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+model_path: !PLACEHOLDER
+feature_extractor_path: !ref <model_path>/mhubert_base_25hz_cp_mls_cv_sp_fisher.pt
+kmeans_path: !ref <model_path>/mhubert_base_25hz_cp_mls_cv_sp_fisher_L11_km500.bin
+vocab_size: 500
+save_embedding: False
+skip_resample: False
+
+
+### Config for Tokenizer
+# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
+layer: 11
+num_codebooks: 1
+deduplicate: [False, False, False, False, False, False]
+bpe_tokenizer_path: [null, null, null, null, null, null]
+sample_rate: 16000
+encoder_dim: 1024
+
+tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer
+  feat_extractor_path: !ref <feature_extractor_path>
+  km_path: !ref <kmeans_path>
+  layer: !ref <layer>
+  vocoder: null
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
new file mode 100644
index 000000000..dc026cc55
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
@@ -0,0 +1,59 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+model_hub: kyutai/mimi
+vocab_size: 1024
+num_codebooks: 32
+sample_rate: 24000
+encoder_dim: 1024
+freeze_embedding: False
+save_embedding: False
+skip_resample: False
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+  source: !ref <model_hub>
+  save_path: !ref <pretrained_model_save_folder>
+  num_codebooks: !ref <num_codebooks>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
new file mode 100644
index 000000000..8d3a9aa27
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
@@ -0,0 +1,55 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+vocab_size: 1024
+num_codebooks: 8
+sample_rate: 16000
+encoder_dim: 1024
+freeze_embedding: False
+save_embedding: False
+skip_resample: False
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
+  source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+  save_path: !ref <pretrained_model_save_folder>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
new file mode 100644
index 000000000..3d9792bbb
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
@@ -0,0 +1,59 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/sqcodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# SQCodec parameters
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sample_rate: 16000
+save_embedding: False
+num_codebooks: 4
+save_path: /home/ubuntu/sq-codec/SQ-Codec
+skip_resample: False
+
+
+# SQCodec model
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+  save_path: !ref <pretrained_model_save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
new file mode 100644
index 000000000..bfd802740
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
@@ -0,0 +1,61 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# WavTokenizer parameters
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+sample_rate: 24000
+save_embedding: False
+num_codebooks: 1
+vocab_size: 4096
+skip_resample: False
+
+# wavtokenizer model
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+  source: !ref <model_hub>
+  save_path: !ref <pretrained_model_save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+  freeze: True
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py
new file mode 120000
index 000000000..39f1a78c2
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py
@@ -0,0 +1 @@
+../libritts_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py
new file mode 100644
index 000000000..52594eaf9
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py
@@ -0,0 +1,504 @@
+"""
+LibriTTS data preparation
+
+Authors
+ * Pradnya Kandarkar 2022
+"""
+
+import json
+import os
+import random
+
+import torch
+import torchaudio
+import re
+from tqdm import tqdm
+
+from speechbrain.inference.text import GraphemeToPhoneme
+from speechbrain.utils.data_utils import get_all_files
+from speechbrain.utils.logger import get_logger
+from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations
+from pathlib import Path
+
+logger = get_logger(__name__)
+LIBRITTS_URL_PREFIX = "https://www.openslr.org/resources/60/"
+
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def prepare_libritts(
+    data_folder,
+    save_json_train,
+    save_json_valid,
+    save_json_test,
+    sample_rate,
+    split_ratio=[80, 10, 10],
+    libritts_subsets=None,
+    train_split=None,
+    valid_split=None,
+    test_split=None,
+    seed=1234,
+    model_name=None,
+    max_valid_size=500,
+    alignments_folder=None,
+    skip_prep=False,
+    skip_resample=False,
+):
+    """
+    Prepares the json files for the LibriTTS dataset.
+    Downloads the dataset if it is not found in the `data_folder` as expected.
+
+    Arguments
+    ---------
+    data_folder : str
+        Path to the folder where the LibriTTS dataset is stored.
+    save_json_train : str
+        Path where the train data specification file will be saved.
+    save_json_valid : str
+        Path where the validation data specification file will be saved.
+    save_json_test : str
+        Path where the test data specification file will be saved.
+    sample_rate : int
+        The sample rate to be used for the dataset
+    split_ratio : list
+        List composed of three integers that sets split ratios for train, valid,
+        and test sets, respectively. For instance split_ratio=[80, 10, 10] will
+        assign 80% of the sentences to training, 10% for validation, and 10%
+        for test.
+    libritts_subsets: list
+        List of librispeech subsets to use (e.g., dev-clean, train-clean-100, ...) for the experiment.
+        This parameter will be ignored if explicit data splits are provided.
+        Explicit data splits parameters: "train_split", "valid_split", "test_split"
+    train_split : list
+        List of librispeech subsets to use (e.g.,train-clean-100, train-clean-360) for the experiment training stage.
+    valid_split : list
+        List of librispeech subsets to use (e.g., dev-clean) for the experiment validation stage.
+    test_split : list
+        List of librispeech subsets to use (e.g., test-clean) for the experiment testing stage.
+    seed : int
+        Seed value
+    model_name : str
+        Model name (used to prepare additional model specific data)
+    alignments_path : None
+        The path to alignments files
+    skip_prep: Bool
+        If True, skip preparation.
+    skip_resample: bool
+        If True, audio will not be resampled
+
+    Returns
+    -------
+    None
+    """
+
+    if skip_prep:
+        return
+
+    # Setting the seed value
+    random.seed(seed)
+
+    # Checks if this phase is already done (if so, skips it)
+    if skip(save_json_train, save_json_valid, save_json_test):
+        logger.info("Preparation completed in previous run, skipping.")
+        return
+
+    logger.info(
+        f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}"
+    )
+
+    # If specific splits are provided, creates data manifest files accordingly
+    if train_split:
+        wav_list = prepare_split(data_folder, train_split)
+        create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
+    if valid_split:
+        wav_list = prepare_split(data_folder, valid_split)
+        # TODO add better way to speedup evaluation
+        if max_valid_size is not None and len(wav_list) > max_valid_size:
+            wav_list = random.sample(wav_list, max_valid_size)
+        create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
+    if test_split:
+        wav_list = prepare_split(data_folder, test_split)
+        create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
+
+    if skip(save_json_train, save_json_valid, save_json_test):
+        logger.info("Preparation completed.")
+        return
+
+    # If specific splits are not provided, and a list of subsets if provided, creates train, valid, test splits
+    # Creates data manifest files according to the data splits
+    if libritts_subsets:
+        wav_list = prepare_split(data_folder, libritts_subsets)
+        # Random split the signal list into train, valid, and test sets.
+        data_split = split_sets(wav_list, split_ratio)
+        # Creating json files
+        create_json(
+            data_split["train"], save_json_train, sample_rate, alignments_folder, model_name, skip_resample
+        )
+        create_json(
+            data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name, skip_resample
+        )
+        create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name, skip_resample)
+
+
+def prepare_split(data_folder, split_list):
+    """
+    Processes the provided list of LibriTTS subsets and creates a list of all the .wav files present in the subsets.
+    Downloads the LibriTTS subsets as required.
+
+    Arguments
+    ---------
+    data_folder : str
+        Path to the folder where the LibriTTS dataset is stored
+    split_list : list
+        List of librispeech subsets to process (e.g., dev-clean, train-clean-100, ...)
+
+    Returns
+    -------
+    wav_list : list
+        List of all .wav files to be processed
+    """
+    extension = [".wav"]  # The expected extension for audio files
+    wav_list = list()  # Stores all audio file paths for the dataset
+
+    # For every subset of the dataset, if it doesn't exist, downloads it
+    for subset_name in split_list:
+        subset_folder = os.path.join(data_folder, subset_name)
+        subset_archive = os.path.join(subset_folder, subset_name + ".tar.gz")
+
+        if not check_folders(subset_folder):
+            logger.info(
+                f"No data found for {subset_name}. Checking for an archive file."
+            )
+            if not os.path.isfile(subset_archive):
+                logger.info(
+                    f"No archive file found for {subset_name}. Downloading and unpacking."
+                )
+                quit()
+        # Collects all files matching the provided extension
+        wav_list.extend(get_all_files(subset_folder, match_and=extension))
+
+    return wav_list
+
+
+def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None, skip_resample=False):
+    """
+    Creates the json file given a list of wav files.
+    Arguments
+    ---------
+    wav_list : list of str
+        The list of wav files.
+    json_file : str
+        The path of the output json file
+    sample_rate : int
+        The sample rate to be used for the dataset
+    data_folder : str
+        The path to LibriTTS
+    alignments_folder : str
+        The path to LibriTTS alignments
+    model_name : str
+        Model name (used to prepare additional model specific data)
+    skip_resample : int
+        Skips resampling - useful when large temporary storage
+        is absent.
+    """
+
+    # Downloads and initializes the G2P model to compute the phonemes if data is being prepared for Tacotron2 experiments
+    if model_name == "Tacotron2":
+        logger.info(
+            "Computing phonemes for labels using SpeechBrain G2P. This may take a while."
+        )
+        g2p = GraphemeToPhoneme.from_hparams(
+            "speechbrain/soundchoice-g2p", run_opts={"device": DEVICE}
+        )
+    else:
+        g2p = None
+
+    json_dict = {}
+
+    # Processes all the wav files in the list
+    for wav_file in tqdm(wav_list):
+        # Reads the signal
+        signal, sig_sr = torchaudio.load(wav_file)
+        duration = signal.shape[1] / sig_sr
+
+        # TODO add better way to filter short utterances
+        if duration < 1.0:
+            continue
+
+        # Manipulates path to get relative path and uttid
+        path_parts = wav_file.split(os.path.sep)
+        uttid, _ = os.path.splitext(path_parts[-1])
+        # relative_path = os.path.join("{data_root}", *path_parts[-4:])
+
+        # Gets the path for the text files and extracts the input text
+        normalized_text_path = os.path.join(
+            "/", *path_parts[:-1], uttid + ".normalized.txt"
+        )
+        try:
+            with open(normalized_text_path, encoding="utf-8") as f:
+                normalized_text = f.read()
+                if normalized_text.__contains__("{"):
+                    normalized_text = normalized_text.replace("{", "")
+                if normalized_text.__contains__("}"):
+                    normalized_text = normalized_text.replace("}", "")
+        except FileNotFoundError:
+            print(f"Warning: The file {normalized_text_path} does not exist.")
+            continue
+
+        # Resamples the audio file if required
+        if sig_sr != sample_rate and not skip_resample:
+            resampled_signal = torchaudio.functional.resample(
+                signal, sig_sr, sample_rate
+            )
+            os.unlink(wav_file)
+            torchaudio.save(wav_file, resampled_signal, sample_rate=sample_rate)
+
+        # Gets the speaker-id from the utterance-id
+        spk_id = uttid.split("_")[0]
+
+        # Creates an entry for the utterance
+        json_dict[uttid] = {
+            "uttid": uttid,
+            "wav": wav_file,
+            "duration": duration,
+            "spk_id": spk_id,
+            "label": normalized_text,
+            "segment": True if "train" in json_file else False,
+        }
+        if alignments_folder is not None:
+            alignments_file_name = get_alignment_path(data_folder, alignments_folder, wav_file)
+            alignments = parse_alignments(alignments_file_name)
+            json_dict[uttid].update(alignments)
+
+        # Characters are used for Tacotron2, phonemes may be needed for other models
+        if model_name not in ["Tacotron2", "HiFi-GAN"] and g2p is not None:
+            # Computes phoneme labels using SpeechBrain G2P and keeps the punctuations
+            phonemes = _g2p_keep_punctuations(g2p, normalized_text)
+            json_dict[uttid].update({"label_phoneme": phonemes})
+
+    # Writes the dictionary to the json file
+    with open(json_file, mode="w", encoding="utf-8") as json_f:
+        json.dump(json_dict, json_f, indent=2)
+
+    logger.info(f"{json_file} successfully created!")
+
+
+def get_alignment_path(data_folder, alignments_folder, file_name):
+    """Returns the path in the LibriSpeech-Alignments dataset
+    corresponding to the specified file path in LibriSpeech
+
+    Arguments
+    ---------
+    data_folder: str
+        the path to LibriSpeech
+    alignments_folder: str
+        the path to LibriSpeech-Alignments
+    file_name: str
+        the file name within LibriSpeech
+
+    Returns
+    -------
+    file_name: str
+        the alignment file path
+    """
+    file_name = Path(file_name)
+    data_folder = Path(data_folder)
+    if file_name.parts[0] == "{data_root}":
+        file_name_rel = file_name.relative_to("{data_root}")
+    else:
+        file_name_rel = file_name.relative_to(data_folder)
+    data_slice = file_name_rel.parts[0]
+
+    textgrid_folder = file_name_rel.relative_to(Path(data_slice) / "LibriTTS" / data_slice).parent.parent
+    textgrid_file_name = f"{file_name_rel.stem}.TextGrid"
+    textgrid_path = Path(alignments_folder) / data_slice / textgrid_folder / textgrid_file_name
+
+    return textgrid_path
+
+
+def skip(*filenames):
+    """
+    Detects if the data preparation has been already done.
+    If the preparation has been done, we can skip it.
+
+    Arguments
+    ---------
+    *filenames : tuple
+        Set of filenames to check for existence.
+
+    Returns
+    -------
+    bool
+        if True, the preparation phase can be skipped.
+        if False, it must be done.
+    """
+    for filename in filenames:
+        if isinstance(filename, list):
+            if any(not os.path.isfile(item) for item in filename):
+                return False
+        else:
+            if not os.path.isfile(filename):
+                return False
+    return True
+
+
+def split_sets(wav_list, split_ratio):
+    """Randomly splits the wav list into training, validation, and test lists.
+
+    Arguments
+    ---------
+    wav_list : list
+        list of all the signals in the dataset
+    split_ratio: list
+        List composed of three integers that sets split ratios for train, valid,
+        and test sets, respectively. For instance split_ratio=[80, 10, 10] will
+        assign 80% of the sentences to training, 10% for validation, and 10%
+        for test.
+
+    Returns
+    -------
+    dictionary containing train, valid, and test splits.
+    """
+    # Random shuffles the list
+    random.shuffle(wav_list)
+    tot_split = sum(split_ratio)
+    tot_snts = len(wav_list)
+    data_split = {}
+    splits = ["train", "valid"]
+
+    for i, split in enumerate(splits):
+        n_snts = int(tot_snts * split_ratio[i] / tot_split)
+        data_split[split] = wav_list[0:n_snts]
+        del wav_list[0:n_snts]
+    data_split["test"] = wav_list
+
+    return data_split
+
+
+def check_folders(*folders):
+    """Returns False if any passed folder does not exist."""
+    for folder in folders:
+        if not os.path.exists(folder):
+            return False
+    return True
+
+def parse_alignments(file_name):
+    """Parses a given LibriSpeech-Alignments TextGrid file and
+    converts the results to the desired format (to be used in JSON
+    metadata)
+
+    Arguments
+    ---------
+    file_name : path-like
+        the file name of the TextGrid file
+
+    Returns
+    -------
+    details: dict
+        the metadata details
+    """
+    try:
+        import textgrids
+    except ImportError:
+        logger.error(
+            "Parsing LibriSpeech-alignments requires the"
+            "praat-textgrids package"
+        )
+        raise
+    if not file_name.exists():
+        return {
+            "has_alignments": False,
+            "phn": [],
+            "phn_stress": [],
+            "phn_start": [],
+            "phn_end": [],
+            "phn_count": 0,
+            "wrd": [],
+            "wrd_start": [],
+            "wrd_end": [],
+            "wrd_count": 0,
+            "unk_count": None
+        }
+
+    text_grid = textgrids.TextGrid()
+    text_grid.read(file_name)
+    word_intervals = [
+        {**word, "label": word["label"].upper()}
+        for word in text_grid.interval_tier_to_array("words")
+    ]
+    phn_intervals = text_grid.interval_tier_to_array("phones")
+    details = {}
+    details.update(intervals_to_dict(word_intervals, "wrd"))
+    phn = intervals_to_dict(phn_intervals, "phn")
+    phn_stress = phn["phn"]
+    phn_nostress = remove_stress_marks(phn_stress)
+    phn["phn"] = phn_nostress
+    phn["phn_stress"] = phn_stress
+    details.update(phn)
+    details["unk_count"] = sum(wrd == "<UNK>" for wrd in details["wrd"])
+    details["has_alignments"] = True
+
+    return details
+
+
+INTERVAL_MAP = [("label", ""), ("begin", "_start"), ("end", "_end")]
+INTERVAL_EMPTY_LABELS = {"", "sil", "sp", "spn"}
+
+
+def intervals_to_dict(intervals, prefix):
+    """
+    Converts a parsed list of intervals from PRAAT TextGrid
+    to a learning-friendly array
+
+    Arguments
+    ---------
+    intervals: list
+        A list of raw TextGrid intervals, as returned by
+        TextGrid.interval_tier_to_array
+    prefix: str
+        the prefix to add
+
+    Returns
+    -------
+    result: dict
+        A dictionary of the form
+            {
+                "{prefix}": <list of labels>,
+                "{prefix}_start": <list of begin values>,
+                "{prefix}_end": <list of end values>,
+                "{prefix}_count: <number of intervals>
+            }
+
+    """
+    # Remove meaningless labels
+    intervals_clean = [
+        interval
+        for interval in intervals
+        if interval["label"] not in INTERVAL_EMPTY_LABELS
+    ]
+    result = {
+        f"{prefix}{suffix}": [interval[key] for interval in intervals_clean]
+        for key, suffix in INTERVAL_MAP
+    }
+    # This will map space labels to a single one
+    result[f"{prefix}_count"] = len(intervals_clean)
+    return result
+
+
+RE_STRESS_MARK = re.compile(r"\d$")
+
+
+def remove_stress_marks(phn):
+    """Removes stress marks from a phoneme annotation
+
+    Arguments
+    ---------
+    phn: list
+        a list of phoneme annotations with or without stress marks
+
+    Returns
+    -------
+    result: list
+        a list of phoneme annotations without stress marks
+    """
+    return [RE_STRESS_MARK.sub("", item) for item in phn]
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 5238beacd..6a2de5859 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -20,13 +20,13 @@
     PositionalEncoding as TransformerPositionalEncoding,
     get_lookahead_mask,
 )
+from speechbrain.dataio.batch import PaddedBatch
+from speechbrain.utils.data_utils import batch_pad_right
 from speechbrain.nnet.attention import RelPosEncXL
 from speechbrain.nnet.embedding import Embedding
 from speechbrain.nnet.linear import Linear
-from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss
+from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, nll_loss
 from speechbrain.dataio.dataio import length_to_mask
-from speechbrain.dataio.batch import PaddedBatch
-from speechbrain.decoders.seq2seq import S2STransformerBeamSearcher
 from speechbrain.utils.data_utils import concat_padded_features
 from speechbrain.nnet.schedulers import NoamScheduler
 
@@ -157,8 +157,10 @@ def __init__(
         show_inference_progress=True,
         audio_token_shift=0,
         multihead_input=True,
+        multihead_output=True,
         representation_mode=RepresentationMode.DISCRETE,
         audio_dim=1024,
+        out_proj=None,
     ):
         super().__init__()
         self.num_tokens = num_tokens
@@ -182,9 +184,11 @@ def __init__(
             if self.representation_mode == RepresentationMode.DISCRETE
             else audio_dim
         )
-        self.out_proj = Linear(
-            input_size=d_model, n_neurons=self.out_dim * tokens_per_step,
-        )
+        if out_proj is None:
+            out_proj = Linear(
+                input_size=d_model, n_neurons=self.out_dim * tokens_per_step,
+            )
+        self.out_proj = out_proj
         self.gate = Linear(input_size=d_model, n_neurons=1)
         if audio_emb is None:
             if self.representation_mode == RepresentationMode.DISCRETE:
@@ -222,6 +226,7 @@ def __init__(
         self.multihead_input = multihead_input
         self.d_model = d_model
         self.d_model_sqrt = math.sqrt(d_model)
+        self.multihead_output = multihead_output
 
     def decode(
         self,
@@ -371,16 +376,17 @@ def forward(
             pos_embs_src,
         )
         lin_out = self.out_proj(dec_out)
-        batch_size, audio_max_len, num_tokens = lin_out.shape
-        lin_out_heads = lin_out.reshape(
-            batch_size,
-            audio_max_len,
-            self.tokens_per_step,
-            num_tokens // self.tokens_per_step,
-        )
+        if self.multihead_output:
+            batch_size, audio_max_len, num_tokens = lin_out.shape
+            lin_out = lin_out.reshape(
+                batch_size,
+                audio_max_len,
+                self.tokens_per_step,
+                num_tokens // self.tokens_per_step,
+            )
         gate_out = self.gate(dec_out).squeeze(-1)
         return TokotronDecoderOutput(
-            lin_out_heads,
+            lin_out,
             gate_out,
             dec_self_attn,
             dec_attn,
@@ -439,6 +445,8 @@ def __init__(
         representation_mode=RepresentationMode.DISCRETE,
         audio_dim=1024,
         show_inference_progress=True,
+        transform_audio=None,
+        feed_audio=None
     ):
         super().__init__()
         self.decoder = None
@@ -451,6 +459,10 @@ def __init__(
         self.representation_mode = RepresentationMode(representation_mode)
         self.audio_dim = audio_dim
         self.show_inference_progress = show_inference_progress
+        if transform_audio is None:
+            transform_audio = nn.Identity()
+        self.transform_audio = transform_audio
+        self.feed_audio = feed_audio
 
     def bind(self, model):
         """Binds this inference implementation to a model
@@ -522,6 +534,7 @@ def forward(self, enc_out, length, emb=None):
                 steps_range = tqdm(steps_range, desc="Inference")
             for idx in steps_range:
                 # One autoregressive step
+                audio = self.transform_audio(audio)
                 step_out = self.decoder.forward(
                     enc_out=enc_out,
                     src_length=length,
@@ -530,7 +543,9 @@ def forward(self, enc_out, length, emb=None):
                 )
                 audio_out = step_out.out
 
-                if self.representation_mode == RepresentationMode.DISCRETE:
+                if self.feed_audio:
+                    audio_out = self.feed_audio(audio_out)
+                elif self.representation_mode == RepresentationMode.DISCRETE:
                     audio_out = audio_out.argmax(-1)
 
                 # The model outputs predictions without BOS. Add the BOS back for the
@@ -592,357 +607,6 @@ def forward(self, enc_out, length, emb=None):
         )
 
 
-class TokotronSearchWrapper(nn.Module):
-    """A wrapper class to facilitate seach-based inference. It takes care of re-interpreting
-    a multi-headed sequence as multiple samples, for compatibility, and for the retention
-    of attention tensors
-
-    Arguments
-    ---------
-    decoder : TokotronTransformerDecoder
-        the Tokotron transformer decoder
-    """
-
-    def __init__(self, decoder):
-        super().__init__()
-        self.tokens_per_step = decoder.tokens_per_step
-        self.decoder = decoder
-
-    def decode(self, memory, enc_states, enc_lens):
-        """Wraps the decode operation, will all the necessary
-        reshaping
-
-        Arguments
-        ---------
-        memory : torch.Tensor
-            Characters predicted so far
-        enc_states : torch.Tensor
-            Encoder states
-        enc_lens : torch.Tensor
-            Encoder state lengths
-        """
-        batch_size = enc_states.size(0) // self.tokens_per_step
-        _, mem_len = memory.shape
-        memory = memory.reshape(
-            self.tokens_per_step, batch_size, mem_len
-        ).permute(1, 2, 0)
-        dec_out, dec_self_attn, dec_attn = self.decoder.decode(
-            enc_out=enc_states[:batch_size],
-            src_length=enc_lens[:batch_size],
-            tgt=memory,
-        )
-        self.dec_self_attn = dec_self_attn
-        self.dec_attn = dec_attn
-        return dec_out, dec_attn
-
-
-class TokotronTransformerBeamSearcher(S2STransformerBeamSearcher):
-    """A slight modification of S2STransformerBeamSearcher that uses an
-    explicit number of tokens instead of trying to infer it from the
-    weights of the linear layer. This is needed because Tokotron is
-    multi-header and the final output layer outputs multiple output states
-
-    Arguments
-    ---------
-    num_tokens : int
-        The number of audio tokens available
-    """
-
-    def __init__(self, num_tokens, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_tokens = num_tokens
-
-    def set_n_out(self):
-        """Set the number of output tokens."""
-        return self.num_tokens
-
-
-class SearchLinearWrapper(nn.Module):
-    """A wrapper for the final linear layer of the Transformer. The goal is to
-    make it compatible with the SpeechBrain Beam Search implementation, which is
-    single-headed, by expanding multiple heads along the batch dimensions.
-
-    Arguments
-    ---------
-    lin : torch.Tensor
-        A linear layer with an output feature dimensions of
-        (tokens_per_step x num_tokens)
-    tokens_per_step : int
-        the numer of tokens the model outputs for each
-        time step
-    """
-
-    def __init__(self, lin, tokens_per_step):
-        super().__init__()
-        self.lin = lin
-        self.tokens_per_step = tokens_per_step
-
-    def forward(self, x):
-        """Performs a forward pass with all the required reshape operations
-
-        Arguments
-        ---------
-        x : torch.Tensor
-            The decoder output
-
-        Returns
-        -------
-        result : torch.Tensor
-            The layer output, reshaped along the batch dimension
-        """
-        x = self.lin(x)
-        batch_size, max_len, out_dim = x.shape
-        num_tokens = x.size(-1) // self.tokens_per_step
-        x = (
-            # batch x tokens x length
-            x.transpose(2, 1)
-            # batch x heads x tokens x length
-            .view(batch_size, self.tokens_per_step, num_tokens, max_len)
-            # heads x batch x tokens x length
-            .transpose(0, 1)
-            # heads * batch x tokens x length
-            .reshape(self.tokens_per_step * batch_size, num_tokens, max_len)
-            # heads * batch x length x tokens
-            .transpose(1, 2)
-        )
-        return x
-
-
-class TokotronSearchInference(nn.Module):
-    """A beam search-based inference implementation
-
-    All keyword arguments will be passed on to the underlying
-    beam search
-    """
-
-    def __init__(self, audio_token_shift=1, **kwargs):
-        super().__init__()
-        self.search_kwargs = kwargs
-        self.audio_token_shift = audio_token_shift
-        self.decoder, self.search, self.tokens_per_step = None, None, None
-
-    def bind(self, model=None):
-        """Binds this inference implementation to a model
-
-        Arguments
-        ---------
-        model : TokotronTransformerModel
-            The transformer model
-        """
-        decoder = model.decoder
-        self.tokens_per_step = decoder.tokens_per_step
-        self.decoder = TokotronSearchWrapper(decoder)
-        self.search = TokotronTransformerBeamSearcher(
-            modules=[
-                self.decoder,
-                SearchLinearWrapper(decoder.out_proj, self.tokens_per_step),
-            ],
-            num_tokens=decoder.num_tokens + self.audio_token_shift,
-            **self.search_kwargs,
-        )
-
-    def decode(self, enc_out, length):
-        """"Decodes the encoder representation using Beam Search
-
-        Arguments
-        ---------
-        enc_out : torch.Tensor
-            Encoder output
-        length : torch.Tensor
-            Encoder output lengths
-
-        Returns
-        -------
-        output : TokotronDecoderInfernceOutput
-            The inference output
-        """
-        with torch.no_grad():
-            device = enc_out.device
-            # The search does not support multiple heads. "Trick" it by expanding encoded
-            # representations along the batch dimension so that the beam searcher
-            # treats it as if they were separate, independent samples.
-            batch_size, max_len, enc_dim = enc_out.shape
-            enc_out_search = (
-                enc_out.unsqueeze(0)
-                .expand(self.tokens_per_step, batch_size, max_len, enc_dim)
-                .reshape(self.tokens_per_step * batch_size, max_len, enc_dim)
-            )
-            length_search = (
-                length.unsqueeze(0)
-                .expand(self.tokens_per_step, batch_size)
-                .reshape(self.tokens_per_step * batch_size)
-            )
-            hyps, audio_length, scores, log_probs = self.search(
-                enc_out_search, length_search
-            )
-            tokens_batch = PaddedBatch(
-                [
-                    {"hyps": torch.tensor(item, device=enc_out.device)}
-                    for item in hyps
-                ]
-            ).to(device)
-
-            audio_tokens, length = tokens_batch.hyps
-            _, audio_max_len = audio_tokens.shape
-            audio_tokens = audio_tokens.reshape(
-                self.tokens_per_step, batch_size, audio_max_len
-            ).permute(1, 2, 0)
-            length = (
-                length.reshape(self.tokens_per_step, batch_size).min(dim=0)
-            ).values
-            audio_tokens = audio_tokens - self.audio_token_shift
-
-            return TokotronDecoderInfernceOutput(
-                audio_tokens=audio_tokens,
-                length=length,
-                dec_self_attn=self.decoder.dec_self_attn,
-                dec_attn=self.decoder.dec_attn,
-                alignments=get_alignments(self.decoder.dec_attn),
-                p_eos=None,
-            )
-
-
-class TokotronForwardInference(nn.Module):
-    """A beam search-based inference implementation
-
-    All keyword arguments will be passed on to the underlying
-    beam search
-
-    Arguments
-    ---------
-    scale_factor : float
-        The scaling factor for encoder representations
-    gate_threshold : float
-        The threshold for gate activation
-    min_length : int
-        The minimum length for generating sequences, in tokens
-    """
-
-    def __init__(
-        self,
-        scale_factor=5.0,
-        gate_threshold=0.5,
-        min_length=16,
-        eos_mode=EosMode.GATE,
-        eos_index=0,
-        representation_mode=RepresentationMode.DISCRETE,
-    ):
-        super().__init__()
-        self.scale_factor = scale_factor
-        self.gate_threshold = gate_threshold
-        self.min_length = min_length
-        self.decoder = None
-        self.gate = None
-        self.eos_mode = EosMode(eos_mode)
-        self.eos_index = eos_index
-        self.representation_mode = RepresentationMode(representation_mode)
-
-    def bind(self, model=None):
-        """Binds this inference implementation to a model
-
-        Arguments
-        ---------
-        model : TokotronTransformerModel
-            The transformer model
-        """
-        self.decoder = model.decoder
-
-    def decode(self, enc_out, length):
-        """"Decodes the encoder representation using Beam Search
-
-        Arguments
-        ---------
-        enc_out : torch.Tensor
-            Encoder output
-        length : torch.Tensor
-            Encoder output lengths
-
-        Returns
-        -------
-        output : TokotronDecoderInfernceOutput
-            The inference output
-        """
-        with torch.no_grad():
-            max_len = enc_out.size(1)
-            src_key_padding_mask = length_to_mask(
-                length * max_len, max_len,
-            ).logical_not()
-            tgt = scale(enc_out, self.scale_factor)
-            dec_out = self.decoder(
-                enc_out=enc_out,
-                tgt=tgt,
-                tgt_length=length,
-                src_length=length,
-                src_key_padding_mask=src_key_padding_mask,
-                pos_embs_src=None,
-            )
-            if self.eos_mode == EosMode.GATE:
-                p_eos, eos = self.get_length_gate(dec_out)
-            else:
-                p_eos, eos = self.get_length_token(dec_out)
-
-            infer_length_abs = eos.max(dim=1).indices
-            infer_length_abs_nonzero = infer_length_abs[infer_length_abs > 0]
-            if len(infer_length_abs_nonzero) > 0:
-                infer_length_max = infer_length_abs_nonzero.max()
-            else:
-                infer_length_max = 0
-            if infer_length_max == 0:
-                infer_length_max = p_eos.size(1)
-            infer_length_abs = torch.where(
-                infer_length_abs == 0, infer_length_max, infer_length_abs
-            )
-            infer_length_abs = infer_length_abs.clip(min=self.min_length)
-            infer_length = infer_length_abs / infer_length_max
-
-            audio = dec_out.out[:, :infer_length_max].argmax(-1)
-            if self.representation_mode == RepresentationMode.DISCRETE:
-                audio = audio.argmax(-1)
-            return TokotronDecoderInfernceOutput(
-                audio=audio,
-                length=infer_length,
-                dec_self_attn=dec_out.dec_self_attn,
-                dec_attn=dec_out.dec_attn,
-                alignments=get_alignments(dec_out.dec_attn),
-                p_eos=p_eos,
-            )
-
-    def get_length_gate(self, dec_out):
-        """Infers lengths using the gate module
-
-        Arguments
-        ---------
-        dec_out : TokotronDecoderOutput
-            The decoder output
-
-        Returns
-        -------
-        p_eos : torch.Tensor
-            EOS probabilities (as estimated by the gate)
-        eos : torch.Tensor
-            a Boolean tensor where positions indicate whether
-            the gate has activated
-        """
-        p_eos = dec_out.gate_out.sigmoid()
-        eos = p_eos > self.gate_threshold
-        return p_eos, eos
-
-    def get_length_token(self, dec_out):
-        """Infers lengths using an EOS token
-
-        Arguments
-        ---------
-        dec_out : TokotronDecoderOutput
-            The decoder output
-        eos : torch.Tensor
-            A Boolean tensor indicating whether EOS has been reached
-        """
-        p_seq = dec_out.out[:, :, 0].softmax(dim=-1)
-        p_eos = p_seq[:, :, self.eos_index].softmax(-1)
-        eos = p_seq.argmax(dim=-1) == self.eos_index
-        return p_eos, eos
-
-
 class TokotronTransformerModel(nn.Module):
     """An end-to-end Tokotron model receiving characters or phonemes
     as inputs and outputting audio tokens
@@ -1052,11 +716,13 @@ def __init__(
         eos_mode=EosMode.GATE,
         inference=None,
         audio_token_shift=0,
-        decoder_mode=DecoderMode.AUTOREGRESSIVE,
         scale_factor=5.0,
         representation_mode=RepresentationMode.DISCRETE,
         audio_dim=1024,
         emb=None,
+        audio_emb=None,
+        out_proj=None,
+        multihead_input=True
     ):
         super().__init__()
         self.in_emb = Embedding(
@@ -1075,11 +741,6 @@ def __init__(
             activation=activation,
             normalize_before=True,
         )
-        self.decoder_mode = DecoderMode(decoder_mode)
-        audio_emb = None
-        if self.decoder_mode == DecoderMode.FORWARD:
-            audio_emb = nn.Identity()
-            audio_emb_size = d_model
         self.decoder = TokotronTransformerDecoder(
             num_tokens=audio_num_tokens + self.audio_token_shift,
             tokens_per_step=audio_tokens_per_step,
@@ -1099,9 +760,11 @@ def __init__(
             gate_threshold=gate_threshold,
             gate_offset=gate_offset,
             audio_token_shift=audio_token_shift,
-            multihead_input=self.decoder_mode == DecoderMode.AUTOREGRESSIVE,
+            multihead_input=multihead_input,
+            multihead_output=out_proj is None,
             representation_mode=representation_mode,
             audio_dim=audio_dim,
+            out_proj=out_proj,
         )
         self.bos_idx = bos_idx
         self.attention_type = attention_type
@@ -1255,17 +918,11 @@ def forward(
             src_key_padding_mask=src_key_padding_mask,
             pos_embs=pos_embs_encoder,
         )
-        if self.decoder_mode == DecoderMode.AUTOREGRESSIVE:
-            tgt = audio
-            tgt_length = audio_length
-        else:
-            tgt = scale(enc_out, self.scale_factor)
-            tgt_length = input_length
         enc_out = self.add_emb(enc_out, emb)
         dec_out = self.decoder(
             enc_out=enc_out,
-            tgt=tgt,
-            tgt_length=tgt_length,
+            tgt=audio,
+            tgt_length=audio_length,
             src_length=input_length,
             src_key_padding_mask=src_key_padding_mask,
             pos_embs_src=pos_embs_encoder,
@@ -1569,6 +1226,7 @@ def __init__(
         representation_mode=RepresentationMode.DISCRETE,
         audio_clip_min=-10.0,
         audio_clip_max=10.0,
+        multihead_output=True,
     ):
         super().__init__()
         self.guided_attention_weight = guided_attention_weight
@@ -1597,6 +1255,7 @@ def __init__(
             self.register_buffer("audio_eos", audio_eos)
         self.audio_clip_min = audio_clip_min
         self.audio_clip_max = audio_clip_max
+        self.multihead_output = multihead_output
 
     def forward(
         self,
@@ -1629,9 +1288,12 @@ def forward(
             out = out.log_softmax(dim=-1)
         batch_size, out_len, heads, tok_dim = out.shape
         max_len = out_len - 1
-        out_reshaped = (
-            out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim)
-        )[:, :max_len]
+        if self.multihead_output:
+            out_reshaped = (
+                out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim)
+            )[:, :max_len]
+        else:
+            out_reshaped = out
         if self.eos_mode == EosMode.TOKEN:
             # NOTE: Shift only the tokens, but not EOS
             padding_lengths = torch.ones(batch_size, device=audio.device)
@@ -1645,7 +1307,10 @@ def forward(
             )
 
         tok_len = audio.size(1)
-        if self.representation_mode == RepresentationMode.DISCRETE:
+        if not self.multihead_output:
+            audio_reshaped = audio
+            lengths_reshaped = audio_length
+        elif self.representation_mode == RepresentationMode.DISCRETE:
             audio_reshaped = audio.transpose(1, 2).reshape(
                 batch_size * heads, max_len
             )
@@ -1664,18 +1329,21 @@ def forward(
                 )
 
         audio_reshaped = audio_reshaped[:, :max_len]
-        lengths_reshaped = (
-            audio_length.unsqueeze(-1)
-            .expand(batch_size, heads)
-            .reshape(batch_size * heads)
-        )
+        if self.multihead_output:        
+            lengths_reshaped = (
+                audio_length.unsqueeze(-1)
+                .expand(batch_size, heads)
+                .reshape(batch_size * heads)
+            )
+        else:
+            lengths_reshaped = audio_length            
         seq_loss = self.seq_cost(
             out_reshaped[:, :tok_len],
             audio_reshaped,
             length=lengths_reshaped,
             reduction=reduction,
         )
-        if reduction == "batch":
+        if reduction == "batch" and self.multihead_output:
             seq_loss = seq_loss.reshape(batch_size, heads).mean(-1)
         lengths_abs = audio_length * out_len
 
@@ -2229,178 +1897,199 @@ def all_weights(self):
         return torch.stack([emb.weight for emb in self.emb])
 
 
-class DACFeatureExtractor(nn.Module):
-    """An adapter for feature extraction
+def get_silence_token(
+    model,
+    sample_length=100000,
+    unsqueeze=False,
+    device=None,
+    num_codebooks=None,
+
+):
+    """Attempts to find out the silence tokens for a given model,
+    if applicable
 
     Arguments
     ---------
-    dac : DAC
-        a DAC model
-    """
-
-    def __init__(self, dac, n_quantizers):
-        super().__init__()
-        self.dac = dac
-        self.dac.eval()
-        self.n_quantizers = n_quantizers
+    model : nn.Module
+        A discrete token model, taking (wav, lengths) as arguments
+    sample_length : int
+        The length of the sample
+    unsqueeze: bool
+        Whether to add an extra dimension to the audio (needed for DAC)
+    device : str | torch.Device
+        The device to use
+    num_codebooks : int | list
+        The number of codebooks or the codebooks to use
 
-    def encode(self, inputs, length):
-        """Encodes a raw audio sample using DAC
+    Returns
+    -------
+    silence_tokens : torch.Tensor
+        The token(s) corresponding to silence
 
-        Arguments
-        ---------
-        inputs : torch.Tensor
-            A (Batch x Samples) or (Batch x Channel x Samples)
-            tensor of audio
-        length : torch.Tensor
-            A tensor of relative lengths
+    silece_emb : torch.Tensor
+        The embedding(s) corresponding to silence
 
-        Returns
-        -------
-        tokens : torch.Tensor
-            A (Batch x Tokens x Heads) tensor of audio tokens
-        emb : torch.Tensor
-            Raw vector embeddings from the model's
-            quantizers
+    """
+    if device is None:
+        device = next(model.parameters()).device
+
+    audio = torch.zeros(1, sample_length, device=device)
+    if unsqueeze:
+        audio = audio.unsqueeze(1)
+    length = torch.ones(1, device=device)
+    model_training = model.training
+    model.eval()
+    tokens = model.sig_to_tokens(audio, length, num_codebooks=num_codebooks)
+    if model_training:
+        model.train()
+    tokens = tokens.squeeze(0)
+    if unsqueeze:
+        tokens = tokens.squeeze(0)
+    silence_tokens = tokens.mode(0).values
+    return silence_tokens
+
+
+def get_silence_repr(model, sample_length=100000, device=None):
+    """Gets continuous silence
 
-        """
-        if inputs.dim() < 3:
-            inputs = inputs.unsqueeze(1)
-        emb, codes, _, _, _ = self.dac.encode(
-            inputs, n_quantizers=self.n_quantizers
-        )
-        emb.transpose_(1, 2)
-        codes.transpose_(1, 2)
-        max_len = emb.size(1)
-        mask = length_to_mask(
-            length * max_len, max_len, device=inputs.device
-        ).unsqueeze(-1)
-        return codes * mask, emb * mask
+    Arguments
+    ---------
+    model : nn.Module
+        A discrete token model, taking (wav, lengths) as arguments
+    sample_length : int
+        The length of the sample
+    device : str | torch.Device
+        The device to use
 
-    def forward(self, inputs, length):
-        """Encodes a raw audio sample using DAC
+    Returns
+    -------
+    silence : torch.Tensor
+        A silecnce tensor
+    """
+    audio = torch.zeros(1, sample_length, device=device)
+    length = torch.ones(1, device=device)
+    audio_repr = model(audio, length)
+    silence = audio_repr.mean(dim=1)[0]
+    return silence
 
-        Arguments
-        ---------
-        inputs : torch.Tensor
-            A (Batch x Samples) or (Batch x Channel x Samples)
-            tensor of audio
-        length : torch.Tensor
-            A tensor of relative lengths
 
-        Returns
-        -------
-        tokens : torch.Tensor
-            A (Batch x Tokens x Heads) tensor of audio tokens
-        emb : torch.Tensor
-            Raw vector embeddings from the model's
-            quantizers
+def feature_pad_to(tensor, length, padding=None):
+    """Pads feature dimensions to the specified length with the specified padding,
+    assuming a (Batch x Length x Features..) tensor
 
-        """
-        return self.encode(inputs, length)
+    Arguments
+    ---------
+    tensor : torch.Tensor
+        The tensor to be padded
 
-    def embeddings(self, tokens):
-        """Converts token indexes to vector embeddings
+    length : int
+        The length to which the tensor will be padded
 
-        Arguments
-        ---------
-        tokens : torch.Tensor
-            a (Batch x Length x Heads) tensor of token indexes
+    padding : torch.Tensor, optional
+        The padding tensor - if omitted, zero padding
+        will be used
 
-        Returns
-        -------
-        emb : torch.Tensor
-            a (Batch x Length x Heads x Embedding) tensor
-            of raw vector embeddings from the model's
-            quantizer codebooks
-        """
-        emb, _, _ = self.dac.quantizer.from_codes(tokens.transpose(1, 2).int())
-        return emb.transpose(1, 2)
+    Returns
+    -------
+    result : torch.Tensor
+        The padded tensor
+    """
+    if padding is None:
+        padding = torch.zeros(tensor.shape[1:])
+    padding = padding[None, ...].expand(
+        (length - tensor.size(0),) + tensor.shape[1:]
+    )
+    return torch.cat([tensor, padding], dim=0)
 
 
-class SpeechTokenizerFeatureExtractor(nn.Module):
-    """This lobe enables the integration of HuggingFace and SpeechBrain
-    pretrained SpeechTokenizer.
+def batch_feature_pad(tensors, padding=None):
+    """Similar to batch_pad_right but pads with the specified padding, which
+    can be a vector or a tensor
 
-    Please, install speechtokenizer:
-    pip install speechtokenizer
+    Arguments
+    ---------
+    tensors : list
+        The list of tensors to be padded
+    padding : torch.Tensor
+        The padding tensor
 
-    Source paper: https://arxiv.org/abs/2308.16692
+    Returns
+    -------
+    result : torch.Tensor
+        the padded tensor
+    """
+    lengths_abs = torch.tensor(
+        [len(item) for item in tensors], device=tensors[0].device
+    )
+    max_length = lengths_abs.max()
+    data = torch.stack(
+        [feature_pad_to(item, max_length, padding) for item in tensors]
+    )
+    lengths = lengths_abs / max_length
+    return data, lengths
 
 
-    The model can be used as a fixed Discrete feature extractor or can be finetuned. It
-    will download automatically the model from HuggingFace or use a local path.
+def token_collate_fn(examples, silence_token, token_keys):
+    """A customized collation function for audio tokens where
+    the specified silence token will be used as padding - instead of
+    zeros
 
     Arguments
     ---------
-    speech_tokenizer : speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface
-        The speech tokenizer interface
-    codebooks : int, optional
-        The number of codebooks to use - if omitted,
-    """
+    examples : list
+        A list of examples
 
-    def __init__(self, speech_tokenizer, codebooks=None):
-        super().__init__()
-        self.speech_tokenizer = speech_tokenizer
-        self.codebooks = codebooks
-
-    def forward(self, wav, wav_lens=None):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
-
-        Arguments
-        ---------
-        wav : torch.Tensor (signal)
-            A batch of audio signals to transform to features.
-        wav_lens : torch.Tensor
-            The relative length of the wav given in SpeechBrain format.
+    silence_token : torch.Tensor
+        The token(s) representing silence
 
-        Returns
-        -------
-        tokens : torch.Tensor
-            A tensor of audio tokens
-            Shape: (N_q x Batch x Time) by default
-            (Batch x Time x N_q) if shape == compat
+    token_keys : list
+        The list of keys to which special padding will be applied
 
-        """
-        return self.encode(wav, wav_lens)
+    Returns
+    -------
+    result : speechbrain.dataio.batch.PaddedBatch
+        A padded batch
+    """
+    token_tensor_ids = {id(examples[0][key]) for key in token_keys}
+    return PaddedBatch(
+        examples,
+        padding_func=_silence_padding,
+        padding_kwargs={
+            "silence_token": silence_token,
+            "token_tensor_ids": token_tensor_ids,
+        },
+    )
 
-    def encode(self, wav, wav_lens=None):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
 
-        Arguments
-        ---------
-        wav : torch.Tensor (signal)
-            A batch of audio signals to transform to features.
-        wav_lens : torch.Tensor
-            The relative length of the wav given in SpeechBrain format.
+def _silence_padding(values, silence_token, token_tensor_ids):
+    return (
+        batch_feature_pad(values, silence_token)
+        if id(values[0]) in token_tensor_ids
+        else batch_pad_right(values)
+    )
 
-        Returns
-        -------
-        tokens : torch.Tensor
-            A (Batch x Seq, N_q) tensor of audio tokens
 
-        """
-        # Extract discrete codes from SpeechTokenizer
-        codes = self.speech_tokenizer.encode(
-            wav.unsqueeze(1), wav_lens
-        )  # codes: (n_q, B, T)
-        if self.codebooks is not None:
-            codes = codes[: self.codebooks]
-        codes = codes.permute(1, 2, 0)
-        return codes
-
-    def decode(self, codes):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
+def use_silence_padding(dataloader_opts, silence_token, token_keys):
+    """Overrides the collation function to add silence padding to
+    audio token features
 
-        Arguments
-        ---------
-        tokens : torch.Tensor
-            A (N_q, Batch x Seq) tensor of audio tokens
+    Arguments
+    ---------
+    dataloder_opts : dict
+        Dataloader options
+    silence_token : torch.Tensor
+        The tensor to be used as silence padding
+    token_keys : torch.Tensor
+        The keys to apply silence padding to
 
-        Returns
-        -------
-        wav : torch.Tensor (signal)
-            A batch of reconstructed audio signals.
-        """
-        codes = codes.permute(2, 0, 1)
-        return self.speech_tokenizer.decode(codes)
+    Returns
+    -------
+    dataloader_opts : dict
+        Updated data loader options
+    """
+    return {
+        **dataloader_opts,
+        "collate_fn": partial(
+            token_collate_fn, silence_token=silence_token, token_keys=token_keys
+        ),
+    }
\ No newline at end of file
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 972d35c66..6f28ee44d 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -1,4 +1,7 @@
+import math
 import torch
+from speechbrain.nnet.linear import Linear
+from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens
 
 
 class AttentionMLP(torch.nn.Module):
@@ -109,3 +112,154 @@ def forward(self, in_tokens):
             if self.proj_layer is not None:
                 in_embs = self.proj_layer(in_embs)
             return in_embs
+
+
+class TernaryPredictionHead(torch.nn.Module):
+    """An alternative prediction head that predicts a fixed number of ternary digits
+    for each position (as used in SQ-Codec)
+
+    Arguments
+    ---------
+    d_model : int
+        The model dimension
+    num_positions : int
+        the number of positions
+    """
+    def __init__(self, d_model, num_positions, d_hidden=512, norm=True):
+        super().__init__()
+        self.num_positions = num_positions
+        self.d_model = d_model
+        self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity()
+        self.lin_hidden = Linear(
+            input_size=d_model,
+            n_neurons=d_hidden,
+        )
+        self.act = torch.nn.LeakyReLU()
+        self.lin_p = Linear(
+            input_size=d_hidden,
+            n_neurons=num_positions * 3,
+            bias=False
+        )
+
+    def forward(self, x, track=None):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The decoder output (Batch x Length x d_model)
+
+        track : int
+            The track index (if applicable)
+
+        Returns
+        -------
+        p : torch.Tensor
+            A tensor of shape (Batch x Length x num_positions x ternary digit)
+            The values are logits (unnormalized probabilities)
+
+            p[:, :, :, 0] corresponds to -1
+            p[:, :, :, 1] corresponds to 0
+            p[:, :, :, 2] corresponds to 1
+        """
+        batch_size, max_len, _ = x.shape
+        x = self.norm(x)
+        x = self.lin_hidden(x)
+        x = self.act(x)
+        p = self.lin_p(x)
+        p = p.reshape(batch_size, max_len, self.num_positions, 3)
+        return p
+
+
+class MultitrackPredictionHead(torch.nn.Module):
+    """An alternative prediction head that predicts multiple
+    tracks of tokens simultaneously
+    
+    Arguments
+    ---------
+    d_model : int
+        The model dimension
+    num_tracks: int
+        The number of tracks
+    vocab_size : int
+        The vocabulary size
+    """
+    def __init__(self, d_model, num_tracks, vocab_size):
+        super().__init__()
+        self.num_tracks = num_tracks
+        self.vocab_size = vocab_size
+        self.lin = Linear(
+            input_size=d_model,
+            n_neurons=num_tracks * vocab_size
+        )
+
+    def forward(self, x):
+        """Computes the forward pass
+        
+        Arguments
+        ---------
+        x : torch.Tensor
+            the input
+        Returns
+        -------
+        result : torch.Tensor
+            a result of shape (Batch x Length x Tracks x Tokens)
+        """
+        batch_size, max_len, _ = x.shape
+        x = self.lin(x)
+        x = x.reshape(batch_size, max_len, self.num_tracks, self.vocab_size)
+        return x
+
+
+
+class TernaryLogitTokenizer(torch.nn.Module):
+    """Converts ternary logits to probabilities
+
+    Arguments
+    ---------
+    num_positions : int
+        The number of ternary digits/positions
+    num_tokens : int
+        The number of tokens
+    chunk_size : int
+        The size of the chunk (to prevent OOM)
+    mode : str
+        "probability" : treats the outputs as a probability distribution
+        "argmax" : "hard" mode, only the top probability is used. Cannot be used with
+        top_k sampling with k > 1
+        
+    """
+    def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10):
+        super().__init__()
+        self.num_positions = num_positions
+        if num_tokens is None:
+            num_tokens = 3 ** num_positions
+        self.num_tokens = num_tokens
+        self.num_tracks = num_tracks
+        self.chunk_size = chunk_size
+        self.register_buffer("vocab", torch.arange(num_tokens))
+        self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
+        self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
+
+    def forward(self, logits):
+        batch_size, max_len, num_positions, _ = logits.shape
+        logits = logits.softmax(-1)
+        logits = logits.reshape(batch_size, max_len, self.num_tracks, 1, num_positions // self.num_tracks, 3)
+        chunks = logits.chunk(
+            dim=1,
+            chunks=math.ceil(logits.size(1) / self.chunk_size)
+        )
+        token_logits_chunks = []
+        for chunk in chunks:
+            token_logits_raw = torch.where(
+                self.vocab_ternary[:, None, None, :, :, None] == self.idx,
+                chunk,
+                torch.ones_like(chunk)
+            ).prod(-1).log().sum(-1).exp()
+            token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
+            token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2))
+        token_logits = torch.cat(
+            token_logits_chunks,
+            dim=1
+        )
+        return token_logits
diff --git a/benchmarks/DASB/model/fairseq_hubert.py b/benchmarks/DASB/model/fairseq_hubert.py
new file mode 100644
index 000000000..3f86b54c5
--- /dev/null
+++ b/benchmarks/DASB/model/fairseq_hubert.py
@@ -0,0 +1,80 @@
+import joblib
+import torch
+import torch.nn.functional as F
+from speechbrain.utils.data_utils import batch_pad_right
+
+
+
+MIN_WAV_LEN = 720
+MODEL_SR = 16000
+
+
+class FairseqHuBERT(torch.nn.Module):
+    def __init__(
+        self,
+        feat_extractor_path,
+        layer,
+        km_path,
+        max_chunk=1600000,
+        vocoder=None,
+    ):
+        super().__init__()
+        import fairseq
+
+        # Feature extractor
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [feat_extractor_path]
+        )
+        self.model = model[0]
+        self.task = task
+        self.layer = layer
+        self.max_chunk = max_chunk
+        # Quantizer
+        km_model = joblib.load(km_path)
+        self.C_np = km_model.cluster_centers_.transpose()
+        self.Cnorm_np = (self.C_np**2).sum(0, keepdims=True)
+        self.register_buffer("C", torch.from_numpy(self.C_np))
+        self.register_buffer("Cnorm", torch.from_numpy(self.Cnorm_np))
+        self.sample_rate = MODEL_SR
+        self.vocoder = vocoder
+
+    def encode(self, x, wav_lens=None):
+        if self.task.cfg.normalize:
+            x = F.layer_norm(x, x.shape)
+
+        feat = []
+        for start in range(0, x.size(1), self.max_chunk):
+            x_chunk = x[:, start : start + self.max_chunk]
+            if x_chunk.size(1) < MIN_WAV_LEN:
+                continue
+            feat_chunk, _ = self.model.extract_features(
+                source=x_chunk,
+                padding_mask=None,
+                mask=False,
+                output_layer=self.layer,
+            )
+            feat.append(feat_chunk)
+        feat = torch.cat(feat, 1).squeeze(0)
+        dist = (
+            feat.pow(2).sum(-1, keepdim=True)
+            - 2 * torch.matmul(feat, self.C)
+            + self.Cnorm
+        )
+        tokens = dist.argmin(dim=-1)
+        if tokens.dim() < 2:
+            tokens = tokens.unsqueeze(0)
+        return tokens
+
+    def decode(self, tokens):
+        if self.vocoder is None:
+            raise ValueError("Vocoder is not set")
+        sig_items = [
+            self.vocoder(item, dur_prediction=True)
+            for item in tokens
+        ]
+        sig, _ = batch_pad_right(sig_items)
+        return sig
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 0e1ffe3f8..92f7ee7d2 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -21,6 +21,8 @@
 from torch.autograd import Function
 from torch.nn.utils import remove_weight_norm, weight_norm
 
+from speechbrain.dataio.dataio import length_to_mask
+
 
 class SQCodec(nn.Module):
     """
@@ -124,7 +126,7 @@ def build_codec_model(self, config):
         exp_model_config = OmegaConf.load(config)
         scalar_codec = ScalarModel(**exp_model_config.generator.config)
         device = next(iter(scalar_codec.parameters())).device
-        parameter_dict = torch.load(self.ckpt_path, map_location=device)
+        parameter_dict = torch.load(self.ckpt_path, map_location=device, weights_only=False)
         scalar_codec.load_state_dict(parameter_dict["codec_model"])
         return scalar_codec
 
@@ -1281,6 +1283,109 @@ def forward(self, x):
         return x
 
 
+class TernaryEmbedding(nn.Module):
+    """A module wrapper for tokens-to-ternary conversion
+
+    Arguments
+    ---------
+    num_digits : int
+        The number of ternary digits
+    shift : int
+        The number of digits to "shift" embeddings by.
+        This is needed when text and special tokens are concatenated
+    shift_cutoff : int
+        The shifted tokens
+    flat : bool
+        Where to enable "flat" embeddings (e.g. multiple codebooks "flattened")
+    """
+    def __init__(
+            self,
+            num_digits,
+            shift=None,
+            shift_cutoff=None,
+            hybrid=False,
+            hybrid_cutoff=None,
+            hybrid_size=None,
+            flat=False):
+        super().__init__()
+        self.num_digits = num_digits
+        if hybrid:
+            shift = None
+        self.shift = shift
+        if shift_cutoff is None and shift:
+            shift_cutoff = 3**shift
+        self.shift_cutoff = shift_cutoff
+        if hybrid and not flat:
+            raise ValueError(
+                "Hybrid embeddings are currently supported"
+                "only for flattened mode")
+        self.flat = flat
+        self.hybrid = hybrid
+        self.hybrid_cutoff = hybrid_cutoff
+        if hybrid:
+            self.emb = torch.nn.Embedding(hybrid_cutoff + 1, hybrid_size)
+            torch.nn.init.uniform_(self.emb.weight, a=-1., b=1.)
+
+    def forward(self, tokens):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        tokens : torch.Tensor
+            the tokens
+        """
+        squeeze = False
+        if tokens.dim() < 3:
+            squeeze = True
+            tokens = tokens.unsqueeze(-1)
+        batch_size, max_len, tracks = tokens.shape
+        tokens = self._shift(tokens)
+        if self.hybrid:
+            # Note: Yes, text tokens will be "floored" but 
+            emb_tokens = (tokens - self.hybrid_cutoff).clip(0)
+        else:
+            emb_tokens = tokens
+        emb = tokens_to_ternary(emb_tokens, D=self.num_digits).float()
+        if self.hybrid:
+            emb = self._hybrid_emb(emb, tokens)
+        positions = emb.size(-1)
+        if self.flat:
+            emb = emb.unsqueeze(-2)
+        else:
+            emb = emb.reshape(batch_size, max_len, tracks, positions // tracks)
+        if squeeze:
+            emb = emb.squeeze(-2)
+        return emb
+
+    def _hybrid_emb(self, emb, tokens):
+        batch_size, max_len, tracks = tokens.shape
+        hybrid_emb = torch.cat(
+            [
+                self.emb(tokens[:, :, 0].clip(max=self.hybrid_cutoff)),
+                torch.where(
+                    (tokens[:, :, 0] < self.hybrid_cutoff).unsqueeze(-1),
+                    torch.ones(batch_size, max_len, emb.size(-1), device=tokens.device) * -1,
+                    emb
+                )
+            ],
+            dim=-1
+
+        )
+        return hybrid_emb
+
+    def _shift(self, tokens):
+        if not self.shift:
+            return tokens
+        shift_multiplier = 3**self.shift
+        shift_offset = shift_multiplier - 1
+        tokens_shift = torch.where(
+            tokens < self.shift_cutoff,
+            tokens,
+            (tokens - self.shift_cutoff) * shift_multiplier + shift_offset
+        )
+        return tokens_shift
+
+
 def decimal_to_ternary_matrix(decimals, D):
     """
     Convert a tensor of decimal numbers to a D*T ternary matrix for each batch.
@@ -1300,7 +1405,7 @@ def decimal_to_ternary_matrix(decimals, D):
         corresponds to a batch, and each column is represented as a ternary number.
     """
     B, T = decimals.shape
-    ternary_matrix = torch.zeros((B, D, T), dtype=torch.long)
+    ternary_matrix = torch.zeros((B, D, T), dtype=torch.long, device=decimals.device)
     for pos in range(D):
         ternary_matrix[:, pos, :] = decimals % 3  # Modulo operation
         decimals //= 3  # Floor division for next ternary digit
@@ -1342,6 +1447,40 @@ def ternary_matrix_to_decimal(matrix):
     return decimals
 
 
+def ternary_matrix_to_decimal_torch(matrix):
+    """
+    Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch.
+
+    Arguments
+    ---------
+    matrix : numpy.ndarray
+        A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number
+        of ternary digits, and N is the number of ternary numbers in each batch.
+
+    Returns
+    -------
+    numpy.ndarray
+        A 2D numpy array of shape (B, N), where each value represents the decimal
+        equivalent of the corresponding ternary number in the input matrix.
+    """
+    (
+        B,
+        D,
+        N,
+    ) = (
+        matrix.shape
+    )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
+    powers_of_three = 3 ** torch.arange(D, device=matrix.device)  # [3^0, 3^1, ..., 3^(D-1)]
+
+    # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
+    powers_of_three = powers_of_three[:, None]  # Shape [D, 1]
+
+    # Compute dot product using broadcasting: matrix * powers_of_three along D axis
+    decimals = torch.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
+
+    return decimals
+
+
 def get_padding(kernel_size, dilation=1):
     """
     Computes the padding size for a given kernel size and dilation.
@@ -1359,3 +1498,123 @@ def get_padding(kernel_size, dilation=1):
         Calculated padding size.
     """
     return int((kernel_size * dilation - dilation) / 2)
+
+
+def ternary_to_decimal(ternary, n_codebook=4):
+    """Converts ternary digits to their decimal equivalent
+
+    Arguments
+    ---------
+    ternary : torch.Tensor
+        (Batch x Length x num_positions) - ternary digits
+    n_codebooks : torch.Tensor
+        The number of codebooks
+    
+    Returns
+    -------
+    result: torch.Tensor
+        the result (Batch x Length x codebooks)
+    """
+    chunks = ternary.chunk(n_codebook, dim=1)
+    codec_ls = []
+    # TODO: Vectorize
+    for i, chunk in enumerate(chunks):
+        chunk = chunk + 1
+        tmp_codec = ternary_matrix_to_decimal_torch(chunk)
+        codec_ls.append(tmp_codec)
+    codec_ls = torch.stack(codec_ls)
+    return codec_ls.permute(1, 2, 0)
+
+
+def ternary_logits_to_tokens(logits, n_codebook=4):
+    """Converts ternary logits to tokens (as used for SQ-Codec)
+
+    Arguments
+    ---------
+    logits : torch.Tensor
+        The logits
+
+    Returns
+    -------
+    tokens : torch.Tensor
+        Token IDs
+    """
+    ternary_matrix = logits_to_ternary(logits)
+    tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2), n_codebook=n_codebook)
+    return tokens
+
+
+def tokens_to_ternary(tokens, D=9):
+    """Converts a sequence of tokens to a ternary matrix
+
+    Arguments
+    ---------
+    tokens : torch.Tensor
+        A (Batch x Length x Codebooks) tensor of tokens
+    D : int
+        The number of ternary digits
+
+    Returns
+    -------
+    result : torch.Tensor
+        A (Batch x Length x Ternary Positions) tensor
+        with values of (-1, 0, 1)"""
+    has_batch = tokens.dim() > 2
+    if not has_batch:
+        tokens = tokens.unsqueeze(0)
+    batch_size = tokens.size(0)
+    n_codebook = tokens.size(2)
+    tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone()
+    ternary_matrix = torch.cat([
+        decimal_to_ternary_matrix(item, D=D) - 1
+        for item in tokens
+    ], dim=1)
+    ternary_matrix = ternary_matrix.transpose(1, 2)
+    if not has_batch:
+        ternary_matrix = ternary_matrix[0]
+    return ternary_matrix
+
+
+def logits_to_ternary(logits):
+    """Converts a tensor with two logits to a ternary matrix
+
+    Arguments
+    ---------
+    logits : torch.Tensor
+        The logits (Batch x Length x num_positions x 3)
+
+    Returns
+    -------
+    result : torch.Tensor
+        The corresponding ternary matrix
+    """
+    ternary = logits.argmax(-1) - 1
+    return ternary
+
+
+def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", num_positions=9, reduction="mean"):
+    if targets.dim() < 3:
+        targets = targets.unsqueeze(-1)
+    if targets_type == "tokens":
+        targets = tokens_to_ternary(targets.unsqueeze(-1), D=num_positions)
+    batch_size, max_len, positions = targets.shape
+    targets_cat = targets + 1
+    predictions_loss = predictions.permute(0, 3, 1, 2).contiguous()
+    loss = nn.functional.nll_loss(
+        predictions_loss,
+        targets_cat,
+        reduction="none"
+    )
+    if length is not None:
+        mask = length_to_mask(
+            length * max_len,
+            max_len
+        )
+    mask = mask.unsqueeze(-1)
+    if mask is not None:
+        loss = loss * mask
+    if reduction == "mean":
+        loss = loss.sum(2).sum(1).sum(0) / mask.sum()
+    elif reduction == "batch":
+        loss = loss.sum(2).sum(1) / mask.sum(-1).sum(-1)
+    return loss
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
new file mode 100644
index 000000000..0ff414c68
--- /dev/null
+++ b/benchmarks/DASB/model/valle.py
@@ -0,0 +1,1234 @@
+"""An adaptation of ESPNET VALL-E
+Originally by Jinchuan Tian
+
+https://github.com/espnet/espnet
+
+Authors
+ * Artem Ploujnikov 2024 (adaptation only)
+"""
+
+# Copyright 2024 Jinchuan Tian
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Implementation of Vall-E: https://arxiv.org/abs/2301.02111
+
+import logging
+import torch
+import inspect
+from typing import Tuple, Optional
+from speechbrain.dataio.dataio import length_to_mask
+
+from torch import Tensor
+from torch import nn
+from torch.nn import functional as F
+from dataclasses import dataclass
+
+from speechbrain.nnet.losses import reduce_loss, truncate
+
+
+@dataclass
+class SpeechLMInferenceOptions:
+    """Inference options
+    """
+
+    device: str = None
+    search_algo: str = "topk_sampling"
+    nbest: int = 1
+    sampling_temperature: float = 1.0
+    top_k: int = 20
+    maxlenratio: float = 0.0
+    minlenratio: float = 0.0
+    eos: int = 5
+    start: int = 1
+    masks: torch.Tensor = None
+    nq: int = None
+    allow_invalid: bool = True
+
+
+class ValleLM(nn.Module):
+    """The Vall-E TTS model (decoder-only transformer), adopted from
+    ESPNET2
+
+    Arguments
+    ---------
+    vocab_size : int
+        Dimention of vocabulary.
+    nq : int
+        Number of codes for each token / frame, usually for speech codec.
+    share_emb : bool
+        If true, share the embedding and lm_head weight.
+    qk_norm : bool
+        If true, apply LayerNorm to q and k in atention.
+    dropout : float
+        dropout rate for attention layers.
+    target_dropout : float
+        a separate dropout applied to targets only (may be
+        useful to mitigate autorgressive prediction instability)
+    att_unit: int
+        Dimention of Transformer attention.
+    head : int
+        Number of heads in Transformer attention.
+    ar_layer : int
+        Number of layers in AR Transformer.
+    nar_layer : int
+        Number of layers in NAR Transformer.
+    n_ctx : int
+        maximum context length of AR & NAR Transformer.
+    lm_head : torch.nn.Module, optional
+        an alternative LM head implementation head, an alternative
+        to the default Linear, useful for non-trivial codecs,
+        such as SQ-Codec
+    logits_to_probs : callable, optional
+        A module or a function that converts logits to token probabilities to
+        support top-K sampling
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        nq,
+        pad_id=0,
+        share_emb=True,
+        qk_norm=False,
+        dropout=0.0,
+        target_dropout=0.0,
+        att_unit=256,
+        head=2,
+        ar_layer=4,
+        nar_layer=4,
+        n_ctx=3000,
+        emb=None,
+        lm_head=None,
+        logits_to_probs=None,
+    ):
+        super().__init__()
+        if emb is None:
+            emb = torch.nn.Embedding(vocab_size, att_unit)
+        self.emb = emb
+        if lm_head is None:
+            lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False)
+        self.lm_head = lm_head
+        spec = inspect.getfullargspec(lm_head.forward)
+        self.lm_head_multitrack = "track" in spec.args
+        if logits_to_probs is None:
+            logits_to_probs = nn.Identity()
+        self.logits_to_probs = logits_to_probs
+        if share_emb:
+            self.lm_head.weight = self.emb.weight
+
+        self.ar_decoder = TransformerDecoder(
+            n_ctx=n_ctx,
+            n_state=att_unit,
+            n_head=head,
+            n_layer=ar_layer,
+            qk_norm=qk_norm,
+            dropout=dropout,
+            target_dropout=target_dropout
+        )
+        if nq > 1:
+            # NOTE: An NAR encoder is not needed if there is only one track
+            self.nar_decoder = ValleNARDecoder(
+                n_level=nq - 1,
+                n_ctx=n_ctx,
+                n_state=att_unit,
+                n_head=head,
+                n_layer=nar_layer,
+                qk_norm=qk_norm,
+                dropout=dropout,
+            )
+
+        self.nq = nq
+        self.n_ctx = n_ctx
+        self.pad_id = pad_id
+        self._initialize()
+
+    def forward(
+        self,
+        dec_seq,
+        dec_seq_lengths=None,
+        prefix_len=None,
+        conti_feats=None,
+        nar_level_idx=1,
+        predict_ar=True,
+        predict_nar=True,
+    ):
+        """Vall-E forward for training
+
+        Arguments
+        ---------
+        dec_seq : torch.Tensor
+            Batch of decoder sequences (B, T, nq).
+        dec_seq_lengths : torch.Tensor
+            Lengths of batched decoder sequences (B,).
+        enc_seq : torch.Tensor
+            Batch of encoder sequences (B, T, nq), keep
+            the interface, may not be used.
+        enc_seq_lengths : torch.Tensor
+            Lengths of batched encoder sequences (B,),
+            keep the interface, may not be used.
+        prefix_len : torch.Tensor
+            Lengths of condition part in dec_seq (B,).
+        nar_level_idx : int
+            the index of the non-autoregressive level to train
+        predict_ar : bool
+            Whether to make an autoregressive prediction
+        predict_nar : bool
+            Whether to make a non-autoregressive prediction
+
+        Returns
+        -------
+        logits_ar : torch.Tensor
+            Autoregressive predictions
+        logits_nar : torch.Tensor
+            Non-autoregressive predictions
+        """
+
+        assert dec_seq.dim() == 3
+
+        dec_seq_emb = self.emb(dec_seq)  # [B, T, nq, D]
+        dec_seq_emb, _ = install_continuous_features(
+            dec_seq_emb, None, conti_feats
+        )
+
+        # Auto-Regressive part
+        if predict_ar:
+            input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[
+                :, :-1
+            ]  # [B, T, D]
+            h_ar = self.ar_decoder(input_ar_emb)
+
+        # Non-Auto-Regressive part
+        if predict_nar:
+            input_nar_emb = self.prepare_input(
+                dec_seq_emb, prefix_len, nar_level_idx
+            )[
+                :, 1:
+            ]  # [B, T, V]
+            max_len = dec_seq.size(1)
+            mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool()
+            mask = mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
+            h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask)
+
+        # Logits
+        logits_ar, logits_nar = None, None
+        if predict_ar:
+            logits_ar = self.apply_lm_head(h_ar, 0)
+        if predict_nar:
+            logits_nar = self.apply_lm_head(h_nar, nar_level_idx + 1)
+
+        return logits_ar, logits_nar
+
+    def prepare_input(self, dec_seq_emb, prefix_len, level):
+        # NOTE(Jinchuan): have to use "expand" here but maybe lead to extra memory usage.
+        # This is because both prefix_mask and level_mask are broadcastable and will
+        # trigger user warning.
+
+        # (1) level mask, [B, 1, nq, 1], True is to include
+        if isinstance(level, int):
+            level = torch.ones_like(dec_seq_emb[:, 0, 0, 0]) * level
+        level_mask = length_to_mask(level, self.nq).bool()
+        level_mask = (
+            level_mask.unsqueeze(1).unsqueeze(3).expand(dec_seq_emb.size())
+        )
+
+        # (2) prefix mask, [B, T, 1, 1], True is the prefix
+        prefix_mask = length_to_mask(
+            prefix_len * dec_seq_emb.size(1), dec_seq_emb.size(1)
+        ).bool()
+        prefix_mask = (
+            prefix_mask.unsqueeze(2).unsqueeze(3).expand(dec_seq_emb.size())
+        )
+
+        # (3) mask and then sum in nq-axis.
+        mask = torch.logical_or(level_mask, prefix_mask)
+        return dec_seq_emb.masked_fill(~mask, 0.0).sum(2)
+
+    @torch.no_grad()
+    def inference(
+        self, prefix, opts, enc_seq=None, suffix=None,
+    ):
+        """Vall-E Inference.
+
+        Arguments
+        ---------
+        prefix : torch.Tensor
+            Prefix part of dec_seq (B, T, nq).
+        opts : SpeechLMInferenceOptions
+            inference options.
+        enc_seq : torch.Tensor
+            Encoder token sequence (B, T, nq).
+        suffix : torch.Tensor
+            suffix part of dec_seq (B, T, nq),
+            usually the target sequence for teacher-forcing.
+
+        Returns
+        -------
+        gen_tokens_list : list
+            Generated tokens
+        gen_scores_list : list
+            The scores associated with the generated tokens
+        """
+
+        # (1) initialization
+        cache = self.ar_decoder.init()
+
+        # (2) auto-regressive prefix forward on first code layer
+        prefix = prefix.expand(opts.nbest, -1, -1)
+        if opts.search_algo == "teacher_force":
+            suffix = suffix.expand(opts.nbest, -1, -1)
+        prefix_emb = self.emb(prefix).sum(dim=2)  # [B, T, D]
+        _ = self.ar_decoder(prefix_emb, kv_cache=cache)
+
+        # (3) auto-regressive loop on first code layer
+        # (3.1) AR initialization
+        minlen = (
+            int(prefix.size(1) * opts.minlenratio)
+            if opts.minlenratio > 0
+            else 0
+        )
+        maxlen = int(prefix.size(1) * opts.maxlenratio)
+        if opts.search_algo == "teacher_force":
+            assert suffix is not None
+            minlen = suffix.size(1)
+            maxlen = suffix.size(1)
+        if maxlen + prefix.size(1) > self.n_ctx:
+            maxlen = self.n_ctx - prefix.size(1)
+        logging.info(f"maxlen={maxlen}, minlen={minlen}")
+
+        generated = {"token": [], "score": []}
+        finish_idx = (
+            torch.Tensor([-1]).expand(opts.nbest).long().to(opts.device)
+        )
+        prev_tok = (
+            torch.Tensor([opts.start])
+            .tile(opts.nbest, 1)
+            .long()
+            .to(opts.device)
+        )
+        modality_index = prev_tok.flatten()
+        mask = modality_index_to_mask(modality_index, opts)
+        tracks = prefix.size(-1)
+        is_flattened = opts.nq == 1 and tracks > 1
+        if is_flattened:
+            prev_tok = prev_tok.expand(1, tracks)
+        mask_cache = []
+        modality_tokens = torch.tensor(
+            list(opts.masks.keys()), device=prefix.device
+        )
+
+        for step in range(maxlen):
+            #  (3.2) AR loop
+            if is_flattened:
+                prev_tok = prev_tok.unsqueeze(1)
+            prev_emb = self.emb(prev_tok).squeeze(2)  # [B, 1, D]
+            h_ar = self.ar_decoder(prev_emb, kv_cache=cache)
+            logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0))  # [B, 1, V]
+            if logits.dim() < 4:
+                logits = logits.unsqueeze(-2)
+            gen_tok, gen_score = logits_to_tokens(
+                logits,
+                opts,
+                mask,
+                allow_eos=step >= minlen,
+                nq_level=0,
+            )
+            # [B, 1, 1] -> [B, 1]
+            gen_tok, gen_score = gen_tok.squeeze(1), gen_score.squeeze(1)
+
+            generated["token"].append(gen_tok)
+            generated["score"].append(gen_score)
+
+            if opts.search_algo == "teacher_force":
+                prev_tok = suffix[:, step : step + 1, 0]
+            else:
+                prev_tok = gen_tok  # [B, 1]
+
+            # (3.3) detect modality swtich
+            mask_cache.append(mask.clone())
+            modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens)
+            # Note: The ESPNET VALL-E had
+            # modality_change_mask = torch.logical_and(
+            #    prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64,
+            # )
+            if torch.any(modality_change_mask):
+                modality_index = torch.where(
+                    modality_change_mask, prev_tok[:, 0], modality_index,
+                ).flatten().squeeze()
+                if modality_index.dim() == 0:
+                    modality_index = modality_index.unsqueeze(0)
+                if modality_index.size(0) > 1:
+                    modality_index = modality_index[0:1]
+                mask = modality_index_to_mask(modality_index, opts)
+                logging.warning(
+                    f"Step {step}: change modality index {modality_index}"
+                )
+
+            # (3.4) detect ended hypotheses.
+            finish_idx = torch.where(
+                torch.logical_and(prev_tok[:, 0] == opts.eos, finish_idx == -1),
+                step,
+                finish_idx,
+            )
+
+            if torch.all(torch.ge(finish_idx, 0)):
+                break
+
+            if step == maxlen - 1:
+                logging.warning(
+                    f"Some examples cannot finish in {maxlen} steps: {finish_idx}"
+                    f"Consider increasing the maxlenratio"
+                )
+
+        logging.info(f"Terminate at steps: {finish_idx.cpu().tolist()}")
+
+        # (3.4) finalize auto-regressive
+        if opts.allow_invalid:
+            valid_idx = torch.arange(len(finish_idx), device=finish_idx.device)
+            finish_idx = torch.where(finish_idx == -1, step, finish_idx)
+        else:
+            valid_idx = finish_idx.ne(-1).nonzero(as_tuple=True)[0]
+        if len(valid_idx) == 0:
+            self.ar_decoder.reset()
+            logging.warning(f"No valid examples. Return None")
+            return [], []
+        elif len(valid_idx) < prefix.size(0):
+            logging.info(f"Only {len(valid_idx)} of {prefix.size(0)} are valid")
+
+        finish_idx = finish_idx[valid_idx]
+        prefix_emb = prefix_emb[valid_idx]
+        if opts.search_algo == "teacher_force":
+            suffix = suffix[valid_idx]
+        gen_tokens_ar = torch.cat(generated["token"], dim=1)[
+            valid_idx
+        ].unsqueeze(
+            2
+        )  # [B, T, 1]
+        gen_scores_ar = torch.cat(generated["score"], dim=1)[
+            valid_idx
+        ].unsqueeze(2)
+        gen_tokens_ar = gen_tokens_ar[:, : finish_idx.max() + 1]  # idx -> count
+        gen_scores_ar = gen_scores_ar[:, : finish_idx.max() + 1]
+
+        self.ar_decoder.reset()
+
+        # (4) non-auto-regressive loop on the remained code layers
+        # (4.1) NAR initialization
+        if opts.search_algo == "teacher_force":
+            prev_tok = suffix[:, :, 0]
+        else:
+            prev_tok = gen_tokens_ar[:, :, 0]
+        start_token = torch.tensor(
+            [opts.start], device=prefix.device
+        )[None, None, :]
+
+        # (4.2) NAR loop
+        if self.nq > 1:
+            start_emb = self.emb(start_token).squeeze().tile(
+                len(valid_idx), 1, 1
+            )  # [B, 1, D]
+            prev_emb = torch.cat(
+                [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1
+            )  # [B, T, D]
+
+            ones = torch.ones_like(valid_idx)
+            mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool()
+            mask = mask.unsqueeze(1).unsqueeze(1)
+            generated = {"token": [], "score": []}
+
+            mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache
+            vocab_mask = torch.cat(mask_cache, dim=1)
+
+            for step in range(1, opts.nq):
+                h_nar = self.nar_decoder(
+                    prev_emb, ones * step - 1, mask=mask
+                )  # [B, T, D]
+                
+                logits = self.apply_lm_head(h_nar, step)
+                logits = self.logits_to_probs(logits)
+                gen_tok, gen_score = logits_to_tokens(
+                    logits.unsqueeze(2),
+                    opts,
+                    vocab_mask,
+                    search_algo="greedy_search",
+                    allow_eos=False,
+                    nq_level=step,
+                )
+                gen_tok, gen_score = (
+                    gen_tok.squeeze(2),
+                    gen_score.squeeze(2),
+                )  # [B, T]
+
+                generated["token"].append(gen_tok[:, prefix.size(1) :])
+                generated["score"].append(gen_score[:, prefix.size(1) :])
+
+                if opts.search_algo == "teacher_force":
+                    prev_tok = suffix[:, :, step]
+                else:
+                    prev_tok = generated["token"][-1]
+                prev_emb[:, prefix.size(1) :] += self.emb(prev_tok)  # [B, T, D]
+                prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb
+
+            # (5) combine AR and NAR results
+            gen_tokens_nar = torch.stack(generated["token"], dim=2)  # [B, T, nq]
+            gen_scores_nar = torch.stack(generated["score"], dim=2)
+
+            gen_tokens = torch.cat(
+                [gen_tokens_ar, gen_tokens_nar], dim=2
+            )  # [B, T, nq]
+            gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2)
+        else:
+            gen_tokens = gen_tokens_ar
+            gen_scores = gen_scores_ar
+
+        gen_tokens_list, gen_scores_list = [], []
+        for b in range(len(valid_idx)):
+            item_finish_idx = finish_idx[b]
+            gen_tokens_list.append(gen_tokens[b][:item_finish_idx])
+            gen_scores_list.append(gen_scores[b][:item_finish_idx])
+        return gen_tokens_list, gen_scores_list
+    
+    def apply_lm_head(self, x, track):
+        """Applies the language model head
+        
+        Arguments
+        ---------
+        """
+
+        if self.lm_head_multitrack:
+            result = self.lm_head(x, track)
+        else:
+            result = self.lm_head(x)
+        return result
+
+    def _initialize(self):
+        for m in self.modules():
+            if isinstance(m, torch.nn.Linear):
+                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
+                if m.bias is not None:
+                    torch.nn.init.zeros_(m.bias)
+            elif isinstance(m, torch.nn.Embedding):
+                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
+
+
+class ResidualAttentionBlock(nn.Module):
+    """A VALL-E residual attention block
+
+    Arguments
+    ---------
+    n_state : int
+        The number of states
+    n_head : int
+        The number of heads
+    cross_attention : bool
+        Whether to use cross-attention
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Whether to normalize queries and keys
+    dropout : float
+        The dropout probability
+    """
+
+    def __init__(
+        self,
+        n_state,
+        n_head,
+        cross_attention=False,
+        causal=False,
+        qk_norm=False,
+        dropout=0.0,
+    ):
+        super().__init__()
+
+        self.attn = MultiHeadAttention(
+            n_state, n_head, causal=causal, qk_norm=qk_norm, dropout=dropout,
+        )
+        self.attn_ln = LayerNorm(n_state)
+        self.attn_dropout = nn.Dropout(p=dropout)
+
+        self.cross_attn = (
+            MultiHeadAttention(
+                n_state, n_head, causal=False, qk_norm=qk_norm, dropout=dropout,
+            )
+            if cross_attention
+            else None
+        )
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        self.cross_attn_dropout = (
+            nn.Dropout(p=dropout) if cross_attention else None
+        )
+
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+        self.mlp_dropout = nn.Dropout(p=dropout)
+
+    def forward(
+        self, x, xa=None, mask=None, kv_cache=None,
+    ):
+        """The forward pass implementation
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            the feature tensor
+        xa : torch.Tensor
+            The tensor for cross-attention
+        mask : torch.Tensor
+            The attention mask to be applied
+
+        """
+        x = x + self.attn_dropout(
+            self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
+        )
+        if self.cross_attn:
+            x = x + self.cross_attn_dropout(
+                self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
+            )
+        x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x)))
+        return x
+
+
+class TransformerDecoder(nn.Module):
+    """A custom transformer decoder implementation for VALL-E
+
+    Arguments
+    ---------
+    n_ctx : int
+        The context length
+    n_state : int
+        The number of states
+    n_head : int
+        The number of heads
+    n_layer : int
+        The number of layers
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Whether to normalize queries and keys
+    dropout : float
+        The dropout probability
+    target_dropout : float
+        The target dropout probability
+    layer_class : type
+        The layer type to be used
+    """    
+    def __init__(
+        self,
+        n_ctx,
+        n_state,
+        n_head,
+        n_layer,
+        causal=True,
+        qk_norm=False,
+        dropout=0.0,
+        target_dropout=0.0,
+        layer_class=ResidualAttentionBlock,
+    ):
+
+        super().__init__()
+
+        self.pos_emb = nn.Embedding(n_ctx, n_state)
+
+        self.blocks = nn.ModuleList(
+            [
+                layer_class(
+                    n_state=n_state,
+                    n_head=n_head,
+                    cross_attention=False,
+                    causal=causal,
+                    qk_norm=qk_norm,
+                    dropout=dropout,
+                )
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = LayerNorm(n_state)
+        self.target_dropout = nn.Dropout(target_dropout)
+
+        self.causal = causal
+        self.kv_cache = None
+
+    def forward(
+        self, x, mask=None, kv_cache=None,
+    ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            the feature tensor
+        mask : torch.Tensor
+            The attention mask to be applied
+        kv_cache : dict
+            The key/value cache (for inference)
+
+        Returns
+        -------
+        result : torch.Tensor
+            The decoder output
+        """
+        if self.causal and mask is not None:
+            raise ValueError("Causal Transformer dones't allow mask")
+
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0)
+        tgt = self.target_dropout(x)
+
+        for block in self.blocks:
+            x = block(x, tgt, mask=mask, kv_cache=kv_cache)
+            tgt = x
+
+        x = self.ln(x)
+        return x
+
+    def init(self):
+        """Initializes the key/value cache and the hooks to update it"""
+        self.kv_cache, self.hooks = install_kv_cache_hook(self, self.kv_cache)
+        return self.kv_cache
+
+    def reset(self):
+        """Resets the key-value cache"""
+        for hook in self.hooks:
+            hook.remove()
+        self.kv_cache = None
+
+
+class LayerNorm(nn.LayerNorm):
+    """A layer normalziation wrapper"""
+
+    def forward(self, x):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The tensor to be normalized
+
+        Returns
+        -------
+        result : torch.Tensor
+            A normalzied tensor
+        """
+        return super().forward(x.float()).type(x.dtype)
+
+
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+
+
+class ResidualAttentionBlockAdaLN(ResidualAttentionBlock):
+    """"The Vall-E Adaptive Residual Attention Block
+
+    Arguments
+    ---------
+    n_state : int
+        The number of states
+    n_head : int
+        The number of states
+    n_head : int
+        The number of attention heads
+    cross_attention : bool
+        The number of attention heads
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Queries/Keys Normalization
+    dropout : float
+        The dropout probability
+    """
+
+    def __init__(
+        self,
+        n_state,
+        n_head,
+        cross_attention=False,
+        causal=False,
+        qk_norm=False,
+        dropout=0.0,
+    ):
+        super(ResidualAttentionBlockAdaLN, self).__init__(
+            n_state=n_state,
+            n_head=n_head,
+            cross_attention=cross_attention,
+            causal=causal,
+            qk_norm=qk_norm,
+            dropout=dropout,
+        )
+
+        self.attn_ln = AdaLN(n_state)
+        self.mlp_ln = AdaLN(n_state)
+
+    def forward(
+        self, x, level, xa=None, mask=None, kv_cache=None,
+    ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The source tensor
+        level : torch.Tensor
+            The level numbers for each batch element
+        xa : torch.Tensor
+            The sequence for cross attention
+        mask : torch.Tensor
+            The attention mask
+        kv_cache : dict
+            The key/value cache (for inference)
+        """
+        x = x + self.attn_dropout(
+            self.attn(self.attn_ln(x, level), mask=mask, kv_cache=kv_cache)
+        )
+        if self.cross_attn:
+            x = x + self.cross_attn_dropout(
+                self.cross_attn(
+                    self.cross_attn_ln(x, level), xa, kv_cache=kv_cache
+                )
+            )
+        x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x, level)))
+        return x
+
+
+class ValleNARDecoder(TransformerDecoder):
+    def __init__(
+        self,
+        n_level,
+        n_ctx,
+        n_state,
+        n_head,
+        n_layer,
+        causal=False,
+        qk_norm=False,
+        dropout=0.0,
+        layer_class=ResidualAttentionBlockAdaLN,
+    ):
+        """The VALL-E non-autoregressive decoder
+
+        Arguments
+        ---------
+        n_level : int
+            The number of levels
+        n_ctx : int
+            The context length
+        n_state : int
+            The number of states
+        n_head : int
+            The number of attention heads
+        n_layer : int
+            The number of layers
+        causal : bool
+            Whether to operate in causal mode (i.e. avoid attending
+            to future steps)
+        qk_norm : bool
+            Queries/Keys Normalization
+        dropout : float
+            The dropout probability
+        layer_class : type
+            The layer class to use
+        """
+        super().__init__(
+            n_ctx=n_ctx,
+            n_state=n_state,
+            n_head=n_head,
+            n_layer=n_layer,
+            causal=causal,
+            qk_norm=qk_norm,
+            dropout=dropout,
+            layer_class=layer_class,
+        )
+
+        self.level_emb = nn.Embedding(n_level, n_state)
+        self.ln = AdaLN(n_state)
+
+    def forward(
+        self, x, level, mask=None, kv_cache=None,
+    ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The source tensor
+        level : torch.Tensor
+            The level numbers for each batch element
+        mask : torch.Tensor
+            The attention mask
+        kv_cache : dict
+            The key/value cache (for inference)
+        """
+        if self.causal and mask is not None:
+            raise ValueError("mask is not allowed when causal")
+
+        level = self.level_emb(level)
+
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0)
+
+        for block in self.blocks:
+            x = block(x, level=level, mask=mask, kv_cache=kv_cache)
+
+        x = self.ln(x, level)
+        return x
+
+
+class MultiHeadAttention(nn.Module):
+    """A Multi-Head Attention implementation
+
+    Arguments
+    ---------
+    n_state : int
+        The number of states
+    n_head : int
+        The number of attention heads
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Queries/Keys Normalization
+    dropout : float
+        The dropout probability
+    """
+
+    def __init__(
+        self, n_state, n_head, causal=False, qk_norm=False, dropout=0.0,
+    ):
+        super().__init__()
+        assert n_state % n_head == 0
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+        self.causal = causal
+        self.dropout = dropout
+
+        self.qk_norm = qk_norm
+        if qk_norm:
+            self.q_norm = LayerNorm(n_state // n_head)
+            self.k_norm = LayerNorm(n_state // n_head)
+
+    def forward(
+        self, x, xa=None, mask=None, kv_cache=None,
+    ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The source tensor
+        xa : torch.Tensor
+            The sequence for cross attention
+        mask : torch.Tensor
+            The attention mask
+        kv_cache : dict
+            The key/value cache (for inference)
+        """
+        q = self.query(x)
+
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+
+        wv = self.qkv_attention(q, k, v, mask)
+
+        return self.out(wv)
+
+    def qkv_attention(self, q, k, v, mask=None):
+        """Computes self-attention
+
+        Arguments
+        ---------
+        q : torch.Tensor
+            The queries tensor
+        k : torch.Tensor
+            The keys tensor
+        v : torch.Tensor
+            The values tensor
+
+        Returns
+        -------
+        wv : torch.Tensor
+            The attention output
+        """
+        if self.causal and mask is not None:
+            raise ValueError("mask is not allowed when the attention is causal")
+
+        if self.causal and q.size(1) == k.size(1):
+            causal = True
+        else:
+            causal = False
+
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+            wv = (
+                F.scaled_dot_product_attention(
+                    q, k, v, mask, is_causal=causal, dropout_p=self.dropout
+                )
+                .permute(0, 2, 1, 3)
+                .flatten(start_dim=2)
+            )
+
+        return wv
+
+
+class AdaLN(nn.Module):
+    """Adaptive Layer Normalization, a Layer Norm implementation
+    that learns an affine transformation based on the level
+    embedding
+
+    Arguemnts
+    ---------
+    n_state : int
+        The number of states
+    eps : float
+        The layer norm epsilon parameter"""
+
+    def __init__(self, n_state, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Linear(n_state, n_state, bias=False)
+        self.bias = nn.Linear(n_state, n_state, bias=False)
+        nn.init.constant_(self.weight.weight, 1.0)
+        nn.init.constant_(self.bias.weight, 0.0)
+
+        self.n_state = n_state
+        self.eps = eps
+
+    def forward(self, x, level_emb):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The tensor
+        level_emb : torch.Tensor
+            The level embedding
+        """
+        w = self.weight(level_emb).unsqueeze(1)
+        b = self.bias(level_emb).unsqueeze(1)
+        x = nn.functional.layer_norm(x, (self.n_state,), eps=self.eps)
+        x = w * x + b
+        return x
+
+
+def install_kv_cache_hook(model, cache):
+    """Sets up the key/value cache hook
+
+    Arguments
+    ---------
+    model : torch.nn.Module
+        The model
+    cache : dict
+        The cache content
+
+    Returns
+    -------
+    cache : torch.Tensor
+        The cache dictionary (new or copied)
+    hooks : torch.Tensor
+        The installed hooks
+    """
+    cache = {**cache} if cache is not None else {}
+    hooks = []
+
+    def save_to_cache(module, _, output):
+        if module not in cache:
+            # save as-is, for the first token or cross attention
+            cache[module] = output
+        else:
+            cache[module] = torch.cat([cache[module], output], dim=1).detach()
+        return cache[module]
+
+    def install_hooks(layer: torch.nn.Module):
+        if isinstance(layer, MultiHeadAttention):
+            hooks.append(layer.key.register_forward_hook(save_to_cache))
+            hooks.append(layer.value.register_forward_hook(save_to_cache))
+
+    model.apply(install_hooks)
+    return cache, hooks
+
+
+def logits_to_tokens(
+    logits, opts, mask, search_algo=None, allow_eos=True, nq_level=None,
+):
+    """
+    Select the generated tokens and their scores based on logits prediction.
+
+    Arguments
+    ---------
+    logits : torch.Tensor
+        predicted logits, of size [B, T, nq, V]
+    opts : SpeechLMInferenceOptions
+        search options
+    mask : torch.Tensor
+        mask to specify valid tokens, of size [B, 1, nq, V]
+    search_algo : str
+        search algorithm
+    allow_eos : bool
+        whether to allow end-of-sentence prediction
+    nq_level : int, optional
+        if not None, only conpute the specified codec level nq.
+
+    Returns
+    -------
+    gen_token_idx : torch.Tensor
+        The token indexes
+    gen_token_score : torch.Tensor
+        The token scores
+    """
+
+    assert logits.dim() == 4
+    search_algo = search_algo if search_algo is not None else opts.search_algo
+    neg_inf = torch.finfo(logits.dtype).min
+
+    # (1) Apply mask
+    if nq_level is not None:
+        mask = mask[:, :, nq_level : nq_level + 1]
+
+    if allow_eos:
+        mask = mask.clone()
+        mask[:, :, 0, opts.eos] = False
+
+    logits.masked_fill_(mask, neg_inf)
+
+    # (2) token selection
+    if search_algo in ["topk_sampling"]:
+        topk_values, topk_indices = torch.topk(logits, opts.top_k, dim=-1)
+        probs = torch.softmax(topk_values / opts.sampling_temperature, dim=-1)
+        inner_indices = torch.multinomial(
+            probs.flatten(end_dim=-2), num_samples=1
+        ).view(probs[..., :1].size())
+        gen_token_idx = torch.gather(topk_indices, -1, inner_indices).squeeze(
+            -1
+        )
+        gen_token_score = (
+            torch.gather(probs, -1, inner_indices).squeeze(-1).log()
+        )
+
+    elif search_algo in ["topp_sampling"]:
+        probs = torch.softmax(logits / opts.sampling_temperature, dim=-1)
+        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+        accum_probs = torch.cumsum(sorted_probs, dim=-1)
+        clip_probs = torch.where(accum_probs <= opts.top_p, sorted_probs, 0.0)
+        # always keep at least one candidate no matter what value it is
+        if torch.any(clip_probs[..., 0] == 0.0):
+            clip_probs[..., 0] = sorted_probs[..., 0]
+        clip_probs = clip_probs / clip_probs.sum(dim=-1, keepdim=True)
+        inner_indices = torch.multinomial(
+            clip_probs.flatten(end_dim=-2), num_samples=1
+        ).view(clip_probs[..., :1].size())
+        gen_token_idx = torch.gather(sorted_indices, -1, inner_indices).squeeze(
+            -1
+        )
+        gen_token_score = (
+            torch.gather(clip_probs, -1, inner_indices).squeeze(-1).log()
+        )
+
+    elif search_algo in ["greedy_search", "teacher_force"]:
+        probs = logits.softmax(dim=-1)
+        topk_values, topk_indices = torch.topk(logits, 1, dim=-1)
+        gen_token_idx = topk_indices[:, :, :, 0]
+        gen_token_score = topk_values[:, :, :, 0].log()
+
+    else:
+        raise NotImplementedError(f"opts.search_algo={opts.search_algo}")
+
+    return gen_token_idx, gen_token_score
+
+
+@torch.no_grad()
+def install_continuous_features(
+    dec_emb: torch.Tensor,
+    enc_emb: Optional[torch.Tensor] = None,
+    conti_feats: Tuple = None,
+):
+    if conti_feats is None:
+        return dec_emb, enc_emb
+
+    assert dec_emb.size(0) == len(conti_feats)
+    if enc_emb is not None:
+        assert enc_emb.size(0) == len(conti_feats)
+
+    for b, conti_feat in enumerate(conti_feats):
+        for conti_emb, start, end, part in conti_feat:
+            if part == "dec":
+                assert conti_emb.size(1) == dec_emb.size(2)
+                dec_emb[b, start:end] = conti_emb
+            else:
+                assert conti_emb.size(1) == enc_emb.size(2)
+                enc_emb[b, start:end] = conti_emb
+
+    return dec_emb, enc_emb
+
+
+def modality_index_to_mask(
+    modality_index: torch.Tensor, inference_opts: SpeechLMInferenceOptions,
+):
+    assert modality_index.dim() == 1
+    modality_index = modality_index.cpu().tolist()
+    mask = torch.stack(
+        [inference_opts.masks[idx] for idx in modality_index], dim=0
+    ).unsqueeze(
+        1
+    )  # [B, 1, nq, V]
+
+    return mask
+
+
+def masked_nll_loss(
+    log_probabilities, targets, mask, allowed_len_diff=3, reduction="mean"
+):
+    """Similar to the standard nll_loss from SpeechBrain
+    but applies a custom mask
+
+    Arguments
+    ---------
+    log_probabilities : torch.Tensor
+        The probabilities after log has been applied.
+        Format is [batch, log_p] or [batch, frames, log_p].
+    targets : torch.Tensor
+        The targets, of shape [batch] or [batch, frames].
+    mask : torch.Tensor
+        The mask for loss calculation
+    allowed_len_diff : int
+        Length difference that will be tolerated before raising an exception.
+    reduction : str
+        Options are 'mean', 'batch', 'batchmean', 'sum'.
+        See pytorch for 'mean', 'sum'. The 'batch' option returns
+        one loss per item in the batch, 'batchmean' returns sum / batch size.
+    """
+    log_probabilities, targets = truncate(
+        log_probabilities, targets, allowed_len_diff
+    )
+    dims = [0, log_probabilities.dim() - 1] + list(range(1, log_probabilities.dim() - 1))
+    log_probabilities = log_probabilities.permute(dims).contiguous()
+    loss = torch.nn.functional.nll_loss(
+        input=log_probabilities, target=targets.long(), reduction="none"
+    )
+    while mask.dim() < loss.dim():
+        mask = mask.unsqueeze(-1)
+    mask = mask.expand_as(loss)
+    loss *= mask
+    loss = reduce_loss(loss, mask, reduction, 0.0, log_probabilities, targets)
+    return loss.contiguous()
diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh
old mode 100644
new mode 100755
index e0f848aef..5dcd6b397
--- a/benchmarks/DASB/run_experiments.sh
+++ b/benchmarks/DASB/run_experiments.sh
@@ -149,8 +149,13 @@ seed="${seed:-$RANDOM}"
 
 
 if [ "$rnd_dir" = True ]; then
-    rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
-    output_folder="$output_folder/$rnd_dirname"
+    if [[ ! -z "$ORION_TRIAL_ID" ]]; then
+      # Use the Orion Trial ID to ensure interrupted trials are resumed
+      output_folder="$output_folder/$ORION_TRIAL_ID"
+    else
+      rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
+      output_folder="$output_folder/$rnd_dirname"
+    fi
 fi
 
 # Make sure  the output_folder is created
@@ -181,7 +186,7 @@ mkdir -p $cached_data_folder
 # Function to run the training experiment
 run_experiment() {
 
-python $dataset/$task/train.py $hparams  --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \
+eval python $dataset/$task/train.py $hparams  --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \
 $additional_flags
 
 }
@@ -201,4 +206,4 @@ done
 
 
 echo 'Final Results (Performance Aggregation)'
-python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt
\ No newline at end of file
+python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
old mode 100644
new mode 100755
index 2ad1dddf3..1ee79c0cd
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -63,6 +63,7 @@ orion_db_type="PickledDB"
 exp_max_trials=50
 store_all=True
 compress_exp=True
+hparam_filter=""
 
 # Function to print argument descriptions and exit
 print_argument_descriptions() {
@@ -202,12 +203,28 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
 
+    --hparam_filter)
+      hparam_filter="$2"
+      shift
+      shift
+      ;;
+
     --help)
       print_argument_descriptions
       ;;
 
     -*|--*)
-      additional_flags+="$1 $2 " # store additional flags
+      name=$1
+      value=$2
+      if [[ "$name" =~ ^--eval_run_ ]]; then
+        name=$(echo $name | sed s/^--eval_run_/--/)
+        eval_run_additional_flags+="$name $value "
+      else
+        if [[ ! "$eval_run_additional_flags" =~ "$name " ]]; then
+          eval_run_additional_flags+="$name $value "
+        fi
+        additional_flags+="$name $value " # store additional flags
+      fi    
       shift # past argument
       ;;
 
@@ -271,6 +288,7 @@ echo "-------------------------------------"
 get_flag() {
     local file_path="$1"
     local pattern="$2"
+    local filter="$3"
 
     # Check if the file exists
     if [ ! -f "$file_path" ]; then
@@ -279,7 +297,7 @@ get_flag() {
     fi
 
     # Use grep to find all lines containing the pattern and then extract the flags using sed
-    grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n'
+    grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | grep "$filter" | tr -d '\n'
 }
 
 
@@ -323,7 +341,7 @@ function extract_best_params() {
 step_id=1
 hparams_step=$hparams
 pattern="@orion_step1:"
-opt_flags=$(get_flag "$hparams_step" "$pattern")
+opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter")
 
 # Check if the string is empty and exit with an error if it is
 if [ -z "$opt_flags" ]; then
@@ -365,7 +383,7 @@ while [ -n "$opt_flags" ]; do
     eval $orion_hunt_command
 
     # Compress the exp folder (if required)
-    if [ "$compress_exp" = True ]; then
+    if [ "$compress_exp" = True ] && [ ! -e "$output_folder_step/exp.tar.gz" ]; then
         tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp"
         if [ -d "$output_folder_step/exp" ]; then
             rm -rf "$output_folder_step/exp"
@@ -399,7 +417,7 @@ while [ -n "$opt_flags" ]; do
     pattern="@orion_step$step_id:"
 
     # update optimization flags pattern
-    opt_flags=$(get_flag "$hparams_step" "$pattern")
+    opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter")
 done
 
 echo
@@ -415,6 +433,6 @@ scp $best_yaml_file $final_yaml_file
 ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
   --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --rnd_dir $store_all --testing True $additional_flags
+  --rnd_dir False --testing True $eval_run_additional_flags
 
-echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file
+echo "The test performance with best hparams is available at  $output_folder/best"
diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py
index 0df315b7e..e11046ade 100644
--- a/benchmarks/DASB/utils/aggregate_results.py
+++ b/benchmarks/DASB/utils/aggregate_results.py
@@ -144,6 +144,8 @@ def aggregate_metrics(prototype, metrics):
 
     # Report final metric to Orion
     # Remember: orion expects metrics to be minimized!
-    if eval_metric == "acc" or eval_metric == "f1":
+    if eval_metric in ["acc", "f1"]:
         final_metric = 1 - final_metric
+    elif eval_metric == "utmos":
+        final_metric = -final_metric
     report_objective(final_metric)
diff --git a/benchmarks/DASB/utils/audio_tokens.py b/benchmarks/DASB/utils/audio_tokens.py
deleted file mode 100644
index 9dc4014c4..000000000
--- a/benchmarks/DASB/utils/audio_tokens.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""Utilities for discrete audio token models
-
-
-Authors
- * Artem Ploujnikov 2023
-"""
-import torch
-from speechbrain.dataio.batch import PaddedBatch
-from speechbrain.utils.data_utils import batch_pad_right
-from functools import partial
-
-
-def get_silence_token(
-    model,
-    sample_length=100000,
-    extract_emb=True,
-    device=None,
-    model_kwargs=None,
-):
-    """Attempts to find out the silence tokens for a given model,
-    if applicable
-
-    Arguments
-    ---------
-    model : nn.Module
-        A discrete token model, taking (wav, lengths) as arguments
-    sample_length : int
-        The length of the sample
-    extract_emb : bool
-        Whether to extract embeddings
-    device : str | torch.Device
-        The device to use
-    model_kwargs : dict
-        Additional arguments to pass to the model
-
-    Returns
-    -------
-    silence_tokens : torch.Tensor
-        The token(s) corresponding to silence
-
-    silece_emb : torch.Tensor
-        The embedding(s) corresponding to silence
-
-    """
-    if device is None:
-        device = next(model.parameters()).device
-    if model_kwargs is None:
-        model_kwargs = {}
-
-    audio = torch.zeros(1, sample_length, device=device)
-    length = torch.ones(1, device=device)
-    result = model(audio, length, **model_kwargs)
-    tokens = result[0]
-    silence_tokens = tokens.squeeze(0).mode(0).values
-    silence_emb = None
-    if extract_emb:
-        if hasattr(model, "embeddings"):
-            silence_emb = model.embeddings(
-                silence_tokens[None, None, :]
-            ).squeeze()
-        else:
-            heads = tokens.shape[-1]
-            embs = result[1]
-            mode_idx = [
-                (tokens[0, :, head] == silence_tokens[head]).nonzero()[0].item()
-                for head in range(heads)
-            ]
-            silence_emb = torch.stack(
-                [embs[0, idx, head] for head, idx in enumerate(mode_idx)]
-            )
-    return silence_tokens, silence_emb
-
-
-def feature_pad_to(tensor, length, padding=None):
-    """Pads feature dimensions to the specified length with the specified padding,
-    assuming a (Batch x Length x Features..) tensor
-
-    Arguments
-    ---------
-    tensor : torch.Tensor
-        The tensor to be padded
-
-    length : int
-        The length to which the tensor will be padded
-
-    padding : torch.Tensor, optional
-        The padding tensor - if omitted, zero padding
-        will be used
-
-    Returns
-    -------
-    result : torch.Tensor
-        The padded tensor
-    """
-    if padding is None:
-        padding = torch.zeros(tensor.shape[1:])
-    padding = padding[None, ...].expand(
-        (length - tensor.size(0),) + tensor.shape[1:]
-    )
-    return torch.cat([tensor, padding], dim=0)
-
-
-def batch_feature_pad(tensors, padding=None):
-    """Similar to batch_pad_right but pads with the specified padding, whcih
-    can be a vector or a tensor
-
-    Arguments
-    ---------
-    tensors : list
-        The list of tensors to be padded
-    padding : torch.Tensor
-        The padding tensor
-
-    Returns
-    -------
-    result : torch.Tensor
-        the padded tensor
-    """
-    lengths_abs = torch.tensor(
-        [len(item) for item in tensors], device=tensors[0].device
-    )
-    max_length = lengths_abs.max()
-    data = torch.stack(
-        [feature_pad_to(item, max_length, padding) for item in tensors]
-    )
-    lengths = lengths_abs / max_length
-    return data, lengths
-
-
-def token_collate_fn(examples, silence_token, token_keys):
-    """A customized collation function for audio tokens where
-    the specified silence token will be used as padding - instead of
-    zeros
-
-    Arguments
-    ---------
-    examples : list
-        A list of examples
-
-    silence_token : torch.Tensor
-        The token(s) representing silence
-
-    token_keys : list
-        The list of keys to which special padding will be applied
-
-    Returns
-    -------
-    result : speechbrain.dataio.batch.PaddedBatch
-        A padded batch
-    """
-    token_tensor_ids = {id(examples[0][key]) for key in token_keys}
-    return PaddedBatch(
-        examples,
-        padding_func=_silence_padding,
-        padding_kwargs={
-            "silence_token": silence_token,
-            "token_tensor_ids": token_tensor_ids,
-        },
-    )
-
-
-def _silence_padding(values, silence_token, token_tensor_ids):
-    return (
-        batch_feature_pad(values, silence_token)
-        if id(values[0]) in token_tensor_ids
-        else batch_pad_right(values)
-    )
-
-
-def use_silence_padding(dataloader_opts, silence_token, token_keys):
-    """Overrides the collation function to add silence padding to
-    audio token features
-
-    Arguments
-    ---------
-    dataloder_opts : dict
-        Dataloader options
-    silence_token : torch.Tensor
-        The tensor to be used as silence padding
-    token_keys : torch.Tensor
-        The keys to apply silence padding to
-
-    Returns
-    -------
-    dataloader_opts : dict
-        Updated data loader options
-    """
-    return {
-        **dataloader_opts,
-        "collate_fn": partial(
-            token_collate_fn, silence_token=silence_token, token_keys=token_keys
-        ),
-    }
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index c0e14f867..76f2a6c2f 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -7,32 +7,52 @@
 """
 
 from speechbrain.inference.interfaces import Pretrained
-from speechbrain.inference.ASR import EncoderDecoderASR
 from speechbrain.lobes.models.huggingface_transformers import Whisper
+from speechbrain.lobes.models.huggingface_transformers.wav2vec2 import Wav2Vec2
 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
+from speechbrain.dataio.dataio import length_to_mask
 from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher
 from speechbrain.dataio.batch import PaddedBatch
 from speechbrain.utils.metric_stats import ErrorRateStats
-from speechbrain.utils.superpowers import run_shell
+from speechbrain.utils.data_utils import pad_right_to
+from speechbrain.utils.fetching import fetch
 from collections import namedtuple
 from pathlib import Path
-import os
+from torch import nn
 import torch
 import torchaudio
 import re
 import string
 import logging
-import shutil
-import shlex
-import subprocess
+
 
 logger = logging.getLogger(__name__)
 
+
+has_transformers = False
+try:
+    from transformers import AutoModelForAudioXVector
+
+    has_transformers = True
+except ImportError:
+    logger.warning(
+        "transformers library not found - some evaluators may be disabled"
+    )
+
+
 RE_PUNCTUATION = re.compile(
     "|".join(re.escape(char) for char in string.punctuation)
 )
 
 
+SAMPLE_RATE = 16000
+DEFAULT_ENCODER_HUB = "chaanks/wav2vec2-small"
+DEFAULT_MODEL_URL = "https://huggingface.co/chaanks/UTMOS/resolve/main"
+DEFAULT_MODEL_NAME = "utmos.ckpt"
+DEFAULT_SAVE_DIR = "./pretrained_models"
+DEFAULT_JUDGE_ID = 288
+DEFAULT_DOMAIN_ID = 0
+
 SpeechEvaluationResult = namedtuple(
     "SpeechEvaluationResult", ["score", "details"]
 )
@@ -217,77 +237,6 @@ def __call__(self, wavs, length):
         return self.mods.model(wavs, length)
 
 
-class RegressionModelSpeechEvaluator(SpeechEvaluator):
-    """A speech evaluator that uses a regression model
-    that produces a quality score (e.g. SSL fine-tuning)
-    for a sample of speech
-
-    Arguments
-    ---------
-    source : str
-        The source model path or HuggingFace hub name
-    sample_rate : int
-        The audio sample rate this evaluator expects
-    """
-
-    def __init__(self, source, sample_rate=None, *args, **kwargs):
-        super().__init__(sample_rate=sample_rate)
-        self.model = SpeechEvaluationRegressionModel.from_hparams(
-            source, *args, **kwargs
-        )
-
-    def evaluate(
-        self,
-        wavs,
-        length,
-        text=None,
-        wavs_ref=None,
-        length_ref=None,
-        sample_rate=None,
-        sample_rate_ref=None,
-    ):
-        """Evaluates a batch of waveforms
-
-        Arguments
-        ---------
-        Arguments
-        ---------
-        wavs: torch.Tensor
-            the waveforms to evaluate
-
-        length: torch.Tensor
-            relative lengths (a 1-D tensor)
-
-        text : list, optional
-            Ground truth text
-
-        wavs_ref : torch.Tensor
-            the reference waveforms
-
-        length_ref : torch.Tensor
-            the reference waveform lengths
-
-        sample_rate : int, optional
-            The sample rate of the audio. If not provided,
-            the audio is assumed to be at the same sample
-            rate as the model
-
-        sample_rate_ref : int, optional
-            The sample rate of the reference samples
-
-        Returns
-        -------
-        result : SpeechEvaluationResult
-            an aggregated speech evaluation result with a score
-            for each item
-        """
-        wavs = self.resample(wavs, sample_rate)
-        scores = self.model(wavs, length)
-        while scores.dim() > 1 and scores.size(-1) == 1:
-            scores = scores.squeeze(-1)
-        return SpeechEvaluationResult(score=scores, details={"score": scores})
-
-
 class ASRSpeechEvaluator(SpeechEvaluator):
     """A superclass for ASR speech evaluators"""
 
@@ -401,105 +350,6 @@ def _replace_blanks(self, preds):
         return [" " if item == "" else item for item in preds]
 
 
-class EncoderDecoderASRSpeechEvaluator(ASRSpeechEvaluator):
-    """A speech evaluator implementation based on ASR.
-    Computes the Word Error Rate (WER), Character Error Rate (CER)
-    and a few other metrics
-
-    Arguments
-    ---------
-    sample_rate : int
-        The audio sample rate this evaluator expects
-    """
-
-    def __init__(self, source, sample_rate=None, *args, **kwargs):
-        super().__init__(sample_rate=sample_rate)
-        self.asr = EncoderDecoderASR.from_hparams(source, *args, **kwargs)
-        self.device = next(self.asr.mods.parameters()).device
-
-    def evaluate_samples(self, wavs, length, text, sample_rate):
-        wavs = self.resample(wavs, sample_rate)
-        if text is None:
-            raise ValueError("This evaluator requires ground-truth text")
-        predicted_words, scores, log_probs = self.transcribe_batch_with_details(
-            wavs, length
-        )
-        ids = range(1, len(wavs) + 1)
-        wer_metric, cer_metric = init_asr_metrics()
-        wer_metric.append(ids, predicted_words, text)
-        cer_metric.append(ids, predicted_words, text)
-        wer = torch.tensor(
-            [score["WER"] for score in wer_metric.scores], device=wavs.device
-        )
-        cer = torch.tensor(
-            [score["WER"] for score in cer_metric.scores], device=wavs.device
-        )
-        prob_mean = log_probs.exp().mean(dim=-1)
-        return {
-            "wer": wer,
-            "cer": cer,
-            "beam_score": scores,
-            "prob_mean": prob_mean,
-            "pred": predicted_words,
-            "target": text,
-        }
-
-    def transcribe_batch_with_details(self, wavs, wav_lens):
-        """Transcribes the input audio into a sequence of words
-
-        The waveforms should already be in the model's desired format.
-        You can call:
-        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
-        to get a correctly converted signal in most cases.
-
-        Arguments
-        ---------
-        predicted_words : list
-            The raw ASR predictions, fully decoded
-        best_scores : list
-            The best scores (from beam search)
-        best_log_probs : list
-            The best predicted log-probabilities (from beam search)
-
-
-        Returns
-        -------
-        predicted_words : list
-            The predictions
-
-        best_scores : torch.Tensor
-            The best scores (from beam search)
-
-        best_log_probs : torch.Tensor
-            The best log-probabilities
-
-        """
-        with torch.no_grad():
-            wav_lens = wav_lens.to(self.device)
-            encoder_out = self.asr.encode_batch(wavs, wav_lens)
-            (
-                hyps,
-                best_lens,
-                best_scores,
-                best_log_probs,
-            ) = self.asr.mods.decoder(encoder_out, wav_lens)
-            predicted_words = [
-                self.asr.tokenizer.decode_ids(token_seq) for token_seq in hyps
-            ]
-        return predicted_words, best_scores, best_log_probs
-
-    def to(self, device):
-        """Transfers this module to the spcieifed device
-
-        Arguments
-        ---------
-        device : str | torch.Device
-            the target device
-        """
-        self.asr = self.asr.to(device)
-        return self
-
-
 class WhisperASRSpeechEvaluator(ASRSpeechEvaluator):
     """A speech evaluator implementation based on Whisper ASR
 
@@ -743,171 +593,320 @@ def evaluate_files(self, file_names, text=None, file_names_ref=None):
         raise NotImplementedError()
 
 
-UTMOS_REPO = "https://huggingface.co/spaces/sarulab-speech/UTMOS-demo"
+class UTMOSModel(nn.Module):
+    """The UTMOS model wrapper
+
+    Arguments
+    ---------
+    source : str
+        The WavLM source
+    save_path : str | path-like
+        The path where the model will be saved
+    features_dim : int, optional
+        The features dimension
+    num_domains : int, optional
+        The number of domains
+    domain_dim : int, optional
+        The dimension of each domain
+    num_judges : int, optional
+        The number of "judges"
+    judge_dim : int, optional
+        The dimension of each judge
+    decoder_hidden_size : int, optional
+        The size of the decoder hidden state
+    multiplier : float, optional
+        The number that the raw model output is multiplied by
+        to compute the score
+    offset : float, optional
+        The number that (raw output * multiplier) will be added
+        to in order to get the score
+    """
+
+    def __init__(
+        self,
+        source,
+        save_path,
+        features_dim=768,
+        num_domains=3,
+        domain_dim=128,
+        num_judges=3000,
+        judge_dim=128,
+        decoder_hidden_size=512,
+        multiplier=2.0,
+        offset=3.0,
+    ):
+        super().__init__()
+
+        self.ssl_encoder = Wav2Vec2(
+            source,
+            save_path,
+            freeze=True,
+            output_norm=False,
+            freeze_feature_extractor=True,
+            output_all_hiddens=False,
+        )
+
+        self.domain_embedding = nn.Embedding(num_domains, domain_dim)
+        self.judge_embedding = nn.Embedding(num_judges, judge_dim)
+
+        self.decoder = nn.LSTM(
+            input_size=features_dim + domain_dim + judge_dim,
+            hidden_size=decoder_hidden_size,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(decoder_hidden_size * 2, 2048),
+            torch.nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(2048, 1),
+        )
+        self.multiplier = multiplier
+        self.offset = offset
+
+    def forward(self, wav, domain_id=None, judge_id=None):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        wav : torch.Tensor
+            The raw waveforms
+        domain_id : torch.Tensor
+            The domain identifiers
+        judge_id : torch.Tensor
+            The judge identifier
+
+        Returns
+        -------
+        result : torch.Tensor
+            The predicted rating(s)
+        """
+
+        if domain_id is None:
+            domain_id = torch.zeros(
+                len(wav), dtype=torch.int, device=wav.device
+            )
+        if judge_id is None:
+            judge_id = (
+                torch.ones(len(wav), dtype=torch.int, device=wav.device)
+                * DEFAULT_JUDGE_ID
+            )
+
+        ssl_features = self.ssl_encoder(wav)
+        domain_emb = self.domain_embedding(domain_id)
+        judge_emb = self.judge_embedding(judge_id)
+
+        domain_emb = domain_emb.unsqueeze(1).expand(
+            -1, ssl_features.size(1), -1
+        )
+        judge_emb = judge_emb.unsqueeze(1).expand(-1, ssl_features.size(1), -1)
+        concatenated_feature = torch.cat(
+            [ssl_features, domain_emb, judge_emb], dim=2
+        )
+
+        decoder_output, _ = self.decoder(concatenated_feature)
+        pred = self.classifier(decoder_output)
 
+        return pred.mean(dim=1).squeeze(1) * self.multiplier + self.offset
 
-class UTMOSSpeechEvaluator(BulkSpeechEvaluator):
-    """An evaluation wrapper for UTMOS
+
+class UTMOSSpeechEvaluator(SpeechEvaluator):
+    """The UTMOS speech evaluator wrapper
 
     Github: https://github.com/sarulab-speech/UTMOS22
     HuggingFace: https://huggingface.co/spaces/sarulab-speech/UTMOS-demo
 
+
     Arguments
     ---------
-    model_path : str | path-like
-        The path where the HuggingFace repository was extracted
-    output_folder : str | path-like
-        The folder where results will be output
-    ckpt_path : str | path-like
-        The path to the checkpoint to be used
-    script : str | path-like
-        The path to the evaluation script, defaults to the bundled
-        predict.py
-    python : str | path-like, optional
-        The path to the Python interpreter to be used, defaults to
-        "python". Depending on the environment, it might need to be
-        changed (e.g. to "python3" or an absolute path to the interpreter)
-    use_python : bool
-        Whether to launch the script using python. This flag will need to be
-        set to False in environments where running UTMOS requires a wrapper shell
-        script (e.g. to initialize a different Python virtual environment from
-        the one in which SpeechBrain is running)
-    tmp_folder : str | path-like, optional
-        The temporary folder where files will be copied for evaluation. If
-        omitted, it will be set to output_folder. This can be useful on
-        compute environments that provide fast local storage (e.g. certain
-        compute clusters)
-    repo : str
-        The repor
+    source : str, optional
+        The WavLM source
+    save_path : str | path-like, optional
+        The path where the model will be saved
+    model_name : str
+        The name of the model hub
+    model_url : str
+        The model URL (if applicable)
+    domain_id : int
+        The domain ID of the underlying model
+    judge_id : int
+        The judge ID to use (given UTMOS was trained as an ensemble
+        of judges)
+    run_opts: dict, optional
+        The run options
+    sample_rate : int
+        The sample rate of the underlying model
     """
 
     def __init__(
         self,
-        model_path,
-        output_folder,
-        ckpt_path,
-        script="predict.py",
-        python="python",
-        use_python=True,
-        batch_size=8,
-        tmp_folder=None,
-        repo=UTMOS_REPO,
+        source=None,
+        save_path=None,
+        model_name=None,
+        model_url=None,
+        domain_id=None,
+        judge_id=None,
+        run_opts=None,
+        sample_rate=16000,
     ):
-        self.output_folder = Path(output_folder)
-        rand = torch.randint(1, 999999999, (1,)).item()
-        if tmp_folder is None:
-            tmp_folder = self.output_folder
-        else:
-            tmp_folder = Path(tmp_folder)
-        self.eval_path = (tmp_folder / f"eval_{rand}").absolute()
-        self.model_path = Path(model_path).absolute()
-        script = self.model_path / script
-        self.script = script
-        self.ckpt_path = Path(ckpt_path).absolute()
-        self.batch_size = batch_size
-        self.python = python
-        self.use_python = use_python
-        self.repo = repo
-        self.install()
-
-    def install(self):
-        if self.model_path.exists():
-            logger.info("UTMOS is already installed in %s", self.model_path)
-            return
-        logger.info(
-            "Attempting to install UTMOS from %s to %s",
-            self.repo,
-            self.model_path,
-        )
-        cmd = shlex.join(
-            [
-                "git",
-                "-C",
-                str(self.model_path.parent),
-                "clone",
-                self.repo,
-                str(self.model_path.name),
-            ]
-        )
-        output, err, return_code = run_shell(cmd)
-        if return_code != 0:
-            raise CommandError(cmd, output, err, return_code)
-        logger.info("Repository clone successful, performing an LFS fetch")
-        cwd = Path.cwd()
-        try:
-            os.chdir(self.model_path)
-            cmd = shlex.join(["git", "lfs", "fetch"])
-            output, err, return_code = run_shell(cmd)
-            if return_code != 0:
-                raise CommandError(cmd, output, err, return_code)
-        finally:
-            os.chdir(cwd)
-        if not self.ckpt_path.exists():
-            raise ValueError("ckpt_path {ckpt_path} does not exist")
-
-    def evaluate_files(self, file_names, text, file_names_ref=None):
-        """Evaluates multiple files
+        super().__init__(sample_rate=sample_rate)
+        self.model = UTMOSModel(source=source, save_path=save_path,)
+        if run_opts is not None:
+            device = run_opts.get("device")
+            if device:
+                self.model = self.model.to(device)
+        fetch(model_name, model_url, save_path)
+        model_path = Path(save_path) / model_name
+        state_dict = torch.load(model_path)
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+
+        self.domain_id = domain_id
+        self.judge_id = judge_id
+
+    def evaluate(
+        self,
+        wavs,
+        length,
+        text=None,
+        wavs_ref=None,
+        length_ref=None,
+        sample_rate=None,
+        sample_rate_ref=None,
+    ):
+        """Evaluates a batch of waveforms using UTMOS
 
         Arguments
         ---------
-        file_names : list
-            A list of files
-
-        text : list
-            File transcripts (not required for all evaluators)
-            Not used in this evaluator
-
-        file_names_ref : list, optional
-            A list of reference files / ground truths (if applicable)
-            Not used in this evaluator
+        wavs: torch.Tensor
+            the waveforms to evaluate
+        length: torch.Tensor
+            relative lengths (a 1-D tensor)
+        text : list, optional
+            Ground truth text. Ignored for UTMOS.
+        wavs_ref : torch.Tensor
+            the reference waveforms. Ignored for UTMOS.
+        length_ref : torch.Tensor
+            the reference waveform lengths. Ignored for UTMOS.
+        sample_rate : int, optional
+            The sample rate of the audio. If not provided,
+            the audio is assumed to be at the same sample
+            rate as the model
+        sample_rate_ref : int, optional
+            The sample rate of the reference samples. Ignored for UTMOS.
 
         Returns
         -------
         result : SpeechEvaluationResult
-            a consolidated evaluation result
+            an aggregated speech evaluation result with a score
+            for each item
         """
-        current_path = os.getcwd()
-        try:
-            self.eval_path.mkdir(parents=True, exist_ok=True)
-            logger.info("Copying the files to '%s'", self.eval_path)
-            for file_name in file_names:
-                target_file_name = self.eval_path / Path(file_name).name
-                shutil.copy(file_name, target_file_name)
-
-            logger.info("Running evaluation")
-            result_path = self.eval_path / "result.txt"
-            os.chdir(self.model_path)
-            cmd = [
-                str(self.script),
-                "--mode",
-                "predict_dir",
-                "--bs",
-                str(self.batch_size),
-                "--inp_dir",
-                str(self.eval_path),
-                "--out_path",
-                str(result_path),
-                "--ckpt_path",
-                str(self.ckpt_path),
-            ]
-            if self.use_python:
-                cmd = [self.python] + cmd
-
-            output = subprocess.check_output(cmd)
-            logger.info("Evaluation finished, output: %s", output)
-            file_names = [path.name for path in self.eval_path.glob("*.wav")]
-            with open(result_path) as result_path:
-                scores = [float(line.strip()) for line in result_path]
-            score_map = dict(zip(file_names, scores))
-            scores_ordered = [
-                score_map[Path(file_name).name] for file_name in file_names
-            ]
-            return SpeechEvaluationResult(
-                scores_ordered, {"utmos": scores_ordered}
+        wavs = self.resample(wavs, sample_rate=sample_rate)
+        domain_id, judge_id = None, None
+        if self.domain_id is not None:
+            domain_id = (
+                torch.ones(len(wavs), device=wavs.device) * self.domain_id
+            )
+        if self.judge_id is not None:
+            judge_id = torch.ones(len(wavs), device=wavs.device) * self.judge_id
+
+        scores = self.model(wav=wavs, domain_id=domain_id, judge_id=judge_id)
+        return SpeechEvaluationResult(score=scores, details={"utmos": scores})
+
+
+class SpkSimWavLM(SpeechEvaluator):
+    """A speaker similarity evaluator based on WavLM / XVector
+
+    Arguments
+    ---------
+    source : str
+        The model hub to use
+    savedir : str
+        The path where the model will be saved
+    model_sample_rate : int, optional
+        The sample rate to which all samples will be resampled
+        before being processed
+    """
+
+    def __init__(
+        self,
+        source,
+        savedir,
+        model_sample_rate=16000,
+        run_opts=None,
+        *args,
+        **kwargs,
+    ):
+        if not has_transformers:
+            raise ValueError(
+                "Unable to use the SpkSimWavLM evaluator because the "
+                "transformers library is not enabled"
             )
-        finally:
-            os.chdir(current_path)
-            shutil.rmtree(self.eval_path)
+        if run_opts is None:
+            run_opts = {}
+        device = run_opts.get("device")
+        self.model = AutoModelForAudioXVector.from_pretrained(
+            source, cache_dir=savedir, *args, **kwargs
+        )
+        if device is not None:
+            self.model = self.model.to(device)
+
+        self.model.eval()
+        self.model_sample_rate = model_sample_rate
+        self.device = next(self.model.parameters()).device
+
+    def evaluate(
+        self,
+        wavs,
+        length,
+        text=None,
+        wavs_ref=None,
+        length_ref=None,
+        sample_rate=None,
+        sample_rate_ref=None,
+    ):
+        # Resample
+        if sample_rate is not None:
+            wavs = torchaudio.functional.resample(
+                wavs, orig_freq=sample_rate, new_freq=self.model_sample_rate
+            )
+        if sample_rate_ref is not None:
+            wavs_ref = torchaudio.functional.resample(
+                wavs_ref,
+                orig_freq=sample_rate_ref,
+                new_freq=self.model_sample_rate,
+            )
+
+        # Concatenate
+        batch_size, wavs_max_len = wavs.shape
+        _, wavs_ref_max_len = wavs_ref.shape
+        length_abs = length * wavs_max_len
+        length_ref_abs = length_ref * wavs_ref_max_len
+        max_len = max(wavs_max_len, wavs_ref_max_len)
+        wavs, _ = pad_right_to(wavs, (batch_size, max_len))
+        wavs_ref, _ = pad_right_to(wavs_ref, (batch_size, max_len))
+        audio = torch.cat([wavs, wavs_ref])
+
+        length_cat_abs = torch.cat([length_abs, length_ref_abs])
+        # Attention mask
+        attention_mask = length_to_mask(
+            length_cat_abs.int()
+        ).long()  # 0 for masked tokens
+        # Forward
+        with torch.inference_mode():
+            embs = self.model(
+                input_values=audio,
+                attention_mask=attention_mask,
+                output_attentions=False,
+            ).embeddings
+        hyp_embs, ref_embs = embs.split([len(wavs), len(wavs_ref)])
+        scores = torch.nn.functional.cosine_similarity(
+            hyp_embs, ref_embs, dim=-1
+        )
+
+        return SpeechEvaluationResult(scores, {"score": scores})
 
 
 def vocoder_to_device(vocoder, device):
diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index be73fda74..a522aaecf 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -10,7 +10,9 @@
 import sys
 import os
 import torch
+import re
 from abc import ABC, abstractmethod
+from pathlib import Path
 from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
 from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import (
     DiscreteSSL,
@@ -19,6 +21,17 @@
 from speechbrain.lobes.models.discrete.speechtokenizer import SpeechTokenizer
 from speechbrain.lobes.models.discrete.wavtokenizer import WavTokenizer
 from speechbrain.lobes.models.huggingface_transformers.mimi import Mimi
+from speechbrain.utils.superpowers import run_shell
+from speechbrain.utils.fetching import fetch
+from model.fairseq_hubert import FairseqHuBERT
+from torch import nn
+import logging
+import shlex
+import yaml
+
+logger = logging.getLogger(__name__)
+
+    
 
 base_dir = os.path.abspath(
     os.path.join(os.path.dirname(__file__), "..")
@@ -499,7 +512,7 @@ def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
-        signal = self.decode(tokens.view(tokens.shape[0], -1), **kwargs)
+        signal = self.decode(tokens.reshape(tokens.shape[0], -1), **kwargs)
         return signal.squeeze(1)
 
     @torch.no_grad()
@@ -513,3 +526,132 @@ def get_pretrained_embeddings(
         raise ValueError(
             "SQCodec does not have any trainable quantizer or embedding since it uses scalar quantization."
         )
+
+
+DEFAULT_ESPNET_REPO = "https://github.com/espnet/espnet"
+
+
+class ESPNetEncodecInterface(BaseTokenizer, nn.Module):
+    """An interface for pretrained ESPNet Encodec implementations"""
+
+    def __init__(
+        self,
+        source,
+        model_ckpt,
+        model_config,
+        save_path,
+        sample_rate=24000,
+        n_codebook=32,
+        espnet_repo=DEFAULT_ESPNET_REPO,
+        espnet_commit=None,
+    ):
+        super().__init__()
+        self.source = source
+        self.model_ckpt = model_ckpt
+        self.model_config = model_config
+        self.save_path = Path(save_path)
+        self.sample_rate = sample_rate
+        self.n_codebook = n_codebook
+        self.espnet_repo = espnet_repo
+        self.espnet_commit = espnet_commit
+        self._load()
+
+    def _load(self):
+        self._load_espnet()
+        ckpt_file_name = fetch(
+            filename=self.model_ckpt,
+            source=self.source,
+            savedir=str(self.save_path),
+            save_filename=str(Path(self.model_ckpt).name)
+        )
+        config_file_name = fetch(
+            filename=self.model_config,
+            source=self.source,
+            savedir=str(self.save_path),
+            save_filename="config.yaml"
+        )
+        with open(config_file_name) as config_file:
+            config = yaml.safe_load(config_file)
+        from espnet2.gan_codec.encodec.encodec import Encodec as ESPNetEncodec
+        self.encodec = ESPNetEncodec(**config["codec_conf"])
+        device = next(iter(self.encodec.parameters())).device
+        state_dict = torch.load(ckpt_file_name, map_location=device)
+        state_dict = {
+            re.sub("^codec.", "", key): value
+            for key, value in state_dict.items()
+        }
+        self.encodec.load_state_dict(state_dict)
+
+    def _load_espnet(self):
+        try:
+            import espnet2
+        except ModuleNotFoundError:
+            self._download_espnet()
+
+    def _download_espnet(self):
+        logger.info("espnet is not installed, installing")
+        espnet_path = self.save_path / "espnet"
+        if not espnet_path.exists():
+            logger.info("Cloining %s into %s", self.espnet_repo, espnet_path)
+            cmd = shlex.join(["git", "clone", self.espnet_repo, str(espnet_path)])
+            run_shell(cmd)
+        else:
+            logger.info("%s already exists", espnet_path)
+        if self.espnet_commit:
+            logger.info("Checking out %s", self.espnet_commit)
+            cmd = shlex.join(["git", "-C", str(espnet_path), "checkout", self.espnet_commit])
+            run_shell(cmd)
+        logger.info("Installing")
+        cmd = shlex.join(["pip", "install", "-e", str(espnet_path)])
+        run_shell(cmd)
+        logger.info("Installation completed")
+
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
+        self.encodec.eval()
+        if signal.dim() < 3:
+            signal = signal.unsqueeze(1)
+        tokens = self.encodec.encode(signal)
+        return tokens.permute(1, 2, 0)[:, :, :self.n_codebook]
+
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        self.encodec.eval()
+        tokens = tokens.permute(2, 0, 1)
+        signal = self.encodec.decode(tokens, **kwargs)
+        return signal.squeeze(1)
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(
+        self, vocab_size=None, num_codebooks=None, **kwargs
+    ):
+        """
+        This method is not implemented for ESPNet Encodec, as it uses scalar quantization
+        and does not have any trainable quantizer or embedding.
+        """
+        raise ValueError(
+            "ESPNet Encodec does not have any trainable quantizer or embedding since it uses scalar quantization."
+        )
+
+
+class FairseqHuBERTTokenizer(FairseqHuBERT, BaseTokenizer):
+    def __init__(self, *args, **kwargs):
+        FairseqHuBERT.__init__(self, *args, **kwargs)
+        BaseTokenizer.__init__(self)
+
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+        self.eval()
+        tokens = self.encode(signal)
+        tokens = tokens.unsqueeze(-1)
+        return tokens
+
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        return self.decode(tokens.permute(0, 2, 1))
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(
+        self, vocab_size=None, num_codebooks=None, **kwargs
+    ):
+        raise NotImplementedError("Fairseq HuBERT does not support embeddings")