Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Debatts Some Code #295

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
{
"model_type": "SoundStorm",
"dataset": ["emilia"],
"preprocess": {
"hop_size": 480,
"sample_rate": 24000,
"processed_dir": "",
"valid_file": "valid.json",
"train_file": "train.json",
"use_phone_cond": false
},
"model": {
"soundstorm": {
"num_quantizer": 1,
"hidden_size": 1024,
"num_layers": 16,
"num_heads": 16,
"codebook_size": 1024,
"cfg_scale": 0.15,
"mask_layer_schedule": "linear",
"use_cond_code": true,
"cond_codebook_size": 8192,
"cond_dim":1024,
"use_llama_style": true,
"use_phone_cond": false,
"use_pretrained_model": false
},
"kmeans": {
"type": "repcodec",
"stat_mean_var_path": "./stat_ckpt/emilia_wav2vec2bert_stats_10k.pt In HuggingFace",
"repcodec": {
"codebook_size": 8192,
"hidden_size": 1024,
"codebook_dim": 8,
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12
},
"pretrained_path": "./semantic_codec/emilia_50k_8192_norm_8d_86k_steps_model.safetensors In HuggingFace"
},
"codec": {
"encoder": {
"d_model": 96,
"up_ratios": [3, 4, 5, 8],
"out_channels": 256,
"use_tanh": false,
"pretrained_path": "./acoustic_codec/emilia_50k_model.safetensors In HuggingFace"
},
"decoder": {
"in_channel": 256,
"upsample_initial_channel": 1536,
"up_ratios": [8, 5, 4, 3],
"num_quantizers": 12,
"codebook_size": 1024,
"codebook_dim": 8,
"quantizer_type": "fvq",
"quantizer_dropout": 0.5,
"commitment": 0.25,
"codebook_loss_weight": 1.0,
"use_l2_normlize": true,
"codebook_type": "euclidean",
"kmeans_init": false,
"kmeans_iters": 10,
"decay": 0.8,
"eps": 0.5,
"threshold_ema_dead_code": 2,
"weight_init": false,
"use_vocos": true,
"vocos_dim": 512,
"vocos_intermediate_dim": 4096,
"vocos_num_layers": 30,
"n_fft": 1920,
"hop_size": 480,
"padding": "same",
"pretrained_path": "./acoustic_codec/emilia_50k_model_1.safetensors In HuggingFace"
}
}
},
"log_dir": "",
"train": {
"max_epoch": 0,
"use_dynamic_batchsize": true,
"max_tokens": 2000000,
"max_sentences": 20,
"lr_warmup_steps": 32000,
"lr_scheduler": "inverse_sqrt",
"num_train_steps": 800000,
"adam": {
"lr": 1e-4
},
"ddp": false,
"random_seed": 114,
"batch_size": 10,
"epochs": 5000,
"max_steps": 1000000,
"total_training_steps": 800000,
"save_summary_steps": 500,
"save_checkpoints_steps": 1000,
"valid_interval": 2000,
"keep_checkpoint_max": 100,
"gradient_accumulation_step": 1,
"tracker": ["tensorboard"],
"save_checkpoint_stride": [1],
"keep_last": [10],
"run_eval": [true],
"dataloader": {
"num_worker": 16,
"pin_memory": true
},
"use_emilia_dataset": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"model_type": "SoundStorm",
"dataset": ["emilia"],
"preprocess": {
"hop_size": 480,
"sample_rate": 24000,
"processed_dir": "",
"valid_file": "valid.json",
"train_file": "train.json",
"use_phone_cond": false
},
"model": {
"soundstorm": {
"num_quantizer": 12,
"hidden_size": 1024,
"num_layers": 16,
"num_heads": 16,
"codebook_size": 1024,
"cfg_scale": 0.15,
"mask_layer_schedule": "linear",
"use_cond_code": true,
"cond_codebook_size": 8192,
"cond_dim":1024,
"use_llama_style": true,
"use_phone_cond": false,
"use_pretrained_model": false,
"predict_layer_1": false
},
"kmeans": {
"type": "repcodec",
"stat_mean_var_path": "./stat_ckpt/emilia_wav2vec2bert_stats_10k.pt In HuggingFace",
"repcodec": {
"codebook_size": 8192,
"hidden_size": 1024,
"codebook_dim": 8,
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12
},
"pretrained_path": "./semantic_codec/emilia_50k_8192_norm_8d_86k_steps_model.safetensors In HuggingFace"
},
"codec": {
"encoder": {
"d_model": 96,
"up_ratios": [3, 4, 5, 8],
"out_channels": 256,
"use_tanh": false,
"pretrained_path": "./acoustic_codec/emilia_50k_model.safetensors In HuggingFace"
},
"decoder": {
"in_channel": 256,
"upsample_initial_channel": 1536,
"up_ratios": [8, 5, 4, 3],
"num_quantizers": 12,
"codebook_size": 1024,
"codebook_dim": 8,
"quantizer_type": "fvq",
"quantizer_dropout": 0.5,
"commitment": 0.25,
"codebook_loss_weight": 1.0,
"use_l2_normlize": true,
"codebook_type": "euclidean",
"kmeans_init": false,
"kmeans_iters": 10,
"decay": 0.8,
"eps": 0.5,
"threshold_ema_dead_code": 2,
"weight_init": false,
"use_vocos": true,
"vocos_dim": 512,
"vocos_intermediate_dim": 4096,
"vocos_num_layers": 30,
"n_fft": 1920,
"hop_size": 480,
"padding": "same",
"pretrained_path": "./acoustic_codec/emilia_50k_model_1.safetensors In HuggingFace"
}
}
},
"log_dir": "",
"train": {
"max_epoch": 0,
"use_dynamic_batchsize": true,
"max_tokens": 2000000,
"max_sentences": 20,
"lr_warmup_steps": 32000,
"lr_scheduler": "inverse_sqrt",
"num_train_steps": 800000,
"adam": {
"lr": 1e-4
},
"ddp": false,
"random_seed": 114,
"batch_size": 10,
"epochs": 5000,
"max_steps": 1000000,
"total_training_steps": 800000,
"save_summary_steps": 500,
"save_checkpoints_steps": 1000,
"valid_interval": 2000,
"keep_checkpoint_max": 100,
"gradient_accumulation_step": 1,
"tracker": ["tensorboard"],
"save_checkpoint_stride": [1],
"keep_last": [10],
"run_eval": [true],
"dataloader": {
"num_worker": 16,
"pin_memory": true
},
"use_emilia_dataset": true
}
}
9 changes: 9 additions & 0 deletions models/tts/debatts/speech_examples/87_SPEAKER01_2_part03.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"key": "87_SPEAKER01_2_part03",
"text": "你方要论证的是, 他在技术上完全突破了壁垒, 在现实里真的可以落地, 而不是举出一些跟越宇宙可能没有关, 没有关系, 实际关系不大的东西告诉我, 这叫做越宇宙。我方告诉你, 不只是算力和芯片的问题, 还有包括VR的问题, 硬件上算力有问题, 题片有问题, 软软。",
"duration": 15.34,
"language": "zh",
"wav_path": "./87_SPEAKER01_2_part03.wav",
"chenci_prompt_wav_path": "./87_SPEAKER01_2_part03_213_chenci_prompt_6s.wav",
"prompt0_wav_path": "./87_SPEAKER00_1_part01.wav"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
{
"model_type": "T2S",
"dataset": ["not"],
"preprocess": {
"hop_size": 320,
"sample_rate": 16000,
"processed_dir": "TODO",
"valid_file": "valid.json",
"train_file": "train.json",
"use_phone_cond": false,
"min_dur": 3,
"max_dur": 40,
"use_emilia_101k": true
},
"model": {
"t2sllama": {
"phone_vocab_size": 1024,
"target_vocab_size": 8192,
"hidden_size": 2048,
"intermediate_size": 8192,
"num_hidden_layer": 8,
"num_attention_head": 16,
"pad_token_id": 9216,
"bos_target_id": 9217,
"eos_target_id": 9218,
"bos_phone_id": 9219,
"eos_phone_id": 9220,
"bos_prompt0_id": 9221,
"eos_prompt0_id": 9222,
"use_lang_emb": false
},
"kmeans": {
"type": "repcodec",
"stat_mean_var_path":"./ckpt/emilia_wav2vec2bert_stats_10k.pt",
"repcodec": {
"codebook_size": 8192,
"hidden_size": 1024,
"codebook_dim": 8,
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12
},
"pretrained_path":"./ckpt/repcodec/emilia_50k_8192_norm_8d_86k_steps_model.safetensors"
},
"codec": {
"encoder": {
"d_model": 96,
"up_ratios": [4, 4, 4, 5],
"out_channels": 256,
"use_tanh": false,
"pretrained_path":"./ckpt/codec_16K_320_8/pytorch_model.bin"
},
"decoder": {
"in_channel": 256,
"upsample_initial_channel": 1536,
"up_ratios": [5, 4, 4, 4],
"num_quantizers": 8,
"codebook_size": 1024,
"codebook_dim": 8,
"quantizer_type": "fvq",
"quantizer_dropout": 0.5,
"commitment": 0.25,
"codebook_loss_weight": 1.0,
"use_l2_normlize": true,
"codebook_type": "euclidean",
"kmeans_init": false,
"kmeans_iters": 10,
"decay": 0.8,
"eps": 0.5,
"threshold_ema_dead_code": 2,
"weight_init": false,
"use_vocos": true,
"vocos_dim": 512,
"vocos_intermediate_dim": 4096,
"vocos_num_layers": 24,
"n_fft": 1280,
"hop_size": 320,
"padding": "same",
"pretrained_path": "./ckpt/codec_16K_320_8/pytorch_model_1.bin"
}
}
},
"log_dir": "TODO",
"train": {
"max_epoch": 0,
"use_dynamic_batchsize": true,
"max_tokens": 3000,
"max_sentences": 20,
"lr_warmup_steps": 3200,
"lr_scheduler": "inverse_sqrt",
"num_train_steps": 800,
"adam": {
"lr": 1e-5
},
"ddp": false,
"random_seed": 114,
"batch_size": 1,
"epochs": 500,
"max_steps": 10000,
"total_training_steps": 8000,
"save_summary_steps": 500,
"save_checkpoints_steps": 300,
"valid_interval": 2000,
"keep_checkpoint_max": 100,
"gradient_accumulation_step": 10,
"tracker": ["tensorboard"],
"save_checkpoint_stride": [1],
"keep_last": [15],
"run_eval": [true],
"dataloader": {
"num_worker": 4,
"pin_memory": true
},
"use_emilia_dataset": false
}
}

Loading
Loading