Hyper_Parameters.yaml

Sound:
    N_FFT: 1024
    Mel_Dim: 80
    Frame_Length: 1024
    Frame_Shift: 256
    Sample_Rate: 22050
    F0_Min: 65
    F0_Max: 2094

Tokens: 63
Speakers: 157

Encoder:
    Size: 192
    Transformer:
        Stack: 6
        Head: 2
        Dropout_Rate: 0.1
        FFN:
            Kernel_Size: 3
            Dropout_Rate: 0.1

Duration_Predictor:
    Kernel_Size: 3
    Conv_Stack: 3
    Flow_Stack: 4
    Dropout_Rate: 0.5

Acoustic_Encoder:
    Conv_Stack: 16
    Kernel_Size: 5
    Dilation_Rate: 1
    Dropout_Rate: 0.1

Acoustic_Flow:
    Stack: 4
    Conv_Stack: 4
    Kernel_Szie: 5
    Dilation_Rate: 1
    Dropout_Rate: 0.1

Linguistic_Encoder:
    Conv_Stack: 16
    Kernel_Size: 5
    Dilation_Rate: 1
    Dropout_Rate: 0.1

Linguistic_Flow:
    Stack: 4
    Conv_Stack: 4
    Kernel_Szie: 5
    Dilation_Rate: 1
    Dropout_Rate: 0.1

Token_Predictor:
    Size: 256
    LSTM:
        Stack: 2
        Dropout_Rate: 0.1

Decoder:
    Prenet:
        Kernel_Size: 7
    Upsample:
        Base_Size: 512
        Rate: [8, 8, 2, 2]
        Kernel_Size: [16, 16, 4, 4]
    Residual_Block:
        Kernel_Size: [3, 7, 11]
        Dilation_Size: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
    Postnet:
        Kernel_Size: 7
    LeakyRelu_Negative_Slope: 0.1

Discriminator:
    Use_STFT: true
    Period: [2, 3, 5, 7, 11]
    STFT_N_FFT: [1024, 2048, 512, 300, 1200]
    Scale_Pool_Kernel_Size: [1, 4, 8, 16, 32]

Token_Path: 'F:/Datasets/22K.VITS.VCTK/Token.yaml'
GE2E_Path: 'F:/Datasets/22K.VITS.VCTK/GE2E.pickle'
Spectrogram_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Spectrogram_Range_Info.yaml'
Mel_Range_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Mel_Range_Info.yaml'
F0_Info_Path: 'F:/Datasets/22K.VITS.VCTK/F0_Info.yaml'
Energy_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Energy_Info.yaml'
Speaker_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Speaker_Info.yaml'
Emotion_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Emotion_Info.yaml'
Language_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Language_Info.yaml'
Gender_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Gender_Info.yaml'
Language_and_Gender_Info_by_Speaker_Path: 'F:/Datasets/22K.VITS.VCTK/Language_and_Gender_Info_by_Speaker.yaml'
Train:
    Pattern_Cache: false
    Train_Pattern:
        Path: 'F:/Datasets/22K.VITS.VCTK/Train'
        Metadata_File: 'METADATA.PICKLE'
        Feature_Length:
            Min: 50
            Max: 800
        Text_Length:
            Min: 1
            Max: 200
        Accumulated_Dataset_Epoch: 1 # This is to prevent slow down from torch.utils.data.DataLoader when the number of patterns is small.
        Augmentation_Ratio: 0.10
    Eval_Pattern:
        Path: 'F:/Datasets/22K.VITS.VCTK/Eval'
        Metadata_File: 'METADATA.PICKLE'
        Feature_Length:
            Min: 50
            Max: 800
        Text_Length:
            Min: 10
            Max: 200
    Num_Workers: 0
    Batch_Size: 16
    Segment_Size: 64
    Learning_Rate:
        Initial: 2.0e-4
        Warmup_Step: 4000
        Lambda:
            STFT: 45.0
            Token_CTC: 45.0
            Feature_Map: 2.0
    ADAM:
        Beta1: 0.8
        Beta2: 0.99
        Epsilon: 1.0e-9
    Gradient_Norm: 0.0
    Max_Step: 1000000
    Checkpoint_Save_Interval: 5000
    Logging_Interval: 1
    Evaluation_Interval: 1000
    Inference_Interval: 5000
    Initial_Inference: true
    Inference_in_Train:
        Text: [
            'Do not kill the goose that lays the golden eggs.',
            'A good medicine tastes bitter.',
            ]
        Speaker: [
            'VCTK.P250',
            'VCTK.P251',
            ]
        Language: [
            'English',
            'English',
            ]

Inference_Batch_Size: 16

Inference_Path: './results/Inference'
Checkpoint_Path: './results/Checkpoint'
Log_Path: './results/Log'

Weights_and_Biases:
    # Use: true
    Use: false
    Project: 'HierSpeech'
    Entity: 'codejin'
    Name: 'VCTK'
    Save_Checkpoint:
        Use: false
        Interval: 50000 # Unlike local, The capacity of WandB is small.

Use_Mixed_Precision: true   # Don't use mixed precision in this model.
Use_Multi_GPU: false
Device: '0'
# Use_Multi_GPU: true
# Device: '0,1,2,3,4,5,6,7'