-
Notifications
You must be signed in to change notification settings - Fork 9
/
Hyper_Parameters.yaml
169 lines (154 loc) · 4.06 KB
/
Hyper_Parameters.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
Sound:
N_FFT: 1024
Mel_Dim: 80
Frame_Length: 1024
Frame_Shift: 256
Sample_Rate: 22050
F0_Min: 65
F0_Max: 2094
Tokens: 63
Speakers: 157
Encoder:
Size: 192
Transformer:
Stack: 6
Head: 2
Dropout_Rate: 0.1
FFN:
Kernel_Size: 3
Dropout_Rate: 0.1
Duration_Predictor:
Kernel_Size: 3
Conv_Stack: 3
Flow_Stack: 4
Dropout_Rate: 0.5
Acoustic_Encoder:
Conv_Stack: 16
Kernel_Size: 5
Dilation_Rate: 1
Dropout_Rate: 0.1
Acoustic_Flow:
Stack: 4
Conv_Stack: 4
Kernel_Szie: 5
Dilation_Rate: 1
Dropout_Rate: 0.1
Linguistic_Encoder:
Conv_Stack: 16
Kernel_Size: 5
Dilation_Rate: 1
Dropout_Rate: 0.1
Linguistic_Flow:
Stack: 4
Conv_Stack: 4
Kernel_Szie: 5
Dilation_Rate: 1
Dropout_Rate: 0.1
Token_Predictor:
Size: 256
LSTM:
Stack: 2
Dropout_Rate: 0.1
Decoder:
Prenet:
Kernel_Size: 7
Upsample:
Base_Size: 512
Rate: [8, 8, 2, 2]
Kernel_Size: [16, 16, 4, 4]
Residual_Block:
Kernel_Size: [3, 7, 11]
Dilation_Size: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
Postnet:
Kernel_Size: 7
LeakyRelu_Negative_Slope: 0.1
Discriminator:
Use_STFT: true
Period: [2, 3, 5, 7, 11]
STFT_N_FFT: [1024, 2048, 512, 300, 1200]
Scale_Pool_Kernel_Size: [1, 4, 8, 16, 32]
Token_Path: 'F:/Datasets/22K.VITS.VCTK/Token.yaml'
GE2E_Path: 'F:/Datasets/22K.VITS.VCTK/GE2E.pickle'
Spectrogram_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Spectrogram_Range_Info.yaml'
Mel_Range_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Mel_Range_Info.yaml'
F0_Info_Path: 'F:/Datasets/22K.VITS.VCTK/F0_Info.yaml'
Energy_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Energy_Info.yaml'
Speaker_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Speaker_Info.yaml'
Emotion_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Emotion_Info.yaml'
Language_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Language_Info.yaml'
Gender_Info_Path: 'F:/Datasets/22K.VITS.VCTK/Gender_Info.yaml'
Language_and_Gender_Info_by_Speaker_Path: 'F:/Datasets/22K.VITS.VCTK/Language_and_Gender_Info_by_Speaker.yaml'
Train:
Pattern_Cache: false
Train_Pattern:
Path: 'F:/Datasets/22K.VITS.VCTK/Train'
Metadata_File: 'METADATA.PICKLE'
Feature_Length:
Min: 50
Max: 800
Text_Length:
Min: 1
Max: 200
Accumulated_Dataset_Epoch: 1 # This is to prevent slow down from torch.utils.data.DataLoader when the number of patterns is small.
Augmentation_Ratio: 0.10
Eval_Pattern:
Path: 'F:/Datasets/22K.VITS.VCTK/Eval'
Metadata_File: 'METADATA.PICKLE'
Feature_Length:
Min: 50
Max: 800
Text_Length:
Min: 10
Max: 200
Num_Workers: 0
Batch_Size: 16
Segment_Size: 64
Learning_Rate:
Initial: 2.0e-4
Warmup_Step: 4000
Lambda:
STFT: 45.0
Token_CTC: 45.0
Feature_Map: 2.0
ADAM:
Beta1: 0.8
Beta2: 0.99
Epsilon: 1.0e-9
Gradient_Norm: 0.0
Max_Step: 1000000
Checkpoint_Save_Interval: 5000
Logging_Interval: 1
Evaluation_Interval: 1000
Inference_Interval: 5000
Initial_Inference: true
Inference_in_Train:
Text: [
'Do not kill the goose that lays the golden eggs.',
'A good medicine tastes bitter.',
]
Speaker: [
'VCTK.P250',
'VCTK.P251',
]
Language: [
'English',
'English',
]
Inference_Batch_Size: 16
Inference_Path: './results/Inference'
Checkpoint_Path: './results/Checkpoint'
Log_Path: './results/Log'
Weights_and_Biases:
# Use: true
Use: false
Project: 'HierSpeech'
Entity: 'codejin'
Name: 'VCTK'
Save_Checkpoint:
Use: false
Interval: 50000 # Unlike local, The capacity of WandB is small.
Use_Mixed_Precision: true # Don't use mixed precision in this model.
Use_Multi_GPU: false
Device: '0'
# Use_Multi_GPU: true
# Device: '0,1,2,3,4,5,6,7'