-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathhyperparams.py
85 lines (74 loc) · 2.71 KB
/
hyperparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
'''
modified from
https://www.github.com/kyubyong/tacotron
'''
from zhon import zhuyin
class Hyperparams:
'''Hyper parameters'''
# pipeline
withtone = False
input_mode = "bopomofo"
prepro = True # if True, run `python prepro.py` first before running `python train.py`.
#prepro_path = "/nfs/Athena/yangchiyi/lecture_tts_data/prepro_data"
prepro_path = "/home/yangchiyi/lecture_tts_data/prepro_data"
#prepro_path = "/home/yangchiyi/DaAiSermon/prepro_data"
if input_mode == "bopomofo":
if withtone:
#vocab = "P" + zhuyin.characters[:-1] + "E" + "˙ˊˇˋ"
vocab = zhuyin.characters[:-1] + "E" + "˙ˊˇˋ"
else:
#vocab = "P" + zhuyin.characters[:-1] + "E"
vocab = zhuyin.characters[:-1] + "E"
elif input_mode == "pinyin":
if withtone:
#vocab = "PE abcdefghijklmnopqrstuvwxyz?1234"
vocab = "E abcdefghijklmnopqrstuvwxyz?1234"
else:
#vocab = "PE abcdefghijklmnopqrstuvwxyz?"
vocab = "E abcdefghijklmnopqrstuvwxyz?"
else:
vocab = zhuyin.characters[:-1] + "E"
#vocab = zhuyin.characters[:-1] + "E" + "˙ˊˇˋ" #for previous word base
# bopomofo base without tone
#vocab = zhuyin.characters[:-1] + "E"
# bopomofo base with tone
#vocab = zhuyin.characters[:-1] + "E" + "˙ˊˇˋ"
#vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding E: End of Sentence
# data
#data = "/nfs/Athena/yangchiyi/lecture_tts_data/wav_trimmed"
data = "/home/yangchiyi/lecture_tts_data/wav_trimmed"
#data = "/home/yangchiyi/DaAiSermon/wav_nosilence"
test_data = 'test_sentences.txt'
max_duration = 10.0
max_len = 25
# signal processing
#sr = 16000 # Sample rate.
sr = 22050 # Sample rate.
n_fft = 2048 # fft points (samples)
frame_shift = 0.0125 # seconds
frame_length = 0.05 # seconds
hop_length = int(sr*frame_shift) # samples.
win_length = int(sr*frame_length) # samples.
n_mels = 80 # Number of Mel banks to generate
power = 1.2 # Exponent for amplifying the predicted magnitude
n_iter = 300 # Number of inversion iterations
preemphasis = .97 # or None
max_db = 100
ref_db = 20
# model
embed_size = 256 # alias = E
encoder_num_banks = 16
decoder_num_banks = 8
num_highwaynet_blocks = 4
r = 5 # Reduction factor. Paper => 2, 3, 5
dropout_rate = .5
guided_attention = True
schedule_prob = 0.5 # probability of schedule sampling using the ground truth as input
# training scheme
lr = 0.001 # Initial learning rate.
logdir = "./test_logdir"
logfile = "./test.log"
sampledir = './'
batch_size = 32
num_epochs = 500