-
Notifications
You must be signed in to change notification settings - Fork 81
/
Copy pathmodel.py
93 lines (72 loc) · 3.42 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import tensorflow as tf
from tensorpack.graph_builder.model_desc import ModelDesc, InputDesc
from hparam import hparam as hp
from modules import conv1d_banks, conv1d, normalize, highwaynet, gru
from tensorpack.train.tower import get_current_tower_context
from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
class ClassificationModel(ModelDesc):
'''
n = batch size
t = timestep size
h = hidden size
e = embedding size
'''
def __init__(self, num_banks, hidden_units, num_highway, norm_type, num_classes):
self.num_banks = num_banks
self.hidden_units = hidden_units
self.num_highway = num_highway
self.norm_type = norm_type
self.num_classes = num_classes
@auto_reuse_variable_scope
def embedding(self, x, is_training=False):
"""
:param x: shape=(n, t, n_mels)
:return: embedding. shape=(n, e)
"""
# frame-level embedding
x = tf.layers.dense(x, units=self.hidden_units, activation=tf.nn.relu) # (n, t, h)
out = conv1d_banks(x, K=self.num_banks, num_units=self.hidden_units, norm_type=self.norm_type,
is_training=is_training) # (n, t, k * h)
out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (n, t, k * h)
out = conv1d(out, self.hidden_units, 3, scope="conv1d_1") # (n, t, h)
out = normalize(out, type=self.norm_type, is_training=is_training, activation_fn=tf.nn.relu)
out = conv1d(out, self.hidden_units, 3, scope="conv1d_2") # (n, t, h)
out += x # (n, t, h) # residual connections
for i in range(self.num_highway):
out = highwaynet(out, num_units=self.hidden_units, scope='highwaynet_{}'.format(i)) # (n, t, h)
out = gru(out, self.hidden_units, False) # (n, t, h)
# take the last output
out = out[..., -1, :] # (n, h)
# embedding
out = tf.layers.dense(out, self.num_classes, name='projection') # (n, e)
out = tf.identity(out, name="embedding")
return out
def loss(self):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.y, labels=self.speaker_id)
loss = tf.reduce_mean(loss)
return loss
def accuracy(self):
acc = tf.reduce_mean(tf.to_float(tf.equal(self.speaker_id, self.pred)), name='accuracy')
return acc
def _get_inputs(self):
length = hp.signal.duration * hp.signal.sr
length_spec = length // hp.signal.hop_length + 1
return [InputDesc(tf.float32, (None, length), 'wav'),
InputDesc(tf.float32, (None, length_spec, hp.signal.n_mels), 'x'),
InputDesc(tf.int32, (None,), 'speaker_id')]
def _build_graph(self, inputs):
_, self.x, self.speaker_id = inputs
is_training = get_current_tower_context().is_training
with tf.variable_scope('embedding'):
self.y = self.embedding(self.x, is_training) # (n, e)
self.prob = tf.nn.softmax(self.y, name='probability')
self.pred = tf.to_int32(tf.argmax(self.prob, axis=1), name='prediction')
self.cost = self.loss()
# summaries
tf.summary.scalar('train/loss', self.cost)
tf.summary.scalar('train/accuracy', self.accuracy())
def _get_optimizer(self):
lr = tf.get_variable('learning_rate', initializer=hp.train.lr, trainable=False)
return tf.train.AdamOptimizer(lr)