-
Notifications
You must be signed in to change notification settings - Fork 70
/
Copy pathaudio_test.py
237 lines (215 loc) · 9.88 KB
/
audio_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
'''
Script to test a audio sample using trained model
'''
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os.path
import time
import numpy as np
import librosa
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import tensorflow as tf
import ipdb
import itertools
from numpy.lib import stride_tricks
import matplotlib as mpl
mpl.use('agg')
from matplotlib import pyplot as plt
from AudioSampleReader import AudioSampleReader
from model import Model
from GlobalConstont import *
# dir for test audio
data_dir = 'mix.wav'
# not useful during sample test
sum_dir = 'sum'
# dir to load model
train_dir = 'train'
lr = 0.00001 # not useful during test
n_hidden = 300 # hidden state size
batch_size = 1 # 1 for audio sample test
hop_size = 64
# oracle flag to decide if a frame need to be seperated
sep_flag = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] * 10
# oracle permutation to concatenate the chuncks of output frames
oracal_p = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] * 10
# import ipdb; ipdb.set_trace()
def stft(sig, frameSize, overlapFac=0.75, window=np.hanning):
""" short time fourier transform of audio signal """
win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
# samples = np.append(np.zeros(np.floor(frameSize / 2.0)), sig)
samples = np.array(sig, dtype='float64')
# cols for windowing
cols = np.ceil((len(samples) - frameSize) / float(hopSize)) + 1
# zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(
samples,
shape=(cols, frameSize),
strides=(samples.strides[0] * hopSize, samples.strides[0])).copy()
frames *= win
return np.fft.rfft(frames)
def out_put(N_frame):
'''Use trained model to infer N _frame chuncks of
frames of input audio'''
with tf.Graph().as_default():
# feed forward keep prob
p_keep_ff = tf.placeholder(tf.float32, shape=None)
# recurrent keep prob
p_keep_rc = tf.placeholder(tf.float32, shape=None)
# audio sample generator
data_generator = AudioSampleReader(data_dir)
# placeholder for model input
in_data = tf.placeholder(
tf.float32, shape=[batch_size, FRAMES_PER_SAMPLE, NEFF])
# init the model
BiModel = Model(n_hidden, batch_size, p_keep_ff, p_keep_rc)
# make inference of embedding
embedding = BiModel.inference(in_data)
saver = tf.train.Saver(tf.all_variables())
sess = tf.Session()
# restore the model
saver.restore(sess, 'train/model.ckpt-492000')
tot_frame = N_frame * FRAMES_PER_SAMPLE
# arrays to store output waveform
out_audio1 = np.zeros([(tot_frame - 1) * hop_size + FRAME_SIZE])
out_audio2 = np.zeros([(tot_frame - 1) * hop_size + FRAME_SIZE])
mix = np.zeros([(tot_frame - 1) * hop_size + FRAME_SIZE])
N_assign = 0
# for every chunk of frames of data
for step in range(N_frame):
# import ipdb; ipdb.set_trace()
data_batch = data_generator.gen_next()
if data_batch is None:
break
# log spectrum info.
in_data_np = np.concatenate(
[np.reshape(item['Sample'], [1, FRAMES_PER_SAMPLE, NEFF])
for item in data_batch])
# phase info.
in_phase_np = np.concatenate(
[np.reshape(item['Phase'], [1, FRAMES_PER_SAMPLE, NEFF])
for item in data_batch])
# VAD info.
VAD_data_np = np.concatenate(
[np.reshape(item['VAD'], [1, FRAMES_PER_SAMPLE, NEFF])
for item in data_batch])
# get inferred embedding using trained model
# with keep prob = 1
embedding_np, = sess.run(
[embedding],
feed_dict={in_data: in_data_np,
p_keep_ff: 1,
p_keep_rc: 1})
# ipdb.set_trace()
# get active TF-bin embedding according to VAD
embedding_ac = [embedding_np[i, j, :]
for i, j in itertools.product(
range(FRAMES_PER_SAMPLE), range(NEFF))
if VAD_data_np[0, i, j] == 1]
if(sep_flag[step] == 1):
# if the frame need to be seperated
# cluster the embeddings
# import ipdb; ipdb.set_trace()
if embedding_ac == []:
break
kmean = KMeans(n_clusters=2, random_state=0).fit(embedding_ac)
else:
# if the frame don't need to be seperated
# don't split the embeddings
kmean = KMeans(n_clusters=1, random_state=0).fit(embedding_ac)
mask = np.zeros([FRAMES_PER_SAMPLE, NEFF, 2])
ind = 0
if N_assign == 0:
# if their is no existing speaker in previous frame
center = kmean.cluster_centers_
N_assign = center.shape[0]
elif N_assign == 1:
# if their is one speaker in previous frame
center_new = kmean.cluster_centers_
# assign the embedding for a speaker to the speaker with the
# closest centroid in previous frames
if center_new.shape[0] == 1:
# update and smooth the centroid for 1 speaker
center = 0.7 * center + 0.3 * center_new
else:
# update and smooth the centroid for 2 speakers
N_assign = 2
# compute their relative affinity
cor = np.matmul(center_new, np.transpose(center))
# ipdb.set_trace()
if(cor[1] > cor[0]):
# rearrange their sequence if not consistant with
# previous frames
kmean.cluster_centers_ = np.array(
[kmean.cluster_centers_[1],
kmean.cluster_centers_[0]])
kmean.labels_ = (kmean.labels_ == 0).astype('int')
center = kmean.cluster_centers_
else:
# two speakers have appeared
center_new = kmean.cluster_centers_
cor = np.matmul(center_new[0, :], np.transpose(center))
# rearrange their sequence if not consistant with previous
# frames
if(cor[1] > cor[0]):
if(sep_flag[step] == 1):
kmean.cluster_centers_ = np.array(
[kmean.cluster_centers_[1],
kmean.cluster_centers_[0]])
kmean.labels_ = (kmean.labels_ == 0).astype('int')
else:
kmean.labels_ = (kmean.labels_ == 1).astype('int')
# need permutation of their order(Oracle)
if(oracal_p[step]):
kmean.cluster_centers_ = np.array(
[kmean.cluster_centers_[1],
kmean.cluster_centers_[0]])
kmean.labels_ = (kmean.labels_ == 0).astype('int')
else:
kmean.labels_ = (~kmean.labels_).astype('int')
center = center * 0.7 + 0.3 * kmean.cluster_centers_
# transform the clustering result and VAD info. into masks
for i in range(FRAMES_PER_SAMPLE):
for j in range(NEFF):
if VAD_data_np[0, i, j] == 1:
mask[i, j, kmean.labels_[ind]] = 1
ind += 1
for i in range(FRAMES_PER_SAMPLE):
# apply the mask and reconstruct the waveform
tot_ind = step * FRAMES_PER_SAMPLE + i
# ipdb.set_trace()
# amp = (in_data_np[0, i, :] *
# data_batch[0]['Std']) + data_batch[0]['Mean']
amp = in_data_np[0, i, :] * GLOBAL_STD + GLOBAL_MEAN
out_data1 = (mask[i, :, 0] * amp *
VAD_data_np[0, i, :])
out_data2 = (mask[i, :, 1] * amp *
VAD_data_np[0, i, :])
out_mix = amp
out_data1_l = 10 ** (out_data1 / 20) / AMP_FAC
out_data2_l = 10 ** (out_data2 / 20) / AMP_FAC
out_mix_l = 10 ** (out_mix / 20) / AMP_FAC
out_stft1 = out_data1_l * in_phase_np[0, i, :]
out_stft2 = out_data2_l * in_phase_np[0, i, :]
out_stft_mix = out_mix_l * in_phase_np[0, i, :]
con_data1 = out_stft1[-2:0:-1].conjugate()
con_data2 = out_stft2[-2:0:-1].conjugate()
con_mix = out_stft_mix[-2:0:-1].conjugate()
out1 = np.concatenate((out_stft1, con_data1))
out2 = np.concatenate((out_stft2, con_data2))
out_mix = np.concatenate((out_stft_mix, con_mix))
frame_out1 = np.fft.ifft(out1).astype(np.float64)
frame_out2 = np.fft.ifft(out2).astype(np.float64)
frame_mix = np.fft.ifft(out_mix).astype(np.float64)
out_audio1[tot_ind * hop_size:tot_ind * hop_size + FRAME_SIZE] += frame_out1 * 0.5016
out_audio2[tot_ind * hop_size:tot_ind * hop_size + FRAME_SIZE] += frame_out2 * 0.5016
mix[tot_ind * hop_size:tot_ind * hop_size + FRAME_SIZE] += frame_mix * 0.5016
librosa.output.write_wav('out_31.wav', out_audio1, SAMPLING_RATE)
librosa.output.write_wav('out_32.wav', out_audio2, SAMPLING_RATE)
out_put(4)