-
Notifications
You must be signed in to change notification settings - Fork 4
/
umm_seg.py
218 lines (177 loc) · 7.53 KB
/
umm_seg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import os
import h5py
import numpy as np
import json
import pickle as pkl
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import librosa
import pydub
from pyAudioAnalysis import audioBasicIO as aIO
from pyAudioAnalysis import audioSegmentation as aS
import CRNN
import utils
#Tensorflow initialization
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0)
config = tf.ConfigProto(gpu_options=gpu_options,intra_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
tf.enable_eager_execution(config=config,device_policy=tfe.DEVICE_PLACEMENT_SILENT)
#current model name ckpt128logmel_2conv-74
#extract features
def extract_feature2d_file(file_name,num_mfcc=40):
X, sample_rate = librosa.load(file_name)
stft = np.abs(librosa.stft(X))
mel = librosa.feature.melspectrogram(X, sr=sample_rate, hop_length=int(0.015*sample_rate), n_fft= int(0.030*sample_rate))
log_mel=librosa.core.power_to_db(mel)
mfccs = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=num_mfcc) #using precomputed mel spectrograms for mfcc computation
chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
contrast = librosa.feature.spectral_contrast(S=stft, sr=sample_rate)
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
sr=sample_rate,chroma=chroma) # all the features are [feat_, sample points] frame duration= 30ms with 15 ms overlap
return mfccs,chroma,mel,contrast,tonnetz,log_mel,X, sample_rate
def feat_ext(file_path,file_name):
if file_name.endswith(".wav"):
mfccs, chroma, mel, contrast,tonnetz,log_mel,X, sample_rate= extract_feature2d_file(os.path.join(file_path, file_name),num_mfcc=40)
else:
print("File type not supported!")
## TODO: Currently just use log_mel_spectogram
return mfccs,X,sample_rate
def normalization(tensor_in, epsilon=.0001):
tensor_in=tf.reshape(tensor_in,[-1,utils.input_dim,utils.time_steps])
mean,variance = tf.nn.moments(tensor_in,axes=[1],keep_dims=1)
tensor_normalized = (tensor_in-mean)/(variance+tf.cast(epsilon,tf.float64))
return tensor_normalized
def compute_timeline(logits,pad,contiguous,random_wins):
"""
:return: return start and ending time of zeros in logit
"""
segments=[]
logits_class_id=tf.argmax(logits,axis=1)
idx=0
logits_class_id=np.array(logits_class_id)
list_logits_class_id=list(logits_class_id)
seq_logits_class_id=list_logits_class_id[0].reshape(1,-1)
for arr in list_logits_class_id[1:contiguous]:
seq_logits_class_id=np.concatenate((seq_logits_class_id,arr.reshape(1,-1)),axis=1)
if pad>0:
seq_logits_class_id= seq_logits_class_id.reshape(-1)[:-pad]
else:
seq_logits_class_id= seq_logits_class_id.reshape(-1)
for i in range(len(list_logits_class_id[contiguous:])):
logits_idx=i+contiguous
curr_logits=list_logits_class_id[logits_idx].reshape(-1)
idx=0
while idx < len(curr_logits):
if curr_logits[idx]==1:
idx+=1
continue
else:
start_idx=idx+random_wins[i][0]
while idx < len(curr_logits) and curr_logits[idx] !=1:
idx+=1
end_idx=idx+random_wins[i][0]-1
if end_idx >= len(seq_logits_class_id):
end_idx=len(seq_logits_class_id)-1 #this should take care of the extra pad
seq_logits_class_id[start_idx:end_idx]=0
while idx < len(seq_logits_class_id):
if seq_logits_class_id[idx]==1:
idx+=1
continue
else:
start_idx=idx
while idx < len(seq_logits_class_id) and seq_logits_class_id[idx] !=1:
idx+=1
end_idx=idx-1
#if end_idx-start_idx>6:
start_time=(start_idx-2)*15
end_time=(end_idx+1)*15
if end_time-start_time>100:
segments.append((start_time,end_time))
return segments
def call_umm_segmentation(features,pad,contiguous,random_wins):
'''
Parameters
----------
list of features in size (128,201)
length of padding
number of contiguous segments
[(start,end)] for all the random windows
'''
model=CRNN.Model(utils.hidden_dim,utils.num_layers,utils.input_dim)
# load checkpoint
checkpoint_prefix = os.path.join(utils.model_dir, utils.model_name)
step_counter = tf.train.get_or_create_global_step()
checkpoint = tfe.Checkpoint(
model=model, step_counter=step_counter)
if tf.train.checkpoint_exists(checkpoint_prefix):
checkpoint.restore(checkpoint_prefix)
norm_feats=normalization(tf.convert_to_tensor(features))
logit = model(norm_feats, training=False)
time_segments = compute_timeline(logit,pad,contiguous,random_wins)
return time_segments
def segment_feat(features):
"""
Parameters
----------
features of the entire audio segment
Returns
-------
list of features in size (128,201)
length of padding
number of contiguous segments
[(start,end)] for all the random windows
"""
#pad with zeros to make divisable by 201
cols=features.shape[1]
multiplier=(cols/utils.time_steps)+((cols%utils.time_steps) > 0)
pad=utils.time_steps*multiplier - cols
padded_feats=np.pad(features,((0,0),(0,pad)),'constant',constant_values=0)
#create segments
seg_features=np.hsplit(padded_feats,multiplier)
contiguous= len(seg_features)
split_size=utils.time_steps
offset=utils.offset
random_wins=[]
for i in range(0,cols,offset):
if i>0 and i<=(cols-split_size):
temp=np.random.randint(i-offset,i)
random_wins.append((temp,temp+split_size))
seg_features.append(padded_feats[:,temp:temp+split_size])
return seg_features,pad,contiguous,random_wins
def silence_intervals(file_path,file_name):
"""
returns two lists of start and end times
"""
nsil_start_time=[]
nsil_end_time=[]
sil_start_time=[]
sil_end_time=[]
#read file
audio, sample_rate = librosa.load(os.path.join(file_path,file_name))
#silence extraction using librosa
nsil_intv=librosa.effects.split(audio, top_db=30).astype('float32') / sample_rate
#silence extraction using pyAudioanalysis
# [Fs, x] = aIO.readAudioFile(os.path.join(file_path,file_name))
# nsil_intv = np.array(aS.silenceRemoval(x, Fs, 0.020, 0.020, smoothWindow = 0.7, Weight = 0.3, plot = False))
# print "non-sil segments="+str(nsil_intv)
#silence detection using webrtcvad (voice activity detection)
#nsil_intv=np.array(vad_webrtcvad(file_path,file_name))
dur=librosa.get_duration(y=audio, sr=sample_rate)
print nsil_intv
print dur
print sample_rate
curr_sil_start=0.0
curr_sil_end=0.0
for i in range(nsil_intv.shape[0]):
nsil_start_time.append(nsil_intv[i][0])
#sil_start_time=list(np.array(sil_start_time)/sample_rate)
nsil_end_time.append(nsil_intv[i][1])
#sil_end_time=list(np.array(sil_end_time)/sample_rate)
for i in range(len(nsil_start_time)):
curr_sil_end=nsil_start_time[i]
sil_start_time.append(str(curr_sil_start))
sil_end_time.append(str(curr_sil_end))
curr_sil_start=nsil_end_time[i]
print sil_start_time
print sil_end_time
return sil_start_time,sil_end_time