forked from sicklincoln/Keras-to-Realtime-Audio
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkerasifyDNN1.py
236 lines (169 loc) · 8.57 KB
/
kerasifyDNN1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.optimizers import SGD
# load and evaluate a saved model
#from numpy import loadtxt
#from keras.models import load_model
#will train over one or more audio files
paths = [
#'/Users/ioi/Desktop/testoutput/vocals0.wav',
#'/Users/ioi/Desktop/testoutput/vocals1.wav',
#'/Users/ioi/Desktop/testoutput/vocals2.wav'
#'/data/audio/classical/improv/pianoimprov1.aif',
#'/data/audio/sicklincoln/curioussamples/ecstasyvocalonly.wav'#
#"/data/audio/classical/improv/improv1.wav",
#'/data/audio/field/galesonbrightonbeach/R09_0007.WAV'
'/data/audio/SCsamp/acapella/thehype.wav'
];
fftsize = 4096 #2048
halffftsize = 2048 #1024
numprevframes = 1; #have this working with javascript, not yet implemented for SuperCollider
numbinsused = halffftsize
inputdimension = numbinsused * numprevframes
#to roll own
#https://github.com/CPJKU/madmom
#pyaudio
import librosa
whichfftbins = range(halffftsize);
#subset of fft bins based around 88 piano keys in 12TET and some harmonics; a bin and its neighbour are taken
#whichfftbins = [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 67, 68, 69, 71, 72, 73, 75, 76, 77, 78, 80, 81, 82, 85, 86, 87, 90, 91, 92, 95, 96, 97, 98, 101, 102, 103, 104, 107, 108, 109, 110, 113, 114, 115, 116, 120, 121, 122, 123, 127, 128, 129, 130, 135, 136, 137, 138, 143, 144, 145, 146, 151, 152, 153, 154, 155, 160, 161, 162, 163, 164, 170, 171, 172, 173, 174, 180, 181, 182, 183, 184, 190, 191, 192, 193, 194, 195, 202, 203, 204, 205, 206, 207, 214, 215, 216, 217, 218, 219, 227, 228, 229, 230, 231, 232, 240, 241, 242, 243, 244, 245, 246, 254, 255, 257, 258, 259, 260, 270, 271, 272, 273, 274, 275, 276, 286, 287, 288, 289, 291, 292, 303, 304, 306, 307, 308, 309, 321, 322, 324, 325, 326, 327, 328, 340, 341, 343, 344, 346, 347, 360, 361, 364, 365, 366, 367, 368, 381, 382, 385, 386, 388, 389, 390, 404, 405, 408, 409, 411, 412, 413, 428, 429, 432, 433, 436, 437, 454, 455, 458, 459, 462, 463, 481, 482, 485, 486, 489, 490, 491, 509, 510, 514, 515, 518, 519, 520, 540, 541, 545, 546, 549, 550, 551, 572, 573, 577, 578, 582, 583, 584, 606, 607, 612, 613, 617, 618, 642, 643, 648, 649, 653, 654, 655, 680, 681, 687, 688, 692, 693, 694, 720, 721, 728, 729, 733, 734, 735, 763, 764, 771, 772, 777, 778, 779, 809, 810, 817, 818, 823, 824, 825, 857, 858, 865, 866, 872, 873, 874, 908, 909, 917, 918, 924, 925, 926, 962, 963, 971, 972, 979, 980, 981, 1019, 1020, 1029, 1030, 1037, 1038, 1039, 1040, 1080, 1081, 1091, 1092, 1099, 1100, 1101, 1144, 1145, 1155, 1156, 1165, 1166, 1167, 1212, 1213, 1224, 1225, 1234, 1235, 1236, 1284, 1285, 1297, 1298, 1307, 1308, 1309, 1310, 1360, 1361, 1374, 1375, 1385, 1386, 1387, 1388, 1441, 1442, 1456, 1457, 1467, 1468, 1469, 1470 ];
numbinsused = len(whichfftbins); #2048 for all, else 379 subset above
inputdimension = numbinsused * numprevframes
def audiofiletotrainingdata(path, usesubset=True):
#'/data/audio/littleaudio/numan1.wav'
yt, sr = librosa.load(path,sr=44100,mono=True)
inputdimensionnow = inputdimension #numbinsused * numprevframes
#remove any start and end silence
#https://librosa.github.io/librosa/generated/librosa.effects.trim.html
y, index = librosa.effects.trim(yt)
#only take louder sections
splitintervals = librosa.effects.split(y, top_db=50, frame_length=fftsize, hop_length=halffftsize);
z = y[splitintervals[0][0]:splitintervals[0][1]]
for i in range(len(splitintervals)-1):
z = np.concatenate((z,y[splitintervals[i+1][0]:splitintervals[i+1][1]]))
y = z
#hann
D_left = librosa.stft(y, n_fft=fftsize, center=False, hop_length=halffftsize, win_length=fftsize, window='boxcar')
#0.5*Math.log(power+1)*scalefactor;
magssource = np.log(np.square(np.abs(D_left))+1)* (0.5 * (1/5.456533600026138));
#np.transpose(
#newest to oldest in packing in fft frames e.g. now, now-1, now-2...now-numpreframes+1
#so reverse order based on last dimension
magssource = np.flip(magssource,1)
if usesubset :
mags = np.array([ magssource[i] for i in whichfftbins])
else :
mags = magssource[0:halffftsize,:]
inputdimensionnow = halffftsize * numprevframes;
numtrainingexamples = len(mags[0])-numprevframes+1;
#np.empty([numtrainingexamples]); #
x_trainnow = [None]*numtrainingexamples;
#np.shape(mags) 1 by 1025, need only 1024 of these 0:1024
for i in range(numtrainingexamples):
# print(i,numtrainingexamples,np.shape(mags),np.shape(magssource))
# print(np.shape(mags[0]),mags[0])
# print(np.array(mags)[:,0:(0+5)])
x_trainnow[i] = mags[:,i:(i+numprevframes)]
#[a[i%3] for i in b]
x_trainnow = np.asarray(x_trainnow, dtype=np.float32)
#shape (number, 1024, 5)
x_trainnow.shape = (len(x_trainnow), inputdimensionnow); #,1
#other way to get y_train, same result
#z_train = x_train[:,:,0,0];
#z_train = np.transpose(z_train);
#######z_train.shape = (halffftsize,numtrainingexamples);
y_trainnow = np.transpose(magssource[0:halffftsize,0:numtrainingexamples]);
return x_trainnow, y_trainnow
#paths = ['/data/audio/littleaudio/numan1.wav','/data/audio/littleaudio/galv.wav'];
output = audiofiletotrainingdata(paths[0],True)
x_train = output[0]
y_train = output[1]
#print('Assumes length of first audio file less than second')
#
#output = audiofiletotrainingdata(paths[0],true)
#x_train = output[0]
#print(len(x_train))
#
#output = audiofiletotrainingdata(paths[1],false)
#y_train = output[1][0:len(x_train),:]
#print(len(y_train))
#for i in range(len(paths)-1):
# output = audiofiletotrainingdata(paths[i+1])
# x_train = np.concatenate((x_train,output[0]))
# y_train = np.concatenate((y_train,output[1]))
model = Sequential()
#model.add(Dense(inputdimension//2, input_dim=inputdimension, activation='relu'))
#model.add(Dropout(0.25))
#
#model.add(Dense(inputdimension//4, activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(halffftsize//2, input_dim=inputdimension, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(halffftsize//4, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(halffftsize//2, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(halffftsize, activation='linear'))
#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
#model.compile(loss='categorical_crossentropy', optimizer=sgd)
model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
print(np.shape(x_train))
print(np.shape(y_train))
model.fit(x_train, y_train, batch_size=32, epochs=100)
score = model.evaluate(x_train, y_train, batch_size=32)
print(score)
#model export for SuperCollider
from kerasify import export_model
export_model(model, 'DNN1.model')
#model export for javascript
import onnxmltools
#from keras.models import load_model
# Update the input name and path for your Keras model
#input_keras_model = 'model.h5'
# Change this path to the output name and path for the ONNX model
output_onnx_model = '/Users/ioi/Desktop/onnxoutput/modelcheck.onnx'
# Load your Keras model
#keras_model = load_model(input_keras_model)
# Convert the Keras model into ONNX
onnx_model = onnxmltools.convert_keras(model)
# Save as protobuf
onnxmltools.utils.save_model(onnx_model, output_onnx_model)
#X_test.astype(np.float32)
# Compute the prediction with ONNX Runtime
#import onnxruntime as rt
#sess = rt.InferenceSession(output_onnx_model)
#input_name = sess.get_inputs()[0].name
#label_name = sess.get_outputs()[0].name
#
#print(input_name, label_name)
#
#print(np.shape(x_train[0]))
#
#print(x_train[0])
#
#pred_onx = sess.run(None, {input_name: x_train})[0]
#
#print(pred_onx)
#import tensorflowjs as tfjs
##tfjs.converters.save_keras_model(model, '/Users/ioi/Desktop/tfjsoutput2')
#
##can save memory and file size this way
##https://medium.com/huia/creating-an-interactive-artificial-intelligence-experience-in-the-browser-with-tensorflowjs-ea205ee08c02
#tfjs.converters.save_keras_model(model, '/Users/ioi/Desktop/tfjsoutput2',quantization_dtype=np.uint16);
#
# save model and architecture to single file
#https://machinelearningmastery.com/save-load-keras-deep-learning-models/
#model.save("model.h5")
#print("Saved model to disk")
#
## load model
#model = load_model('model.h5')
## summarize model.
#model.summary()
#
#score = model.evaluate(x_test, y_test, batch_size=32)
#
#print(score)
#