-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathread_audio_make_RGB_VGG.py
125 lines (90 loc) · 4.25 KB
/
read_audio_make_RGB_VGG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
import pandas as pd
import librosa
import os
import pywt
import cv2 as cvlib
from args import parser
import matplotlib.pyplot as plt
args = parser.parse_args()
#normalize each colour image between -1 and 1
def normalize_data_all_gather(vect_in, out_min, out_max, percent_acceptation=80,
not_clip_until_acceptation_time_factor=1.5):
# nb_dim = len(vect_in.shape)
percent_val = np.percentile(abs(vect_in).reshape((vect_in.shape[0], vect_in.shape[1] * vect_in.shape[2])),
percent_acceptation, axis=1)
percent_val_matrix = not_clip_until_acceptation_time_factor * np.repeat(percent_val,
vect_in.shape[1] * vect_in.shape[2],
axis=0).reshape(
(vect_in.shape[0], vect_in.shape[1], vect_in.shape[2]))
matrix_clip = np.maximum(np.minimum(vect_in, percent_val_matrix), -percent_val_matrix)
return np.divide(matrix_clip, percent_val_matrix) * ((out_max - out_min) / 2) + (out_max + out_min) / 2
# Read data from file 'filename.csv'
path_liste_file= args.audio_listefile_folder_path
# Control delimiters, rows, column names with read_csv (see later)
#data = pd.read_csv("/Users/vincentbelz/Documents/Data/audio_classification/liste_file/esc50.csv")
data = pd.read_csv(path_liste_file)
# Preview the first 5 lines of the loaded data
print(data.head())
print(data['filename'].iloc[250])
print(data['target'].iloc[250])
print(data['category'].iloc[250])
RGB_sound_VGG = np.zeros((2000, 224, 224, 3))
y_label_sound=np.zeros((2000,1))
y_label_hot_sound=np.zeros((2000,50))
n_fft = 1024 # frame length
hop_length = 512
path= args.audio_folder_path
path_save= args.audio_images_folder_path
#path = '/Users/vincentbelz/Documents/Data/audio_classification/audio_files/'
for i in range(2000) :
print('loop nb',i)
print('file name',data['filename'].iloc[i])
print('target nb', data['target'].iloc[i])
print('target category', data['category'].iloc[i])
y_label_sound[i,:]=data['target'].iloc[i]
y_label_hot_sound[i, data['target'].iloc[i]] = 1
namefile = data['filename'].iloc[i]
print(namefile)
filepath = os.path.join(path, namefile)
# open the audio file
clipnoise, sample_rate = librosa.load(filepath, sr=22500)
#print(clipnoise.shape)
scales = np.arange(1, 128)
waveletname = 'morl'
coeffnoise, freqnoise = pywt.cwt(clipnoise, scales, waveletname)
#print('coeff noise',coeffnoise.shape)
#print('freq noise',freqnoise.shape)
scalogramimg=cvlib.resize(coeffnoise, dsize=(224, 224), interpolation=cvlib.INTER_CUBIC)
#print(scalogramimg.shape)
stft = librosa.stft(clipnoise, n_fft=n_fft, hop_length=hop_length)
stft_magnitude, stft_phase = librosa.magphase(stft)
stft_magnitude_db = librosa.amplitude_to_db(stft_magnitude, ref=np.max)
#print(stft_magnitude_db.shape)
spectrogramimg=cvlib.resize(stft_magnitude_db, dsize=(224, 224), interpolation=cvlib.INTER_CUBIC)
#print(spectrogramimg.shape)
# mfcc
mfcc = librosa.feature.mfcc(y=clipnoise, sr=sample_rate, n_mfcc=200)
#print(mfcc.shape)
mfccimg=cvlib.resize(mfcc, dsize=(224, 224), interpolation=cvlib.INTER_CUBIC)
#print(mfccimg.shape)
RGB_sound_VGG[i, :, :, 0] = spectrogramimg
RGB_sound_VGG[i, :, :, 1] = scalogramimg
RGB_sound_VGG[i, :, :, 2] = mfccimg
print('finish loop nb', i)
#print(abs(RGB_sound[0,:,:,0]).max())
#print(abs(RGB_sound[0,:,:,1]).max())
#print(abs(RGB_sound[0,:,:,2]).max())
RGB_sound_VGG[:, :, :, 0] = normalize_data_all_gather(RGB_sound_VGG[:, :, :, 0], 0, 1, 95, 2)
RGB_sound_VGG[:, :, :, 1] = normalize_data_all_gather(RGB_sound_VGG[:, :, :, 1], 0, 1, 95, 2)
RGB_sound_VGG[:, :, :, 2] = normalize_data_all_gather(RGB_sound_VGG[:, :, :, 2], 0, 1, 95, 2)
print(abs(RGB_sound_VGG[0,:,:,0]).max())
print(abs(RGB_sound_VGG[0,:,:,1]).max())
print(abs(RGB_sound_VGG[0,:,:,2]).max())
path_save_rgb= os.path.join(path_save, 'RGB_sound_VGG')
path_save_ylabel= os.path.join(path_save, 'y_label_sound')
path_save_ylabel_hot= os.path.join(path_save, 'y_label_hot_sound')
np.save(path_save_rgb,RGB_sound_VGG)
#np.save(path_save_ylabel,y_label_sound)
#np.save(path_save_ylabel_hot,y_label_hot_sound)
print('saved to disk')