-
Notifications
You must be signed in to change notification settings - Fork 0
/
prep_data_STRIP.py
93 lines (83 loc) · 2.83 KB
/
prep_data_STRIP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import librosa as lr
import pandas as pd
import matplotlib.pyplot as plt
import os
from pydub import AudioSegment
import ffmpeg
import json
import numpy as np
# truncate all songs to 15 seconds
def prep_audio_files():
for dir in os.listdir('./audio'):
if dir.startswith('.'):
continue
for file in os.listdir('./audio/' + dir):
if (file.startswith('.')):
continue
print(file)
song = AudioSegment.from_file('./audio/' + dir + '/' + file)
if len(song) <= 15000:
continue
song = song[30000:45000]
song.export('./audio/' + dir + '/' + file)
def main():
SAMPLE_RATE = 22050
NUM_SLICES = 10
TOTAL_SAMPLES = 15 * SAMPLE_RATE
SAMPLES_PER_SLICE = int(TOTAL_SAMPLES / NUM_SLICES)
prep_audio_files()
mfcc_dict = {}
mfcc_dict['sample_num'] = []
mfcc_dict['labels'] = []
mfcc_dict['mfcc'] = []
i = 0
for dir in os.listdir('./audio'):
if dir.startswith('.'):
continue
for file in os.listdir('./audio/' + dir):
if (file.startswith('.')):
continue
y, sr = lr.load('./audio/' + dir + '/' + file)
for s in range(NUM_SLICES):
start = SAMPLES_PER_SLICE * s
finish = start + SAMPLES_PER_SLICE
mfcc = lr.feature.mfcc(y=y[start:finish], sr=sr , n_mfcc=13)
mfcc = mfcc.T
mfcc_dict['sample_num'].append(i)
mfcc_dict['mfcc'].append(mfcc.tolist())
if (dir == 'hip-hop'):
mfcc_dict['labels'].append(0)
elif (dir == 'rock'):
mfcc_dict['labels'].append(1)
i += 1
mfcc_df = pd.DataFrame(mfcc_dict)
mfcc_df.to_csv('data.csv', index=False)
# remove all quotation marks from data.csv
with open('data.csv', 'r') as f:
lines = f.readlines()
with open('data.csv', 'w') as f:
for line in lines:
f.write(line.replace('"', ''))
# remove all brackets and double brackets from data.csv
with open('data.csv', 'r') as f:
lines = f.readlines()
with open('data.csv', 'w') as f:
for line in lines:
f.write(line.replace('[', ''))
with open('data.csv', 'r') as f:
lines = f.readlines()
with open('data.csv', 'w') as f:
for line in lines:
f.write(line.replace(']', ''))
with open('data.csv', 'r') as f:
lines = f.readlines()
with open('data.csv', 'w') as f:
for line in lines:
f.write(line.replace('[[', ''))
with open('data.csv', 'r') as f:
lines = f.readlines()
with open('data.csv', 'w') as f:
for line in lines:
f.write(line.replace(']]', ''))
if __name__ == "__main__":
main()