-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
240 lines (210 loc) · 9.72 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env python3
from matplotlib import colors
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import ConfusionMatrixDisplay
from mpl_toolkits.mplot3d import Axes3D
class Dataset:
"""Class for classifying samples using the
knn-method"""
def __init__(self,filename, k, features = None, genres = None):
self.k = k
data_frame = pd.read_csv(filename, sep='\t')
train_data = data_frame.loc[data_frame['Type'] == 'Train']
test_data = data_frame.loc[data_frame['Type'] == 'Test']
if features == None:
features = list(data_frame.columns.values)[2:65]
## Extract only wanted features
self.features = features
self.train_data = train_data[features]
self.train_labels = train_data[['GenreID']]
self.train_data.index = self.train_labels['GenreID']
self.test_data = test_data[features]
self.test_labels = test_data[['GenreID']]
self.test_data.index = self.test_labels['GenreID']
## Extract only wanted genres
genredict = { 'pop':0, 'metal':1,'disco':2,\
'blues':3, 'reggae':4, 'classical':5,\
'rock':6, 'hip_hop':7, 'country':8,'jazz':9}
if genres == None:
self.genres=[None]*len(genredict)
self.genreIDs = [None]*len(self.genres)
for key in genredict:
self.genreIDs[genredict[key]]=genredict[key]
self.genres[genredict[key]] = key
else:
self.genres = genres
self.genreIDs = [None]*len(genres)
for i in range(len(genres)):
self.genreIDs[i] = genredict[genres[i]]
frames = []
for i in range(len(self.genreIDs)):
frames.append(self.train_data[self.train_data.index == self.genreIDs[i]])
self.train_data = pd.concat(frames)
self.train_labels = self.train_data.index
frames = []
for i in range(len(self.genreIDs)):
frames.append(self.test_data[self.test_data.index == self.genreIDs[i]])
self.test_data = pd.concat(frames)
self.test_labels = self.test_data.index
self.pca = PCA()
def scale(self, normalization_method = 'min-max'):
"""
Trains a scaling object to the training set and uses it to
transform the training and test sets.
The transform methods of the scaler classes return arrays.
Therefore we must make the scaled arrays first and then
update the values of the data frames using df.loc
"""
if normalization_method == 'min-max':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
scaled_train_data = scaler.fit_transform(self.train_data)
self.train_data.loc[:,:] = scaled_train_data
scaled_test_data = scaler.transform(self.test_data)
self.test_data.loc[:,:] = scaled_test_data
def hist(self,m,n, legend = False, size = (20,20)):
"""
Makes a 2x2 subplot where cells are features and the overlapping
histograms are genres.
"""
if m*n >= len(self.features):
plt.subplots(m,n, figsize = size)
all_params = {}
for i in range(len(self.genres)):
samples = self.train_data[self.train_data.index == self.genreIDs[i]]
my_label = self.genres[i]
j = 1
all_params[self.genres[i]] = {}
for feature in self.features:
all_params[self.genres[i]][feature] = {}
plt.subplot(m,n,j)
data = samples.loc[:,feature]
mean = data.mean()
var = data.var()
all_params[self.genres[i]][feature]['mean'] = mean
all_params[self.genres[i]][feature]['var'] = var
plt.hist(data, bins = 40, alpha = 0.5, label = my_label)
plt.title(feature)
j += 1
if legend:
plt.legend(loc = 'upper left')
return all_params
def three_feature_plot(self):
"""
Makes a 3-Dimensional scatter plot of a 3D feature space.
Note that the function does not call plt.show()
"""
if len(self.features) == 3:
#genres = ['pop', 'metal', 'disco', 'blues', 'reggae', 'classical', 'rock', 'hip-hop','country','jazz']
fig = plt.figure(figsize=(12, 9))
ax = Axes3D(fig)
for i in range(len(self.genres)):
samples = self.train_data[self.train_data.index == self.genreIDs[i]]
x = samples.loc[:,self.features[0]]
y = samples.loc[:,self.features[1]]
z = samples.loc[:,self.features[2]]
ax.scatter(x,y,z, label=self.genres[i])
ax.axes.set_xlabel(self.features[0])
ax.axes.set_ylabel(self.features[1])
ax.axes.set_zlabel(self.features[2])
ax.legend()
ax.set_title("Scatter plot of three features")
else:
print("Did not plot 2D because of too many/little features")
def two_feature_plot(self):
"""
Makes a 2-Dimensional scatter plot of a 2D feature space.
Note that the function does not call plt.show()
"""
if len(self.features) == 2:
# fig = plt.figure(figsize=(12, 9))
for i in range(10):
samples = self.train_data[self.train_data.index == i]
x = samples.loc[:,self.features[0]]
y = samples.loc[:,self.features[1]]
plt.scatter(x.values,y.values)
plt.title("Scatter plot of two features")
plt.xlabel(self.features[0])
plt.ylabel(self.features[1])
plt.legend(self.genres)
else:
print("Did not plot 2D because of too many/little features")
def classify(self, method = 'knn', conf_matrix = False, kernel = 'rbf', C = 1, gamma = 'scale'):
"""
Returns: Errot rate
Trains a KNN-classifier using the training data and
tests it on the test set. If conf_matrix is true, it will also
create a confusion matrix.
"""
if method == 'knn':
classifier = KNeighborsClassifier(n_neighbors = self.k)
elif method == 'SVM':
if kernel == 'rbf':
classifier = SVC(kernel = kernel, C = C, gamma = gamma)
else:
classifier = SVC(kernel = kernel, C = C)
classifier.fit(self.train_data,self.train_labels)
if conf_matrix:
fig, ax = plt.subplots(figsize=(10, 10))
ConfusionMatrixDisplay.from_estimator(classifier, self.test_data, self.test_labels,\
display_labels = self.genres, colorbar = False, xticks_rotation=45, cmap = "Blues",ax = ax)
error_rate = (1 - classifier.score(self.test_data, self.test_labels))
return error_rate
def do_pca(self, n = 3):
"""
Performs principle component analysis on the data set, possibly
reducing the feature space down to n components. Note that n
must be smaller than or equal to the number of features in the
data set.
"""
# TODO This should create new data frame objects since
# dimentions might change
pca = PCA(n_components = n)
pca.fit(self.train_data)
pca_train_data = pca.fit_transform(self.train_data)
pca_test_data = pca.transform(self.test_data)
self.train_data = pd.DataFrame(pca_train_data, index = self.train_data.index)
self.test_data = pd.DataFrame(pca_test_data, index = self.test_data.index)
self.pca = pca
self.features = self.train_data.columns.values
def plot_train_data_pca(self):
"""
Creates a 3D scatter plot from the three principle components.
Note that the function does not call plt.show()
"""
genres = ['pop', 'metal', 'disco', 'blues', 'reggae', 'classical', 'rock', 'hip-hop','country','jazz']
if self.test_data.shape[1] == 3:
self.train_data.columns = ['PC1', 'PC2', 'PC3']
fig = plt.figure(figsize=(12, 9))
ax = Axes3D(fig)
for i in range(10): #Might change to length of a list of a self.genres array for using fewer genres
samples = self.train_data[self.train_data.index == i]
ax.scatter(samples.loc[:,'PC1'],samples.loc[:,'PC2'],samples.loc[:,'PC3'], label=genres[i])
ax.legend()
ax.axes.set_xlabel('PC1')
ax.axes.set_ylabel('PC2')
ax.axes.set_zlabel('PC3')
ax.set_title('PCA analysis')
else: print('Data is not 3-dimensional')
def scree_plot(self):
"""
A Scree plot Shows how much of the variance (information)
in a data set is represented by each principle component.
This is a good indicator of which principle components
should be used to represent your data.
"""
fig = plt.figure(figsize=(12, 9))
per_var = np.round(self.pca.explained_variance_ratio_ * 100, decimals = 1)
labels = [ str(x) for x in range(1, len(per_var)+1)]
plt.bar(x=range(1,len(per_var)+1), height = per_var, tick_label = labels)
plt.ylabel('Percentage of Explaied Vairance')
plt.xlabel('Principle Component')
plt.title('Scree Plot')