-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvm_perceptron.py
551 lines (409 loc) · 24.5 KB
/
svm_perceptron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
# -*- coding: utf-8 -*-
"""SVM-Perceptron.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1PxSPdx848AJrp5vsVYbGL7F_bSiiaHSa
# Config Environment
"""
import pandas as pd
from sklearn.decomposition import PCA
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Perceptron
from sklearn.metrics.cluster import rand_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC, LinearSVC
"""# Normalizer
Normalize non-float numbers and remove NAN or Inf values and replace them with zero.
due to the small number of NANS we could also drop them
Class to help normalize data
"""
class DataNormalizer:
def __init__(self,raw_data):
self.raw_data = raw_data
def normalize_raw_data(self, print_stats=False):
available_labels_values = list(set(self.raw_data.values[:,23]))
available_labels_values.sort()
available_lables_values = { val : index for index, val in enumerate(available_labels_values)}
if print_stats:
print(available_lables_values)
for key, dedicated_number in available_lables_values.items():
columns_name = self.raw_data.columns[23]
self.raw_data.loc[self.raw_data[columns_name] == key, columns_name] = dedicated_number
available_gender_values = list(set(self.raw_data.values[:,1]))
available_gender_values.sort()
available_gender_values = { val : index for index, val in enumerate(available_gender_values)}
if print_stats:
print(available_gender_values)
for key, dedicated_number in available_gender_values.items():
columns_name = self.raw_data.columns[1]
self.raw_data.loc[self.raw_data[columns_name] == key, columns_name] = dedicated_number
available_customer_type = list(set(self.raw_data.values[:,2]))
available_customer_type.sort()
available_customer_type = { val : index for index, val in enumerate(available_customer_type)}
if print_stats:
print(available_customer_type)
for key, dedicated_number in available_customer_type.items():
columns_name = self.raw_data.columns[2]
self.raw_data.loc[self.raw_data[columns_name] == key, columns_name] = dedicated_number
available_type_of_travel = list(set(self.raw_data.values[:,4]))
available_type_of_travel.sort()
available_type_of_travel = { val : index for index, val in enumerate(available_type_of_travel)}
if print_stats:
print(available_type_of_travel)
for key, dedicated_number in available_type_of_travel.items():
columns_name = self.raw_data.columns[4]
self.raw_data.loc[self.raw_data[columns_name] == key, columns_name] = dedicated_number
available_classes = list(set(self.raw_data.values[:,5]))
available_classes.sort()
available_classes = { val : index for index, val in enumerate(available_classes)}
if print_stats:
print(available_classes)
for key, dedicated_number in available_classes.items():
columns_name = self.raw_data.columns[5]
self.raw_data.loc[self.raw_data[columns_name] == key, columns_name] = dedicated_number
self.raw_data.replace([np.inf, -np.inf], np.nan, inplace=True)
return self.raw_data.dropna()
"""# Load Data
Load Dataframe from csv files and provide needed data for models
Class to help extract features and labels from loaded data
"""
class DataLoader:
def __init__(self):
self.raw_data_filePath = '/content/drive/MyDrive/AI/SVM-Perceptron/data/train.csv'
self.test_data_filePath = '/content/drive/MyDrive/AI/SVM-Perceptron/data/test.csv'
self.raw_data = None
self.test_data = None
self.normalized_raw_data = None
self.normalized_raw_test_data = None
self.train_data_features = None
self.train_data_labels = None
self.test_data_features = None
self.test_data_labels = None
# Load Raw Data from CSV , Replace Infinity with NAN, and replace every NAN with zero for Raw Data with Normalizer
def load_raw_data(self):
if not self.raw_data:
self.raw_data = pd.read_csv(self.raw_data_filePath)
self.raw_data = self.raw_data.drop(self.raw_data.columns[[0]], axis=1)
normalizer = DataNormalizer(self.raw_data)
self.raw_data = normalizer.normalize_raw_data()
print('Raw Data Shape: ', self.raw_data.shape)
return self.raw_data
def load_test_data(self):
if not self.test_data:
self.test_data = pd.read_csv(self.test_data_filePath)
self.test_data = self.test_data.drop(self.test_data.columns[[0]], axis=1)
normalizer = DataNormalizer(self.test_data)
self.test_data = normalizer.normalize_raw_data()
print('Test Data Shape: ', self.test_data.shape)
return self.test_data
def load_train_data_features(self):
if not self.train_data_features:
self.train_data_features = self.raw_data.values[:, 1:23].astype('float32')
print('Train Data Shape', self.train_data_features.shape)
return self.train_data_features
def load_train_data_labels(self):
if not self.train_data_labels:
self.train_data_labels = self.raw_data.values[:, 23].astype('float32')
print('Train Data Labels Shape', self.train_data_labels.shape)
return self.train_data_labels
def load_test_data_features(self):
if not self.test_data_features:
self.test_data_features = test_data.values[:, 1:23].astype('float32')
print('Test Data Shape', self.test_data_features.shape)
return self.test_data_features
def load_test_data_labels(self):
if not self.test_data_labels:
self.test_data_labels = test_data.values[:, 23].astype('float32')
print('Test Data Labels Shape', self.test_data_labels.shape)
return self.test_data_labels
loader = DataLoader()
raw_data = loader.load_raw_data()
test_data = loader.load_test_data()
"""Load Raw Data without any normalization or manipulations"""
train_data_features = loader.load_train_data_features()
train_data_labels = loader.load_train_data_labels()
"""Load Normalized and preprocessed dataframe for train data"""
test_data_features = loader.load_test_data_features()
test_data_labels = loader.load_test_data_labels()
"""Load Normalized and preprocessed dataframe for test data
# Perceptron
Test perceptron models with different hyperparam and analysis the result
"""
class PerceptronTest:
def __init__(self, train_data_features, train_data_labels):
self.perceptron_model = None
self.train_data_features = train_data_features
self.train_data_labels = train_data_labels
def simple_test(self, test_data_features, test_data_labels):
self.perceptron_model = Perceptron(random_state=1)
self.perceptron_model.fit(self.train_data_features, self.train_data_labels)
print(f"Perceptron model score: { self.perceptron_model.score(self.train_data_features, self.train_data_labels) * 100}")
predicted_labels = np.array(self.perceptron_model.predict(test_data_features))
print('rand score of perdiction', rand_score(test_data_labels, predicted_labels) * 100)
print('accuracy of perdiction', accuracy_score(test_data_labels, predicted_labels, normalize=False))
def test_best(self, eta, maxIt, test_data_features, test_data_labels):
self.perceptron_model = Perceptron(random_state=20, eta0=0.01, max_iter=maxIt)
self.perceptron_model.fit(self.train_data_features, self.train_data_labels)
print(f"Perceptron model score: { self.perceptron_model.score(self.train_data_features, self.train_data_labels) * 100}")
predicted_labels = np.array(self.perceptron_model.predict(test_data_features))
print('rand score of perdiction', rand_score(test_data_labels, predicted_labels) * 100)
print('accuracy of perdiction', accuracy_score(test_data_labels, predicted_labels, normalize=False))
def learning_rate_test(self, grid=None):
perceptron_model = Perceptron(random_state=20)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
if not grid:
grid = dict()
grid['eta0'] = [0.0001, 0.001, 0.01, 0.1, 5.0]
search = GridSearchCV(perceptron_model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
results = search.fit(self.train_data_features, self.train_data_labels)
print('Mean Accuracy: %.3f' % results.best_score_)
print('Best Learning Rate: %s' % results.best_params_)
means = results.cv_results_['mean_test_score']
params = list(map(lambda dic : dic['eta0'], results.cv_results_['params']))
plt.plot(params, means)
plt.suptitle('Learning Rate On Score')
plt.ylabel("Mean Test Score")
plt.xlabel("Learning Rate")
plt.show()
def iteration_test(self, grid=None):
perceptron_model = Perceptron(eta0=0.01)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
if not grid:
grid = dict()
grid['max_iter'] = [1, 10, 100, 1000, 10000]
search = GridSearchCV(perceptron_model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
results = search.fit(train_data_features, train_data_labels)
print('Mean Accuracy: %.3f' % results.best_score_)
print('Best IterationCount: %s' % results.best_params_)
means = results.cv_results_['mean_test_score']
params = list(map(lambda dic : dic['max_iter'], results.cv_results_['params']))
plt.plot(params, means)
plt.suptitle('Iteration Rate On Score')
plt.ylabel("Mean Test Score")
plt.xlabel("Iteration")
plt.show()
perceptron_test = PerceptronTest(train_data_features, train_data_labels)
perceptron_test.simple_test(test_data_features, test_data_labels)
perceptron_test.learning_rate_test()
"""The hyperparameters for the Perceptron algorithm must be configured for our dataset.
Perhaps the most important hyperparameter is the learning rate.
A large learning rate can cause the model to learn fast, but perhaps at the cost of lower skill. A smaller learning rate can result in a better-performing model but may take a long time to train the model.
"""
perceptron_test.iteration_test()
"""Another important hyperparameter is how many epochs are used to train the model.
This may depend on the training dataset and could vary greatly. Again, we will explore configuration values on a log scale between 1 and 1e+4.
"""
perceptron_test.test_best(0.1, 100, test_data_features, test_data_labels)
"""Final result for Perceptron with the best hyperparams
# SVM Test
class for testing different kinds of svm in sklearn
"""
class SVMTest:
def __init__(self, svm_model, train_data_features, train_data_labels):
self.svm_model = svm_model
self.train_data_features = train_data_features
self.train_data_labels = train_data_labels
def test_svm(self, test_data_features, test_data_labels):
start = time.time()
# fit model
self.svm_model.fit(self.train_data_features, self.train_data_labels)
# Predict traint model
train_predict = self.svm_model.predict(self.train_data_features)
train_purity = accuracy_score(train_predict, self.train_data_labels)
train_rand_index = adjusted_rand_score(train_predict, self.train_data_labels)
# Predict Test
test_predict = self.svm_model.predict(test_data_features)
test_purity = accuracy_score(test_predict, test_data_labels)
test_rand_index = adjusted_rand_score(test_predict, test_data_labels)
print(f'Train Data Accuracy : {round(train_purity * 100, 2)}%')
print(f'Train Data Rand-Index : {round(train_rand_index * 100, 2)}%')
print(f'Test Data Accuracy : {round(test_purity * 100, 2)}%')
print(f'Test Data Rand-Index : {round(test_rand_index * 100, 2)}%')
print('Time Taken(s): ', time.time() - start)
return train_purity, train_rand_index, test_purity, test_rand_index
"""# SVC vs LinearSVC
SVC has multiple kernels which allows us to classify non-linear data. <br>
the simplest kernel is the linear kernel. according to the documentation for the `SVC` on `sklearn` website,
> *The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. For large datasets consider using LinearSVC* <br>
And since we are working on a dataset with 100 thousand data with 22 dimension, it would take a lot of time for fitting our dataset with the normal `SVC` class. <br>
But we have to consider that the `LinearSVC` class is just for the linear kernel, so for other kernels we have to use the `SVC` class.
"""
linear_kerner_svm = SVMTest(SVC(kernel='linear'), train_data_features, train_data_labels)
linear_kerner_svm.test_svm(test_data_features, test_data_labels)
linear_kerner_svm = SVMTest(LinearSVC(), train_data_features, train_data_labels)
linear_kerner_svm.test_svm(test_data_features, test_data_labels)
"""In the above code I used both the `SVC` class with linear kernel and the `LinearSVC`. <br>
The execution time for the normal `SVC` is over 5 hour, but the `LinearSVC` has only taken 5 minutes with 10,000 iteration to execute. <br>
Although the accuracy of the `SVC` class is better, but it is not as much different as the execution time, and we are sure that if we improve the accuracy of `LinearSVC`, by using the same parameters we will also improve the `SVC` model too. <br>
Therefore I chose the `LinearSVC` model for the following codes. But as soon as we are done the linear kernel, we will switch back to the `SVC` class. <br>
And also the `LinearSVC` uses the one-vs-all technique and unfortunately it can not be changed, so I used the normal `SVC` for testing this parameter.
# LinearSVC
"""
duals = [True, False]
for dual in duals:
linear_kerner_svm = SVMTest(LinearSVC(dual=dual), train_data_features, train_data_labels)
print('dual: ', dual)
linear_kerner_svm.test_svm(test_data_features, test_data_labels)
print(' ')
"""The first parameter that I test is the `dual` parameter. <br>
The duality optimization is a theory which divide the solution of a problem into a *'dual*' solution and a *'primal'* solution. <br>
The primal solution is the main and more complete solution which will also take longer time, but the dual solution is a solution which is one step before the main solution. <br>
The properties of dual solution is that its answer is too much close to the main solution, even sometimes better, and it will also produce the answer in a much slower time. <br>
According to the documentation of the `sklearn`:
> Select the algorithm to either solve the dual or primal optimization problem. Prefer `dual=False` when `n_samples > n_features`.
`n_samples` means the number of samples of our train dataset, which is 100 thousand and the `n_features` means the dimension of our data, which is 22. <br>
So `sklearn` suggests us to set `dual=False` for our problem, and also according to the code above, the `dual=False` has a slightly more accurate answer.
"""
coeffs = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5, 2]
train_purities = []
train_rand_indexes = []
test_purities = []
test_rand_indexes = []
for c in coeffs :
linear_kerner_svm = SVMTest(LinearSVC(dual=False, C=c), train_data_features, train_data_labels)
train_purity, train_rand_index, test_purity, test_rand_index = linear_kerner_svm.test_svm(test_data_features, test_data_labels)
train_purities.append(train_purity)
train_rand_indexes.append(train_rand_index)
test_purities.append(test_purity)
test_rand_indexes.append(test_rand_index)
print()
plt.plot(coeffs, train_rand_indexes)
plt.suptitle('C on Train Rand-Index')
plt.show()
plt.plot(coeffs, train_purities, 'tab:orange')
plt.suptitle('C on Train Purity')
plt.show()
plt.plot(coeffs, test_rand_indexes, 'tab:green')
plt.suptitle('C on Test Rand-Index')
plt.show()
plt.plot(coeffs, test_purities, 'tab:red')
plt.suptitle('C on Test Purity')
plt.show()
"""The above code shows the effect of `C` parameter on accuracy. <br>
Parameter `C` can be viewed as a way to control overfitting. It trades off the relative importance of maximizing the margin and fitting the training data. <br>
It is for controlling the effect of slack variables. slack variables are defined for measuring the misclassification of difficult or noisy examples. <br>
As you can see in the plots, by increasing this value, we will have a better accuracy for our training data, but less accuracy for test data. <br>
This behaviour was expected, because we are allowing more slack variables and noises to our fitting process, which will lead to less accuracy for external data. ( data which are not in the train dataset) <br>
According to the plots the best value for this parameter is 1.5. <br>
# SVC
Up to now all our codes have used the `LinearSVC` class and we have figured out that the best parameters are `C=0.4` and `dual=False`. <br>
These values will also have the best effect on the normal `SVC`, because they are the same but with different implementation. <br>
As I said, and just like the documentations said, the execution time of `SVC` for our full dataset will be too much, therefore I chose 10% of data randomly and work with them, we can use the result for the whole dataset and get better accuracy. <br>
"""
seq = np.random.randint(0,103594,10000)
X_train_sample = train_data_features[seq]
y_train_sample = train_data_labels[seq]
print(X_train_sample.shape)
print(y_train_sample.shape)
seq = np.random.randint(0,25893,5000)
X_test_sample = test_data_features[seq]
y_test_sample = test_data_labels[seq]
print(X_test_sample.shape)
print(y_test_sample.shape)
fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].hist(y_train_sample)
ax[0].title.set_text('sample')
ax[1].hist(train_data_labels)
ax[1].title.set_text('original')
fig.show
plt.show()
"""First thing to check is the belivabilty of our sample dataset. <br>
In the code above, I chose 10% of datset randomly and shows the histogram of original and random dataset. we can see that they are somehow the same and it means that we can count on the result and improvments that we apply to the random dataset.
"""
kernels = ['poly', 'rbf', 'sigmoid', 'linear']
train_purities = []
train_rand_indexes = []
test_purities = []
test_rand_indexes = []
def show_bar_chart(x, y, title, color):
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.title.set_text(title)
ax.bar(x, y, color=color, width=0.3)
plt.show()
for kernel in kernels:
svm = SVC(kernel=kernel)
svm_test = SVMTest(svm, X_train_sample, y_train_sample)
train_purity, train_rand_index, test_purity, test_rand_index = svm_test.test_svm(X_test_sample, y_test_sample)
train_purities.append(train_purity)
train_rand_indexes.append(train_rand_index)
test_purities.append(test_purity)
test_rand_indexes.append(test_rand_index)
print()
show_bar_chart(kernels, train_purities,'Train purities', 'tab:blue')
show_bar_chart(kernels, train_rand_indexes, 'Train rand indexes', 'tab:olive')
show_bar_chart(kernels, test_purities, 'Test purities', 'tab:pink')
show_bar_chart(kernels, test_rand_indexes, 'Test rand indexes', 'tab:purple')
"""In this section I used the different kernels of `sklearn` and show their effect on accuracy. <br>
As it is shown in the bar plots, for the train dataset the result of linear, polynomial and gaussian(rbf) is the same but the sigmoid kernel has lower accuracy. <br>
But on the test data the difference of gaussian(rbf) kernel is observable and it has the best accuracy over other kernels. <br>
As the rbf kernel was the best kernel, for following codes I used this kernel.
> Important Note: the default parameter for `p` in polynomial kernel is 3 and default `gamma` for the rbf is `1 / (n_features * X.var())`
"""
decision_function_shape = ['ovo', 'ovr']
for dfs in decision_function_shape:
svm = SVC(kernel='rbf', decision_function_shape=dfs)
print(dfs)
svm_test = SVMTest(svm, X_train_sample, y_train_sample)
train_purity, train_rand_index, test_purity, test_rand_index = svm_test.test_svm(test_data_features, test_data_labels)
print()
"""The default technique for multiclass classification used by `SVC` is one-vs-all or one-vs-rest. <br>
In this code block I used both techniques and show the results of them. <br>
As you can see they are the exact same and we can not say which one is better. So we keep using the default value which is one-vs-rest.
"""
gammas = ['scale', 'auto']
for gamma in gammas :
svm = SVC(kernel='rbf', gamma=gamma)
print(gamma)
svm_test = SVMTest(svm, X_train_sample, y_train_sample)
train_purity, train_rand_index, test_purity, test_rand_index = svm_test.test_svm(test_data_features, test_data_labels)
print()
"""The formual for rbf or gaussian kernel is below: <br>
![image.png]() <br>
It has a paramter called gamma. `sklearn` library gives us two choice for setting this parameter. <br>
`scale` value, which is the default value, means `1 / (n_features * X.var())` and the `auto` value is `1 / n_features`. <br>
As the result shows, `scale` value has better accuracy, which was expected because the `scale` method consider the value of each data, rather than just the number of features of dataser.
"""
Cs = [1, 10, 100, 1000, 10000]
train_purities = []
train_rand_indexes = []
test_purities = []
test_rand_indexes = []
for C in Cs :
svm = SVC(kernel='rbf', C=C)
print(C)
svm_test = SVMTest(svm, X_train_sample, y_train_sample)
train_purity, train_rand_index, test_purity, test_rand_index = svm_test.test_svm(test_data_features, test_data_labels)
print()
train_purities.append(train_purity)
train_rand_indexes.append(train_rand_index)
test_purities.append(test_purity)
test_rand_indexes.append(test_rand_index)
plt.plot(Cs, train_rand_indexes)
plt.suptitle('C on Train Rand-Index')
plt.show()
plt.plot(Cs, train_purities, 'tab:orange')
plt.suptitle('C on Train Purity')
plt.show()
plt.plot(Cs, test_rand_indexes, 'tab:green')
plt.suptitle('C on Test Rand-Index')
plt.show()
plt.plot(Cs, test_purities, 'tab:red')
plt.suptitle('C on Test Purity')
plt.show()
"""For the linear kernel, best value for parameter C was 10000. <br>
But this value will be different for the rbf kernel since. <br>
As you can see in the plots the best value for value when using rbf kernel, is 5 and it is the value we will use in the following codes.
# Conclusion
After all the tests on different kernels and paramater changing, we saw that on 10% of the whole dataset which is selected randomly, the rbf or gaussian kernel has the best performance. <br>
And for the rbf kernel, `c=10000.0` and `gamma='scale'`, which is the default value, has better accuracy. <br>
Also we saw that the scores for both one-vs-one and one-vs-all techniques was the same. But we choose one-vs-all because it was the default value and prefered by `sklearn` community. <br>
So now we expect that if we set these parameters to the whole dataset, we would have a high accuracy. <br>
"""
linear_kerner_svm = SVMTest(SVC(kernel='rbf', C=10000), train_data_features, train_data_labels)
linear_kerner_svm.test_svm(test_data_features, test_data_labels)