-
Notifications
You must be signed in to change notification settings - Fork 0
/
supervised.py
105 lines (81 loc) · 3.57 KB
/
supervised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from typing import Dict, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import learning_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import utils
def runTree(
df_dict : Dict[ str, Tuple[ pd.DataFrame, pd.DataFrame ], ],
train_range : np.array,
seed_num : int,
) -> None:
alpha_range = np.linspace(0, 1, 11)
for name, (X, y) in df_dict.items():
test_mean = []
test_max = []
for alpha in alpha_range:
dtree = DecisionTreeClassifier(criterion='entropy', ccp_alpha=alpha)
train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(dtree, X, y, train_sizes=train_range, shuffle=True, random_state=seed_num, return_times=True)
mean_train = np.mean(train_scores, axis=-1)
mean_test = np.mean(test_scores, axis=-1)
test_mean.append(np.mean(test_scores))
test_max.append(np.max(mean_test))
plt.plot(train_sizes, mean_train, '-o', label='Mean Training score')
plt.plot(train_sizes, mean_test, '-o', label='Mean Testing score')
plt.legend()
plt.xlabel('Training samples')
plt.ylabel('Score')
plt.savefig('{}_score_vs_train_samples_alpha_{:.2f}.png'.format(name, alpha))
plt.clf()
plt.plot(alpha_range, test_max, '-o', label='Max Testing Score')
plt.plot(alpha_range, test_mean, '-o', label='Mean Testing score')
plt.legend()
plt.xlabel('MCCP Parameter ($\\alpha$)')
plt.ylabel('Score')
plt.savefig('{}_score_vs_alpha.png'.format(name))
plt.clf()
def runKNN(
df_dict : Dict[ str, Tuple[ pd.DataFrame, pd.DataFrame ], ],
train_range : np.array,
seed_num : int,
) -> None:
k_range = list(range(1, 11))
for name, (X, y) in df_dict.items():
test_mean = []
test_max = []
for k in k_range:
knn = KNeighborsClassifier(k)
train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(knn, X, y, train_sizes=train_range, shuffle=True, random_state=seed_num, return_times=True)
mean_train = np.mean(train_scores, axis=-1)
mean_test = np.mean(test_scores, axis=-1)
test_mean.append(np.mean(test_scores))
test_max.append(np.max(mean_test))
plt.plot(train_sizes, mean_train, '-o', label='Mean Training score')
plt.plot(train_sizes, mean_test, '-o', label='Mean Testing score')
plt.legend()
plt.xlabel('Training samples')
plt.ylabel('Score')
plt.savefig('{}_score_vs_train_samples_k_{}.png'.format(name, k))
plt.clf()
plt.plot(k_range, test_max, '-o', label='Max Testing Score')
plt.plot(k_range, test_mean, '-o', label='Mean Testing score')
plt.legend()
plt.xlabel('Number of Neighbors ($k$)')
plt.ylabel('Score')
plt.savefig('{}_score_vs_k.png'.format(name))
plt.clf()
if __name__ == '__main__':
credit_df = utils.getCreditRiskData()
zoo_df = utils.getZooData()
df_dict = {
'risk' : ( credit_df.drop( [ 'label', ], axis=1 ), credit_df['label'] ),
'zoo' : ( zoo_df.drop( [ 'class_type', ], axis=1 ), zoo_df['class_type'] ),
}
seed_num = 1738
train_range = np.linspace(0.1, 1.0, 10)
# runTree(df_dict, train_range, seed_num)
runKNN(df_dict, train_range, seed_num)