-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
84 lines (67 loc) · 3.49 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Project 2 - 5/4/23
# Joshua Adams, Weston Beebe, Parth Patel, Jonathan Sanderson, Samuel Sylvester
import matplotlib.pyplot as plt
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay
from feature_extractor import FeatureExtractor
from Classifier_tester import ClassifierTester
from search_spaces import ClfSearchSpace
CLFS_SAVE_PATH = 'classifers.pkl'
def main():
clf_search_space = ClfSearchSpace()
# create testers with different feature extractors
clf_testers = [ClassifierTester(feature_extr_fn=FeatureExtractor.method1),
ClassifierTester(feature_extr_fn=FeatureExtractor.method2)]
# create classifiers
clf = {
'KN': KNeighborsClassifier(),
'GNB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'RF': RandomForestClassifier(),
'Ada': AdaBoostClassifier(),
}
ncv_scores = [{}]*len(clf_testers)
for tester in clf_testers: # for each feature extractor
tester_i = clf_testers.index(tester)
print (f"Feature Extractor {tester_i}")
for c in clf: # for each classifier
if c == 'SVM' and tester_i == 1: # SVM is too slow/never completes for method2
continue
ncv_score = tester.nested_cv_score(clf[c], clf_search_space.get_search_space(c), outer_scoring='f1_macro', inner_scoring='f1_macro') # run nested cross validation
print (c, ncv_score)
ncv_scores[tester_i][c] = ncv_score['test_score'] # add score to dict of scores for this feature extractor
# average across feature extractors
best_avg_ncv_scores = {}
for c in clf:
best_avg_ncv_scores[c] = sum([ncv_scores[i][c] for i in range(len(clf_testers))])/len(clf_testers)
# sort average scores
best_avg_ncv_scores = {k: v for k, v in sorted(best_avg_ncv_scores.items(), key=lambda item: item[1], reverse=True)}
# find best 2 classifiers acrost all feature extractors
# top2 = set()
# for i in range(len(clf_testers)):
# top2.update(list(ncv_scores[i])[:2])
# print best 2 classifiers for each feature extractor
print("\n\nBest 2 classifiers for each feature extractor")
top2 = list(best_avg_ncv_scores)[:2]
for i in range(len(clf_testers)):
print(f"Feature Extractor {i}: {top2[0]} score {ncv_scores[i][top2[0]]}, {top2[1]} score {ncv_scores[i][top2[1]]}")
print("\n\nRunning cross validation on best classifiers to find best hyperparameters")
# use cross validate to obtain best classifiers for each feature extractor-clf combination
best_clfs_metrics = [{}]*len(clf_testers)
save_clfs = [{}]*len(clf_testers)
for i in range(len(clf_testers)):
for c in list(top2)[:2]:
best_clf = clf_testers[i].cross_validate(clf[c], clf_search_space.get_search_space(c), scoring='f1_macro')
save_clfs[i][c] = best_clf['clf'] # get classifier from pipeline
best_clfs_metrics[i][c] = clf_testers[i].test_classifier(best_clf)
pickle.dump(save_clfs, open(CLFS_SAVE_PATH, 'wb'))
print(f'Saved classifiers to {CLFS_SAVE_PATH}')
if __name__ == '__main__':
main()