-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaseModel.py
120 lines (100 loc) · 4.25 KB
/
baseModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import json
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.externals import joblib
import pandas as pd
from utils import plotAuc, getNextVer
from path import *
###### tag
class Trainer:
def __init__(self, modelClass,
other_params=None, tuned_params=None,
modelName=0,
istune=False, isupdate=False,
dataPath=processedDataPath):
self.dataPath = dataPath
self.X_train = None
self.y_train = None
self.X_val = None
self.y_val = None
self.test = None
self.test_id = None
self.feature_names = None
self.modelClass = modelClass
self.other_params = other_params
self.tuned_params = tuned_params
self.modelName = modelName
version = getNextVer('{}-(\d)-.*.model'.format(modelName))
self.isupdate = True if version==1 else isupdate
if self.dataPath.startswith("pca"):
self.modelType = 'pca'
elif self.dataPath.startswith('rein'):
self.modelType = 'reinbalance'
else:
self.modelType = 'ordinal'
if self.isupdate:
self.modelName = '-'.join([modelName, self.modelType, str(version)])
else:
self.modelName = '-'.join([modelName, self.modelType, str(version-1)])
self.trainAuc = None
self.valAuc = None
if tuned_params is None:
self.istune = False
else:
self.istune = istune
def read_data(self):
data = np.load(self.dataPath)
self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(data['X'], data['y'], random_state=1, test_size=0.2)
self.test = data['X_test']
self.test_id = data['test_id']
self.feature_names = data['feature_names']
def fit(self):
if self.isupdate:
if self.istune:
self.tune()
else:
self._fit()
else:
self.model = joblib.load(os.path.join(root, self.modelName + '.model'))
return self.model
def _fit(self):
self.model = self.modelClass(**self.other_params)
print("==================================================")
print("Train Model {}".format(self.modelName))
self.model.fit(self.X_train, self.y_train)
y_train_hat = self.model.predict_proba(self.X_train)[:, 1]
y_val_hat = self.model.predict_proba(self.X_val)[:, 1]
self.trainAuc = plotAuc(self.y_train, y_train_hat, "train")
self.valAuc = plotAuc(self.y_val, y_val_hat, "val")
print("Train auc: {} Val auc: {}".format(self.trainAuc, self.valAuc))
joblib.dump(self.model, "{}".format(self.modelName + '.model'))
print("Save Model {}".format(self.modelName))
return self.model
def tune(self):
self.model = self.modelClass(**self.other_params)
for paramName in self.tuned_params:
print("==================================================")
print('Tuning param:{} values: {}'.format(paramName, self.tuned_params[paramName]))
clf = GridSearchCV(self.model, {paramName: self.tuned_params[paramName]}, scoring='roc_auc', cv=10, verbose=1, n_jobs=-1)
clf.fit(self.X_train, self.y_train)
print("paramName: {} bestValue: {} bestScore; {}".format(
paramName,
clf.best_params_,
clf.best_score_
))
self.model = clf.best_estimator_
self.other_params.update(clf.best_params_)
print("Save Tuned Model {}".format(self.modelName))
joblib.dump(self.model, "{}".format(self.modelName + '.model'))
def getOutput(self):
if self.model is None:
self.fit()
output = pd.DataFrame({'Id': self.test_id, 'Probability': self.model.predict_proba(self.test)[:, 1]})
if self.valAuc is None:
self.valAuc = plotAuc(self.y_val, self.model.predict_proba(self.X_val)[:, 1])
outputPath = os.path.join(root, 'output-{modelName}-{valAuc:.4f}.csv'.format(
modelName=self.modelName,
valAuc=self.valAuc))
output.to_csv(outputPath, index=False)
print("output: " + outputPath)