-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmstudio_python_train.py
81 lines (67 loc) · 2.6 KB
/
mstudio_python_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import ensemble
import pickle
# Enable compact pritning of numpy arrays
np.set_printoptions(suppress=True, precision=2)
#----------
# Prepare data for training
#----------
# Get interval inputs
trainX_intv = dm_traindf[dm_interval_input]
# Get class inputs, convert data to str (missing becomes nan str)
trainX_class = dm_traindf[dm_class_input].applymap(str)
# Impute interval missing values to median
intv_imputer = SimpleImputer(strategy='median')
intv_imputer = intv_imputer.fit(trainX_intv)
trainX_intv_imp = intv_imputer.transform(trainX_intv)
# One-hot encode class inputs, unknown levels are set to all 0s
class_ohe = OneHotEncoder(handle_unknown="ignore")
class_ohe = class_ohe.fit(trainX_class)
trainX_class_ohe = class_ohe.transform(trainX_class).toarray()
# Concatenate interval and class input arrays
trainX = np.concatenate((trainX_intv_imp, trainX_class_ohe), axis=1)
trainy = dm_traindf[dm_dec_target]
#print(trainX)
#print(trainy)
#----------
# Train a model
#----------
# Fit Random Forest model w/ training data
params = {'n_estimators': 100, 'max_depth': 20, 'min_samples_leaf': 5}
dm_model = ensemble.RandomForestClassifier(**params)
dm_model.fit(trainX, trainy)
#print(dm_model)
#----------
# Create dm_scoreddf
#----------
fullX_intv = dm_inputdf[dm_interval_input]
fullX_intv_imp = intv_imputer.transform(fullX_intv)
fullX_class = dm_inputdf[dm_class_input].applymap(str)
fullX_class_ohe = class_ohe.transform(fullX_class).toarray()
fullX = np.concatenate((fullX_intv_imp, fullX_class_ohe), axis=1)
# Score full data: posterior probabilities
dm_scoreddf_prob = pd.DataFrame(dm_model.predict_proba(fullX), columns=dm_predictionvar)
# Score full data: class prediction
dm_scoreddf_class = pd.DataFrame(dm_model.predict(fullX), columns=[dm_classtarget_intovar])
# Column merge posterior probabilities and class prediction
dm_scoreddf = pd.concat([dm_scoreddf_prob, dm_scoreddf_class], axis=1)
print('***** 5 rows from dm_scoreddf *****')
print(dm_scoreddf.head(5))
print(dm_input)
print(', '.join(dm_input))
#----------
# Results
#----------
# Save VariableImportance to CSV
full_input_vars = dm_interval_input + list(class_ohe.get_feature_names())
varimp = pd.DataFrame(list(zip(full_input_vars, dm_model.feature_importances_)), columns=['Variable Name', 'Importance'])
varimp.to_csv(dm_nodedir + '/rpt_var_imp.csv', index=False)
#----------
# Build composite pickle file
#----------
with open(dm_pklpath, 'wb') as f:
pickle.dump(intv_imputer, f)
pickle.dump(class_ohe, f)
pickle.dump(dm_model, f)