This repository has been archived by the owner on Oct 4, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
pipeline.py
113 lines (94 loc) · 3.77 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import math
import numpy as np
import sys
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
import pandas as pd
from os.path import join
from src.dump_results import dump, DummyModel
from sklearn.tree import DecisionTreeRegressor
from src.pre_process import process_level2, load_binary
def run():
data = load_binary()
# Extract features
user_feat_matrix = process_level2(data) # X
del user_feat_matrix['X']['user_id']
X = user_feat_matrix['X'].values
X[np.isnan(X)] = 0
Y = user_feat_matrix['Y']
Y.fillna(0, inplace=True)
del user_feat_matrix['X_all']['user_id']
X_all = user_feat_matrix['X_all'].values
X_all[np.isnan(X_all)] = 0
cols = list(Y.columns.values)
symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
with open("result.txt", 'w') as f:
f.write("user_id,day_in_cycle,symptom,probability\n")
for symptom in symptoms:
print(symptom)
pipeline = Pipeline([
('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
#('standard_scale', StandardScaler()),
('estimator', Lasso()),
])
param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
verbose=2)
model.fit(X, s_Y.values)
print("dumping...")
data_dir = 'data'
cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)}
dump(symptom, model, X_all, c_length, data['users'].user_id)
if __name__ == '__main__':
run()
"""
const
s_Y = Y[[x for x in cols if x[1] == symptom]]
pipeline = DummyModel()
param_grid = {'constant': [math.pow(10, x) for x in range(-10, -1)]}
model = GridSearchCV(pipeline, param_grid = param_grid,
verbose=2)
model.fit(X, s_Y.values.astype(int))
print(model.best_params_)
LASSOGS
pipeline = Pipeline([
('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
#('standard_scale', StandardScaler()),
('estimator', Lasso()),
])
param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
verbose=2)
model.fit(X, s_Y.values)
symptoms=['happy']
for symptom in symptoms:
print(symptom)
s_Y = Y[[x for x in cols if x[1] == symptom]]
print("Lasso")
pipeline = Pipeline([
('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
('standard_scale', MaxAbsScaler()),
#('pca', PCA()),
('estimator', SVR(kernel='rbf')),
])
param_grid = {'estimator__gamma': [math.pow(2, x) for x in range(-5, 2)],
'estimator__C': [math.pow(2, x) for x in range(-5, 2)]}
#'pca__n_components': [5]}
model = GridSearchCV(pipeline,
param_grid,
verbose=10,
cv=5,
n_jobs=1
)
model.fit(X, s_Y.values)
model.best_score_
"""