-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathall_data_analysis.py
102 lines (83 loc) · 3.64 KB
/
all_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pickle
from collections import OrderedDict
from pprint import pprint
import matplotlib.pyplot as plt
# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import json
import argparse
# =================================================
col2drop = ["LTDP10", "REST10", "ESS_s1", "nsrrid", "TIMEINBED_mins"] # these are all the response variables + ids :(
# =================================================
# import wandb
# wandb.init(project="visualize-sklearn")
argparser = argparse.ArgumentParser()
argparser.add_argument('--wasoint', default=2)
argparser.add_argument('--threads', type=int, default=32)
argparser.add_argument("--targetcol", type=str)
args = argparser.parse_args()
wasointerval = args.wasoint
threads = args.threads
targetcol = args.targetcol
def rf_optimizer(xtrain, ytrain):
# Create the random grid
# Number of trees in random forest
# n_estimators = [int(x) for x in np.arange(start=50, stop=200, step=50)]
n_estimators = [200, 300, 500]
# Number of features to consider at every split
max_features = [1.0, "sqrt", 0.5]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.arange(2, 100, step=10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 5, 7, 8]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 8, 12]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_regressor = RandomizedSearchCV(estimator=RandomForestRegressor(bootstrap=True), param_distributions=random_grid,
n_iter=50,
cv=5, verbose=3,
random_state=42, n_jobs=threads)
# Fit the random search model
rf_regressor.fit(xtrain, ytrain)
pprint(rf_regressor.best_params_)
print("CV RESULTS")
cv_results_df = pd.DataFrame(rf_regressor.cv_results_)
# sort by rank_test_score descending
cv_results_df.sort_values(by=['rank_test_score'], inplace=True, ascending=False)
# Export the DataFrame to a CSV file
cv_results_df.to_csv('cv_results.csv', index=False)
# wandb.sklearn.plot_learning_curve(rf_regressor.best_estimator_, xtrain, ytrain)
return rf_regressor.best_params_
# Importing the dataset
# create the filename
filename = "csvdata/datafullnight2_SE_waso" + str(wasointerval) + ".csv"
if wasointerval == 0:
filename = "csvdata/datafullnight2_SE.csv"
df = pd.read_csv(filename)
# print(df)
X = df.drop(columns=col2drop)
y = df.iloc[:, df.columns.get_loc(targetcol)].values
xtr, xtest, ytr, ytest = train_test_split(X.values, y, test_size=0.25, random_state=0)
print("RANDOM FOREST PARAMETER OPTIMIZATION")
rf_params = rf_optimizer(xtr, ytr)
# rf_params is json
# write to file
with open('optimized_params/rf_params_wasothreshold' + str(wasointerval) + '.json', 'w') as fp:
json.dump(rf_params, fp)