|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +from sklearn.ensemble import RandomForestClassifier |
| 4 | +from sklearn.svm import SVC |
| 5 | +from sklearn.tree import DecisionTreeClassifier |
| 6 | +from sklearn.grid_search import GridSearchCV |
| 7 | +from sklearn.metrics import accuracy_score,make_scorer,f1_score,classification_report,average_precision_score |
| 8 | +from sklearn.preprocessing import Normalizer,MinMaxScaler,StandardScaler,normalize |
| 9 | +from sklearn.cross_validation import train_test_split |
| 10 | +import multiprocessing |
| 11 | +import datetime |
| 12 | +from time import gmtime, strftime |
| 13 | +from grid_search_funs import * |
| 14 | + |
| 15 | +def print_log(log_str): |
| 16 | + |
| 17 | + log_file_name = "jvap_log.txt" |
| 18 | + |
| 19 | + with open(log_file_name, "a") as f: |
| 20 | + |
| 21 | + entry = strftime("%Y-%m-%d %H:%M:%S") + '\t' + str(log_str) + '\n' |
| 22 | + |
| 23 | + f.write(entry) |
| 24 | + |
| 25 | + print(entry[:-1]) |
| 26 | + |
| 27 | + |
| 28 | +df_x,df_y = get_data(1000) |
| 29 | + |
| 30 | +df_x,df_y=remove_bad_rows(df_x,df_y) #drops rows with codej1=codej2, codej2=nan |
| 31 | +df_x=drop_unneeded_cols(df_x) #drops unneeded cols |
| 32 | +df_x=drop_dissent(df_x) #drops dissent, concur columns |
| 33 | + |
| 34 | +print_log((df_x.shape, df_y.shape)) |
| 35 | + |
| 36 | +df_x=dummify(df_x) |
| 37 | + |
| 38 | +#GET X, Y AS NUMPY ARRAYS |
| 39 | + |
| 40 | +X = df_x.values |
| 41 | +y = df_y.ix[:,0].values |
| 42 | + |
| 43 | +#MAKE SURE Y LOOKS LIKE [1 1 1 ... 1 1] (SOMETIMES IT CAN STORE INDICES) |
| 44 | + |
| 45 | +print_log((X.shape, y.shape)) |
| 46 | + |
| 47 | +print_log(X[:10]) |
| 48 | +print_log(y[:10]) |
| 49 | + |
| 50 | +############################################# |
| 51 | +# Split into training and test set |
| 52 | +############################################# |
| 53 | + |
| 54 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) |
| 55 | + |
| 56 | +#look at size of df_x and X to make sure you have enough RAM |
| 57 | + |
| 58 | +print_log(df_x.info()) |
| 59 | +print_log(("Size of X in GB: ", (X.nbytes * 1.0)/(1024 * 1024 *1024))) #size of X in GB |
| 60 | + |
| 61 | +#check sizes match |
| 62 | + |
| 63 | +print_log((X_train.shape, y_train.shape)) |
| 64 | +print_log((X_test.shape, y_test.shape)) |
| 65 | + |
| 66 | +#DONT DO FOR RANDOM FOREST |
| 67 | + |
| 68 | +# ############################################# |
| 69 | +# # Standard scale |
| 70 | +# ############################################# |
| 71 | + |
| 72 | +# scaler = StandardScaler() |
| 73 | +# scaler.fit(X_train) |
| 74 | + |
| 75 | +# X_test = scaler.transform(X_test) |
| 76 | + |
| 77 | + |
| 78 | +#DONT DO FOR RANDOM FOREST |
| 79 | + |
| 80 | +# ############################################# |
| 81 | +# # Min-Max scale |
| 82 | +# ############################################# |
| 83 | + |
| 84 | +# scaler = MinMaxScaler() |
| 85 | +# scaler.fit(X_train) |
| 86 | + |
| 87 | +# X_test = scaler.transform(X_test) |
| 88 | + |
| 89 | + |
| 90 | +############################################# |
| 91 | +# [OPTIONAL] |
| 92 | +# Random Forest Grid Search |
| 93 | +############################################# |
| 94 | + |
| 95 | +num_cores = multiprocessing.cpu_count() |
| 96 | + |
| 97 | +print "numcores = ", num_cores |
| 98 | + |
| 99 | +#modify/add params here you want to search over |
| 100 | +param_grid = {'n_estimators': [10, 50, 100, 150, 200], 'max_depth': [1, 5, 10, 15, 20, 25]} |
| 101 | + |
| 102 | + |
| 103 | +rf_clf = RandomForestClassifier(random_state=42) |
| 104 | + |
| 105 | +gridclf = grid_search(X=X_train, y=y_train, clf=rf_clf, param_grid=param_grid, n_jobs=num_cores) |
| 106 | + |
| 107 | +print_log(gridclf.best_params_) |
| 108 | +print_log(gridclf.best_score_) |
| 109 | + |
| 110 | + |
| 111 | +############################################# |
| 112 | +# [OPTIONAL] Random Forest (RUN OVER BEST MODEL FROM GRID SEARCH) |
| 113 | +############################################# |
| 114 | + |
| 115 | +# Replace labels (in case SVM was run) |
| 116 | +# y_train[y_train == 0.] = -1. |
| 117 | +# y_test[y_test == 0.] = -1. |
| 118 | + |
| 119 | + |
| 120 | +rf_clf = RandomForestClassifier(random_state=42, **gridclf.best_params_) |
| 121 | +# class_weight={1.0: 1, -1.0: 150}) |
| 122 | + |
| 123 | +rf_clf.fit(X_train, y_train) |
| 124 | + |
| 125 | +y_pred = rf_clf.predict(X_test) |
| 126 | + |
| 127 | +print_log(classification_report(y_test, y_pred)) |
| 128 | + |
| 129 | +############################################# |
| 130 | +# [OPTIONAL] |
| 131 | +# Feature importance analysis |
| 132 | +############################################# |
| 133 | + |
| 134 | +top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns) |
| 135 | + |
| 136 | +for t in top_n: |
| 137 | + print_log(t) |
0 commit comments