Skip to content

Commit 47ea180

Browse files
committed
Adding JVAP model runner
1 parent ec07018 commit 47ea180

File tree

3 files changed

+137
-535
lines changed

3 files changed

+137
-535
lines changed

CitationGraph/Untitled.ipynb

-140
This file was deleted.

jvap.py

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import pandas as pd
2+
import numpy as np
3+
from sklearn.ensemble import RandomForestClassifier
4+
from sklearn.svm import SVC
5+
from sklearn.tree import DecisionTreeClassifier
6+
from sklearn.grid_search import GridSearchCV
7+
from sklearn.metrics import accuracy_score,make_scorer,f1_score,classification_report,average_precision_score
8+
from sklearn.preprocessing import Normalizer,MinMaxScaler,StandardScaler,normalize
9+
from sklearn.cross_validation import train_test_split
10+
import multiprocessing
11+
import datetime
12+
from time import gmtime, strftime
13+
from grid_search_funs import *
14+
15+
def print_log(log_str):
16+
17+
log_file_name = "jvap_log.txt"
18+
19+
with open(log_file_name, "a") as f:
20+
21+
entry = strftime("%Y-%m-%d %H:%M:%S") + '\t' + str(log_str) + '\n'
22+
23+
f.write(entry)
24+
25+
print(entry[:-1])
26+
27+
28+
df_x,df_y = get_data(1000)
29+
30+
df_x,df_y=remove_bad_rows(df_x,df_y) #drops rows with codej1=codej2, codej2=nan
31+
df_x=drop_unneeded_cols(df_x) #drops unneeded cols
32+
df_x=drop_dissent(df_x) #drops dissent, concur columns
33+
34+
print_log((df_x.shape, df_y.shape))
35+
36+
df_x=dummify(df_x)
37+
38+
#GET X, Y AS NUMPY ARRAYS
39+
40+
X = df_x.values
41+
y = df_y.ix[:,0].values
42+
43+
#MAKE SURE Y LOOKS LIKE [1 1 1 ... 1 1] (SOMETIMES IT CAN STORE INDICES)
44+
45+
print_log((X.shape, y.shape))
46+
47+
print_log(X[:10])
48+
print_log(y[:10])
49+
50+
#############################################
51+
# Split into training and test set
52+
#############################################
53+
54+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
55+
56+
#look at size of df_x and X to make sure you have enough RAM
57+
58+
print_log(df_x.info())
59+
print_log(("Size of X in GB: ", (X.nbytes * 1.0)/(1024 * 1024 *1024))) #size of X in GB
60+
61+
#check sizes match
62+
63+
print_log((X_train.shape, y_train.shape))
64+
print_log((X_test.shape, y_test.shape))
65+
66+
#DONT DO FOR RANDOM FOREST
67+
68+
# #############################################
69+
# # Standard scale
70+
# #############################################
71+
72+
# scaler = StandardScaler()
73+
# scaler.fit(X_train)
74+
75+
# X_test = scaler.transform(X_test)
76+
77+
78+
#DONT DO FOR RANDOM FOREST
79+
80+
# #############################################
81+
# # Min-Max scale
82+
# #############################################
83+
84+
# scaler = MinMaxScaler()
85+
# scaler.fit(X_train)
86+
87+
# X_test = scaler.transform(X_test)
88+
89+
90+
#############################################
91+
# [OPTIONAL]
92+
# Random Forest Grid Search
93+
#############################################
94+
95+
num_cores = multiprocessing.cpu_count()
96+
97+
print "numcores = ", num_cores
98+
99+
#modify/add params here you want to search over
100+
param_grid = {'n_estimators': [10, 50, 100, 150, 200], 'max_depth': [1, 5, 10, 15, 20, 25]}
101+
102+
103+
rf_clf = RandomForestClassifier(random_state=42)
104+
105+
gridclf = grid_search(X=X_train, y=y_train, clf=rf_clf, param_grid=param_grid, n_jobs=num_cores)
106+
107+
print_log(gridclf.best_params_)
108+
print_log(gridclf.best_score_)
109+
110+
111+
#############################################
112+
# [OPTIONAL] Random Forest (RUN OVER BEST MODEL FROM GRID SEARCH)
113+
#############################################
114+
115+
# Replace labels (in case SVM was run)
116+
# y_train[y_train == 0.] = -1.
117+
# y_test[y_test == 0.] = -1.
118+
119+
120+
rf_clf = RandomForestClassifier(random_state=42, **gridclf.best_params_)
121+
# class_weight={1.0: 1, -1.0: 150})
122+
123+
rf_clf.fit(X_train, y_train)
124+
125+
y_pred = rf_clf.predict(X_test)
126+
127+
print_log(classification_report(y_test, y_pred))
128+
129+
#############################################
130+
# [OPTIONAL]
131+
# Feature importance analysis
132+
#############################################
133+
134+
top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)
135+
136+
for t in top_n:
137+
print_log(t)

0 commit comments

Comments
 (0)