-
Notifications
You must be signed in to change notification settings - Fork 14
/
RFTest.py
96 lines (74 loc) · 3.41 KB
/
RFTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import confusion_matrix
from constructors.ensemble import RFClassification
from data.load_all_datasets import load_all_datasets
import numpy as np
from decisiontree import DecisionTree
from refined_rf import RefinedRandomForest
rf = RFClassification()
NR_FOLDS = 5
def _convert_to_tree(dt, features):
"""Convert a sklearn object to a `decisiontree.decisiontree` object"""
n_nodes = dt.tree_.node_count
children_left = dt.tree_.children_left
children_right = dt.tree_.children_right
feature = dt.tree_.feature
threshold = dt.tree_.threshold
classes = dt.classes_
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes)
decision_trees = [None] * n_nodes
for i in range(n_nodes):
decision_trees[i] = DecisionTree()
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
# If we have a test node
if children_left[node_id] != children_right[node_id]:
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
for i in range(n_nodes):
if children_left[i] > 0:
decision_trees[i].left = decision_trees[children_left[i]]
if children_right[i] > 0:
decision_trees[i].right = decision_trees[children_right[i]]
if is_leaves[i]:
decision_trees[i].label = dt.classes_[np.argmax(dt.tree_.value[i][0])]
decision_trees[i].value = None
else:
decision_trees[i].label = features[feature[i]]
decision_trees[i].value = threshold[i]
return decision_trees[0]
for dataset in load_all_datasets():
df = dataset['dataframe']
label_col = dataset['label_col']
feature_cols = dataset['feature_cols']
skf = StratifiedKFold(df[label_col], n_folds=NR_FOLDS, shuffle=True, random_state=1337)
for fold, (train_idx, test_idx) in enumerate(skf):
print 'Fold', fold+1, '/', NR_FOLDS, 'for dataset', dataset['name']
train = df.iloc[train_idx, :].reset_index(drop=True)
X_train = train.drop(label_col, axis=1)
y_train = train[label_col]
test = df.iloc[test_idx, :].reset_index(drop=True)
X_test = test.drop(label_col, axis=1)
y_test = test[label_col]
rf.construct_classifier(train, feature_cols, label_col)
for estimator in rf.clf.estimators_:
print estimator.tree_
print _convert_to_tree(estimator, feature_cols)
predictions = rf.evaluate_multiple(X_test).astype(int)
conf_matrix = confusion_matrix(y_test, predictions)
print conf_matrix
diagonal_sum = sum(
[conf_matrix[i][i] for i in range(len(conf_matrix))])
norm_diagonal_sum = sum(
[float(conf_matrix[i][i]) / float(sum(conf_matrix[i])) for i in
range(len(conf_matrix))])
total_count = np.sum(conf_matrix)
print 'Accuracy:', float(diagonal_sum) / float(total_count)
print 'Balanced accuracy:', float(norm_diagonal_sum) / float(conf_matrix.shape[0])