-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdecision_tree.py
232 lines (205 loc) · 10.2 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import numpy as np
import copy
class Tree():
def __init__(self, value=None, attribute_name="root", attribute_index=None, branches=None):
"""
This class implements a tree structure with multiple branches at each node.
If self.branches is an empty list, this is a leaf node and what is contained in
self.value is the predicted class.
The defaults for this are for a root node in the tree.
Arguments:
branches (list): List of Tree classes. Used to traverse the tree. In a
binary decision tree, the length of this list is either 2 (for left and
right branches) or 0 (at a leaf node).
attribute_name (str): Contains name of attribute that the tree splits the data
on. Used for visualization (see `DecisionTree.visualize`).
attribute_index (float): Contains the index of the feature vector for the
given attribute. Should match with self.attribute_name.
value (number): Contains the value that data should be compared to along the
given attribute.
"""
self.branches = [] if branches is None else branches
self.attribute_name = attribute_name
self.attribute_index = attribute_index
self.value = value
class DecisionTree():
def __init__(self, attribute_names):
"""
This class creates a DecisionTree using the Id3 algorithm.
Input:
attribute_names: list of strings of the attirbute names
Data Members:
attribute_names : stores the attribute_names
tree: contains the nested tree structure created from tree class.
New trees are appended as a new branch is needed
"""
self.attribute_names = attribute_names
self.tree = None
def _check_input(self, features):
if features.shape[1] != len(self.attribute_names):
raise ValueError(
"Number of features and number of attribute names must match!"
)
def addnewtree(self, root, name, col, val):
"""
Creates a new branch, which is itself a Tree, given a root to append it to.
Inputs:
root: Which previous node to append the branch to
name: stores the attribute name that this node is split on
col: stores the attribute column that this node is split on
val: Most common class, this is the classification for this branch if its a leaf
"""
branch = Tree(attribute_name=name, attribute_index = col, value = val)
root.branches.append(branch)
def pickatt(self, features, targets, attributes):
"""
Loops through remaining attributes to figure out which one has the highest information gain.
Inputs:
features: NxK array of features and examples to loop through
targets: Nx1 array of classifications for each example
attributes: list of attribute names to loop through
"""
max_information_gain = -.1
node_attribute_name = ''
node_att_col = 0
if (len(attributes) == 1):
node_attribute_name = attributes[0]
node_att_col = 0
else:
for y in range(len(attributes)):
info_gain = information_gain(features,y,targets)
if (info_gain > max_information_gain):
max_information_gain = info_gain
node_attribute_name = attributes[y]
node_att_col = y
return node_attribute_name, node_att_col
def splitvar(self, features, targets, column, value):
rows = np.where(features[:,column] == value)
newfeatures = features[rows[0],:]
newfeatures = np.delete(newfeatures, column, 1)
newtargets = targets[rows]
return newfeatures, newtargets
def most_common_class(self, targets):
count_true = np.count_nonzero(targets == 1)
count_false = np.count_nonzero(targets == 0)
most_common_class = True if count_true >= count_false else False
return most_common_class
def ID3(self, features, targets, attributes, root, default=None):
node_attribute_name, node_att_col = self.pickatt(features, targets, attributes)
for x in range(2):
newfeatures, newtargets = self.splitvar(features, targets, node_att_col, x)
value = self.most_common_class(newtargets)
if (newfeatures.shape[0] == 0) :
return self
if ((newfeatures.shape[1] == 0) or (len(np.unique(newtargets)) == 1)) :
leaf_name = node_attribute_name + "leaf"
self.addnewtree(root, leaf_name, node_att_col, value)
else:
self.addnewtree(root, node_attribute_name, node_att_col, x)
newroot = root.branches[x]
newattributes = attributes.copy()
newattributes.remove(node_attribute_name)
self.ID3(newfeatures, newtargets, newattributes, newroot, self.most_common_class(newtargets))
def fit(self, features, targets):
"""
Takes in the features as a numpy array and fits a decision tree to the targets.
Args:
features (np.array): numpy array of size NxF containing features, where N is
number of examples and F is number of features.
targets (np.array): numpy array containing class labels for each of the N
examples.
"""
self._check_input(features)
if (self.tree == None):
self.tree = Tree()
if (len(self.attribute_names) == 0): # check there are attributes left
return self
if (len(np.unique(targets)) == 1):
count_true = np.count_nonzero(targets == 1)
count_false = np.count_nonzero(targets == 0)
most_common_class = True if count_true >= count_false else False
self.tree.attribute_name = most_common_class
self.visualize()
return self
self.ID3(features, targets, self.attribute_names,self.tree, default = None)
def findbranches(self, root, example):
if (root.branches == []):
prediction = root.value
return prediction
else:
attribute_index = root.branches[0].attribute_index
example_val = example[attribute_index]
if (example_val > 1):
example_val = 1
newroot = root.branches[int(example_val)]
newexample = np.delete(example, attribute_index)
return self.findbranches(newroot, newexample)
def predict(self, features):
"""
Takes in features as a numpy array and predicts classes for each point using
the trained model.
Args:
features (np.array): numpy array of size NxF containing features, where N is
number of examples and F is number of features.
"""
self._check_input(features)
numexamples = features.shape[0]
targets = np.zeros(numexamples)
for row in range(numexamples):
example = features[row,:]
prediction = self.findbranches(self.tree, example)
targets[row] = int(prediction)
return targets
def entropy(positives, negatives):
"""
The difference in entropy is the characteristic used to measure information gain.
This function will calculate entropy for an array of outputs. This is a helper function for
the information gain function below.
Inputs:
positives: int, the number of examples classified as 1/True in your targets array
negatives: int, the number of examples classified as 0/False in your targets array
entropy = sum_t[-P_t*log_2(P_t)]
"""
total = positives + negatives
if (total == 0):
entropy = 0
else:
pos_percent = float(positives/total)
neg_percent = float(negatives/total)
if (pos_percent > 0) & (neg_percent > 0):
entropy = -(pos_percent*np.log2(pos_percent)) - (neg_percent * np.log2(neg_percent))
elif (pos_percent > 0) & (neg_percent == 0):
entropy = -(pos_percent*np.log2(pos_percent))
else:
entropy = -(neg_percent * np.log2(neg_percent))
return entropy
def information_gain(features, attribute_index, targets):
"""
Calculates the information gain for each possible attribute. The attribute with the highest
information gain will be chosen to be the next split in the decision tree. This algorithm
uses entropy to calculate information gain. This splits each attribute on every value it takes
on. Thus is targeted to binary/categorical variables rather than continuous.
Inputs:
features: NxK array of data of examples
attribute_index: int, columnn number of the attribute to calculate informataion gain for
targets: Nx1 array of classifications for examples
Outputs:
information_gain: float, information gained for that variable if it were to be split on
"""
attribute = features[:,attribute_index]
levels = len(np.unique(attribute))
total = len(attribute)
entropy_after = 0
if (total == 0):
entropy_after = 0
else:
for x in range(levels): # the number of unique attributes
total_level = np.count_nonzero(attribute == x) #count the total number of obs with that instance of att
positives = np.count_nonzero((attribute == x) & (targets == 1)) #number of this instance with pos outcome
negatives = np.count_nonzero((attribute == x) & (targets == 0)) #number of this instance with neg outcome
assert total_level == positives + negatives #check our pos and negs add to toal of this att
part_entropy = float(total_level/total)*(entropy(positives, negatives)) #prob of this instance of att*entropy(att)
entropy_after = entropy_after + part_entropy #add this instance's entropy to total attribut
entropy_before = entropy(np.count_nonzero(targets == 1),np.count_nonzero(targets == 0))
information_gain = entropy_before-entropy_after
return information_gain