-
Notifications
You must be signed in to change notification settings - Fork 6
/
dtree.py
227 lines (182 loc) · 6.67 KB
/
dtree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""
Functions to build the decision tree including the ID3 heuristic
"""
import math
def entropy(data, targetAttr):
"""
Given the target attribute and the data, calculates the entropy using the ID3 heuristic
"""
class_freq = {}
data_entropy = 0.0
# count the frequency of each class in the data set.
for row in data:
#if we created an entry for the current class increment it
if(class_freq.has_key(row[targetAttr])):
class_freq[row[targetAttr]] += 1.0
else :
# create an entry for the current class
class_freq[row[targetAttr]] = 1.0
# calculate entropy of the data as sum over all classes
for freq in class_freq.values():
data_entropy += (-freq / len(data)) * math.log(freq / len(data), 2)
return data_entropy
def gain(data, attribute, targetAttr):
"""
Calculates the gain of using the attribute to classify the data
"""
attribute_freq = {}
subset_entropy = 0.0
# count the frequency of the choosen (attribute) one
for row in data:
#print row
#print attribute
if(attribute_freq.has_key(row[attribute])):
attribute_freq[row[attribute]] += 1.0
else:
attribute_freq[row[attribute]] = 1.0
# calculate the subset_entropy
for attr in attribute_freq.keys():
attr_prob = attribute_freq[attr] / sum(attribute_freq.values())
data_subset = [row for row in data if row[attribute] == attr]
subset_entropy += attr_prob * entropy(data_subset, targetAttr)
# gain is overall entropy minus subset_entropy
return (entropy(data, targetAttr) - subset_entropy)
def create_decision_tree(examples, attributes, target_attribute, heuristic_funtion):
"""
Create the decision tree based on the ID3 heuristic passed in as the heuristic_funtion
"""
# make copies of data since python list are mutable
examples = examples[:]
# possible values for the attribute
vals = [record[target_attribute] for record in examples]
# If you reach a leaf node in the decision tree and have no examples left or
#the examples are equally split among multiple classes, then
#choose the class that is most frequent in the entire training set
default = majority_value(examples, target_attribute)
# if all the records in the dataset have the same (positive 1 or negative 0 classification)
# then return that classification.
if vals.count(vals[0]) == len(vals):
return vals[0]
# else if attributes are empty (reached leaf node) and there are still examples
# choose the most frequent class among the instances of the leaf node
elif not examples or (len(attributes) - 1) <= 0 :
return default
else:
# choose the best attribute to classifiy data by using the ID3 heuristic gain funtion
best = choose_attribute(examples, attributes, target_attribute, heuristic_funtion)
# tree is represented as a dict of dicts. Instantiate the root with empty children
tree = {best:{}}
# for each possible values of best attribute create subtree/ child node
for val in get_values(examples, best):
subtree = create_decision_tree(
get_examples(examples,best,val),
[attr for attr in attributes if attr != best], # removing the chosen best attribute from the list here
target_attribute,
heuristic_funtion)
tree[best][val] = subtree
return tree
########################## Helper Functions #####################
"""
The following are helper funtions used in creating the decision tree. These are
courtsey http://onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html
"""
"""
This module holds functions that are responsible for creating a new
decision tree and for using the tree for data classificiation.
"""
def majority_value(data, target_attr):
"""
Creates a list of all values in the target attribute for each record
in the data list object, and returns the value that appears in this list
the most frequently.
"""
data = data[:]
return most_frequent([record[target_attr] for record in data])
def most_frequent(lst):
"""
Returns the item that appears most frequently in the given list.
"""
lst = lst[:]
highest_freq = 0
most_freq = None
for val in unique(lst):
if lst.count(val) > highest_freq:
most_freq = val
highest_freq = lst.count(val)
return most_freq
def unique(lst):
"""
Returns a list made up of the unique values found in lst. i.e., it
removes the redundant values in lst.
"""
lst = lst[:]
unique_lst = []
# Cycle through the list and add each value to the unique list only once.
for item in lst:
if unique_lst.count(item) <= 0:
unique_lst.append(item)
# Return the list with all redundant values removed.
return unique_lst
def get_values(data, attr):
"""
Creates a list of values in the chosen attribut for each record in data,
prunes out all of the redundant values, and return the list.
"""
data = data[:]
return unique([record[attr] for record in data])
def choose_attribute(data, attributes, target_attr, fitness):
"""
Cycles through all the attributes and returns the attribute with the
highest information gain (or lowest entropy).
"""
data = data[:]
best_gain = 0.0
best_attr = None
for attr in attributes:
gain = fitness(data, attr, target_attr)
if (gain >= best_gain and attr != target_attr):
best_gain = gain
best_attr = attr
return best_attr
def get_examples(data, attr, value):
"""
Returns a list of all the records in <data> with the value of <attr>
matching the given value.
"""
data = data[:]
rtn_lst = []
if not data:
return rtn_lst
else:
record = data.pop()
if record[attr] == value:
rtn_lst.append(record)
rtn_lst.extend(get_examples(data, attr, value))
return rtn_lst
else:
rtn_lst.extend(get_examples(data, attr, value))
return rtn_lst
def get_classification(record, tree):
"""
This function recursively traverses the decision tree and returns a
classification for the given record.
"""
# If the current node is a string, then we've reached a leaf node and
# we can return it as our answer
if type(tree) == type("string"):
return tree
# Traverse the tree further until a leaf node is found.
else:
attr = tree.keys()[0]
t = tree[attr][record[attr]]
return get_classification(record, t)
def classify(tree, data):
"""
Returns a list of classifications for each of the records in the data
list as determined by the given decision tree.
"""
data = data[:]
classification = []
for record in data:
classification.append(get_classification(record, tree))
return classification