-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDecisionTreeSolver.py
423 lines (351 loc) · 20.1 KB
/
DecisionTreeSolver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
import sys, os
from math import log
import json
import numpy as np
import warnings
class Node(object):
"""Class to represent the Data structure needed to represent a node in the tree"""
def __init__(self, right_tree=None, left_tree=None, isleaf=None, attribute=None, attribute_idx=None, attribute_split_val=None,
positive_class_count=None, negative_class_count=None, depth=0):
self.right_tree = right_tree # pointer to the right subtree
self.left_tree = left_tree # pointer to the left subtree
self.isleaf = isleaf # value of the output class if it is the leaf node, None if not
self.attribute = attribute # attribute(word) to split on
self.attribute_idx = attribute_idx # Column number of the attribute based on which the tree is split
self.attribute_split_val = attribute_split_val # Value of the attribute based on which its split
self.positive_class_count = positive_class_count # positive class count at this node
self.negative_class_count = negative_class_count # negative class count at this node
self.depth = depth
class DecisionTreeSolver:
'''
Class that implements the decision tree algorithm
'''
# lets have a stop word list and have the implementation needed to process a list of stop words, remove them
# email messages are cluttered with a lot of stop words and this can be expanded
stopwords = ['|', ' <a', '-->', '<!--']
# dictionary for class labels
class_labels = {'spam':1, 'notspam':0}
def __init__(self, max_depth=5):
self.doc_vectors_binary = None
self.doc_vectors_frequencies = None
self.features = None
self.max_depth = max_depth
self.tree = None
self.class_counts = {}
# Eg. class_counts dictionary
# {
# spam: {total_count: 100, word_counts: {'this':4, 'is': 4, 'test': 4}},
# notspam: {total_count: 200, word_counts: {'YOU':4, 'ARE': 4, 'SELECTED': 4}},
# }
def compute_entropy(self, data_array):
'''
Function to compute the entropy
:param data_array: output class labels array
:return: entropy value
'''
entropy_val = 0
for c in (self.class_labels['spam'], self.class_labels['notspam']):
p_x = float(len(data_array[data_array[:,-1] == c])) / len(data_array)
if p_x != 0:
entropy_val -= p_x * log(p_x, 2) # Calculating the entropy
return entropy_val
def build_word_counts_model(self, files_path):
'''
This will build a word counts model, that will count occurence of word within in documents
builds the dict class_counts from the training data
:param files_path: path to the dir with the training files
:return: Updates the class_counts dictionary for the object
'''
dirs = os.listdir(files_path)
for class_dir_name in dirs:
for f in os.listdir(os.path.join(files_path, class_dir_name)):
document = os.path.join(files_path, class_dir_name, f)
with open(document, 'r') as file:
contents = file.read()
words = set(contents.split())
for word in words:
if word not in self.stopwords:
self.class_counts[class_dir_name]['word_counts'][word] = \
self.class_counts\
.setdefault(class_dir_name,{})\
.setdefault('word_counts',{})\
.setdefault(word, 0) + 1
self.class_counts[class_dir_name]['total_count'] = \
self.class_counts\
.setdefault(class_dir_name, {})\
.setdefault('total_count', 0) + 1
pass
pass
pass
def build_doc_to_words_vector(self, num_top_words, files_path):
'''
Function to build the feature vectors
:param num_top_words: Number of top words based on which to build the feature vectors
This is used to grab that many words from both spam and notspam word count lists
:param files_path: path to training corpus
:return:
'''
# build the top_words_list from spam and notspam dicts
spam_word_counts = self.class_counts.get('spam').get('word_counts')
notspam_word_counts = self.class_counts.get('notspam').get('word_counts')
num_spam_words = len(spam_word_counts) if len(spam_word_counts) < num_top_words else num_top_words
num_notspam_words = len(notspam_word_counts) if len(notspam_word_counts) < num_top_words else num_top_words
top_spam_words_list = sorted(spam_word_counts, key=spam_word_counts.get, reverse=True)[:num_spam_words]
top_notspam_words_list = sorted(notspam_word_counts, key=notspam_word_counts.get, reverse=True)[:num_notspam_words]
top_words_list = set(top_spam_words_list + top_notspam_words_list)
top_words_list = list(top_words_list)
self.features = top_words_list
# initialize an empty feature matrix
self.doc_vectors_binary = np.empty((0, len(top_words_list) + 1), dtype=int) #feature vector for binary features
self.doc_vectors_frequencies = np.empty((0, len(top_words_list) + 1), dtype=int) # feature vector for frequencies
# process the input files and build the feature matrix
dirs = os.listdir(files_path)
for class_dir_name in dirs:
i = 0
print "Processing files in " + class_dir_name + " folder..."
for f in os.listdir(os.path.join(files_path, class_dir_name)):
document = os.path.join(files_path, class_dir_name, f)
with open(document, 'r') as file:
contents = file.read()
words1 = set(contents.split())
bin_vec = map(lambda x: 1 if x else 0,[top_word in words1 for top_word in top_words_list])
self.doc_vectors_binary = np.append(self.doc_vectors_binary, np.reshape(bin_vec + [self.class_labels[class_dir_name]],(1, len(bin_vec)+1)), axis=0)
words2 = contents.split()
freq_vec = [words2.count(top_word) for top_word in top_words_list]
self.doc_vectors_frequencies = np.append(self.doc_vectors_frequencies, np.reshape(freq_vec + [self.class_labels[class_dir_name]],(1, len(freq_vec)+1)), axis=0)
# i += 1
# print i
# docs_with_no_words_from_list = self.doc_vectors[np.sum(np.asarray(self.doc_vectors[:, :-5], int), axis=4) == 0]
pass
def get_majority_class(self, labels):
# returns the majority class for the labels array
majority_class = None
majority_count = -sys.maxint
for value in (self.class_labels['spam'], self.class_labels['notspam']):
count = len(labels[labels == value])
if count > majority_count:
majority_class = value
majority_count = count
return majority_class, majority_count
def decision_tree_learner(self, data_array, indices, parent_major_class, depth):
# function that implemets the decision tree learner recursive algorithm
# follows the pseudo code provided by Steve and Russell AI book
if len(data_array) == 0:
return Node(isleaf=parent_major_class, positive_class_count=0, negative_class_count=0, depth=depth)
majority_class, majority_count = self.get_majority_class(data_array[:, -1])
positive_class_count = len(data_array[data_array[:, -1] == self.class_labels['spam']])
negative_class_count = len(data_array[data_array[:, -1] == self.class_labels['notspam']])
if majority_count == len(data_array):
return Node(isleaf=majority_class, positive_class_count=positive_class_count, negative_class_count=negative_class_count, depth=depth)
if len(indices) == 0:
return Node(isleaf=majority_class, positive_class_count=positive_class_count, negative_class_count=negative_class_count, depth=depth)
# if max_depth is reached return node, stop building tree
if depth == self.max_depth:
return Node(isleaf=majority_class, positive_class_count=positive_class_count, negative_class_count=negative_class_count, depth=depth)
# Find the best attribute to split on
lowest_entropy = sys.maxint
best_attr_index = None
best_attr = None
# loop over all word features except last (last col has class output)
for column in indices:
unique_values = set(data_array[:, column]) # Set of al the unique values in the present column
for value in unique_values:
cum_entropy = 0
# filter the data based on the feature values and compute entropy
# for value in ('0','1'):
# Split the column based on the value
left_branch = data_array[data_array[:, column] >= value]
right_branch = data_array[data_array[:, column] < value]
# filtered = data_array[data_array[:, column] == value]
if len(left_branch) > 0 and len(right_branch) > 0:
x1 = float(left_branch.shape[0]) / (data_array.shape[0])
cum_entropy += x1 * self.compute_entropy(left_branch[:, [-1]])
# if len(right_branch) > 0:
x1 = float(right_branch.shape[0]) / (data_array.shape[0])
cum_entropy += x1 * self.compute_entropy(right_branch[:, [-1]])
# Check if the entropy is lower than the lowest
if cum_entropy < lowest_entropy:
lowest_entropy = cum_entropy
best_attr_index = column
best_attr = self.features[column]
best_split_val = value
# print "Splitting feature %s at value %s that has entropy %f " % (self.features[best_attr_index], str(best_split_val), lowest_entropy)
tree = Node(attribute=self.features[best_attr_index], attribute_idx=best_attr_index,
attribute_split_val=best_split_val,
positive_class_count=positive_class_count, negative_class_count=negative_class_count, depth=depth)
majority_class, majority_count = self.get_majority_class(data_array[:, -1])
indices.remove(best_attr_index)
left_branch = data_array[data_array[:, best_attr_index] >= best_split_val]
left_tree = self.decision_tree_learner(left_branch, indices, majority_class, depth+1)
tree.left_tree = left_tree
right_branch = data_array[data_array[:, best_attr_index] < best_split_val]
right_tree = self.decision_tree_learner(right_branch, indices, majority_class, depth+1)
tree.right_tree = right_tree
return tree
def print_tree(self, tree, side=None):
print_string = ""
if side == "Left":
print_string = "(value == True )"
if side == "Right":
print_string = "(value == False)"
if tree.isleaf is None:
print "=====" * tree.depth + print_string + " :: Split on :: " + tree.attribute + " >= " + str(tree.attribute_split_val)
self.print_tree(tree.left_tree, side="Left")
self.print_tree(tree.right_tree, side="Right")
elif tree.isleaf is not None:
print "=====" * tree.depth + print_string + " :: Classify as :: " + str(tree.isleaf)
def convert_tree_to_json(self, tree):
if tree.isleaf is not None:
return {
"right_tree": tree.right_tree,
"left_tree": tree.left_tree,
"isleaf": tree.isleaf,
"attribute": tree.attribute,
"attribute_idx": tree.attribute_idx,
"attribute_split_val": tree.attribute_split_val,
"positive_class_count": tree.positive_class_count,
"negative_class_count": tree.negative_class_count,
"depth": tree.depth
}
elif tree.isleaf is None:
left_tree = self.convert_tree_to_json(tree.left_tree)
right_tree = self.convert_tree_to_json(tree.right_tree)
return {
"right_tree": right_tree,
"left_tree": left_tree,
"isleaf": tree.isleaf,
"attribute": tree.attribute,
"attribute_idx": tree.attribute_idx,
"attribute_split_val": tree.attribute_split_val,
"positive_class_count": tree.positive_class_count,
"negative_class_count": tree.negative_class_count,
"depth": tree.depth
}
def train(self, files_path, model_file):
print "Training Decision Tree Algorithm for the given dataset..."
self.build_word_counts_model(files_path)
self.build_doc_to_words_vector(1000, files_path)
print "Training a model using binary features..."
tree_binary = self.decision_tree_learner(data_array=self.doc_vectors_binary, indices=range(self.doc_vectors_binary.shape[1]-1), parent_major_class=None, depth=1)
print "Training a model using frequency features..."
tree_freq = self.decision_tree_learner(data_array=self.doc_vectors_frequencies, indices=range(self.doc_vectors_frequencies.shape[1]-1), parent_major_class=None, depth=1)
print "Decision Tree learnt using binary features..."
print "#####################################################"
self.print_tree(tree_binary)
print "#####################################################"
print "Decision Tree learnt using Frequency based features..."
print "#####################################################"
self.print_tree(tree_freq)
print "#####################################################"
self.save_model_to_file(model_file, tree_binary, tree_freq)
pass
# self.save_model_to_file(self.model_file)
def save_model_to_file(self, file_name, tree_binary, tree_freq):
'''
saves the model to the file, so it can be retrieved from the file and run
:param file_name:
:return:
'''
print "Saving the models to the specified file..."
if not os.path.exists(os.path.dirname(file_name)):
os.makedirs(os.path.dirname(file_name))
json_tree_bin = self.convert_tree_to_json(tree_binary)
json_tree_freq = self.convert_tree_to_json(tree_freq)
with open(file_name, 'w+') as filehandle:
json.dump({"bin_tree":json_tree_bin, "freq_tree":json_tree_freq}, filehandle, sort_keys = True, indent = 4, ensure_ascii = False, encoding = "utf-8")
# json.dump({'class_counts': self.class_counts},
# filehandle, sort_keys=True, indent=4, ensure_ascii=False, encoding="utf-8")
def convert_json_to_tree(self, json_tree):
# recursive algorithm to convert the model read from the file back into the Tree data structure used at prediction
if json_tree.get("isleaf") in [0,1]:
return Node(
right_tree = json_tree.get("right_tree"),
left_tree = json_tree.get("left_tree"),
isleaf = json_tree.get("isleaf"),
attribute = json_tree.get("attribute"),
attribute_idx = json_tree.get("attribute_idx"),
attribute_split_val = json_tree.get("attribute_split_val"),
positive_class_count = json_tree.get("positive_class_count"),
negative_class_count = json_tree.get("negative_class_count"),
depth = json_tree.get("depth")
)
else:
left_tree = self.convert_json_to_tree(json_tree.get("left_tree"))
right_tree = self.convert_json_to_tree(json_tree.get("right_tree"))
return Node(
right_tree = right_tree,
left_tree = left_tree,
isleaf = json_tree.get("isleaf"),
attribute = json_tree.get("attribute"),
attribute_idx = json_tree.get("attribute_idx"),
attribute_split_val = json_tree.get("attribute_split_val"),
positive_class_count = json_tree.get("positive_class_count"),
negative_class_count = json_tree.get("negative_class_count"),
depth = json_tree.get("depth")
)
def load_model_from_file(self, file_name):
'''
loads a previously saved model
:param file_name:
:return:
'''
with open(file_name, 'r') as filehandle:
json_tree = json.load(filehandle)
tree_bin = self.convert_json_to_tree(json_tree["bin_tree"])
tree_freq = self.convert_json_to_tree(json_tree["freq_tree"])
return (tree_bin, tree_freq)
def predict_for_input(self, tree, word_vector):
# run through the tree until a leaf node is reached
with warnings.catch_warnings():
warnings.simplefilter("ignore")
while tree.isleaf is None:
if word_vector.count(tree.attribute) >= tree.attribute_split_val:
tree = tree.left_tree
else:
tree = tree.right_tree
return tree.isleaf
def predict(self, files_path, model_file):
print "Predicting using Decision Tree Algorithm for the given test set..."
(tree_bin, tree_freq) = self.load_model_from_file(model_file)
dirs = os.listdir(files_path)
print "Predicting using binary features model..."
print "#############################################################"
for class_dir_name in dirs:
total_test_cases = 0
correct_predictions = 0
for f in os.listdir(os.path.join(files_path, class_dir_name)):
document = os.path.join(files_path, class_dir_name, f)
total_test_cases += 1
with open(document, 'r') as file:
contents = file.read()
words = set(contents.split())
words = list(words)
predicted_class = self.predict_for_input(tree_bin, words)
# print "Prediction for %s is %s" % (document, predicted_class)
if predicted_class == self.class_labels[class_dir_name]:
correct_predictions += 1
print "Prediction Accuracy for class: %s " % class_dir_name
print "### Total test cases: %d, " % (total_test_cases)
print "### Correct classification: %d, " % (correct_predictions)
print "### Accuracy: %.2f, " % (float(correct_predictions)/total_test_cases)
print "#############################################################"
print "Predicting using frequency based features model..."
print "#############################################################"
for class_dir_name in dirs:
total_test_cases = 0
correct_predictions = 0
for f in os.listdir(os.path.join(files_path, class_dir_name)):
document = os.path.join(files_path, class_dir_name, f)
total_test_cases += 1
with open(document, 'r') as file:
contents = file.read()
words = contents.split()
predicted_class = self.predict_for_input(tree_freq, words)
# print "Prediction for %s is %s" % (document, predicted_class)
if predicted_class == self.class_labels[class_dir_name]:
correct_predictions += 1
print "Prediction Accuracy for class: %s " % class_dir_name
print "### Total test cases: %d, " % (total_test_cases)
print "### Correct classification: %d, " % (correct_predictions)
print "### Accuracy: %.2f, " % (float(correct_predictions) / total_test_cases)
print "#############################################################"