-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnaive_bayes_baseline.py
executable file
·155 lines (134 loc) · 4.53 KB
/
naive_bayes_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/python
import pandas as pd
import numpy as np
import json
from collections import Counter
import re
import math
def process_text(text_data):
words = text_data.split()
cnt = Counter(words)
return cnt
print "Start loading data"
f = open('yelp_academic_dataset_review.json', 'r')
# This code is so that we can work with a small dataset first.
# Will need to be updated
ind = 1
total_reviews = 0
category_reviews = [0,0,0,0,0]
rows_dict = []
summary_words = []
for line in f:
inter_dict = json.loads(line)
rows_dict.append(inter_dict)
if ind % 1000 ==0 and ind <= 2000000:
training_data = pd.DataFrame(rows_dict)
rows_dict = []
grouped = training_data.groupby('stars')
idx = 0
for name, group in grouped:
category_reviews[idx] += len(group['text'])
total_reviews += len(group['text'])
idx += 1
inter_words = []
for name, group in grouped:
word_cloud = group['text'].apply(process_text).sum()
inter_words.append(word_cloud)
if ind == 1000:
for idx in range(len(inter_words)):
summary_words.append(inter_words[idx])
else:
for idx in range(len(inter_words)):
summary_words[idx] += inter_words[idx]
ind += 1
print ind
f.close()
print "End loading data"
test_data = pd.DataFrame(rows_dict)
prior = []
print "Compute priors from number of samples from each class"
for idx in range(len(category_reviews)):
prior_val = float(category_reviews[idx])/total_reviews
prior.append(prior_val)
print prior_val
print "Remove all words that don't start with a character, and contain values other than characters or digits as well as some special characters. Print size of word cloud"
curated_summary = []
for ind in range(len(summary_words)):
word_cloud = {}
for word in summary_words[ind]:
m = re.match(r'^[A-Za-z][A-Za-z\'!0-9]+$', word)
if m:
word_cloud[word] = summary_words[ind][word]
print len(word_cloud)
curated_summary.append(word_cloud)
# Remove stop words and create a stop words dict
stop_words = {}
f = open('stopwords_en.txt', 'r')
for line in f:
w = line.rstrip('\r\n')
stop_words[w] = 1
for ind in range(len(curated_summary)):
if w in curated_summary[ind]:
del curated_summary[ind][w]
f.close()
print "Remove stop words. Print dictionary size"
for ind in range(len(curated_summary)):
print len(curated_summary[ind])
print "For common feature set, generate union of words in all categories"
for ind1 in range(len(curated_summary)):
for word in curated_summary[ind1]:
for ind2 in range(len(curated_summary)):
if ind1 == ind2:
continue
if not( word in curated_summary[ind2]):
curated_summary[ind2][word] = 0
print "Number of words considered"
print len(curated_summary[1])
print "Compute conditionals. Use one-laplace smoothing"
# Perform one laplace smoothing
total_word_counts = []
for ind in range(len(curated_summary)):
cnt = sum(curated_summary[ind].values())
for word in curated_summary[ind]:
curated_summary[ind][word] = float(curated_summary[ind][word] + 1)/(len(curated_summary[ind]) + cnt)
print "Training phase complete"
print "Test phase"
def predict_text(grp):
predictions = []
for review in grp:
classes = []
for ind in range(len(prior)):
classes.append(math.log(prior[ind]))
words = review.split()
for word in words:
m = re.match(r'^[A-Za-z][A-Za-z\'!0-9]+$', word)
if not m:
continue
if word in stop_words:
continue
if not(word in curated_summary[0]):
continue
for ind in range(len(curated_summary)):
classes[ind] += math.log( curated_summary [ind][word] )
max_class = 1
max_val = classes[0]
ind = 1
while ind < len(curated_summary):
if classes[ind] > max_val:
max_val = classes[ind]
max_class = ind + 1
ind += 1
predictions.append(max_class)
return predictions
grouped = test_data.groupby('stars')
group_wise_predictions = grouped['text'].apply(predict_text)
ind = 1
correct = 0
for predictions in group_wise_predictions:
for value in predictions:
if value == ind:
correct += 1
ind += 1
print correct, len(test_data)
print "Accuracy: " + str(float(correct)/len(test_data))
print "Test phase complete"