-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathanswer.py
230 lines (191 loc) · 5.9 KB
/
answer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import sys
import re
import math
from Vector import Vector
# ==== Cleaning Text ===
# Load stopwords
from stopwords import stopwords
new_stop_words = set()
for word in stopwords:
if "'" in word:
new_stop_words.add(word.replace("'",""))
stopwords = stopwords.union(new_stop_words)
# Sanitizer
def sanitize(text):
"""
clean up:
1. split a string into a list of words
2. remove all @ handles
3. remove all hash tags
4. remove all links
5. remove all stopwords
6. throw away punctuation, except smiley faces
7. make the final vector of words a set
@args:
text --> a string
"""
words = text.split()
words = [word for word in words if "@" not in word]
words = [word for word in words if "#" not in word]
words = [word.lower() for word in words if not word.startswith("http")]
words = [word.lower() for word in words if word not in stopwords]
text = " ".join(words)
words = re.findall("\w+", text)
words.extend(re.findall("['\-/()=:;]['\-/()=:;]+", text))
words = {word for word in words if
len(word) > 1
and word.lower() != "rt"}
return words
# ===============
# ==== Feature Selection ====
# Info gain formula
def info_bernuilli(p):
"""
computes the entropy of a Bernuilli distr
@args:
p --> probability p in Bernuilli(p)
"""
return -p*math.log(p)-(1-p)*math.log(1-p)
# 250 features with highest info gain
def select_features():
"""
Selects 250 words with the highest information gain. Refer to
slides for details
"""
positives = Vector()
negatives = Vector()
both = Vector()
documents = 0.0
p = 0.0
n = 0.0
f = open("data/train_pos.txt")
for line in f:
for word in sanitize(line):
positives[word] += 1
both[word] += 1
documents += 1
p += 1
f.close()
f = open("data/train_neg.txt")
for line in f:
for word in sanitize(line):
negatives[word] += 1
both[word] += 1
documents += 1
n += 1
f.close()
features = []
for word in both:
p_both = both[word] / documents
p_pos = positives[word] / p or 0.001/p
p_neg = negatives[word] / n or 0.001/n
gain = info_bernuilli(p_both) \
- p/documents * info_bernuilli(p_pos) \
- n/documents * info_bernuilli(p_neg)
features.append((word, gain))
for word in [w[0] for w in sorted(features, key=lambda x: -x[1])[:250]]:
print word
#==================
class NB(object):
def __init__(self):
"""
Should do:
1. define the possible classes
2. define features
3. define CPDs
4. define priors
"""
self.classes = ["+", "-"]
self.features = set()
feature_list = open("data/features.txt")
for line in feature_list:
self.features.add(line.strip().lower())
self.cpds = {"+": Vector(),
"-": Vector()}
for vector in self.cpds.values():
vector.default = 1
self.priors = {"+": 0.6, "-": 0.4}
def learn_cpd(self, cls, tweets):
"""
Should do:
learn the CPD for a given class
@args:
cls --> a string, "+" or "-"
tweets --> an iterable of tweets (a file object, list, etc)
"""
counter = self.cpds[cls]
total = 0.0
for tweet in tweets:
total += 1
tweet = sanitize(tweet)
for word in tweet:
if word in self.features:
counter[word] += 1
for key in counter:
counter[key] = counter[key] / total
counter.default = 1/total
def posterior(self, cls, sanitized_tweet):
"""
Computes the posterior of a sanitized tweet, P(C|tweet)
@args:
cls --> a string, "+" or "-". determines CPD to use
sanitized_tweet --> a set of words in the tweet
"""
p = math.log(self.priors[cls])
cpd = self.cpds[cls]
for feature in self.features:
if feature in sanitized_tweet:
p += math.log(cpd[feature])
else:
p += math.log(1 - cpd[feature])
return p
def classify(self, tweet):
"""
Given a text, classify its sentiment. Picks the class with the largest posterior.
However, if we are not confident, ie if not P(C1|tweet) < 2*P(C2|tweet),
then we refuse to classify, and return neutral, "~".
@args:
tweet --> a string, text of the tweet
"""
tweet = sanitize(tweet)
posteriors = {}
for cls in self.classes:
posteriors[cls] = self.posterior(cls, tweet)
pos = posteriors["+"]
neg = posteriors["-"]
if pos > math.log(2) + neg:
return "+"
elif neg > math.log(3) + pos:
return "-"
else:
return "~"
def eval_performance(n):
w = 0
t = 0
for tweet in open("data/verify_pos.txt"):
t += 1.0
if "+" != n.classify(tweet):
w += 1.0
for tweet in open("data/verify_neg.txt"):
t += 1.0
if "-" != n.classify(tweet):
w += 1.0
for tweet in open("data/verify_neutral.txt"):
t += 1.0
if "~" != n.classify(tweet):
w += 1.0
print "Error: %s" % (w/t)
def classify_text(n, txt):
print "That text is: %s" % n.classify(txt)
def main():
n = NB()
n.learn_cpd("+", open("data/train_pos.txt"))
n.learn_cpd("-", open("data/train_neg.txt"))
if "--verify" in sys.argv:
eval_performance(n)
elif "--features" in sys.argv:
select_features()
else:
classify_text(n, sys.argv[1])
if __name__ == "__main__":
main()