-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathfill_in.py
203 lines (161 loc) · 4.79 KB
/
fill_in.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import sys
import re
import math
from Vector import Vector
# ==== Cleaning Text ===
# Load stopwords
from stopwords import stopwords
new_stop_words = set()
for word in stopwords:
if "'" in word:
new_stop_words.add(word.replace("'",""))
stopwords = stopwords.union(new_stop_words)
# Sanitizer
def sanitize(text):
"""
clean up:
1. split a string into a list of words
2. remove all @ handles
3. remove all hash tags
4. remove all liks
5. remove all stopwords
6. throw away punctuation, except smiley faces
7. make the final vector of words a set
@args:
text --> a string
"""
words = text.split()
words = [word for word in words if "@" not in word]
words = [word for word in words if "#" not in word]
words = [word.lower() for word in words if not word.startswith("http")]
words = [word.lower() for word in words if word not in stopwords]
text = " ".join(words)
words = re.findall("\w+", text)
words.extend(re.findall("['\-/()=:;]['\-/()=:;]+", text))
words = {word for word in words if
len(word) > 1
and word.lower() != "rt"}
return words
# ===============
# ==== Feature Selection ====
# Info gain formula
def info_bernuilli(p):
"""
computes the entropy of a Bernuilli distr
@args:
p --> probability p in Bernuilli(p)
"""
return -p*math.log(p)-(1-p)*math.log(1-p)
# 250 features with highest info gain
def select_features():
"""
Selects 250 words with the highest information gain. Refer to
slides for details
"""
positives = Vector()
negatives = Vector()
both = Vector()
documents = 0.0
p = 0.0
n = 0.0
f = open("data/train_pos.txt")
for line in f:
for word in sanitize(line):
positives[word] += 1
both[word] += 1
documents += 1
p += 1
f.close()
f = open("data/train_neg.txt")
for line in f:
for word in sanitize(line):
negatives[word] += 1
both[word] += 1
documents += 1
n += 1
f.close()
features = []
for word in both:
p_both = both[word] / documents
p_pos = positives[word] / p or 0.001/p
p_neg = negatives[word] / n or 0.001/n
gain = info_bernuilli(p_both) \
- p/documents * info_bernuilli(p_pos) \
- n/documents * info_bernuilli(p_neg)
features.append((word, gain))
for word in [w[0] for w in sorted(features, key=lambda x: -x[1])[:250]]:
print word
#==================
#
# HEY LOOK AT ME I AM THE MOST INTERESTING PART
#
class NB(object):
def __init__(self):
"""
Should do:
1. define the possible classes
2. define features
3. define CPDs
4. define priors
"""
# FILL ME IN!
def learn_cpd(self, cls, tweets):
"""
Should do:
learn the CPD for a given class
@args:
cls --> a string, "+" or "-"
tweets --> an iterable of tweets (a file object, list, etc)
"""
# FILL ME IN!
def posterior(self, cls, sanitized_tweet):
"""
Computes the posterior of a sanitized tweet, P(C|tweet)
@args:
cls --> a string, "+" or "-". determines CPD to use
sanitized_tweet --> a set of words in the tweet
"""
# FILL ME IN!
def classify(self, tweet):
"""
Given a text, classify its sentiment. Picks the class with the largest posterior.
However, if we are not confident, ie if not P(C1|tweet) < 2*P(C2|tweet),
then we refuse to classify, and return neutral, "~".
@args:
tweet --> a string, text of the tweet
"""
# FILL ME IN!
# ===================
def eval_performance(n):
w = 0
t = 0
for tweet in open("data/verify_pos.txt"):
t += 1.0
if "+" != n.classify(tweet):
w += 1.0
for tweet in open("data/verify_neg.txt"):
t += 1.0
if "-" != n.classify(tweet):
w += 1.0
for tweet in open("data/verify_neutral.txt"):
t += 1.0
if "~" != n.classify(tweet):
w += 1.0
print "Error: %s" % (w/t)
def classify_text(n, txt):
print "That text is: %s" % n.classify(txt)
def main():
n = NB()
n.learn_cpd("+", open("data/train_pos.txt"))
n.learn_cpd("-", open("data/train_neg.txt"))
if "--verify" in sys.argv:
eval_performance(n)
elif "--features" in sys.argv:
select_features()
else:
if len(sys.argv) < 2:
print "Not enough args. Provide text as argument"
return -1
classify_text(n, sys.argv[1])
if __name__ == "__main__":
main()