-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment.py
167 lines (136 loc) · 5.23 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Author: Jimmy
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import nltk
import numpy as np
from sklearn.utils import shuffle
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
wordnet_lemmatizer = WordNetLemmatizer()
# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))
# note: an alternative source of stopwords
# from nltk.corpus import stopwords
# stopwords.words('english')
# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('electronics/positive.review').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')
print(positive_reviews)
negative_reviews = BeautifulSoup(open('electronics/negative.review').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')
print(negative_reviews)
# first let's just try to tokenize the text using nltk's tokenizer
# let's take the first review for example:
# t = positive_reviews[0]
# nltk.tokenize.word_tokenize(t.text)
#
# notice how it doesn't downcase, so It != it
# not only that, but do we really want to include the word "it" anyway?
# you can imagine it wouldn't be any more common in a positive review than a negative review
# so it might only add noise to our model.
# so let's create a function that does all this pre-processing for us
def my_tokenizer(s):
s = s.lower() # downcase
tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
tokens = [t for t in tokens if t not in stopwords] # remove stopwords
return tokens
# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []
for review in positive_reviews:
orig_reviews.append(review.text)
tokens = my_tokenizer(review.text)
positive_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
for review in negative_reviews:
orig_reviews.append(review.text)
tokens = my_tokenizer(review.text)
negative_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
print("len(word_index_map):", len(word_index_map))
# now let's create our input matrices
def tokens_to_vector(tokens, label):
x = np.zeros(len(word_index_map) + 1) # last element is for the label
for t in tokens:
i = word_index_map[t]
x[i] += 1
x = x / x.sum() # normalize it before setting label
x[-1] = label
return x
N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
xy = tokens_to_vector(tokens, 1)
data[i,:] = xy
i += 1
for tokens in negative_tokenized:
xy = tokens_to_vector(tokens, 0)
data[i,:] = xy
i += 1
# shuffle the data and create train/test splits
# try it multiple times!
orig_reviews, data = shuffle(orig_reviews, data)
X = data[:,:-1]
Y = data[:,-1]
# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))
# let's look at the weights for each word
# try it with different threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
weight = model.coef_[0][index]
if weight > threshold or weight < -threshold:
print(word, weight)
# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
p = P[i]
y = Y[i]
if y == 1 and p < 0.5:
if p < minP_whenYis1:
wrong_positive_review = orig_reviews[i]
wrong_positive_prediction = preds[i]
minP_whenYis1 = p
elif y == 0 and p > 0.5:
if p > maxP_whenYis0:
wrong_negative_review = orig_reviews[i]
wrong_negative_prediction = preds[i]
maxP_whenYis0 = p
print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)