-
Notifications
You must be signed in to change notification settings - Fork 2
/
NaiveBayesClassifier.py
97 lines (72 loc) · 2.68 KB
/
NaiveBayesClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import nltk
import re
import numpy as np
import pandas
from nltk.corpus import stopwords
import pickle
import os
import argparse
import re
import pandas as pd
stopWords = set(stopwords.words('english'))
specialCharacters = re.compile('[^a-zA-Z0-9 \n\.]')
word_features=None
def parse2(x,typ):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', x)
t= cleantext.strip().split()
s=[specialCharacters.sub('',i).lower() for i in t if len(i)>3]
return (s,typ)
def parse(x):
t= x.split('~')
x=re.sub('\W+'," ",t[0])
ta=x.split()
s=[specialCharacters.sub('',i).encode('utf-8').lower() for i in ta if len(i)>3]
return (s,t[1].split('\n')[0])
def filterValues(x):
values=[]
for i in x:
if i!='i':
values.append(re.sub('[^a-zA-Z0-9 \n\.]', '', i))
return values
def extract_features(document):
global word_features
document_words = set(document)
features = {}
for word in word_features:
if word not in stopWords:
features['contains(%s)' % word] = (word in document_words)
return features
def trainData():
with open("data/IntegratedCons.txt") as IC:
combinedData=map(lambda x:parse2(x,'negative'),IC.readlines())
with open("data/IntegratedCons.txt") as IC:
combinedData.extend(map(lambda x:parse2(x,'positive'),IC.readlines()))
with open("data/reviewsR.txt") as IC:
combinedData.extend(map(lambda x:parse(x),IC.readlines()))
words=[]
for i in combinedData:
words.extend(i[0])
dataFreq= nltk.FreqDist(tuple(words))
global word_features
word_features= dataFreq.keys()
print dataFreq.most_common(10)
training_set = nltk.classify.apply_features(extract_features,combinedData)
classifier = nltk.NaiveBayesClassifier.train(training_set)
return classifier
parser = argparse.ArgumentParser()
parser.add_argument("--cached",help="wanted to use cache solution",action="store_true")
args = parser.parse_args()
if not os.path.isfile('output/NaiveBayesClassifier') or not args.cached:
naiveBayesClassifier = trainData()
with open('output/NaiveBayesClassifier','wb') as classifier:
pickle.dump(naiveBayesClassifier,classifier)
else:
with open('output/NaiveBayesClassifier','rb') as classifier:
naiveBayesClassifier = pickle.load(classifier)
print naiveBayesClassifier.show_most_informative_features(32)
restaurantData = pd.read_csv('data/restaurantReviews.csv',delimiter="~")
restaurantData["tokenizedData"]=restaurantData["ReviewText"].str.lower().str.split().apply(lambda x: filterValues(x))
restaurantData["sentiment"]=restaurantData["tokenizedData"].apply(lambda x :naiveBayesClassifier.classify(extract_features(x)))
print restaurantData[:5]
restaurantData.to_csv('output/predicted.csv')