-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbag_of_words.py
executable file
·103 lines (86 loc) · 3.53 KB
/
bag_of_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
f = open('data/clean_data_string', 'rb')
clean_data = pickle.load(f)
f.close()
print('The total size of data set is', len(clean_data))
# Splitting training and testing data.
data_array = np.array(clean_data)
data_size = data_array.shape[0]
train_size = data_size - 5000 # select a test size of 5000
data_train = data_array[:train_size]
data_test = data_array[train_size:]
review_to_train = data_train[:, 0]
label_to_train = np.asarray(data_train[:, 1], dtype="|S6")
review_to_test = data_test[:, 0]
label_to_test = np.asarray(data_test[:, 1], dtype="|S6")
def train_random_forest():
IF_TRAIN = True #
if IF_TRAIN:
bag_of_words_len = 5000
vectorizer = CountVectorizer(analyzer="word", tokenizer=None,
preprocessor=None, stop_words=None,
max_features=bag_of_words_len)
# Prepare train data
train_data_features = vectorizer.fit_transform(review_to_train)
train_data_features = train_data_features.toarray()
print("Starting training Random Forest")
n_estimators = 100
forest = RandomForestClassifier(n_estimators)
forest = forest.fit(train_data_features, label_to_train)
print("Finish training Random Forest")
__f = open('model/trained_forest_'+str(bag_of_words_len)+'_'+str(n_estimators), 'wb')
pickle.dump(forest, __f)
__f.close()
print('The total size of data set is', len(clean_data))
else:
__f = open('model/trained_forest_2000_500', 'rb')
forest = pickle.load(__f)
__f.close()
# Testing:
# Prepare test data
test_data_features = vectorizer.fit_transform(review_to_test)
test_data_features = test_data_features.toarray()
print("Starting testing")
test_result = forest.predict(test_data_features)
count = 0
for i in range(label_to_test.size):
if label_to_test[i] == test_result[i]:
count += 1
print(count / len(label_to_test))
def train_naive_bayes():
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(review_to_train)
# TF model
tf_transformer = TfidfTransformer(use_idf=False).fit(train_counts)
train_tf = tf_transformer.transform(train_counts)
clf_tf = MultinomialNB().fit(train_tf, label_to_train)
test_counts_tf = count_vect.transform(review_to_test)
test_tf = tf_transformer.transform(test_counts_tf)
test_result_tf = clf_tf.predict(test_tf)
# Get accuracy
count = 0
for i in range(label_to_test.size):
if label_to_test[i] == test_result_tf[i]:
count += 1
print('TF Accuracy', count / len(label_to_test))
# TF IDF model
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
clf_tfidf = MultinomialNB().fit(train_tfidf, label_to_train)
test_counts_tfidf = count_vect.transform(review_to_test)
test_tfidf = tfidf_transformer.transform(test_counts_tfidf)
test_result_idf = clf_tfidf.predict(test_tfidf)
# Get accuracy
count = 0
for i in range(label_to_test.size):
if label_to_test[i] == test_result_idf[i]:
count += 1
print('TF-IDF Accuracy', count / len(label_to_test))
train_random_forest()
train_naive_bayes()