-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTweet_sentiment.py
101 lines (70 loc) · 2.75 KB
/
Tweet_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Natural Language Processing
#Importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#Importing the datasets
testdata = pd.read_csv('Trump.txt', delimiter = '\t', names=["a", "text", "c", "d", "e", "f", "g"], quoting = 3, nrows=3000)
traindata = pd.read_csv('train_twt.csv', delimiter = ',', encoding="ISO-8859-1", names=["label", "b", "c", "d", "e", "f", "tweets"], quoting = 3, error_bad_lines=False)
traindata.head(10)
#Checking the training dataset column names and values
traindata['label'].value_counts()
#traindata.columns
#traindata.shape
#Creating subset training data
traindata = traindata.drop(["b", "c", "d", "e", "f"], axis = 1)
traindata.dropna(subset=['tweets'], inplace=True)
traindata = traindata.sample(frac = 0.4, random_state = 42)
traindata['label'].value_counts()
#selectedone = np.random.choice(366850, replace=False, size=150000)
#traindata = traindata.iloc[selectedone]
#traindata.groupby('label').count()
#Fixing Index
min(traindata.index)
max(traindata.index)
traindata.head()
traindata = traindata.reset_index().drop(columns = 'index')
#Cleaning the texts for training data
#traindata = traindata.drop(["a", "c", "d", "e", "f", "g"], axis = 1)
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus_train = []
for i in range(0,146740):
text =re.sub('[^a-zA-Z]', ' ', traindata['tweets'][i] )
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)
corpus_train.append(text)
#Cleaning the text for test data
corpus_test = []
for i in range(0,testdata.shape[0]):
text_test =re.sub('[^a-zA-Z]', ' ', testdata['text'][i] )
text_test = text_test.lower()
text_test = text_test.split()
ps = PorterStemmer()
text_test = [ps.stem(worda) for worda in text_test if not worda in set(stopwords.words('english'))]
text_test = ' '.join(text_test)
corpus_test.append(text_test)
#Creating the Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 6000)
cv_fit = cv.fit(corpus_train)
X_train = cv_fit.transform(corpus_train).toarray()
y_train = traindata.iloc[:, 0].values
cv_test = CountVectorizer(max_features = 6000)
X_test = cv_fit.transform(corpus_test).toarray()
# Fitting the Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
from collections import Counter
Counter(y_pred)
1839/(1839+1161)
##%61 percent The President Trump Talks Negative