-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathproject2.py
62 lines (49 loc) · 1.78 KB
/
project2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 2 20:45:23 2019
@author: Taimoor
"""
#reading the data from the dataset and converting to lowercase, converting list of reviews
import nltk
corpus = open('E:\\dataset2.csv').read()
corpus = corpus.lower()
docs = corpus.split('\n')
#creatinig a list of the only POS tags that we are interested in
words = nltk.tokenize.word_tokenize(docs[0])
allowed_tags = ['VBP', 'VB', 'VBG', 'JJ', 'NN', 'RB']
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from string import punctuation as punc
pdocs = []
for d in docs:
words = nltk.tokenize.word_tokenize(d)
words_tags = nltk.pos_tag(words)
for w, t in words_tags:
if w in punc:
words.remove(w)
elif w in ENGLISH_STOP_WORDS:
words.remove(w)
elif t not in allowed_tags:
words.remove(w)
pd = ' '.join(words)
pdocs.append(pd)
#Structuring our input documents with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_X = vec.fit_transform(pdocs)
#LDA (Topic modeling)
features = vec.get_feature_names()
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 5)
lda.fit(matrix_X)
#Sentiment Analysis
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
for tid, topics in enumerate(lda.components_):
print('topic ID: ', tid)
top_words = [features[i] for i in topics.argsort()[:-10:-1]]
print(top_words)
score = 0
for w in top_words:
senti_synset = swn.senti_synset(wn.synsets(w)[0].name())
score += senti_synset.pos_score() - senti_synset.neg_score()
print('Sentiment Score: ', score)