-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorrelated_words_0.py
114 lines (86 loc) · 3.32 KB
/
correlated_words_0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 9 12:09:55 2021
@author: Souparno
"""
import pandas as pd
import string
import spacy
import re
from collections import Counter
nlp = spacy.load("en_core_web_lg")
# data=pd.read_csv('methodology smallpox vaccine data.csv')
#text=data['Methodology'][5]
def remove_special_chars(x):
x=re.sub(r'\[.*\]','',x)
x = re.sub(r"<[^>]*>","",x) # removes html tags
x=re.sub(r'\([^)]*\)', '', x) # removes parentheses '()'
x = ' '.join(x.split())
x = re.findall('[A-Z][^A-Z]*', x) # splits any joint words
x=''.join(x)
return x
index=0
# data['keywords']=''
def keywordExtraction(filename,search_query):
data=pd.read_csv(filename)
data['keywords']=''
global index
for text in data['Methodology']:
new=remove_special_chars(text)
new=new.lower()
doc=nlp(new)
words=[]
for token in doc:
if not token.is_stop:
if token.text not in string.punctuation:
words.append(token.lemma_)
wordfreq =Counter(words)
mostcommon=wordfreq.most_common(15) # 15 most freq. words
removewords=[]
for i in mostcommon:
if i[1]>10:
removewords.append(i[0])
words=list(set(words)-set(removewords))
doc = ' '.join(words)
doc=nlp(doc)
correlated_words={}
for token1 in doc:
correlated_words[token1.text]=[]
for token2 in doc:
if token2.text not in correlated_words.keys():
if token1.similarity(token2) > 0.5 and token1!=token2:
correlated_words[token1.text].append(token2.text)
filtered = {k: v for k, v in correlated_words.items() if v }
correlated_words.clear()
correlated_words.update(filtered)
##### added later ######
#keydoc=nlp(' '.join(list(correlated_words.keys())))
keyword=nlp(search_query)
finalkeys=[]
for token in list(correlated_words.keys()):
totalsim=0
for token2 in keyword:
sim=nlp(token).similarity(token2)
totalsim=totalsim+sim
avgsim=totalsim/len(keyword)
finalkeys.append((token,avgsim))
finalkeys.sort(reverse=True,key = lambda x: x[1])
final=[]
for i in finalkeys:
if i[1]>0.5:
#print(i)
final.append(i[0])
unwanted = set(correlated_words.keys()) - set(final)
for unwanted_key in unwanted: del correlated_words[unwanted_key]
values=list(correlated_words.values())+list(correlated_words.values())
finalkeywords=[]
for i in values:
for j in i:
finalkeywords.append(j)
finalkeywords=set(finalkeywords)
print('index :',index)
print(finalkeywords)
data['keywords'][index]=' '.join(finalkeywords)
index=index+1
#data.to_csv('keywords.csv')
return data