forked from waleking/AutoPhrase
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_for_topicmodeling.py
executable file
·96 lines (85 loc) · 3.47 KB
/
prepare_for_topicmodeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import re
def extracPhrase(content):
phrase_pattern_express='<phrase>([\w\s]*)<\/phrase>'
pattern = re.compile(phrase_pattern_express)
matchedPhrases=re.findall(pattern,content)
words_content=re.sub(phrase_pattern_express," ",content)
words_content=re.sub("\s+"," ",words_content)
return matchedPhrases,words_content
def _get_stopwords():
"""
Returns a list of stopwords.
"""
f = open("data/stopwords.txt")
stopwords = set()
for line in f:
stopwords.add(line.rstrip())
return stopwords
def word_count(documents):
word_freq = dict()
phrase_freq=dict()
for document in documents:
if(len(document)>0):
words_or_phrases=document.split(",")
set_of_terms=set(words_or_phrases)
for term in set_of_terms:
if(" " in term):
if(term not in phrase_freq):
phrase_freq[term]=1
else:
phrase_freq[term]=phrase_freq[term]+1
else:
if(term not in word_freq):
word_freq[term]=1
else:
word_freq[term]=word_freq[term]+1
return word_freq,phrase_freq
if __name__=="__main__":
stopwords=_get_stopwords()
documents=[]
i=0
with open("results/segmentation.txt","r") as f:
for line in f:
line_lowercase = line.lower()
line_lowercase=re.sub("\t"," ",line_lowercase)
line_lowercase=re.sub("\s+"," ",line_lowercase)
matchedPhrases,words_content=extracPhrase(line_lowercase)
#remove punctations
sentences_no_punc = re.split(r"<\/phrase>|<phrase>|[\.,:;\!\?\(\)\[\]<>]",words_content)
stripped_sentences = []
for sentence in sentences_no_punc:
if(len(sentence)>0 and sentence!=""):
#remove all words that contain any ^A-Za-z
stripped_sentences.append(re.sub('\w*[^A-Za-z ]\w*', ' ', sentence))
sentences_no_punc = " ".join(stripped_sentences)
#remove extra spaces
sentences_no_punc=re.sub("\s+"," ",sentences_no_punc)
#remove stopwords and short words whose length is <=2.
document_without_stopwords=",".join([word for word in sentences_no_punc.split() if (word not in stopwords and len(word)>2)])
if(len(matchedPhrases)>=1):
phrases=",".join(matchedPhrases)
#add extracted phrases
document_without_stopwords=document_without_stopwords+","+phrases
documents.append(document_without_stopwords)
#do the wordcount
word_freq,phrase_freq=word_count(documents)
#output the final result
fWriter=open("results/input_forTopicModel.txt","w")
filtered_documents=[]
for document in documents:
if(len(document)>0):
words_or_phrases=document.split(",")
filtered_document=[]
for term in words_or_phrases:
if(" " not in term):
if(word_freq[term]>3):
filtered_document.append(term)
else:
filtered_document.append(term)
filtered_documents.append(",".join(filtered_document))
else:
filtered_documents.append(document)
for filtered_document in filtered_documents:
fWriter.write("%s\n" % filtered_document)
fWriter.close()