-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreparser.py
152 lines (129 loc) · 6.19 KB
/
preparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from document import Document
import re
from stemmer import Stemmer
from gensim.corpora import Dictionary
import utils
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
def get_continuous_chunks(text):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
if current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
def deEmojify(inputString):
return inputString.encode('ascii', 'ignore').decode('ascii')
class Preparse:
def __init__(self,stmmer=False):
self.stop_words = stopwords.words('english')
self.secondStop_word =['rt','i','p','etc','oh','im','also']
#,'0','1','2','3','4','5','6','7','8','9'
self.stop_words=self.stop_words+self.secondStop_word
self.personadic ={}
self.terms={}
def parse_sentence(self, text,tweetId=""):
"""
This function tokenize, remove stop words and apply lower case for every word within the text
:param text:
:return:
"""
nonBreakSpace = u'\xa0'
text = text.replace(nonBreakSpace, ' ')
if ('1280975647146496009' == tweetId):
print(text)
print(tweetId)
text=deEmojify(text)
return_parse=[]
ifprsona={}
text=re.sub('\.\.+', '.', text)
text=text.replace('\r','')
text_tokens = re.split("[ \-!?:=\n()$&^\+\"';~*\|“…”{}\[\]‘]+",text)
#text_tokens= self.stemmer.stem_term(text_tokens)
word = 0
lenTokens=len(text_tokens)
while word < lenTokens:
if (text_tokens[word].count('.') == 1):
split = text_tokens[word].split('.')
if (split[0].replace(',',"").isnumeric() and not (split[1].isnumeric())) or\
((not split[0].isnumeric() and (split[1].replace(',',"").isnumeric()))) or\
((not (split[1].isnumeric())) and not (split[1].isnumeric())):
text_tokens[word]=split[0]
text_tokens.insert(word+1,split[1])
if (text_tokens[word].count('/') == 1):
split = text_tokens[word].split('/')
if ((not (split[1].isnumeric())) and not (split[1].isnumeric())):
text_tokens[word]=split[0]
text_tokens.insert(word+1,split[1])
if(len(text_tokens[word].replace("#",""))<2 and not text_tokens[word].isnumeric()):
word+=1
elif(text_tokens[word].lower() in self.stop_words):
word += 1
elif (text_tokens[word][0]).isupper():
tempprona=text_tokens[word]
temp = re.sub("[,/.’#'\"]+", '', text_tokens[word])
#return_parse+=[temp]
word+=1
while word < lenTokens and text_tokens[word]!="" and (text_tokens[word][0]).isupper():
temp = re.sub("[ ,/.’#'\"]+", '', text_tokens[word])
#return_parse += [temp]
tempprona+=" " +text_tokens[word]
word+=1
for text in tempprona.split(","):
if len(text) < 1:
continue
if(text[0]==" "):
text=text[1:]
if len(text) < 2:
continue
if(text.isnumeric()):
continue
if(not self.terms.get(text.lower())):
if(self.personadic.get(text)):
if(not ifprsona.get(text)):
self.personadic[text] += 1
ifprsona[text] = True
else:
self.personadic[text] =1
ifprsona[text]=True
else:
self.terms[text_tokens[word]]=True
tempp=text_tokens[word][0].upper()+text_tokens[word][1:]
if(self.personadic.get(tempp)):
self.personadic.pop(tempp)
word+=1
def parse_doc(self, doc_as_list):
"""
This function takes a tweet document as list and break it into different fields
:param doc_as_list: list re-preseting the tweet.
:return: Document object with corresponding fields.
"""
tweet_id = doc_as_list[0]
tweet_date = doc_as_list[1]
full_text = doc_as_list[2]
url = doc_as_list[3]
retweet_text = doc_as_list[4]
retweet_url = doc_as_list[5]
quote_text = doc_as_list[6]
quote_url = doc_as_list[7]
term_dict = {}
tokenized_text = self.parse_sentence(full_text,tweet_id)
'''re.match(r'^\d{2}\.\d{2}\.\d{4}$',text_tokens[word]) or re.match(r'^\d{2}\.\d{2}\.\d{2}$',text_tokens[word]) or\
re.match(r'^\d{1}\.\d{2}\.\d{4}$',text_tokens[word]) or re.match(r'^\d{1}\.\d{2}\.\d{2}$',text_tokens[word]) or\
re.match(r'^\d{2}\.\d{1}\.\d{4}$',text_tokens[word]) or re.match(r'^\d{2}\.\d{1}\.\d{2}$',text_tokens[word]) or\
re.match(r'^\d{2}\/\d{2}\/\d{4}$',text_tokens[word]) or re.match(r'^\d{2}\/\d{2}\/\d{2}$', text_tokens[word]) or\
re.match(r'^\d{1}\/\d{2}\/\d{4}$',text_tokens[word]) or re.match(r'^\d{1}\/\d{2}\/\d{2}$', text_tokens[word]) or\
re.match(r'^\d{2}\/\d{1}\/\d{4}$',text_tokens[word]) or re.match(r'^\d{2}\/\d{1}\/\d{2}$', text_tokens[word]) or\
re.match(r'^\d{1}\.\d{1}\.\d{4}$',text_tokens[word]) or re.match(r'^\d{1}\.\d{1}\.\d{2}$',text_tokens[word]) or\
re.match(r'^\d{1}\/\d{1}\/\d{4}$',text_tokens[word]) or re.match(r'^\d{1}\/\d{1}\/\d{2}$',text_tokens[word]):'''