-
Notifications
You must be signed in to change notification settings - Fork 0
/
q1.py
191 lines (133 loc) · 5.05 KB
/
q1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python
# coding: utf-8
# In[47]:
import re
import os
import nltk
import unidecode
import numpy as np
import codecs
# In[2]:
# ## Text Cleaning
# 1. Text is splitted by '\t' first.
# 2. Remove extra spaces
# 3. Tokenize the strings
# 4. Remove Punctuations from tokenize words
# 5. Remove Number
# 6. Remove Double quotations from tokens
# 7. Replace URL with url tag
# 8. Remove ascents from string using decode like A°
# 9. Split camelCase word into 'camel' and 'Case'
# 10. Remove number from token
# In[3]:
def isExtraSpace(text):
return len(text)==0
# In[4]:
def tokenize(text):
tokens = nltk.word_tokenize(text)
return tokens
# In[5]:
def isASCII(token):
if token in ['.','+','*','?','[','/', '//','\\','^','%',']', '$','(',')','{','}','=', '!', '|',':','-']:
return True
return False
# In[6]:
def ignore_special_token(token):
if token in ['^^^^', '==']:
return True
return False
# In[7]:
def no_change(token):
if token in ['Objective-J', 'I/O']:
return True
return False
# In[8]:
def is_number(token):
num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')
return bool(num_regex.match(token))
# In[9]:
def replace_url(token):
replaced_text = re.sub('(http[s]?://)?((www)\.)?([a-zA-Z0-9]+)\.{1}((com)(\.(cn))?|(org))', '<url>', token)
return replaced_text
# In[10]:
def split_camelCase(token):
return re.findall(r'[a-zA-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', token)
# In[11]:
def remove_numbers_from_token(token):
return re.sub(r'[0-9]+', '', token)
# In[41]:
def get_stemmer(stemmer_type):
if(stemmer_type=='porter_stemmer'): stemmer = nltk.PorterStemmer()
elif(stemmer_type=='snowball_stemmer'): stemmer = nltk.SnowballStemmer(language = 'english')
return stemmer
# In[34]:
def text_cleaning(tokens):
new_tokens=[]
for token in tokens:
token=token.replace('"', '')
token = replace_url(token)
token = unidecode.unidecode(token)
if isASCII(token) or is_number(token) or len(token)==0: continue
if no_change(token):
new_tokens.append(token)
continue
tt = re.split(', |:|\+|%|\|/|\*|$|&|@|_|-|!|;|,', token)
for t in tt:
t=t.replace('"', '')
t=unidecode.unidecode(t)
#Removing ASCII character and extra space after splitting
if isASCII(t) or len(t)<=1 or is_number(t): continue
#Removing '.' from the word
split_by_dot = t.split('.')
final_token = sorted(split_by_dot, key=len, reverse=True)[0]
#If still some non-ascii char left in the token
if not final_token.isalnum():
final_token = re.sub('[^A-Za-z0-9]+', '', final_token)
#Remove number from token
final_token = remove_numbers_from_token(final_token)
#CamelCase condition based on first char if it is lower or not
if final_token and final_token[0].islower():
camel_tokens = split_camelCase(final_token)
for tt in camel_tokens:
if len(tt) <= 1: continue
new_tokens.append(tt.lower())
else:
if len(final_token) <= 1: continue
new_tokens.append(final_token.lower())
return new_tokens
def create_vocab_from_corpus(file_path):
vocab_doc_wise_tokenization={}
vocab_doc_wise_stemming={}
i=0
for file in os.listdir(file_path):
print(i, end=' ')
print(file)
with codecs.open(os.path.join(file_path, file), mode='r', encoding='utf-8') as input_file:
next(input_file)
d=[]
for line in input_file:
text = line.strip().split('\t')[0]
if(isExtraSpace(text)): continue
text=text.replace('"', '')
####################################### TOKENIZATION ################################################
tokens = tokenize(text)
pun_free_token = text_cleaning(tokens)
if pun_free_token:
# print("Final: ", pun_free_token)
d = sum([], d+pun_free_token)
##d = sorted(set(d), key=lambda x:d.index(x))
doc_name = os.path.splitext(file)[0]
vocab_doc_wise_tokenization[doc_name] = d
####################################### STEMMING ################################################
stemmer = get_stemmer('snowball_stemmer')
st = [stemmer.stem(word) for word in d]
vocab_doc_wise_stemming[doc_name] = st
i+=1
# if i>=10: break
# np.save('vocab_doc_wise_tokenization.npy', vocab_doc_wise_tokenization)
# np.save('vocab_doc_wise_stemming.npy', vocab_doc_wise_stemming)
# vocab_doc_wise_tokenization = np.load('vocab_doc_wise_tokenization.npy', allow_pickle='TRUE').item()
# vocab_doc_wise_stemming = np.load('vocab_doc_wise_stemming.npy', allow_pickle='TRUE').item()
##print(vocab_doc_wise_tokenization)
#print(vocab_doc_wise_stemming)
return vocab_doc_wise_stemming