-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathnormalizer.py
46 lines (31 loc) · 1011 Bytes
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
import tokenizer
'''
Perform basic text normalization which includes
converting all words to lower case except abbreviations
'''
def normalize(input, multiList=False):
# input can be string as well as list
if isinstance(input,str):
line = tokenizer.tokenize(input)
for index,word in enumerate(line):
# if word is abbreviation or some other important word
if not re.match(r'^[^a-z]*$',word):
word.lower()
line[index] = word.lower()
return line
if not isinstance(input, basestring):
if multiList==False :
for index,word in enumerate(input):
# if word is abbreviation or some other important word
if not re.match(r'^[^a-z]*$',word):
input[index] = word.lower()
return input
else :
for list in input:
for index,word in enumerate(list):
# if word is abbreviation or some other important word
if not re.match(r'^[^a-z]*$',word):
list[index] = word.lower()
return input
#print normalize('Hello ThEre AKA Hahahah JJk')