-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtext_cleaner.py
executable file
·74 lines (62 loc) · 1.66 KB
/
text_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# text_cleaner.py
# preprocesses text so that it preserves only lowercase latin letters and spaces
# Johan and Aditya, Fall 2014
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys
import string
import codecs
import re
from unidecode import unidecode
include = string.lowercase
exclude = string.punctuation
try:
if len(sys.argv)> 1 and len(sys.argv) < 3 or len(sys.argv)>3:
print "Usage: python script.py <inputFile.txt> <outputFile.txt>"
exit()
run = 1
inputfn = sys.argv[1]
outputfn = sys.argv[2]
except IndexError:
run = 0
def cleaner_fn(inputfn, outputfn):
inputFile = codecs.open(str(sys.argv[1]), encoding='utf-8')
outputFile = open(str(sys.argv[2]), 'w')
for s in inputFile:
t = cleaner(s)
outputFile.write(t+'\n')
inputFile.close()
outputFile.close()
def cleaner(s):
s = re.sub(u'ø', 'oe', s) # ø -> oe
s = re.sub(u'å', 'aa', s) # å -> aa
s = re.sub(u'æ', 'ae', s) # æ -> ae
s = re.sub(u'ä', 'ae', s) # ä -> ae
s = re.sub(u'ö', 'oe', s) # ö -> oe
s = re.sub(u'ü', 'ue', s) # ü -> ue
s = re.sub(u'ß', 'ss', s) # ß -> ss
s = unidecode(s)
string_split = s.split()
new_s = ''
# check for all uppercase
for word in string_split:
for ch in include:
lowercase_exist = 0
if ch in word:
lowercase_exist = 1
break
num_exist = 0
for num in '0123456789':
if num in word:
num_exist = 1
if lowercase_exist and not num_exist:
new_s += word + ' '
else:
new_s += ' '
# make all letters lowercase
new_s = new_s.lower()
# remove punctuation
t = ''.join(ch for ch in new_s if ch not in exclude)
return t
if run:
cleaner_fn(inputfn,outputfn)