forked from attardi/wikiextractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremoveSymbols.py
21 lines (19 loc) · 889 Bytes
/
removeSymbols.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/usr/bin/python
# -*- coding: utf-8 -*-
def removeSymbols(text):
"""Removes all symbols as they serve no
semantical purpose for us."""
# string.punctuation also includes "-" which we want to treat differently.
# Furthermore, we also want to remove some additional symbols. The end
# of the following string consists of different typographical sorts of
# dashes:
for c in "<>|[]{}()@#$€£%&*_=+…,.?!\"«»“”‘’;:/\\~`0123456789—‒–―":
# it's faster to determine this first instead of blindly replacing:
if c in text:
text = text.replace(c, " ")
# Remove apostrophes so that we end up with "dont", "wont", "im"
# "isnt", "arent", "peters" and so forth:
text = text.replace("'", "")
# Remove typographic hyphen to create words such as "nonnotable":
text = text.replace("-", "")
return text