-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess_arabic.py
35 lines (30 loc) · 1.21 KB
/
preprocess_arabic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re,sys,codecs
import json
def clean_text(text):
# remove tashkeel
text = text.replace('{', 'ا')
text = text.replace('}', 'ا')
#text = text.replace('-','')
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(p_tashkeel, "", text)
#Other typos in the conll files
text = text.replace('ه`ذا', 'هذا')
text = text.replace('ه`ذه', 'هذه')
text = text.replace('ه`ذين', 'هذين')
text = text.replace('الل`ه','الله')
text = text.replace('ذ`لك', 'ذلك')
text = text.replace('إل`ه','إله')
return text
if __name__ == '__main__':
if len(sys.argv) < 3:
sys.exit("Usage: {} <input_path> <output_path>.".format(sys.argv[0]))
writer = codecs.open(sys.argv[2],'wb',encoding='utf-8')
for line in codecs.open(sys.argv[1],encoding='utf-8'):
doc = json.loads(line)
sentences = doc['sentences']
clean_sentences = [[clean_text(t) for t in sent] for sent in sentences]
doc['sentences'] = clean_sentences
writer.write(json.dumps(doc)+'\n')