forked from anlausch/XWEAT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvocab_dict Pickle maker_noClean.py
69 lines (60 loc) · 2.27 KB
/
vocab_dict Pickle maker_noClean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import codecs
import os
import pickle
import re
import json
'''
THIS SCRIPT CREATE ARABIC VOCAB FROM CSV TO PICKLE WITH TEST CLEANING TO FIT THE ARAVEC MODEL
'''
def clean_arabic_str(text):
'''
this method clean strings of arabic, remove tashkeel, and replace double letters and unify ta2 marbuta and ha2
:param text: text: an arabic word
:type text str
:return:text
'''
search = ["أ", "إ", "آ", "ة", "_", "-", "/", ".", "،", " و ", " يا ", '"', "ـ", "'", "ى", "\\", '\n', '\t',
'"', '?', '؟', '!']
replace = ["ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", " يا", "", "", "", "ي", "", ' ', ' ', ' ', ' ? ', ' ؟ ',
' ! ']
# remove tashkeel
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(p_tashkeel, "", text)
# remove longation
p_longation = re.compile(r'(.)\1+')
subst = r"\1\1"
text = re.sub(p_longation, subst, text)
text = text.replace('وو', 'و')
text = text.replace('يي', 'ي')
text = text.replace('اا', 'ا')
for i in range(0, len(search)):
text = text.replace(search[i], replace[i])
# trim
text = text.strip()
return text
is_russian = False
with codecs.open('data/vocab_en_ar.csv', "r", "utf8") as f:
translation_dict = {}
for line in f.readlines():
parts = line.split(",")
en = parts[0]
if en == "" or en[0].isupper():
continue
else:
if is_russian and parts[3] != "\n" and parts[3] != "\r\n" and parts[3] != "\r":
other_m = parts[2]
#other_m = clean_arabic_str(other_m).replace(" ", "_")
other_f = parts[3].strip()
#other_f = clean_arabic_str(other_f).replace(" ", "_") # clean text
translation_dict[en] = (other_m, other_f)
else:
other_m = parts[1].strip()
#other_m = clean_arabic_str(other_m).replace(" ", "_")
other_f = None
if len(parts) > 2 and parts[2] != "\n" and parts[2] != "\r\n" and parts[2] != "\r" and parts[2] != '':
other_f = parts[2].strip()
#other_f = clean_arabic_str(other_f).replace(" ", "_")
translation_dict[en] = (other_m, other_f)
pickle.dump(translation_dict, open("data/vocab_dict_en_arNoClean.p", "wb"))
print('done')
print(translation_dict)