File tree 3 files changed +5
-6
lines changed
3 files changed +5
-6
lines changed Original file line number Diff line number Diff line change 1
- import tokenizers
2
- from utils import *
1
+ from .util import remove_tashkeel
3
2
import unittest
4
3
5
4
class TestUnit (unittest .TestCase ):
Original file line number Diff line number Diff line change @@ -88,7 +88,7 @@ def process_data(self, file_path):
88
88
89
89
if self .normalize :
90
90
print ("Normalizing the data ..." )
91
- self .corpus = normalize_data (self .corpus )
91
+ self .corpus = normalize_data (self .corpus , self . norm_dict )
92
92
93
93
if self .split :
94
94
print ("Splitting the data ..." )
Original file line number Diff line number Diff line change @@ -27,10 +27,10 @@ def remove_tashkeel(text):
27
27
text = re .sub (r"[ًٌٍَََُِّْ]" , "" , text )
28
28
return text
29
29
30
- def normalize_data (text ):
30
+ def normalize_data (text , norm_dict ):
31
31
# use a mapping dictionary
32
- regex = re .compile ("|" .join (map (re .escape , self . norm_dict .keys ())))
33
- text = regex .sub (lambda match : self . norm_dict [match .group (0 )], text )
32
+ regex = re .compile ("|" .join (map (re .escape , norm_dict .keys ())))
33
+ text = regex .sub (lambda match : norm_dict [match .group (0 )], text )
34
34
return text
35
35
36
36
def remove_english_chars (text ):
You can’t perform that action at this time.
0 commit comments