fix some bugs

zaidalyafeai · zaidalyafeai · commit 116d3e054fb2 · 2020-07-28T22:43:54.000+03:00
diff --git a/tkseem/test.py b/tkseem/test.py
@@ -1,5 +1,4 @@
-import tokenizers
-from utils import *
+from .util import remove_tashkeel
 import unittest
 
 class TestUnit(unittest.TestCase):
diff --git a/tkseem/tokenizers.py b/tkseem/tokenizers.py
@@ -88,7 +88,7 @@ def process_data(self, file_path):
 
         if self.normalize:
             print("Normalizing the data ...")
-            self.corpus = normalize_data(self.corpus)
+            self.corpus = normalize_data(self.corpus, self.norm_dict)
 
         if self.split:
             print("Splitting the data ...")
diff --git a/tkseem/util.py b/tkseem/util.py
@@ -27,10 +27,10 @@ def remove_tashkeel(text):
     text = re.sub(r"[ًٌٍَََُِّْ]", "", text)
     return text
 
-def normalize_data(text):
+def normalize_data(text, norm_dict):
     # use a mapping dictionary 
-    regex = re.compile("|".join(map(re.escape, self.norm_dict.keys())))
-    text  = regex.sub(lambda match: self.norm_dict[match.group(0)], text)
+    regex = re.compile("|".join(map(re.escape, norm_dict.keys())))
+    text  = regex.sub(lambda match: norm_dict[match.group(0)], text)
     return text 
 
 def remove_english_chars(text):