Skip to content

Commit 116d3e0

Browse files
committed
fix some bugs
1 parent 11e8224 commit 116d3e0

File tree

3 files changed

+5
-6
lines changed

3 files changed

+5
-6
lines changed

tkseem/test.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import tokenizers
2-
from utils import *
1+
from .util import remove_tashkeel
32
import unittest
43

54
class TestUnit(unittest.TestCase):

tkseem/tokenizers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def process_data(self, file_path):
8888

8989
if self.normalize:
9090
print("Normalizing the data ...")
91-
self.corpus = normalize_data(self.corpus)
91+
self.corpus = normalize_data(self.corpus, self.norm_dict)
9292

9393
if self.split:
9494
print("Splitting the data ...")

tkseem/util.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,10 @@ def remove_tashkeel(text):
2727
text = re.sub(r"[ًٌٍَََُِّْ]", "", text)
2828
return text
2929

30-
def normalize_data(text):
30+
def normalize_data(text, norm_dict):
3131
# use a mapping dictionary
32-
regex = re.compile("|".join(map(re.escape, self.norm_dict.keys())))
33-
text = regex.sub(lambda match: self.norm_dict[match.group(0)], text)
32+
regex = re.compile("|".join(map(re.escape, norm_dict.keys())))
33+
text = regex.sub(lambda match: norm_dict[match.group(0)], text)
3434
return text
3535

3636
def remove_english_chars(text):

0 commit comments

Comments
 (0)