Skip to content

Commit 5d85214

Browse files
committed
adding tests
1 parent f0cc7cf commit 5d85214

File tree

1 file changed

+53
-21
lines changed

1 file changed

+53
-21
lines changed

test.py

+53-21
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,77 @@
11
import unittest
2-
from tkseem.util import remove_tashkeel
3-
import tkseem as tk
4-
52

6-
class TestUnit(unittest.TestCase):
7-
def test_tashkeel(self):
8-
self.assertEqual(
9-
remove_tashkeel("مِكَرٍّ مِفَرٍّ مُقبِلٍ مُدبِرٍ مَعًا"),
10-
"مكر مفر مقبل مدبر معا",
11-
"Remove Tashkeel is not working",
12-
)
3+
import tkseem as tk
134

145

156
class TokenizersTestUnit(unittest.TestCase):
167
sample_text = "مرحبا أيها الأصدقاء"
178
token = "نص"
9+
chars = False
1810
tokenizer = None
19-
token_id = None
11+
12+
def print_string(self, method_name, output, method_arguments=None):
13+
return f'{self.tokenizer} method {method_name}() output {f"on arguments: [{method_arguments}]" if method_arguments else ""} is: {output}'
2014

2115
def test_tokenize(self):
2216
tokenized = self.tokenizer.tokenize(self.sample_text)
23-
print(f"{self.tokenizer} tokenize() output:", tokenized)
17+
print(self.print_string("tokenize", tokenized, method_arguments="sample_text"))
2418
return self.assertIsNotNone(tokenized)
2519

2620
def test_detokenize(self):
2721
tokenized = self.tokenizer.tokenize(self.sample_text)
2822
detokenized = self.tokenizer.detokenize(tokenized)
2923
print(
30-
f"{self.tokenizer} detokenize() output on the previously tokenized text:",
31-
detokenized,
24+
self.print_string(
25+
"detokenize", detokenized, method_arguments="tokenized sample_text"
26+
)
3227
)
3328
return self.assertIsNotNone(detokenized)
3429

3530
def test_token_to_id(self):
36-
self.token_id = self.tokenizer.token_to_id(self.token)
37-
print(f"{self.tokenizer} token_to_id() output:", self.token_id)
38-
return self.assertIsNotNone(self.token_id)
31+
token = self.token if not self.chars else self.token[0]
32+
token_id = self.tokenizer.token_to_id(token)
33+
print(self.print_string("token_to_id", token_id, method_arguments=token))
34+
return self.assertIsNotNone(token_id)
3935

36+
def test_id_to_token(self):
37+
token = self.token if not self.chars else self.token[0]
38+
token_id = self.tokenizer.token_to_id(token)
39+
matched_token = self.tokenizer.id_to_token(token_id)
40+
print(
41+
self.print_string(
42+
"id_to_token", matched_token, method_arguments=f"'{token}' id"
43+
)
44+
)
45+
return self.assertEqual(matched_token, token)
46+
47+
def test_encode(self):
48+
encoded = self.tokenizer.encode(self.sample_text)
49+
print(self.print_string("encode", encoded, method_arguments="sample_text"))
50+
return self.assertIsNotNone(encoded)
51+
52+
def test_decode(self):
53+
encoded = self.tokenizer.encode(self.sample_text)
54+
decoded = self.tokenizer.decode(encoded)
55+
print(
56+
self.print_string("decode", decoded, method_arguments="encoded sample_text")
57+
)
58+
return self.assertIsNotNone(decoded)
4059

41-
word_tokenizer = tk.WordTokenizer()
42-
word_tokenizer.train("tasks/samples/data.txt")
43-
TokenizersTestUnit.tokenizer = word_tokenizer
44-
unittest.main(argv=["first-arg-is-ignored"], exit=False)
4560

61+
for tokenizer in (
62+
tk.SentencePieceTokenizer(),
63+
tk.WordTokenizer(),
64+
tk.MorphologicalTokenizer(),
65+
tk.CharacterTokenizer(),
66+
tk.DisjointLetterTokenizer(),
67+
tk.RandomTokenizer(),
68+
):
69+
try:
70+
tokenizer.train("tasks/samples/data.txt")
71+
except TypeError as type_error:
72+
print(f"{tokenizer} does not need file_path to train")
73+
tokenizer.train()
74+
if isinstance(tokenizer, tk.CharacterTokenizer):
75+
TokenizersTestUnit.chars = True
76+
TokenizersTestUnit.tokenizer = tokenizer
77+
unittest.main(argv=["first-arg-is-ignored"], exit=False)

0 commit comments

Comments
 (0)