|
1 | 1 | import unittest
|
2 |
| -from tkseem.util import remove_tashkeel |
3 |
| -import tkseem as tk |
4 |
| - |
5 | 2 |
|
6 |
| -class TestUnit(unittest.TestCase): |
7 |
| - def test_tashkeel(self): |
8 |
| - self.assertEqual( |
9 |
| - remove_tashkeel("مِكَرٍّ مِفَرٍّ مُقبِلٍ مُدبِرٍ مَعًا"), |
10 |
| - "مكر مفر مقبل مدبر معا", |
11 |
| - "Remove Tashkeel is not working", |
12 |
| - ) |
| 3 | +import tkseem as tk |
13 | 4 |
|
14 | 5 |
|
15 | 6 | class TokenizersTestUnit(unittest.TestCase):
|
16 | 7 | sample_text = "مرحبا أيها الأصدقاء"
|
17 | 8 | token = "نص"
|
| 9 | + chars = False |
18 | 10 | tokenizer = None
|
19 |
| - token_id = None |
| 11 | + |
| 12 | + def print_string(self, method_name, output, method_arguments=None): |
| 13 | + return f'{self.tokenizer} method {method_name}() output {f"on arguments: [{method_arguments}]" if method_arguments else ""} is: {output}' |
20 | 14 |
|
21 | 15 | def test_tokenize(self):
|
22 | 16 | tokenized = self.tokenizer.tokenize(self.sample_text)
|
23 |
| - print(f"{self.tokenizer} tokenize() output:", tokenized) |
| 17 | + print(self.print_string("tokenize", tokenized, method_arguments="sample_text")) |
24 | 18 | return self.assertIsNotNone(tokenized)
|
25 | 19 |
|
26 | 20 | def test_detokenize(self):
|
27 | 21 | tokenized = self.tokenizer.tokenize(self.sample_text)
|
28 | 22 | detokenized = self.tokenizer.detokenize(tokenized)
|
29 | 23 | print(
|
30 |
| - f"{self.tokenizer} detokenize() output on the previously tokenized text:", |
31 |
| - detokenized, |
| 24 | + self.print_string( |
| 25 | + "detokenize", detokenized, method_arguments="tokenized sample_text" |
| 26 | + ) |
32 | 27 | )
|
33 | 28 | return self.assertIsNotNone(detokenized)
|
34 | 29 |
|
35 | 30 | def test_token_to_id(self):
|
36 |
| - self.token_id = self.tokenizer.token_to_id(self.token) |
37 |
| - print(f"{self.tokenizer} token_to_id() output:", self.token_id) |
38 |
| - return self.assertIsNotNone(self.token_id) |
| 31 | + token = self.token if not self.chars else self.token[0] |
| 32 | + token_id = self.tokenizer.token_to_id(token) |
| 33 | + print(self.print_string("token_to_id", token_id, method_arguments=token)) |
| 34 | + return self.assertIsNotNone(token_id) |
39 | 35 |
|
| 36 | + def test_id_to_token(self): |
| 37 | + token = self.token if not self.chars else self.token[0] |
| 38 | + token_id = self.tokenizer.token_to_id(token) |
| 39 | + matched_token = self.tokenizer.id_to_token(token_id) |
| 40 | + print( |
| 41 | + self.print_string( |
| 42 | + "id_to_token", matched_token, method_arguments=f"'{token}' id" |
| 43 | + ) |
| 44 | + ) |
| 45 | + return self.assertEqual(matched_token, token) |
| 46 | + |
| 47 | + def test_encode(self): |
| 48 | + encoded = self.tokenizer.encode(self.sample_text) |
| 49 | + print(self.print_string("encode", encoded, method_arguments="sample_text")) |
| 50 | + return self.assertIsNotNone(encoded) |
| 51 | + |
| 52 | + def test_decode(self): |
| 53 | + encoded = self.tokenizer.encode(self.sample_text) |
| 54 | + decoded = self.tokenizer.decode(encoded) |
| 55 | + print( |
| 56 | + self.print_string("decode", decoded, method_arguments="encoded sample_text") |
| 57 | + ) |
| 58 | + return self.assertIsNotNone(decoded) |
40 | 59 |
|
41 |
| -word_tokenizer = tk.WordTokenizer() |
42 |
| -word_tokenizer.train("tasks/samples/data.txt") |
43 |
| -TokenizersTestUnit.tokenizer = word_tokenizer |
44 |
| -unittest.main(argv=["first-arg-is-ignored"], exit=False) |
45 | 60 |
|
| 61 | +for tokenizer in ( |
| 62 | + tk.SentencePieceTokenizer(), |
| 63 | + tk.WordTokenizer(), |
| 64 | + tk.MorphologicalTokenizer(), |
| 65 | + tk.CharacterTokenizer(), |
| 66 | + tk.DisjointLetterTokenizer(), |
| 67 | + tk.RandomTokenizer(), |
| 68 | +): |
| 69 | + try: |
| 70 | + tokenizer.train("tasks/samples/data.txt") |
| 71 | + except TypeError as type_error: |
| 72 | + print(f"{tokenizer} does not need file_path to train") |
| 73 | + tokenizer.train() |
| 74 | + if isinstance(tokenizer, tk.CharacterTokenizer): |
| 75 | + TokenizersTestUnit.chars = True |
| 76 | + TokenizersTestUnit.tokenizer = tokenizer |
| 77 | + unittest.main(argv=["first-arg-is-ignored"], exit=False) |
0 commit comments