|
| 1 | +<?php |
| 2 | + |
| 3 | +declare(strict_types=1); |
| 4 | + |
| 5 | +namespace Phpml\Tests\Tokenization; |
| 6 | + |
| 7 | +use Phpml\Exception\InvalidArgumentException; |
| 8 | +use Phpml\Tokenization\NGramWordTokenizer; |
| 9 | + |
| 10 | +/** |
| 11 | + * Inspiration: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html |
| 12 | + */ |
| 13 | +class NGramWordTokenizerTest extends TokenizerTest |
| 14 | +{ |
| 15 | + /** |
| 16 | + * @dataProvider textDataProvider |
| 17 | + */ |
| 18 | + public function testNGramTokenization(int $minGram, int $maxGram, string $text, array $tokens): void |
| 19 | + { |
| 20 | + $tokenizer = new NGramWordTokenizer($minGram, $maxGram); |
| 21 | + |
| 22 | + self::assertEquals($tokens, $tokenizer->tokenize($text)); |
| 23 | + } |
| 24 | + |
| 25 | + public function testMinGramGreaterThanMaxGramNotAllowed(): void |
| 26 | + { |
| 27 | + self::expectException(InvalidArgumentException::class); |
| 28 | + |
| 29 | + new NGramWordTokenizer(5, 2); |
| 30 | + } |
| 31 | + |
| 32 | + public function testMinGramValueTooSmall(): void |
| 33 | + { |
| 34 | + self::expectException(InvalidArgumentException::class); |
| 35 | + |
| 36 | + new NGramWordTokenizer(0, 2); |
| 37 | + } |
| 38 | + |
| 39 | + public function testMaxGramValueTooSmall(): void |
| 40 | + { |
| 41 | + self::expectException(InvalidArgumentException::class); |
| 42 | + |
| 43 | + new NGramWordTokenizer(1, 0); |
| 44 | + } |
| 45 | + |
| 46 | + public function textDataProvider(): array |
| 47 | + { |
| 48 | + return [ |
| 49 | + [ |
| 50 | + 1, 1, |
| 51 | + 'one two three four', |
| 52 | + ['one', 'two', 'three', 'four'], |
| 53 | + ], |
| 54 | + [ |
| 55 | + 1, 2, |
| 56 | + 'one two three four', |
| 57 | + ['one', 'two', 'three', 'four', 'one two', 'two three', 'three four'], |
| 58 | + ], |
| 59 | + [ |
| 60 | + 1, 3, |
| 61 | + 'one two three four', |
| 62 | + ['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four'], |
| 63 | + ], |
| 64 | + [ |
| 65 | + 2, 3, |
| 66 | + 'one two three four', |
| 67 | + ['one two', 'two three', 'three four', 'one two three', 'two three four'], |
| 68 | + ], |
| 69 | + [ |
| 70 | + 1, 2, |
| 71 | + '快狐跑过 边缘跑', |
| 72 | + ['快狐跑过', '边缘跑', '快狐跑过 边缘跑'], |
| 73 | + ], |
| 74 | + [ |
| 75 | + 2, 4, |
| 76 | + $this->getSimpleText(), |
| 77 | + [ |
| 78 | + 'Lorem ipsum', 'ipsum dolor', 'dolor sit', 'sit amet', 'amet consectetur', 'consectetur adipiscing', |
| 79 | + 'adipiscing elit', 'elit Cras', 'Cras consectetur', 'consectetur dui', 'dui et', 'et lobortis', |
| 80 | + 'lobortis auctor', 'auctor Nulla', 'Nulla vitae', 'vitae congue', 'congue lorem', 'Lorem ipsum dolor', |
| 81 | + 'ipsum dolor sit', 'dolor sit amet', 'sit amet consectetur', 'amet consectetur adipiscing', |
| 82 | + 'consectetur adipiscing elit', 'adipiscing elit Cras', 'elit Cras consectetur', 'Cras consectetur dui', |
| 83 | + 'consectetur dui et', 'dui et lobortis', 'et lobortis auctor', 'lobortis auctor Nulla', 'auctor Nulla vitae', |
| 84 | + 'Nulla vitae congue', 'vitae congue lorem', 'Lorem ipsum dolor sit', 'ipsum dolor sit amet', |
| 85 | + 'dolor sit amet consectetur', 'sit amet consectetur adipiscing', 'amet consectetur adipiscing elit', |
| 86 | + 'consectetur adipiscing elit Cras', 'adipiscing elit Cras consectetur', 'elit Cras consectetur dui', |
| 87 | + 'Cras consectetur dui et', 'consectetur dui et lobortis', 'dui et lobortis auctor', 'et lobortis auctor Nulla', |
| 88 | + 'lobortis auctor Nulla vitae', 'auctor Nulla vitae congue', 'Nulla vitae congue lorem', |
| 89 | + ], |
| 90 | + ], |
| 91 | + [ |
| 92 | + 2, 4, |
| 93 | + $this->getUtf8Text(), |
| 94 | + [ |
| 95 | + '鋍鞎 鞮鞢騉', '鞮鞢騉 袟袘觕', '袟袘觕 炟砏', '炟砏 謺貙蹖', '謺貙蹖 偢偣唲', '偢偣唲 箷箯緷', '箷箯緷 鑴鱱爧', '鑴鱱爧 覮轀', |
| 96 | + '覮轀 剆坲', '剆坲 煘煓瑐', '煘煓瑐 鬐鶤鶐', '鬐鶤鶐 飹勫嫢', '飹勫嫢 枲柊氠', '枲柊氠 鍎鞚韕', '鍎鞚韕 焲犈', '焲犈 殍涾烰', |
| 97 | + '殍涾烰 齞齝囃', '齞齝囃 蹅輶', '蹅輶 孻憵', '孻憵 擙樲橚', '擙樲橚 藒襓謥', '藒襓謥 岯岪弨', '岯岪弨 廞徲', '廞徲 孻憵懥', |
| 98 | + '孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕', '鞮鞢騉 袟袘觕 炟砏', '袟袘觕 炟砏 謺貙蹖', '炟砏 謺貙蹖 偢偣唲', '謺貙蹖 偢偣唲 箷箯緷', |
| 99 | + '偢偣唲 箷箯緷 鑴鱱爧', '箷箯緷 鑴鱱爧 覮轀', '鑴鱱爧 覮轀 剆坲', '覮轀 剆坲 煘煓瑐', '剆坲 煘煓瑐 鬐鶤鶐', '煘煓瑐 鬐鶤鶐 飹勫嫢', |
| 100 | + '鬐鶤鶐 飹勫嫢 枲柊氠', '飹勫嫢 枲柊氠 鍎鞚韕', '枲柊氠 鍎鞚韕 焲犈', '鍎鞚韕 焲犈 殍涾烰', '焲犈 殍涾烰 齞齝囃', '殍涾烰 齞齝囃 蹅輶', |
| 101 | + '齞齝囃 蹅輶 孻憵', '蹅輶 孻憵 擙樲橚', '孻憵 擙樲橚 藒襓謥', '擙樲橚 藒襓謥 岯岪弨', '藒襓謥 岯岪弨 廞徲', '岯岪弨 廞徲 孻憵懥', |
| 102 | + '廞徲 孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕 炟砏', '鞮鞢騉 袟袘觕 炟砏 謺貙蹖', '袟袘觕 炟砏 謺貙蹖 偢偣唲', '炟砏 謺貙蹖 偢偣唲 箷箯緷', |
| 103 | + '謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧', '偢偣唲 箷箯緷 鑴鱱爧 覮轀', '箷箯緷 鑴鱱爧 覮轀 剆坲', '鑴鱱爧 覮轀 剆坲 煘煓瑐', |
| 104 | + '覮轀 剆坲 煘煓瑐 鬐鶤鶐', '剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢', '煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠', '鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕', |
| 105 | + '飹勫嫢 枲柊氠 鍎鞚韕 焲犈', '枲柊氠 鍎鞚韕 焲犈 殍涾烰', '鍎鞚韕 焲犈 殍涾烰 齞齝囃', '焲犈 殍涾烰 齞齝囃 蹅輶', |
| 106 | + '殍涾烰 齞齝囃 蹅輶 孻憵', '齞齝囃 蹅輶 孻憵 擙樲橚', '蹅輶 孻憵 擙樲橚 藒襓謥', '孻憵 擙樲橚 藒襓謥 岯岪弨', '擙樲橚 藒襓謥 岯岪弨 廞徲', |
| 107 | + '藒襓謥 岯岪弨 廞徲 孻憵懥', '岯岪弨 廞徲 孻憵懥 趡趛踠', |
| 108 | + ], |
| 109 | + ], |
| 110 | + ]; |
| 111 | + } |
| 112 | +} |
0 commit comments