Skip to content

Commit cefb4fc

Browse files
KenorFRakondas
KenorFR
authored andcommittedApr 5, 2019
Ngram word (#370)
* Add NGramWordTokenizer * Update doc Add test Check coding standards
1 parent dbbce0e commit cefb4fc

File tree

3 files changed

+193
-0
lines changed

3 files changed

+193
-0
lines changed
 

Diff for: ‎docs/machine-learning/feature-extraction/token-count-vectorizer.md

+17
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,20 @@ $tokenizer->tokenize('Quick Fox');
7171

7272
// returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox']
7373
```
74+
75+
**NGramWordTokenizer**
76+
77+
The NGramWordTokenizer tokenizer accepts the following parameters:
78+
79+
`$minGram` - minimum length of characters in a gram. Defaults to 1.
80+
`$maxGram` - maximum length of characters in a gram. Defaults to 2.
81+
82+
```php
83+
use Phpml\Tokenization\NGramWordTokenizer;
84+
85+
$tokenizer = new NGramWordTokenizer(1, 2);
86+
87+
$tokenizer->tokenize('very quick fox');
88+
89+
// returns ['very', 'quick', 'fox', 'very quick', 'quick fox']
90+
```

Diff for: ‎src/Tokenization/NGramWordTokenizer.php

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Tokenization;
6+
7+
use Phpml\Exception\InvalidArgumentException;
8+
9+
class NGramWordTokenizer extends WordTokenizer
10+
{
11+
/**
12+
* @var int
13+
*/
14+
private $minGram;
15+
16+
/**
17+
* @var int
18+
*/
19+
private $maxGram;
20+
21+
public function __construct(int $minGram = 1, int $maxGram = 2)
22+
{
23+
if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
24+
throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
25+
}
26+
27+
$this->minGram = $minGram;
28+
$this->maxGram = $maxGram;
29+
}
30+
31+
/**
32+
* {@inheritdoc}
33+
*/
34+
public function tokenize(string $text): array
35+
{
36+
preg_match_all('/\w\w+/u', $text, $words);
37+
38+
$words = $words[0];
39+
40+
$nGrams = [];
41+
for ($j = $this->minGram; $j <= $this->maxGram; $j++) {
42+
$nGrams = array_merge($nGrams, $this->getNgrams($words, $j));
43+
}
44+
45+
return $nGrams;
46+
}
47+
48+
private function getNgrams(array $match, int $n = 2): array
49+
{
50+
$ngrams = [];
51+
$len = count($match);
52+
for ($i = 0; $i < $len; $i++) {
53+
if ($i > ($n - 2)) {
54+
$ng = '';
55+
for ($j = $n - 1; $j >= 0; $j--) {
56+
$ng .= ' '.$match[$i - $j];
57+
}
58+
$ngrams[] = trim($ng);
59+
}
60+
}
61+
62+
return $ngrams;
63+
}
64+
}

Diff for: ‎tests/Tokenization/NGramWordTokenizerTest.php

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Tests\Tokenization;
6+
7+
use Phpml\Exception\InvalidArgumentException;
8+
use Phpml\Tokenization\NGramWordTokenizer;
9+
10+
/**
11+
* Inspiration: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
12+
*/
13+
class NGramWordTokenizerTest extends TokenizerTest
14+
{
15+
/**
16+
* @dataProvider textDataProvider
17+
*/
18+
public function testNGramTokenization(int $minGram, int $maxGram, string $text, array $tokens): void
19+
{
20+
$tokenizer = new NGramWordTokenizer($minGram, $maxGram);
21+
22+
self::assertEquals($tokens, $tokenizer->tokenize($text));
23+
}
24+
25+
public function testMinGramGreaterThanMaxGramNotAllowed(): void
26+
{
27+
self::expectException(InvalidArgumentException::class);
28+
29+
new NGramWordTokenizer(5, 2);
30+
}
31+
32+
public function testMinGramValueTooSmall(): void
33+
{
34+
self::expectException(InvalidArgumentException::class);
35+
36+
new NGramWordTokenizer(0, 2);
37+
}
38+
39+
public function testMaxGramValueTooSmall(): void
40+
{
41+
self::expectException(InvalidArgumentException::class);
42+
43+
new NGramWordTokenizer(1, 0);
44+
}
45+
46+
public function textDataProvider(): array
47+
{
48+
return [
49+
[
50+
1, 1,
51+
'one two three four',
52+
['one', 'two', 'three', 'four'],
53+
],
54+
[
55+
1, 2,
56+
'one two three four',
57+
['one', 'two', 'three', 'four', 'one two', 'two three', 'three four'],
58+
],
59+
[
60+
1, 3,
61+
'one two three four',
62+
['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four'],
63+
],
64+
[
65+
2, 3,
66+
'one two three four',
67+
['one two', 'two three', 'three four', 'one two three', 'two three four'],
68+
],
69+
[
70+
1, 2,
71+
'快狐跑过 边缘跑',
72+
['快狐跑过', '边缘跑', '快狐跑过 边缘跑'],
73+
],
74+
[
75+
2, 4,
76+
$this->getSimpleText(),
77+
[
78+
'Lorem ipsum', 'ipsum dolor', 'dolor sit', 'sit amet', 'amet consectetur', 'consectetur adipiscing',
79+
'adipiscing elit', 'elit Cras', 'Cras consectetur', 'consectetur dui', 'dui et', 'et lobortis',
80+
'lobortis auctor', 'auctor Nulla', 'Nulla vitae', 'vitae congue', 'congue lorem', 'Lorem ipsum dolor',
81+
'ipsum dolor sit', 'dolor sit amet', 'sit amet consectetur', 'amet consectetur adipiscing',
82+
'consectetur adipiscing elit', 'adipiscing elit Cras', 'elit Cras consectetur', 'Cras consectetur dui',
83+
'consectetur dui et', 'dui et lobortis', 'et lobortis auctor', 'lobortis auctor Nulla', 'auctor Nulla vitae',
84+
'Nulla vitae congue', 'vitae congue lorem', 'Lorem ipsum dolor sit', 'ipsum dolor sit amet',
85+
'dolor sit amet consectetur', 'sit amet consectetur adipiscing', 'amet consectetur adipiscing elit',
86+
'consectetur adipiscing elit Cras', 'adipiscing elit Cras consectetur', 'elit Cras consectetur dui',
87+
'Cras consectetur dui et', 'consectetur dui et lobortis', 'dui et lobortis auctor', 'et lobortis auctor Nulla',
88+
'lobortis auctor Nulla vitae', 'auctor Nulla vitae congue', 'Nulla vitae congue lorem',
89+
],
90+
],
91+
[
92+
2, 4,
93+
$this->getUtf8Text(),
94+
[
95+
'鋍鞎 鞮鞢騉', '鞮鞢騉 袟袘觕', '袟袘觕 炟砏', '炟砏 謺貙蹖', '謺貙蹖 偢偣唲', '偢偣唲 箷箯緷', '箷箯緷 鑴鱱爧', '鑴鱱爧 覮轀',
96+
'覮轀 剆坲', '剆坲 煘煓瑐', '煘煓瑐 鬐鶤鶐', '鬐鶤鶐 飹勫嫢', '飹勫嫢 枲柊氠', '枲柊氠 鍎鞚韕', '鍎鞚韕 焲犈', '焲犈 殍涾烰',
97+
'殍涾烰 齞齝囃', '齞齝囃 蹅輶', '蹅輶 孻憵', '孻憵 擙樲橚', '擙樲橚 藒襓謥', '藒襓謥 岯岪弨', '岯岪弨 廞徲', '廞徲 孻憵懥',
98+
'孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕', '鞮鞢騉 袟袘觕 炟砏', '袟袘觕 炟砏 謺貙蹖', '炟砏 謺貙蹖 偢偣唲', '謺貙蹖 偢偣唲 箷箯緷',
99+
'偢偣唲 箷箯緷 鑴鱱爧', '箷箯緷 鑴鱱爧 覮轀', '鑴鱱爧 覮轀 剆坲', '覮轀 剆坲 煘煓瑐', '剆坲 煘煓瑐 鬐鶤鶐', '煘煓瑐 鬐鶤鶐 飹勫嫢',
100+
'鬐鶤鶐 飹勫嫢 枲柊氠', '飹勫嫢 枲柊氠 鍎鞚韕', '枲柊氠 鍎鞚韕 焲犈', '鍎鞚韕 焲犈 殍涾烰', '焲犈 殍涾烰 齞齝囃', '殍涾烰 齞齝囃 蹅輶',
101+
'齞齝囃 蹅輶 孻憵', '蹅輶 孻憵 擙樲橚', '孻憵 擙樲橚 藒襓謥', '擙樲橚 藒襓謥 岯岪弨', '藒襓謥 岯岪弨 廞徲', '岯岪弨 廞徲 孻憵懥',
102+
'廞徲 孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕 炟砏', '鞮鞢騉 袟袘觕 炟砏 謺貙蹖', '袟袘觕 炟砏 謺貙蹖 偢偣唲', '炟砏 謺貙蹖 偢偣唲 箷箯緷',
103+
'謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧', '偢偣唲 箷箯緷 鑴鱱爧 覮轀', '箷箯緷 鑴鱱爧 覮轀 剆坲', '鑴鱱爧 覮轀 剆坲 煘煓瑐',
104+
'覮轀 剆坲 煘煓瑐 鬐鶤鶐', '剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢', '煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠', '鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕',
105+
'飹勫嫢 枲柊氠 鍎鞚韕 焲犈', '枲柊氠 鍎鞚韕 焲犈 殍涾烰', '鍎鞚韕 焲犈 殍涾烰 齞齝囃', '焲犈 殍涾烰 齞齝囃 蹅輶',
106+
'殍涾烰 齞齝囃 蹅輶 孻憵', '齞齝囃 蹅輶 孻憵 擙樲橚', '蹅輶 孻憵 擙樲橚 藒襓謥', '孻憵 擙樲橚 藒襓謥 岯岪弨', '擙樲橚 藒襓謥 岯岪弨 廞徲',
107+
'藒襓謥 岯岪弨 廞徲 孻憵懥', '岯岪弨 廞徲 孻憵懥 趡趛踠',
108+
],
109+
],
110+
];
111+
}
112+
}

0 commit comments

Comments
 (0)