Skip to content

Commit 5e02b89

Browse files
authored
Fix FilesDataset arrays and TokenCountVectorizer numeric token (#363)
1 parent 02dab41 commit 5e02b89

File tree

5 files changed

+23
-5
lines changed

5 files changed

+23
-5
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [0.8.0] - 2019-03-20
8+
### Added
9+
- [Tokenization] Added NGramTokenizer (#350)
10+
- editorconfig file (#355)
11+
### Fixed
12+
- [Dataset] FilesDataset read samples without additional array (#363)
13+
- [Tokenization] fixed error with numeric token values (#363)
14+
### Changed
15+
- [Math] improved performance with pow and sqrt replacement (#350)
16+
- [Math] reduce duplicated code in distance metrics (#348)
17+
- update phpunit to 7.5.1 (#335)
18+
- code style fixes (#334)
19+
720
## [0.7.0] - 2018-11-07
821
### Added
922
- [Clustering] added KMeans associative clustering (#262)

src/Dataset/FilesDataset.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ private function scanDir(string $dir): void
2929
$target = basename($dir);
3030

3131
foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
32-
$this->samples[] = [file_get_contents($file)];
32+
$this->samples[] = file_get_contents($file);
3333
$this->targets[] = $target;
3434
}
3535
}

src/FeatureExtraction/TokenCountVectorizer.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ private function getBeyondMinimumIndexes(int $samplesCount): array
157157
$indexes = [];
158158
foreach ($this->frequencies as $token => $frequency) {
159159
if (($frequency / $samplesCount) < $this->minDF) {
160-
$indexes[] = $this->getTokenIndex($token);
160+
$indexes[] = $this->getTokenIndex((string) $token);
161161
}
162162
}
163163

tests/Dataset/FilesDatasetTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ public function testLoadFilesDatasetWithBBCData(): void
2929
self::assertEquals($targets, array_values(array_unique($dataset->getTargets())));
3030

3131
$firstSample = file_get_contents($rootPath.'/business/001.txt');
32-
self::assertEquals($firstSample, $dataset->getSamples()[0][0]);
32+
self::assertEquals($firstSample, $dataset->getSamples()[0]);
3333

3434
$firstTarget = 'business';
3535
self::assertEquals($firstTarget, $dataset->getTargets()[0]);
3636

3737
$lastSample = file_get_contents($rootPath.'/tech/010.txt');
38-
self::assertEquals($lastSample, $dataset->getSamples()[49][0]);
38+
self::assertEquals($lastSample, $dataset->getSamples()[49]);
3939

4040
$lastTarget = 'tech';
4141
self::assertEquals($lastTarget, $dataset->getTargets()[49]);

tests/FeatureExtraction/TokenCountVectorizerTest.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
8484
{
8585
// word at least in half samples
8686
$samples = [
87-
'Lorem ipsum dolor sit amet',
87+
'Lorem ipsum dolor sit amet 1550',
8888
'Lorem ipsum sit amet',
8989
'ipsum sit amet',
9090
'ipsum sit amet',
@@ -96,6 +96,7 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
9696
2 => 'dolor',
9797
3 => 'sit',
9898
4 => 'amet',
99+
5 => 1550,
99100
];
100101

101102
$tokensCounts = [
@@ -105,27 +106,31 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
105106
2 => 0,
106107
3 => 1,
107108
4 => 1,
109+
5 => 0,
108110
],
109111
[
110112
0 => 1,
111113
1 => 1,
112114
2 => 0,
113115
3 => 1,
114116
4 => 1,
117+
5 => 0,
115118
],
116119
[
117120
0 => 0,
118121
1 => 1,
119122
2 => 0,
120123
3 => 1,
121124
4 => 1,
125+
5 => 0,
122126
],
123127
[
124128
0 => 0,
125129
1 => 1,
126130
2 => 0,
127131
3 => 1,
128132
4 => 1,
133+
5 => 0,
129134
],
130135
];
131136

0 commit comments

Comments
 (0)