Skip to content

Commit

Permalink
feat(CSV): allow iterating from an SplfileObject
Browse files Browse the repository at this point in the history
  • Loading branch information
bpolaszek committed Dec 21, 2023
1 parent fff38db commit 6f322b5
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 61 deletions.
82 changes: 66 additions & 16 deletions src/Iterator/CSVIterator.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use BenTools\ETL\Normalizer\NumericStringToNumberNormalizer;
use BenTools\ETL\Normalizer\ValueNormalizerInterface;
use IteratorAggregate;
use SplFileObject;
use Symfony\Component\OptionsResolver\OptionsResolver;
use Traversable;

Expand Down Expand Up @@ -61,39 +62,88 @@ public function __construct(
$this->options = $resolver->resolve($options);
}

/**
* @param array<int|string, mixed> $data
* @param list<string>|null $columns
*
* @return array|string[]
*/
private function extract(array $data, ?array $columns): array
{
if ($this->options['normalizers']) {
array_walk($data, function (&$value) {
foreach ($this->options['normalizers'] as $normalizer) {
$value = $normalizer->normalize($value);
}

return $value;
});
}

return !empty($columns) ? self::combine($columns, $data) : $data;
}

public function getIterator(): Traversable
{
if ($this->text instanceof SplFileObject) {
return $this->iterateFromFile($this->text);
}

return $this->iterateFromContent($this->text);
}

/**
* @return Traversable<mixed>
*/
private function iterateFromFile(SplFileObject $file): Traversable
{
$flags = [SplFileObject::READ_CSV, $file->getFlags()];
$file->setFlags(array_reduce($flags, fn ($a, $b) => $a | $b, 0));
$columns = $this->options['columns'];
if ('auto' === $columns) {
$columns = null;
}
foreach ($this->text as $r => $row) {
$fields = str_getcsv(
$row,
while (!$file->eof()) {
$fields = $file->fgetcsv(
$this->options['delimiter'],
$this->options['enclosure'],
$this->options['escapeString'],
);
if (0 === $r && 'auto' === $this->options['columns']) {
if ([null] === $fields) {
continue;
}
if ('auto' === $this->options['columns'] && 0 === $file->key()) {
$columns ??= $fields;
continue;
}

if ($this->options['normalizers']) {
array_walk($fields, function (&$value) {
foreach ($this->options['normalizers'] as $normalizer) {
$value = $normalizer->normalize($value);
}

return $value;
});
}
yield $this->extract($fields, $columns);
}
}

if (!empty($columns)) {
yield self::combine($columns, $fields);
/**
* @param Traversable<string> $content
*
* @return Traversable<mixed>
*/
private function iterateFromContent(Traversable $content): Traversable
{
$columns = $this->options['columns'];
if ('auto' === $columns) {
$columns = null;
}
foreach ($content as $r => $row) {
$fields = str_getcsv(
$row,
$this->options['delimiter'],
$this->options['enclosure'],
$this->options['escapeString'],
);
if ('auto' === $this->options['columns'] && 0 === $r) {
$columns ??= $fields;
continue;
}
yield $fields;
yield $this->extract($fields, $columns);
}
}

Expand Down
97 changes: 52 additions & 45 deletions tests/Unit/Iterator/CSVIteratorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

use BenTools\ETL\Iterator\CSVIterator;
use BenTools\ETL\Iterator\StrTokIterator;
use SplFileObject;

use function dirname;
use function expect;
use function Safe\file_get_contents;

it('iterates over CSV data', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [...new CSVIterator(new StrTokIterator($content))];
it('iterates over CSV data', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows)->toHaveCount(11)
->and($rows[0])->toBe([
Expand All @@ -30,11 +30,14 @@
3 => 'Asia',
4 => 13929286,
]);
})->with(function () {
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)));
yield 'file' => new CSVIterator(new SplFileObject($filename));
});

it('can make columns automatically', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [...new CSVIterator(new StrTokIterator($content), ['columns' => 'auto'])];
it('can make columns automatically', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows)->toHaveCount(10)
->and($rows[0])->toBe([
Expand All @@ -51,21 +54,14 @@
'continent' => 'Asia',
'population' => 13929286,
]);
})->with(function () {
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => 'auto']);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => 'auto']);
});

it('can map user-defined columns', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [
...new CSVIterator(new StrTokIterator($content), [
'columns' => [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
],
]),
];
it('can map user-defined columns', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows[1])->toBe([
'cityEnglishName' => 'New York',
Expand All @@ -81,22 +77,21 @@
'continent' => 'Asia',
'population' => 13929286,
]);
})->with(function () {
$columns = [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
];
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]);
});

it('adds fields when the row has not enough columns', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [
...new CSVIterator(new StrTokIterator($content), [
'columns' => [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
'misc',
],
]),
];
it('adds fields when the row has not enough columns', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows[1])->toBe([
'cityEnglishName' => 'New York',
Expand All @@ -114,20 +109,22 @@
'population' => 13929286,
'misc' => null,
]);
})->with(function () {
$columns = [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
'misc',
];
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]);
});

it('removes extra data whenever there are more fields than columns', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [
...new CSVIterator(new StrTokIterator($content), [
'columns' => [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
],
]),
];
it('removes extra data whenever there are more fields than columns', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows[1])->toBe([
'cityEnglishName' => 'New York',
Expand All @@ -141,4 +138,14 @@
'countryIsoCode' => 'JP',
'continent' => 'Asia',
]);
})->with(function () {
$columns = [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
];
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]);
});

0 comments on commit 6f322b5

Please sign in to comment.