Skip to content

Commit

Permalink
fix(rust,python): incomplete reading of list types from parquet (#11578)
Browse files Browse the repository at this point in the history
  • Loading branch information
nameexhaustion authored Oct 7, 2023
1 parent 4d53ae2 commit 6cd1825
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -498,9 +498,6 @@ where
if items.len() > 1 {
return MaybeNext::Some(Ok(items.pop_front().unwrap()));
}
if (items.len() == 1) && items.front().unwrap().0.len() == chunk_size.unwrap_or(usize::MAX) {
return MaybeNext::Some(Ok(items.pop_front().unwrap()));
}
if *remaining == 0 {
return match items.pop_front() {
Some(decoded) => MaybeNext::Some(Ok(decoded)),
Expand Down Expand Up @@ -541,11 +538,11 @@ where
};

if (items.len() == 1)
&& items.front().unwrap().0.len() < chunk_size.unwrap_or(usize::MAX)
&& items.front().unwrap().0.len() > chunk_size.unwrap_or(usize::MAX)
{
MaybeNext::More
} else {
MaybeNext::Some(Ok(items.pop_front().unwrap()))
} else {
MaybeNext::More
}
},
}
Expand Down
20 changes: 20 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pytest

import polars as pl
Expand Down Expand Up @@ -494,3 +495,22 @@ def test_tz_aware_parquet_9586(io_files_path: Path) -> None:
{"UTC_DATETIME_ID": [datetime(2023, 6, 26, 14, 15, 0, tzinfo=timezone.utc)]}
).select(pl.col("*").cast(pl.Datetime("ns", "UTC")))
assert_frame_equal(result, expected)


def test_nested_list_page_reads_to_end_11548() -> None:
df = pl.select(
pl.repeat(pl.arange(0, 2048, dtype=pl.UInt64).implode(), 2).alias("x"),
)

f = io.BytesIO()

pq.write_table(df.to_arrow(), f, data_page_size=1)

f.seek(0)

assert pl.read_parquet(f).select(
pl.col("x").list.lengths()
).to_series().to_list() == [
2048,
2048,
]

0 comments on commit 6cd1825

Please sign in to comment.