diff --git a/crates/polars-io/src/parquet/read_impl.rs b/crates/polars-io/src/parquet/read_impl.rs index 66a84fcb114a..5f99e770b343 100644 --- a/crates/polars-io/src/parquet/read_impl.rs +++ b/crates/polars-io/src/parquet/read_impl.rs @@ -95,13 +95,10 @@ pub(super) fn array_iter_to_series( } /// Materializes hive partitions. -/// We have a special num_rows arg, as df can be empty. -fn materialize_hive_partitions( - df: &mut DataFrame, - hive_partition_columns: Option<&[Series]>, - num_rows: usize, -) { +fn materialize_hive_partitions(df: &mut DataFrame, hive_partition_columns: Option<&[Series]>) { if let Some(hive_columns) = hive_partition_columns { + let num_rows = df.height(); + for s in hive_columns { unsafe { df.with_column_unchecked(s.new_from_index(0, num_rows)) }; } @@ -223,7 +220,7 @@ fn rg_to_dfs_optionally_par_over_columns( if let Some(rc) = &row_count { df.with_row_count_mut(&rc.name, Some(*previous_row_count + rc.offset)); } - materialize_hive_partitions(&mut df, hive_partition_columns, md.num_rows()); + materialize_hive_partitions(&mut df, hive_partition_columns); apply_predicate(&mut df, predicate.as_deref(), true)?; @@ -299,7 +296,7 @@ fn rg_to_dfs_par_over_rg( if let Some(rc) = &row_count { df.with_row_count_mut(&rc.name, Some(row_count_start as IdxSize + rc.offset)); } - materialize_hive_partitions(&mut df, hive_partition_columns, md.num_rows()); + materialize_hive_partitions(&mut df, hive_partition_columns); apply_predicate(&mut df, predicate.as_deref(), false)?; diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py index 9dc016120619..83fce2da1194 100644 --- a/py-polars/tests/unit/io/test_hive.py +++ b/py-polars/tests/unit/io/test_hive.py @@ -55,6 +55,28 @@ def test_hive_partitioned_predicate_pushdown( assert q.filter(pl.col("sugars_g") == 25).collect().shape == (1, 4) +@pytest.mark.write_disk() +def test_hive_partitioned_slice_pushdown(io_files_path: Path, tmp_path: Path) -> None: + df = pl.read_ipc(io_files_path / "*.ipc") + + root = tmp_path / "partitioned_data" + + # Ignore the pyarrow legacy warning until we can write properly with new settings. + warnings.filterwarnings("ignore") + pq.write_to_dataset( + df.to_arrow(), + root_path=root, + partition_cols=["category", "fats_g"], + use_legacy_dataset=True, + ) + + q = pl.scan_parquet(root / "**/*.parquet", hive_partitioning=True) + + # tests: 11682 + assert q.head(1).collect().select(pl.all_horizontal(pl.all().count() == 1)).item() + assert q.head(0).collect().columns == ["calories", "sugars_g", "category", "fats_g"] + + @pytest.mark.write_disk() def test_hive_partitioned_projection_pushdown( io_files_path: Path, tmp_path: Path