From 9df2ca13737532b460cc422a637dbb48e381a7b3 Mon Sep 17 00:00:00 2001 From: Dean MacGregor Date: Thu, 11 Apr 2024 13:02:29 -0400 Subject: [PATCH] feat: change default to write parquet statistics --- crates/polars-io/src/parquet/write.rs | 2 +- py-polars/polars/dataframe/frame.py | 4 ++-- py-polars/polars/lazyframe/frame.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/polars-io/src/parquet/write.rs b/crates/polars-io/src/parquet/write.rs index 8149f7d5e128..c3aa6e85fb6f 100644 --- a/crates/polars-io/src/parquet/write.rs +++ b/crates/polars-io/src/parquet/write.rs @@ -121,7 +121,7 @@ where ParquetWriter { writer, compression: ParquetCompression::default().into(), - statistics: false, + statistics: true, row_group_size: None, data_page_size: None, parallel: true, diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 8b9caf5a3a85..a2aeb5b05b30 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3302,7 +3302,7 @@ def write_parquet( *, compression: ParquetCompression = "zstd", compression_level: int | None = None, - statistics: bool = False, + statistics: bool = True, row_group_size: int | None = None, data_page_size: int | None = None, use_pyarrow: bool = False, @@ -3329,7 +3329,7 @@ def write_parquet( - "zstd" : min-level: 1, max-level: 22. statistics - Write statistics to the parquet headers. This requires extra compute. + Write statistics to the parquet headers. This is the default behavior. row_group_size Size of the row groups in number of rows. Defaults to 512^2 rows. data_page_size diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index f96adfd0bbc4..687af4a10d69 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2116,7 +2116,7 @@ def sink_parquet( *, compression: str = "zstd", compression_level: int | None = None, - statistics: bool = False, + statistics: bool = True, row_group_size: int | None = None, data_pagesize_limit: int | None = None, maintain_order: bool = True, @@ -2126,7 +2126,7 @@ def sink_parquet( simplify_expression: bool = True, slice_pushdown: bool = True, no_optimization: bool = False, - ) -> DataFrame: + ) -> None: """ Evaluate the query in streaming mode and write to a Parquet file. @@ -2153,7 +2153,7 @@ def sink_parquet( - "brotli" : min-level: 0, max-level: 11. - "zstd" : min-level: 1, max-level: 22. statistics - Write statistics to the parquet headers. This requires extra compute. + Write statistics to the parquet headers. This is the default behavior. row_group_size Size of the row groups in number of rows. If None (default), the chunks of the `DataFrame` are