Deprecate max statistics size properly (#14188)

logan-keede · alamb · web-flow · commit 18f14abf3fce · 2025-01-25T10:02:03.000-05:00
* deprecating max_statisics_size

* formatting by rustfmt

* Sprinkle more #[allow(deprecated)]

* update expected configs

* fix: add deprecation warning to information_schema

---------

Co-authored-by: Andrew Lamb &lt;andrew@nerdnetworks.org&gt;
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -108,38 +108,51 @@ use crate::{DataFusionError, Result};
 /// ```
 ///
 /// NB: Misplaced commas may result in nonsensical errors
-#[macro_export]
 macro_rules! config_namespace {
     (
-     $(#[doc = $struct_d:tt])*
-     $vis:vis struct $struct_name:ident {
-        $(
-        $(#[doc = $d:tt])*
-        $field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr
-        )*$(,)*
-    }
+        $(#[doc = $struct_d:tt])* // Struct-level documentation attributes
+        $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute
+        $(#[allow($($struct_de:tt)*)])?
+        $vis:vis struct $struct_name:ident {
+            $(
+                $(#[doc = $d:tt])* // Field-level documentation attributes
+                $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
+                $(#[allow($($field_de:tt)*)])?
+                $field_vis:vis $field_name:ident : $field_type:ty,
+                $(warn = $warn:expr,)?
+                $(transform = $transform:expr,)?
+                default = $default:expr
+            )*$(,)*
+        }
     ) => {
-
-        $(#[doc = $struct_d])*
+        $(#[doc = $struct_d])* // Apply struct documentation
+        $(#[deprecated($($struct_depr)*)])? // Apply struct deprecation
+        $(#[allow($($struct_de)*)])?
         #[derive(Debug, Clone, PartialEq)]
-        $vis struct $struct_name{
+        $vis struct $struct_name {
             $(
-            $(#[doc = $d])*
-            $field_vis $field_name : $field_type,
+                $(#[doc = $d])* // Apply field documentation
+                $(#[deprecated($($field_depr)*)])? // Apply field deprecation
+                $(#[allow($($field_de)*)])?
+                $field_vis $field_name: $field_type,
             )*
         }
 
         impl ConfigField for $struct_name {
             fn set(&mut self, key: &str, value: &str) -> Result<()> {
                 let (key, rem) = key.split_once('.').unwrap_or((key, ""));
-
                 match key {
                     $(
-                       stringify!($field_name) => {
-                           $(let value = $transform(value);)?
-                           $(log::warn!($warn);)?
-                           self.$field_name.set(rem, value.as_ref())
-                       },
+                        stringify!($field_name) => {
+                            // Safely apply deprecated attribute if present
+                            // $(#[allow(deprecated)])?
+                            {
+                                $(let value = $transform(value);)? // Apply transformation if specified
+                                $(log::warn!($warn);)? // Log warning if specified
+                                #[allow(deprecated)]
+                                self.$field_name.set(rem, value.as_ref())
+                            }
+                        },
                     )*
                     _ => return _config_err!(
                         "Config value \"{}\" not found on {}", key, stringify!($struct_name)
@@ -149,15 +162,16 @@ macro_rules! config_namespace {
 
             fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
                 $(
-                let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
-                let desc = concat!($($d),*).trim();
-                self.$field_name.visit(v, key.as_str(), desc);
+                    let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
+                    let desc = concat!($($d),*).trim();
+                    #[allow(deprecated)]
+                    self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
         }
-
         impl Default for $struct_name {
             fn default() -> Self {
+                #[allow(deprecated)]
                 Self {
                     $($field_name: $default),*
                 }
@@ -467,6 +481,9 @@ config_namespace! {
 
         /// (writing) Sets max statistics size for any column. If NULL, uses
         /// default parquet writer setting
+        /// max_statistics_size is deprecated, currently it is not being used
+        // TODO: remove once deprecated
+        #[deprecated(since = "45.0.0", note = "Setting does not do anything")]
         pub max_statistics_size: Option<usize>, default = Some(4096)
 
         /// (writing) Target maximum number of rows in each row group (defaults to 1M
@@ -1598,19 +1615,23 @@ impl ConfigField for TableParquetOptions {
 macro_rules! config_namespace_with_hashmap {
     (
      $(#[doc = $struct_d:tt])*
+     $(#[deprecated($($struct_depr:tt)*)])?  // Optional struct-level deprecated attribute
      $vis:vis struct $struct_name:ident {
         $(
         $(#[doc = $d:tt])*
+        $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
         $field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr
         )*$(,)*
     }
     ) => {
 
         $(#[doc = $struct_d])*
+        $(#[deprecated($($struct_depr)*)])?  // Apply struct deprecation
         #[derive(Debug, Clone, PartialEq)]
         $vis struct $struct_name{
             $(
             $(#[doc = $d])*
+            $(#[deprecated($($field_depr)*)])? // Apply field deprecation
             $field_vis $field_name : $field_type,
             )*
         }
@@ -1621,6 +1642,8 @@ macro_rules! config_namespace_with_hashmap {
                 match key {
                     $(
                        stringify!($field_name) => {
+                           // Handle deprecated fields
+                           #[allow(deprecated)] // Allow deprecated fields
                            $(let value = $transform(value);)?
                            self.$field_name.set(rem, value.as_ref())
                        },
@@ -1635,13 +1658,16 @@ macro_rules! config_namespace_with_hashmap {
                 $(
                 let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
                 let desc = concat!($($d),*).trim();
+                // Handle deprecated fields
+                #[allow(deprecated)]
                 self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
         }
 
         impl Default for $struct_name {
             fn default() -> Self {
+                #[allow(deprecated)]
                 Self {
                     $($field_name: $default),*
                 }
@@ -1653,7 +1679,7 @@ macro_rules! config_namespace_with_hashmap {
                 let parts: Vec<&str> = key.splitn(2, "::").collect();
                 match parts.as_slice() {
                     [inner_key, hashmap_key] => {
-                        // Get or create the ColumnOptions for the specified column
+                        // Get or create the struct for the specified key
                         let inner_value = self
                             .entry((*hashmap_key).to_owned())
                             .or_insert_with($struct_name::default);
@@ -1669,6 +1695,7 @@ macro_rules! config_namespace_with_hashmap {
                     $(
                     let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name));
                     let desc = concat!($($d),*).trim();
+                    #[allow(deprecated)]
                     col_options.$field_name.visit(v, key.as_str(), desc);
                     )*
                 }
@@ -1720,6 +1747,9 @@ config_namespace_with_hashmap! {
 
         /// Sets max statistics size for the column path. If NULL, uses
         /// default parquet options
+        /// max_statistics_size is deprecated, currently it is not being used
+        // TODO: remove once deprecated
+        #[deprecated(since = "45.0.0", note = "Setting does not do anything")]
         pub max_statistics_size: Option<usize>, default = None
     }
 }
diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs
@@ -26,6 +26,7 @@ use crate::{
 };
 
 use arrow_schema::Schema;
+// TODO: handle once deprecated
 #[allow(deprecated)]
 use parquet::{
     arrow::ARROW_SCHEMA_META_KEY,
@@ -157,6 +158,9 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
                     builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
             }
 
+            // max_statistics_size is deprecated, currently it is not being used
+            // TODO: remove once deprecated
+            #[allow(deprecated)]
             if let Some(max_statistics_size) = options.max_statistics_size {
                 builder = {
                     #[allow(deprecated)]
@@ -202,6 +206,7 @@ impl ParquetOptions {
     ///
     /// Note that this method does not include the key_value_metadata from [`TableParquetOptions`].
     pub fn into_writer_properties_builder(&self) -> Result<WriterPropertiesBuilder> {
+        #[allow(deprecated)]
         let ParquetOptions {
             data_pagesize_limit,
             write_batch_size,
@@ -452,6 +457,7 @@ mod tests {
     fn column_options_with_non_defaults(
         src_col_defaults: &ParquetOptions,
     ) -> ParquetColumnOptions {
+        #[allow(deprecated)] // max_statistics_size
         ParquetColumnOptions {
             compression: Some("zstd(22)".into()),
             dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
@@ -472,6 +478,7 @@ mod tests {
             "1.0"
         };
 
+        #[allow(deprecated)] // max_statistics_size
         ParquetOptions {
             data_pagesize_limit: 42,
             write_batch_size: 42,
@@ -515,6 +522,7 @@ mod tests {
     ) -> ParquetColumnOptions {
         let bloom_filter_default_props = props.bloom_filter_properties(&col);
 
+        #[allow(deprecated)] // max_statistics_size
         ParquetColumnOptions {
             bloom_filter_enabled: Some(bloom_filter_default_props.is_some()),
             encoding: props.encoding(&col).map(|s| s.to_string()),
@@ -535,7 +543,6 @@ mod tests {
             ),
             bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
             bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
-            #[allow(deprecated)]
             max_statistics_size: Some(props.max_statistics_size(&col)),
         }
     }
@@ -569,6 +576,7 @@ mod tests {
             HashMap::from([(COL_NAME.into(), configured_col_props)])
         };
 
+        #[allow(deprecated)] // max_statistics_size
         TableParquetOptions {
             global: ParquetOptions {
                 // global options
diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs
@@ -936,6 +936,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
     fn try_from(
         value: &protobuf::ParquetOptions,
     ) -> datafusion_common::Result<Self, Self::Error> {
+        #[allow(deprecated)] // max_statistics_size
         Ok(ParquetOptions {
             enable_page_index: value.enable_page_index,
             pruning: value.pruning,
@@ -1013,6 +1014,7 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
     fn try_from(
         value: &protobuf::ParquetColumnOptions,
     ) -> datafusion_common::Result<Self, Self::Error> {
+        #[allow(deprecated)] // max_statistics_size
         Ok(ParquetColumnOptions {
             compression: value.compression_opt.clone().map(|opt| match opt {
                 protobuf::parquet_column_options::CompressionOpt::Compression(v) => Some(v),
diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs
@@ -820,6 +820,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
             dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled),
             dictionary_page_size_limit: value.dictionary_page_size_limit as u64,
             statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled),
+            #[allow(deprecated)]
             max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)),
             max_row_group_size: value.max_row_group_size as u64,
             created_by: value.created_by.clone(),
@@ -858,6 +859,7 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions {
                 .statistics_enabled
                 .clone()
                 .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled),
+            #[allow(deprecated)]
             max_statistics_size_opt: value.max_statistics_size.map(|v| {
                 protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(
                     v as u32,
diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs
@@ -362,6 +362,7 @@ impl TableParquetOptionsProto {
         };
 
         let column_specific_options = global_options.column_specific_options;
+        #[allow(deprecated)] // max_statistics_size
         TableParquetOptionsProto {
             global: Some(ParquetOptionsProto {
                 enable_page_index: global_options.global.enable_page_index,
@@ -455,6 +456,7 @@ impl TableParquetOptionsProto {
 
 impl From<&ParquetOptionsProto> for ParquetOptions {
     fn from(proto: &ParquetOptionsProto) -> Self {
+        #[allow(deprecated)] // max_statistics_size
         ParquetOptions {
             enable_page_index: proto.enable_page_index,
             pruning: proto.pruning,
@@ -509,6 +511,7 @@ impl From<&ParquetOptionsProto> for ParquetOptions {
 
 impl From<ParquetColumnOptionsProto> for ParquetColumnOptions {
     fn from(proto: ParquetColumnOptionsProto) -> Self {
+        #[allow(deprecated)] // max_statistics_size
         ParquetColumnOptions {
             bloom_filter_enabled: proto.bloom_filter_enabled_opt.map(
                 |parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v,
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -301,7 +301,7 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b
 datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.
 datafusion.execution.parquet.encoding NULL (writing)  Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting
 datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.
-datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting
+datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
 datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md