Skip to content

Commit 18f14ab

Browse files
logan-keedealamb
andauthored
Deprecate max statistics size properly (#14188)
* deprecating max_statisics_size * formatting by rustfmt * Sprinkle more #[allow(deprecated)] * update expected configs * fix: add deprecation warning to information_schema --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 1dad545 commit 18f14ab

File tree

7 files changed

+72
-27
lines changed

7 files changed

+72
-27
lines changed

datafusion/common/src/config.rs

Lines changed: 54 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -108,38 +108,51 @@ use crate::{DataFusionError, Result};
108108
/// ```
109109
///
110110
/// NB: Misplaced commas may result in nonsensical errors
111-
#[macro_export]
112111
macro_rules! config_namespace {
113112
(
114-
$(#[doc = $struct_d:tt])*
115-
$vis:vis struct $struct_name:ident {
116-
$(
117-
$(#[doc = $d:tt])*
118-
$field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr
119-
)*$(,)*
120-
}
113+
$(#[doc = $struct_d:tt])* // Struct-level documentation attributes
114+
$(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute
115+
$(#[allow($($struct_de:tt)*)])?
116+
$vis:vis struct $struct_name:ident {
117+
$(
118+
$(#[doc = $d:tt])* // Field-level documentation attributes
119+
$(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
120+
$(#[allow($($field_de:tt)*)])?
121+
$field_vis:vis $field_name:ident : $field_type:ty,
122+
$(warn = $warn:expr,)?
123+
$(transform = $transform:expr,)?
124+
default = $default:expr
125+
)*$(,)*
126+
}
121127
) => {
122-
123-
$(#[doc = $struct_d])*
128+
$(#[doc = $struct_d])* // Apply struct documentation
129+
$(#[deprecated($($struct_depr)*)])? // Apply struct deprecation
130+
$(#[allow($($struct_de)*)])?
124131
#[derive(Debug, Clone, PartialEq)]
125-
$vis struct $struct_name{
132+
$vis struct $struct_name {
126133
$(
127-
$(#[doc = $d])*
128-
$field_vis $field_name : $field_type,
134+
$(#[doc = $d])* // Apply field documentation
135+
$(#[deprecated($($field_depr)*)])? // Apply field deprecation
136+
$(#[allow($($field_de)*)])?
137+
$field_vis $field_name: $field_type,
129138
)*
130139
}
131140

132141
impl ConfigField for $struct_name {
133142
fn set(&mut self, key: &str, value: &str) -> Result<()> {
134143
let (key, rem) = key.split_once('.').unwrap_or((key, ""));
135-
136144
match key {
137145
$(
138-
stringify!($field_name) => {
139-
$(let value = $transform(value);)?
140-
$(log::warn!($warn);)?
141-
self.$field_name.set(rem, value.as_ref())
142-
},
146+
stringify!($field_name) => {
147+
// Safely apply deprecated attribute if present
148+
// $(#[allow(deprecated)])?
149+
{
150+
$(let value = $transform(value);)? // Apply transformation if specified
151+
$(log::warn!($warn);)? // Log warning if specified
152+
#[allow(deprecated)]
153+
self.$field_name.set(rem, value.as_ref())
154+
}
155+
},
143156
)*
144157
_ => return _config_err!(
145158
"Config value \"{}\" not found on {}", key, stringify!($struct_name)
@@ -149,15 +162,16 @@ macro_rules! config_namespace {
149162

150163
fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
151164
$(
152-
let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
153-
let desc = concat!($($d),*).trim();
154-
self.$field_name.visit(v, key.as_str(), desc);
165+
let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
166+
let desc = concat!($($d),*).trim();
167+
#[allow(deprecated)]
168+
self.$field_name.visit(v, key.as_str(), desc);
155169
)*
156170
}
157171
}
158-
159172
impl Default for $struct_name {
160173
fn default() -> Self {
174+
#[allow(deprecated)]
161175
Self {
162176
$($field_name: $default),*
163177
}
@@ -467,6 +481,9 @@ config_namespace! {
467481

468482
/// (writing) Sets max statistics size for any column. If NULL, uses
469483
/// default parquet writer setting
484+
/// max_statistics_size is deprecated, currently it is not being used
485+
// TODO: remove once deprecated
486+
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
470487
pub max_statistics_size: Option<usize>, default = Some(4096)
471488

472489
/// (writing) Target maximum number of rows in each row group (defaults to 1M
@@ -1598,19 +1615,23 @@ impl ConfigField for TableParquetOptions {
15981615
macro_rules! config_namespace_with_hashmap {
15991616
(
16001617
$(#[doc = $struct_d:tt])*
1618+
$(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute
16011619
$vis:vis struct $struct_name:ident {
16021620
$(
16031621
$(#[doc = $d:tt])*
1622+
$(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
16041623
$field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr
16051624
)*$(,)*
16061625
}
16071626
) => {
16081627

16091628
$(#[doc = $struct_d])*
1629+
$(#[deprecated($($struct_depr)*)])? // Apply struct deprecation
16101630
#[derive(Debug, Clone, PartialEq)]
16111631
$vis struct $struct_name{
16121632
$(
16131633
$(#[doc = $d])*
1634+
$(#[deprecated($($field_depr)*)])? // Apply field deprecation
16141635
$field_vis $field_name : $field_type,
16151636
)*
16161637
}
@@ -1621,6 +1642,8 @@ macro_rules! config_namespace_with_hashmap {
16211642
match key {
16221643
$(
16231644
stringify!($field_name) => {
1645+
// Handle deprecated fields
1646+
#[allow(deprecated)] // Allow deprecated fields
16241647
$(let value = $transform(value);)?
16251648
self.$field_name.set(rem, value.as_ref())
16261649
},
@@ -1635,13 +1658,16 @@ macro_rules! config_namespace_with_hashmap {
16351658
$(
16361659
let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
16371660
let desc = concat!($($d),*).trim();
1661+
// Handle deprecated fields
1662+
#[allow(deprecated)]
16381663
self.$field_name.visit(v, key.as_str(), desc);
16391664
)*
16401665
}
16411666
}
16421667

16431668
impl Default for $struct_name {
16441669
fn default() -> Self {
1670+
#[allow(deprecated)]
16451671
Self {
16461672
$($field_name: $default),*
16471673
}
@@ -1653,7 +1679,7 @@ macro_rules! config_namespace_with_hashmap {
16531679
let parts: Vec<&str> = key.splitn(2, "::").collect();
16541680
match parts.as_slice() {
16551681
[inner_key, hashmap_key] => {
1656-
// Get or create the ColumnOptions for the specified column
1682+
// Get or create the struct for the specified key
16571683
let inner_value = self
16581684
.entry((*hashmap_key).to_owned())
16591685
.or_insert_with($struct_name::default);
@@ -1669,6 +1695,7 @@ macro_rules! config_namespace_with_hashmap {
16691695
$(
16701696
let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name));
16711697
let desc = concat!($($d),*).trim();
1698+
#[allow(deprecated)]
16721699
col_options.$field_name.visit(v, key.as_str(), desc);
16731700
)*
16741701
}
@@ -1720,6 +1747,9 @@ config_namespace_with_hashmap! {
17201747

17211748
/// Sets max statistics size for the column path. If NULL, uses
17221749
/// default parquet options
1750+
/// max_statistics_size is deprecated, currently it is not being used
1751+
// TODO: remove once deprecated
1752+
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
17231753
pub max_statistics_size: Option<usize>, default = None
17241754
}
17251755
}

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use crate::{
2626
};
2727

2828
use arrow_schema::Schema;
29+
// TODO: handle once deprecated
2930
#[allow(deprecated)]
3031
use parquet::{
3132
arrow::ARROW_SCHEMA_META_KEY,
@@ -157,6 +158,9 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
157158
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
158159
}
159160

161+
// max_statistics_size is deprecated, currently it is not being used
162+
// TODO: remove once deprecated
163+
#[allow(deprecated)]
160164
if let Some(max_statistics_size) = options.max_statistics_size {
161165
builder = {
162166
#[allow(deprecated)]
@@ -202,6 +206,7 @@ impl ParquetOptions {
202206
///
203207
/// Note that this method does not include the key_value_metadata from [`TableParquetOptions`].
204208
pub fn into_writer_properties_builder(&self) -> Result<WriterPropertiesBuilder> {
209+
#[allow(deprecated)]
205210
let ParquetOptions {
206211
data_pagesize_limit,
207212
write_batch_size,
@@ -452,6 +457,7 @@ mod tests {
452457
fn column_options_with_non_defaults(
453458
src_col_defaults: &ParquetOptions,
454459
) -> ParquetColumnOptions {
460+
#[allow(deprecated)] // max_statistics_size
455461
ParquetColumnOptions {
456462
compression: Some("zstd(22)".into()),
457463
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
@@ -472,6 +478,7 @@ mod tests {
472478
"1.0"
473479
};
474480

481+
#[allow(deprecated)] // max_statistics_size
475482
ParquetOptions {
476483
data_pagesize_limit: 42,
477484
write_batch_size: 42,
@@ -515,6 +522,7 @@ mod tests {
515522
) -> ParquetColumnOptions {
516523
let bloom_filter_default_props = props.bloom_filter_properties(&col);
517524

525+
#[allow(deprecated)] // max_statistics_size
518526
ParquetColumnOptions {
519527
bloom_filter_enabled: Some(bloom_filter_default_props.is_some()),
520528
encoding: props.encoding(&col).map(|s| s.to_string()),
@@ -535,7 +543,6 @@ mod tests {
535543
),
536544
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
537545
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
538-
#[allow(deprecated)]
539546
max_statistics_size: Some(props.max_statistics_size(&col)),
540547
}
541548
}
@@ -569,6 +576,7 @@ mod tests {
569576
HashMap::from([(COL_NAME.into(), configured_col_props)])
570577
};
571578

579+
#[allow(deprecated)] // max_statistics_size
572580
TableParquetOptions {
573581
global: ParquetOptions {
574582
// global options

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
936936
fn try_from(
937937
value: &protobuf::ParquetOptions,
938938
) -> datafusion_common::Result<Self, Self::Error> {
939+
#[allow(deprecated)] // max_statistics_size
939940
Ok(ParquetOptions {
940941
enable_page_index: value.enable_page_index,
941942
pruning: value.pruning,
@@ -1013,6 +1014,7 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
10131014
fn try_from(
10141015
value: &protobuf::ParquetColumnOptions,
10151016
) -> datafusion_common::Result<Self, Self::Error> {
1017+
#[allow(deprecated)] // max_statistics_size
10161018
Ok(ParquetColumnOptions {
10171019
compression: value.compression_opt.clone().map(|opt| match opt {
10181020
protobuf::parquet_column_options::CompressionOpt::Compression(v) => Some(v),

datafusion/proto-common/src/to_proto/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
820820
dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled),
821821
dictionary_page_size_limit: value.dictionary_page_size_limit as u64,
822822
statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled),
823+
#[allow(deprecated)]
823824
max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)),
824825
max_row_group_size: value.max_row_group_size as u64,
825826
created_by: value.created_by.clone(),
@@ -858,6 +859,7 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions {
858859
.statistics_enabled
859860
.clone()
860861
.map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled),
862+
#[allow(deprecated)]
861863
max_statistics_size_opt: value.max_statistics_size.map(|v| {
862864
protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(
863865
v as u32,

datafusion/proto/src/logical_plan/file_formats.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ impl TableParquetOptionsProto {
362362
};
363363

364364
let column_specific_options = global_options.column_specific_options;
365+
#[allow(deprecated)] // max_statistics_size
365366
TableParquetOptionsProto {
366367
global: Some(ParquetOptionsProto {
367368
enable_page_index: global_options.global.enable_page_index,
@@ -455,6 +456,7 @@ impl TableParquetOptionsProto {
455456

456457
impl From<&ParquetOptionsProto> for ParquetOptions {
457458
fn from(proto: &ParquetOptionsProto) -> Self {
459+
#[allow(deprecated)] // max_statistics_size
458460
ParquetOptions {
459461
enable_page_index: proto.enable_page_index,
460462
pruning: proto.pruning,
@@ -509,6 +511,7 @@ impl From<&ParquetOptionsProto> for ParquetOptions {
509511

510512
impl From<ParquetColumnOptionsProto> for ParquetColumnOptions {
511513
fn from(proto: ParquetColumnOptionsProto) -> Self {
514+
#[allow(deprecated)] // max_statistics_size
512515
ParquetColumnOptions {
513516
bloom_filter_enabled: proto.bloom_filter_enabled_opt.map(
514517
|parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v,

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b
301301
datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.
302302
datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting
303303
datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.
304-
datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting
304+
datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used
305305
datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
306306
datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
307307
datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer

0 commit comments

Comments
 (0)