Skip to content

Commit dff2f3c

Browse files
authored
Minor: Clarify which parquet options are used for reading/writing (#11511)
1 parent d91a03f commit dff2f3c

File tree

4 files changed

+87
-81
lines changed

4 files changed

+87
-81
lines changed

datafusion/common/src/config.rs

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -315,121 +315,124 @@ config_namespace! {
315315
}
316316

317317
config_namespace! {
318-
/// Options related to parquet files
318+
/// Options for reading and writing parquet files
319319
///
320320
/// See also: [`SessionConfig`]
321321
///
322322
/// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
323323
pub struct ParquetOptions {
324-
/// If true, reads the Parquet data page level metadata (the
324+
// The following options affect reading parquet files
325+
326+
/// (reading) If true, reads the Parquet data page level metadata (the
325327
/// Page Index), if present, to reduce the I/O and number of
326328
/// rows decoded.
327329
pub enable_page_index: bool, default = true
328330

329-
/// If true, the parquet reader attempts to skip entire row groups based
331+
/// (reading) If true, the parquet reader attempts to skip entire row groups based
330332
/// on the predicate in the query and the metadata (min/max values) stored in
331333
/// the parquet file
332334
pub pruning: bool, default = true
333335

334-
/// If true, the parquet reader skip the optional embedded metadata that may be in
336+
/// (reading) If true, the parquet reader skip the optional embedded metadata that may be in
335337
/// the file Schema. This setting can help avoid schema conflicts when querying
336338
/// multiple parquet files with schemas containing compatible types but different metadata
337339
pub skip_metadata: bool, default = true
338340

339-
/// If specified, the parquet reader will try and fetch the last `size_hint`
341+
/// (reading) If specified, the parquet reader will try and fetch the last `size_hint`
340342
/// bytes of the parquet file optimistically. If not specified, two reads are required:
341343
/// One read to fetch the 8-byte parquet footer and
342344
/// another to fetch the metadata length encoded in the footer
343345
pub metadata_size_hint: Option<usize>, default = None
344346

345-
/// If true, filter expressions are be applied during the parquet decoding operation to
347+
/// (reading) If true, filter expressions are be applied during the parquet decoding operation to
346348
/// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
347349
pub pushdown_filters: bool, default = false
348350

349-
/// If true, filter expressions evaluated during the parquet decoding operation
351+
/// (reading) If true, filter expressions evaluated during the parquet decoding operation
350352
/// will be reordered heuristically to minimize the cost of evaluation. If false,
351353
/// the filters are applied in the same order as written in the query
352354
pub reorder_filters: bool, default = false
353355

354-
// The following map to parquet::file::properties::WriterProperties
356+
// The following options affect writing to parquet files
357+
// and map to parquet::file::properties::WriterProperties
355358

356-
/// Sets best effort maximum size of data page in bytes
359+
/// (writing) Sets best effort maximum size of data page in bytes
357360
pub data_pagesize_limit: usize, default = 1024 * 1024
358361

359-
/// Sets write_batch_size in bytes
362+
/// (writing) Sets write_batch_size in bytes
360363
pub write_batch_size: usize, default = 1024
361364

362-
/// Sets parquet writer version
365+
/// (writing) Sets parquet writer version
363366
/// valid values are "1.0" and "2.0"
364367
pub writer_version: String, default = "1.0".into()
365368

366-
/// Sets default parquet compression codec
369+
/// (writing) Sets default parquet compression codec.
367370
/// Valid values are: uncompressed, snappy, gzip(level),
368371
/// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
369372
/// These values are not case sensitive. If NULL, uses
370373
/// default parquet writer setting
371374
pub compression: Option<String>, default = Some("zstd(3)".into())
372375

373-
/// Sets if dictionary encoding is enabled. If NULL, uses
376+
/// (writing) Sets if dictionary encoding is enabled. If NULL, uses
374377
/// default parquet writer setting
375378
pub dictionary_enabled: Option<bool>, default = None
376379

377-
/// Sets best effort maximum dictionary page size, in bytes
380+
/// (writing) Sets best effort maximum dictionary page size, in bytes
378381
pub dictionary_page_size_limit: usize, default = 1024 * 1024
379382

380-
/// Sets if statistics are enabled for any column
383+
/// (writing) Sets if statistics are enabled for any column
381384
/// Valid values are: "none", "chunk", and "page"
382385
/// These values are not case sensitive. If NULL, uses
383386
/// default parquet writer setting
384387
pub statistics_enabled: Option<String>, default = None
385388

386-
/// Sets max statistics size for any column. If NULL, uses
389+
/// (writing) Sets max statistics size for any column. If NULL, uses
387390
/// default parquet writer setting
388391
pub max_statistics_size: Option<usize>, default = None
389392

390-
/// Target maximum number of rows in each row group (defaults to 1M
393+
/// (writing) Target maximum number of rows in each row group (defaults to 1M
391394
/// rows). Writing larger row groups requires more memory to write, but
392395
/// can get better compression and be faster to read.
393396
pub max_row_group_size: usize, default = 1024 * 1024
394397

395-
/// Sets "created by" property
398+
/// (writing) Sets "created by" property
396399
pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into()
397400

398-
/// Sets column index truncate length
401+
/// (writing) Sets column index truncate length
399402
pub column_index_truncate_length: Option<usize>, default = None
400403

401-
/// Sets best effort maximum number of rows in data page
404+
/// (writing) Sets best effort maximum number of rows in data page
402405
pub data_page_row_count_limit: usize, default = usize::MAX
403406

404-
/// Sets default encoding for any column
407+
/// (writing) Sets default encoding for any column.
405408
/// Valid values are: plain, plain_dictionary, rle,
406409
/// bit_packed, delta_binary_packed, delta_length_byte_array,
407410
/// delta_byte_array, rle_dictionary, and byte_stream_split.
408411
/// These values are not case sensitive. If NULL, uses
409412
/// default parquet writer setting
410413
pub encoding: Option<String>, default = None
411414

412-
/// Use any available bloom filters when reading parquet files
415+
/// (writing) Use any available bloom filters when reading parquet files
413416
pub bloom_filter_on_read: bool, default = true
414417

415-
/// Write bloom filters for all columns when creating parquet files
418+
/// (writing) Write bloom filters for all columns when creating parquet files
416419
pub bloom_filter_on_write: bool, default = false
417420

418-
/// Sets bloom filter false positive probability. If NULL, uses
421+
/// (writing) Sets bloom filter false positive probability. If NULL, uses
419422
/// default parquet writer setting
420423
pub bloom_filter_fpp: Option<f64>, default = None
421424

422-
/// Sets bloom filter number of distinct values. If NULL, uses
425+
/// (writing) Sets bloom filter number of distinct values. If NULL, uses
423426
/// default parquet writer setting
424427
pub bloom_filter_ndv: Option<u64>, default = None
425428

426-
/// Controls whether DataFusion will attempt to speed up writing
429+
/// (writing) Controls whether DataFusion will attempt to speed up writing
427430
/// parquet files by serializing them in parallel. Each column
428431
/// in each row group in each output file are serialized in parallel
429432
/// leveraging a maximum possible core count of n_files*n_row_groups*n_columns.
430433
pub allow_single_file_parallelism: bool, default = true
431434

432-
/// By default parallel parquet writer is tuned for minimum
435+
/// (writing) By default parallel parquet writer is tuned for minimum
433436
/// memory usage in a streaming execution plan. You may see
434437
/// a performance benefit when writing large parquet files
435438
/// by increasing maximum_parallel_row_group_writers and
@@ -440,7 +443,7 @@ config_namespace! {
440443
/// data frame.
441444
pub maximum_parallel_row_group_writers: usize, default = 1
442445

443-
/// By default parallel parquet writer is tuned for minimum
446+
/// (writing) By default parallel parquet writer is tuned for minimum
444447
/// memory usage in a streaming execution plan. You may see
445448
/// a performance benefit when writing large parquet files
446449
/// by increasing maximum_parallel_row_group_writers and
@@ -450,7 +453,6 @@ config_namespace! {
450453
/// writing out already in-memory data, such as from a cached
451454
/// data frame.
452455
pub maximum_buffered_record_batches_per_stream: usize, default = 2
453-
454456
}
455457
}
456458

@@ -1534,6 +1536,9 @@ macro_rules! config_namespace_with_hashmap {
15341536
}
15351537

15361538
config_namespace_with_hashmap! {
1539+
/// Options controlling parquet format for individual columns.
1540+
///
1541+
/// See [`ParquetOptions`] for more details
15371542
pub struct ColumnOptions {
15381543
/// Sets if bloom filter is enabled for the column path.
15391544
pub bloom_filter_enabled: Option<bool>, default = None

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ use parquet::{
3535
/// Options for writing parquet files
3636
#[derive(Clone, Debug)]
3737
pub struct ParquetWriterOptions {
38+
/// parquet-rs writer properties
3839
pub writer_options: WriterProperties,
3940
}
4041

0 commit comments

Comments
 (0)