@@ -315,121 +315,124 @@ config_namespace! {
315
315
}
316
316
317
317
config_namespace ! {
318
- /// Options related to parquet files
318
+ /// Options for reading and writing parquet files
319
319
///
320
320
/// See also: [`SessionConfig`]
321
321
///
322
322
/// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
323
323
pub struct ParquetOptions {
324
- /// If true, reads the Parquet data page level metadata (the
324
+ // The following options affect reading parquet files
325
+
326
+ /// (reading) If true, reads the Parquet data page level metadata (the
325
327
/// Page Index), if present, to reduce the I/O and number of
326
328
/// rows decoded.
327
329
pub enable_page_index: bool , default = true
328
330
329
- /// If true, the parquet reader attempts to skip entire row groups based
331
+ /// (reading) If true, the parquet reader attempts to skip entire row groups based
330
332
/// on the predicate in the query and the metadata (min/max values) stored in
331
333
/// the parquet file
332
334
pub pruning: bool , default = true
333
335
334
- /// If true, the parquet reader skip the optional embedded metadata that may be in
336
+ /// (reading) If true, the parquet reader skip the optional embedded metadata that may be in
335
337
/// the file Schema. This setting can help avoid schema conflicts when querying
336
338
/// multiple parquet files with schemas containing compatible types but different metadata
337
339
pub skip_metadata: bool , default = true
338
340
339
- /// If specified, the parquet reader will try and fetch the last `size_hint`
341
+ /// (reading) If specified, the parquet reader will try and fetch the last `size_hint`
340
342
/// bytes of the parquet file optimistically. If not specified, two reads are required:
341
343
/// One read to fetch the 8-byte parquet footer and
342
344
/// another to fetch the metadata length encoded in the footer
343
345
pub metadata_size_hint: Option <usize >, default = None
344
346
345
- /// If true, filter expressions are be applied during the parquet decoding operation to
347
+ /// (reading) If true, filter expressions are be applied during the parquet decoding operation to
346
348
/// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
347
349
pub pushdown_filters: bool , default = false
348
350
349
- /// If true, filter expressions evaluated during the parquet decoding operation
351
+ /// (reading) If true, filter expressions evaluated during the parquet decoding operation
350
352
/// will be reordered heuristically to minimize the cost of evaluation. If false,
351
353
/// the filters are applied in the same order as written in the query
352
354
pub reorder_filters: bool , default = false
353
355
354
- // The following map to parquet::file::properties::WriterProperties
356
+ // The following options affect writing to parquet files
357
+ // and map to parquet::file::properties::WriterProperties
355
358
356
- /// Sets best effort maximum size of data page in bytes
359
+ /// (writing) Sets best effort maximum size of data page in bytes
357
360
pub data_pagesize_limit: usize , default = 1024 * 1024
358
361
359
- /// Sets write_batch_size in bytes
362
+ /// (writing) Sets write_batch_size in bytes
360
363
pub write_batch_size: usize , default = 1024
361
364
362
- /// Sets parquet writer version
365
+ /// (writing) Sets parquet writer version
363
366
/// valid values are "1.0" and "2.0"
364
367
pub writer_version: String , default = "1.0" . into( )
365
368
366
- /// Sets default parquet compression codec
369
+ /// (writing) Sets default parquet compression codec.
367
370
/// Valid values are: uncompressed, snappy, gzip(level),
368
371
/// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
369
372
/// These values are not case sensitive. If NULL, uses
370
373
/// default parquet writer setting
371
374
pub compression: Option <String >, default = Some ( "zstd(3)" . into( ) )
372
375
373
- /// Sets if dictionary encoding is enabled. If NULL, uses
376
+ /// (writing) Sets if dictionary encoding is enabled. If NULL, uses
374
377
/// default parquet writer setting
375
378
pub dictionary_enabled: Option <bool >, default = None
376
379
377
- /// Sets best effort maximum dictionary page size, in bytes
380
+ /// (writing) Sets best effort maximum dictionary page size, in bytes
378
381
pub dictionary_page_size_limit: usize , default = 1024 * 1024
379
382
380
- /// Sets if statistics are enabled for any column
383
+ /// (writing) Sets if statistics are enabled for any column
381
384
/// Valid values are: "none", "chunk", and "page"
382
385
/// These values are not case sensitive. If NULL, uses
383
386
/// default parquet writer setting
384
387
pub statistics_enabled: Option <String >, default = None
385
388
386
- /// Sets max statistics size for any column. If NULL, uses
389
+ /// (writing) Sets max statistics size for any column. If NULL, uses
387
390
/// default parquet writer setting
388
391
pub max_statistics_size: Option <usize >, default = None
389
392
390
- /// Target maximum number of rows in each row group (defaults to 1M
393
+ /// (writing) Target maximum number of rows in each row group (defaults to 1M
391
394
/// rows). Writing larger row groups requires more memory to write, but
392
395
/// can get better compression and be faster to read.
393
396
pub max_row_group_size: usize , default = 1024 * 1024
394
397
395
- /// Sets "created by" property
398
+ /// (writing) Sets "created by" property
396
399
pub created_by: String , default = concat!( "datafusion version " , env!( "CARGO_PKG_VERSION" ) ) . into( )
397
400
398
- /// Sets column index truncate length
401
+ /// (writing) Sets column index truncate length
399
402
pub column_index_truncate_length: Option <usize >, default = None
400
403
401
- /// Sets best effort maximum number of rows in data page
404
+ /// (writing) Sets best effort maximum number of rows in data page
402
405
pub data_page_row_count_limit: usize , default = usize :: MAX
403
406
404
- /// Sets default encoding for any column
407
+ /// (writing) Sets default encoding for any column.
405
408
/// Valid values are: plain, plain_dictionary, rle,
406
409
/// bit_packed, delta_binary_packed, delta_length_byte_array,
407
410
/// delta_byte_array, rle_dictionary, and byte_stream_split.
408
411
/// These values are not case sensitive. If NULL, uses
409
412
/// default parquet writer setting
410
413
pub encoding: Option <String >, default = None
411
414
412
- /// Use any available bloom filters when reading parquet files
415
+ /// (writing) Use any available bloom filters when reading parquet files
413
416
pub bloom_filter_on_read: bool , default = true
414
417
415
- /// Write bloom filters for all columns when creating parquet files
418
+ /// (writing) Write bloom filters for all columns when creating parquet files
416
419
pub bloom_filter_on_write: bool , default = false
417
420
418
- /// Sets bloom filter false positive probability. If NULL, uses
421
+ /// (writing) Sets bloom filter false positive probability. If NULL, uses
419
422
/// default parquet writer setting
420
423
pub bloom_filter_fpp: Option <f64 >, default = None
421
424
422
- /// Sets bloom filter number of distinct values. If NULL, uses
425
+ /// (writing) Sets bloom filter number of distinct values. If NULL, uses
423
426
/// default parquet writer setting
424
427
pub bloom_filter_ndv: Option <u64 >, default = None
425
428
426
- /// Controls whether DataFusion will attempt to speed up writing
429
+ /// (writing) Controls whether DataFusion will attempt to speed up writing
427
430
/// parquet files by serializing them in parallel. Each column
428
431
/// in each row group in each output file are serialized in parallel
429
432
/// leveraging a maximum possible core count of n_files*n_row_groups*n_columns.
430
433
pub allow_single_file_parallelism: bool , default = true
431
434
432
- /// By default parallel parquet writer is tuned for minimum
435
+ /// (writing) By default parallel parquet writer is tuned for minimum
433
436
/// memory usage in a streaming execution plan. You may see
434
437
/// a performance benefit when writing large parquet files
435
438
/// by increasing maximum_parallel_row_group_writers and
@@ -440,7 +443,7 @@ config_namespace! {
440
443
/// data frame.
441
444
pub maximum_parallel_row_group_writers: usize , default = 1
442
445
443
- /// By default parallel parquet writer is tuned for minimum
446
+ /// (writing) By default parallel parquet writer is tuned for minimum
444
447
/// memory usage in a streaming execution plan. You may see
445
448
/// a performance benefit when writing large parquet files
446
449
/// by increasing maximum_parallel_row_group_writers and
@@ -450,7 +453,6 @@ config_namespace! {
450
453
/// writing out already in-memory data, such as from a cached
451
454
/// data frame.
452
455
pub maximum_buffered_record_batches_per_stream: usize , default = 2
453
-
454
456
}
455
457
}
456
458
@@ -1534,6 +1536,9 @@ macro_rules! config_namespace_with_hashmap {
1534
1536
}
1535
1537
1536
1538
config_namespace_with_hashmap ! {
1539
+ /// Options controlling parquet format for individual columns.
1540
+ ///
1541
+ /// See [`ParquetOptions`] for more details
1537
1542
pub struct ColumnOptions {
1538
1543
/// Sets if bloom filter is enabled for the column path.
1539
1544
pub bloom_filter_enabled: Option <bool >, default = None
0 commit comments