@@ -43,6 +43,8 @@ pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Pag
43
43
pub const DEFAULT_MAX_STATISTICS_SIZE : usize = 4096 ;
44
44
/// Default value for [`WriterProperties::max_row_group_size`]
45
45
pub const DEFAULT_MAX_ROW_GROUP_SIZE : usize = 1024 * 1024 ;
46
+ /// Default value for [`WriterProperties::bloom_filter_position`]
47
+ pub const DEFAULT_BLOOM_FILTER_POSITION : BloomFilterPosition = BloomFilterPosition :: AfterRowGroup ;
46
48
/// Default value for [`WriterProperties::created_by`]
47
49
pub const DEFAULT_CREATED_BY : & str = concat ! ( "parquet-rs version " , env!( "CARGO_PKG_VERSION" ) ) ;
48
50
/// Default value for [`WriterProperties::column_index_truncate_length`]
@@ -86,6 +88,24 @@ impl FromStr for WriterVersion {
86
88
}
87
89
}
88
90
91
+ /// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
92
+ /// write Bloom filters
93
+ ///
94
+ /// Basic constant, which is not part of the Thrift definition.
95
+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
96
+ pub enum BloomFilterPosition {
97
+ /// Write Bloom Filters of each row group right after the row group
98
+ ///
99
+ /// This saves memory by writing it as soon as it is computed, at the cost
100
+ /// of data locality for readers
101
+ AfterRowGroup ,
102
+ /// Write Bloom Filters at the end of the file
103
+ ///
104
+ /// This allows better data locality for readers, at the cost of memory usage
105
+ /// for writers.
106
+ End ,
107
+ }
108
+
89
109
/// Reference counted writer properties.
90
110
pub type WriterPropertiesPtr = Arc < WriterProperties > ;
91
111
@@ -131,6 +151,7 @@ pub struct WriterProperties {
131
151
data_page_row_count_limit : usize ,
132
152
write_batch_size : usize ,
133
153
max_row_group_size : usize ,
154
+ bloom_filter_position : BloomFilterPosition ,
134
155
writer_version : WriterVersion ,
135
156
created_by : String ,
136
157
pub ( crate ) key_value_metadata : Option < Vec < KeyValue > > ,
@@ -217,6 +238,11 @@ impl WriterProperties {
217
238
self . max_row_group_size
218
239
}
219
240
241
+ /// Returns maximum number of rows in a row group.
242
+ pub fn bloom_filter_position ( & self ) -> BloomFilterPosition {
243
+ self . bloom_filter_position
244
+ }
245
+
220
246
/// Returns configured writer version.
221
247
pub fn writer_version ( & self ) -> WriterVersion {
222
248
self . writer_version
@@ -337,6 +363,7 @@ pub struct WriterPropertiesBuilder {
337
363
data_page_row_count_limit : usize ,
338
364
write_batch_size : usize ,
339
365
max_row_group_size : usize ,
366
+ bloom_filter_position : BloomFilterPosition ,
340
367
writer_version : WriterVersion ,
341
368
created_by : String ,
342
369
key_value_metadata : Option < Vec < KeyValue > > ,
@@ -356,6 +383,7 @@ impl WriterPropertiesBuilder {
356
383
data_page_row_count_limit : usize:: MAX ,
357
384
write_batch_size : DEFAULT_WRITE_BATCH_SIZE ,
358
385
max_row_group_size : DEFAULT_MAX_ROW_GROUP_SIZE ,
386
+ bloom_filter_position : DEFAULT_BLOOM_FILTER_POSITION ,
359
387
writer_version : DEFAULT_WRITER_VERSION ,
360
388
created_by : DEFAULT_CREATED_BY . to_string ( ) ,
361
389
key_value_metadata : None ,
@@ -375,6 +403,7 @@ impl WriterPropertiesBuilder {
375
403
data_page_row_count_limit : self . data_page_row_count_limit ,
376
404
write_batch_size : self . write_batch_size ,
377
405
max_row_group_size : self . max_row_group_size ,
406
+ bloom_filter_position : self . bloom_filter_position ,
378
407
writer_version : self . writer_version ,
379
408
created_by : self . created_by ,
380
409
key_value_metadata : self . key_value_metadata ,
@@ -479,6 +508,12 @@ impl WriterPropertiesBuilder {
479
508
self
480
509
}
481
510
511
+ /// Sets where in the final file Bloom Filters are written
512
+ pub fn set_bloom_filter_position ( mut self , value : BloomFilterPosition ) -> Self {
513
+ self . bloom_filter_position = value;
514
+ self
515
+ }
516
+
482
517
/// Sets "created by" property.
483
518
pub fn set_created_by ( mut self , value : String ) -> Self {
484
519
self . created_by = value;
@@ -991,6 +1026,7 @@ mod tests {
991
1026
) ;
992
1027
assert_eq ! ( props. write_batch_size( ) , DEFAULT_WRITE_BATCH_SIZE ) ;
993
1028
assert_eq ! ( props. max_row_group_size( ) , DEFAULT_MAX_ROW_GROUP_SIZE ) ;
1029
+ assert_eq ! ( props. bloom_filter_position( ) , DEFAULT_BLOOM_FILTER_POSITION ) ;
994
1030
assert_eq ! ( props. writer_version( ) , DEFAULT_WRITER_VERSION ) ;
995
1031
assert_eq ! ( props. created_by( ) , DEFAULT_CREATED_BY ) ;
996
1032
assert_eq ! ( props. key_value_metadata( ) , None ) ;
0 commit comments