@@ -45,6 +45,8 @@ pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Pag
45
45
pub const DEFAULT_MAX_STATISTICS_SIZE : usize = 4096 ;
46
46
/// Default value for [`WriterProperties::max_row_group_size`]
47
47
pub const DEFAULT_MAX_ROW_GROUP_SIZE : usize = 1024 * 1024 ;
48
+ /// Default value for [`WriterProperties::bloom_filter_position`]
49
+ pub const DEFAULT_BLOOM_FILTER_POSITION : BloomFilterPosition = BloomFilterPosition :: AfterRowGroup ;
48
50
/// Default value for [`WriterProperties::created_by`]
49
51
pub const DEFAULT_CREATED_BY : & str = concat ! ( "parquet-rs version " , env!( "CARGO_PKG_VERSION" ) ) ;
50
52
/// Default value for [`WriterProperties::column_index_truncate_length`]
@@ -88,6 +90,24 @@ impl FromStr for WriterVersion {
88
90
}
89
91
}
90
92
93
+ /// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
94
+ /// write Bloom filters
95
+ ///
96
+ /// Basic constant, which is not part of the Thrift definition.
97
+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
98
+ pub enum BloomFilterPosition {
99
+ /// Write Bloom Filters of each row group right after the row group
100
+ ///
101
+ /// This saves memory by writing it as soon as it is computed, at the cost
102
+ /// of data locality for readers
103
+ AfterRowGroup ,
104
+ /// Write Bloom Filters at the end of the file
105
+ ///
106
+ /// This allows better data locality for readers, at the cost of memory usage
107
+ /// for writers.
108
+ End ,
109
+ }
110
+
91
111
/// Reference counted writer properties.
92
112
pub type WriterPropertiesPtr = Arc < WriterProperties > ;
93
113
@@ -132,6 +152,7 @@ pub struct WriterProperties {
132
152
data_page_row_count_limit : usize ,
133
153
write_batch_size : usize ,
134
154
max_row_group_size : usize ,
155
+ bloom_filter_position : BloomFilterPosition ,
135
156
writer_version : WriterVersion ,
136
157
created_by : String ,
137
158
pub ( crate ) key_value_metadata : Option < Vec < KeyValue > > ,
@@ -219,6 +240,11 @@ impl WriterProperties {
219
240
self . max_row_group_size
220
241
}
221
242
243
+ /// Returns maximum number of rows in a row group.
244
+ pub fn bloom_filter_position ( & self ) -> BloomFilterPosition {
245
+ self . bloom_filter_position
246
+ }
247
+
222
248
/// Returns configured writer version.
223
249
pub fn writer_version ( & self ) -> WriterVersion {
224
250
self . writer_version
@@ -340,6 +366,7 @@ pub struct WriterPropertiesBuilder {
340
366
data_page_row_count_limit : usize ,
341
367
write_batch_size : usize ,
342
368
max_row_group_size : usize ,
369
+ bloom_filter_position : BloomFilterPosition ,
343
370
writer_version : WriterVersion ,
344
371
created_by : String ,
345
372
key_value_metadata : Option < Vec < KeyValue > > ,
@@ -359,6 +386,7 @@ impl WriterPropertiesBuilder {
359
386
data_page_row_count_limit : DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT ,
360
387
write_batch_size : DEFAULT_WRITE_BATCH_SIZE ,
361
388
max_row_group_size : DEFAULT_MAX_ROW_GROUP_SIZE ,
389
+ bloom_filter_position : DEFAULT_BLOOM_FILTER_POSITION ,
362
390
writer_version : DEFAULT_WRITER_VERSION ,
363
391
created_by : DEFAULT_CREATED_BY . to_string ( ) ,
364
392
key_value_metadata : None ,
@@ -378,6 +406,7 @@ impl WriterPropertiesBuilder {
378
406
data_page_row_count_limit : self . data_page_row_count_limit ,
379
407
write_batch_size : self . write_batch_size ,
380
408
max_row_group_size : self . max_row_group_size ,
409
+ bloom_filter_position : self . bloom_filter_position ,
381
410
writer_version : self . writer_version ,
382
411
created_by : self . created_by ,
383
412
key_value_metadata : self . key_value_metadata ,
@@ -489,6 +518,12 @@ impl WriterPropertiesBuilder {
489
518
self
490
519
}
491
520
521
+ /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
522
+ pub fn set_bloom_filter_position ( mut self , value : BloomFilterPosition ) -> Self {
523
+ self . bloom_filter_position = value;
524
+ self
525
+ }
526
+
492
527
/// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
493
528
pub fn set_created_by ( mut self , value : String ) -> Self {
494
529
self . created_by = value;
@@ -1054,6 +1089,7 @@ mod tests {
1054
1089
) ;
1055
1090
assert_eq ! ( props. write_batch_size( ) , DEFAULT_WRITE_BATCH_SIZE ) ;
1056
1091
assert_eq ! ( props. max_row_group_size( ) , DEFAULT_MAX_ROW_GROUP_SIZE ) ;
1092
+ assert_eq ! ( props. bloom_filter_position( ) , DEFAULT_BLOOM_FILTER_POSITION ) ;
1057
1093
assert_eq ! ( props. writer_version( ) , DEFAULT_WRITER_VERSION ) ;
1058
1094
assert_eq ! ( props. created_by( ) , DEFAULT_CREATED_BY ) ;
1059
1095
assert_eq ! ( props. key_value_metadata( ) , None ) ;
0 commit comments