@@ -34,8 +34,9 @@ use crate::column::{
34
34
} ;
35
35
use crate :: data_type:: DataType ;
36
36
use crate :: errors:: { ParquetError , Result } ;
37
+ use crate :: file:: properties:: { BloomFilterPosition , WriterPropertiesPtr } ;
37
38
use crate :: file:: reader:: ChunkReader ;
38
- use crate :: file:: { metadata:: * , properties :: WriterPropertiesPtr , PARQUET_MAGIC } ;
39
+ use crate :: file:: { metadata:: * , PARQUET_MAGIC } ;
39
40
use crate :: schema:: types:: { self , ColumnDescPtr , SchemaDescPtr , SchemaDescriptor , TypePtr } ;
40
41
41
42
/// A wrapper around a [`Write`] that keeps track of the number
@@ -115,9 +116,10 @@ pub type OnCloseColumnChunk<'a> = Box<dyn FnOnce(ColumnCloseResult) -> Result<()
115
116
/// - the row group metadata
116
117
/// - the column index for each column chunk
117
118
/// - the offset index for each column chunk
118
- pub type OnCloseRowGroup < ' a > = Box <
119
+ pub type OnCloseRowGroup < ' a , W > = Box <
119
120
dyn FnOnce (
120
- RowGroupMetaDataPtr ,
121
+ & ' a mut TrackedWrite < W > ,
122
+ RowGroupMetaData ,
121
123
Vec < Option < Sbbf > > ,
122
124
Vec < Option < ColumnIndex > > ,
123
125
Vec < Option < OffsetIndex > > ,
@@ -143,7 +145,7 @@ pub struct SerializedFileWriter<W: Write> {
143
145
schema : TypePtr ,
144
146
descr : SchemaDescPtr ,
145
147
props : WriterPropertiesPtr ,
146
- row_groups : Vec < RowGroupMetaDataPtr > ,
148
+ row_groups : Vec < RowGroupMetaData > ,
147
149
bloom_filters : Vec < Vec < Option < Sbbf > > > ,
148
150
column_indexes : Vec < Vec < Option < ColumnIndex > > > ,
149
151
offset_indexes : Vec < Vec < Option < OffsetIndex > > > ,
@@ -197,18 +199,28 @@ impl<W: Write + Send> SerializedFileWriter<W> {
197
199
198
200
self . row_group_index += 1 ;
199
201
202
+ let bloom_filter_position = self . properties ( ) . bloom_filter_position ( ) ;
200
203
let row_groups = & mut self . row_groups ;
201
204
let row_bloom_filters = & mut self . bloom_filters ;
202
205
let row_column_indexes = & mut self . column_indexes ;
203
206
let row_offset_indexes = & mut self . offset_indexes ;
204
- let on_close =
205
- |metadata, row_group_bloom_filter, row_group_column_index, row_group_offset_index| {
206
- row_groups. push ( metadata) ;
207
- row_bloom_filters. push ( row_group_bloom_filter) ;
208
- row_column_indexes. push ( row_group_column_index) ;
209
- row_offset_indexes. push ( row_group_offset_index) ;
210
- Ok ( ( ) )
207
+ let on_close = move |buf,
208
+ mut metadata,
209
+ row_group_bloom_filter,
210
+ row_group_column_index,
211
+ row_group_offset_index| {
212
+ row_bloom_filters. push ( row_group_bloom_filter) ;
213
+ row_column_indexes. push ( row_group_column_index) ;
214
+ row_offset_indexes. push ( row_group_offset_index) ;
215
+ match bloom_filter_position {
216
+ BloomFilterPosition :: AfterRowGroup => {
217
+ write_bloom_filters ( buf, row_bloom_filters, & mut metadata) ?
218
+ }
219
+ BloomFilterPosition :: End => ( ) ,
211
220
} ;
221
+ row_groups. push ( metadata) ;
222
+ Ok ( ( ) )
223
+ } ;
212
224
213
225
let row_group_writer = SerializedRowGroupWriter :: new (
214
226
self . descr . clone ( ) ,
@@ -221,7 +233,7 @@ impl<W: Write + Send> SerializedFileWriter<W> {
221
233
}
222
234
223
235
/// Returns metadata for any flushed row groups
224
- pub fn flushed_row_groups ( & self ) -> & [ RowGroupMetaDataPtr ] {
236
+ pub fn flushed_row_groups ( & self ) -> & [ RowGroupMetaData ] {
225
237
& self . row_groups
226
238
}
227
239
@@ -273,40 +285,6 @@ impl<W: Write + Send> SerializedFileWriter<W> {
273
285
Ok ( ( ) )
274
286
}
275
287
276
- /// Serialize all the bloom filter to the file
277
- pub fn write_bloom_filters ( & mut self , row_groups : & mut [ RowGroup ] ) -> Result < ( ) > {
278
- // iter row group
279
- // iter each column
280
- // write bloom filter to the file
281
- for row_group in row_groups. iter_mut ( ) {
282
- let row_group_idx: u16 = row_group
283
- . ordinal
284
- . expect ( "Missing row group ordinal" )
285
- . try_into ( )
286
- . expect ( "Negative row group ordinal" ) ;
287
- let row_group_idx = row_group_idx as usize ;
288
- for ( column_idx, column_chunk) in row_group. columns . iter_mut ( ) . enumerate ( ) {
289
- match self . bloom_filters [ row_group_idx] [ column_idx] . take ( ) {
290
- Some ( bloom_filter) => {
291
- let start_offset = self . buf . bytes_written ( ) ;
292
- bloom_filter. write ( & mut self . buf ) ?;
293
- let end_offset = self . buf . bytes_written ( ) ;
294
- // set offset and index for bloom filter
295
- let column_chunk_meta = column_chunk
296
- . meta_data
297
- . as_mut ( )
298
- . expect ( "can't have bloom filter without column metadata" ) ;
299
- column_chunk_meta. bloom_filter_offset = Some ( start_offset as i64 ) ;
300
- column_chunk_meta. bloom_filter_length =
301
- Some ( ( end_offset - start_offset) as i32 ) ;
302
- }
303
- None => { }
304
- }
305
- }
306
- }
307
- Ok ( ( ) )
308
- }
309
-
310
288
/// Serialize all the column index to the file
311
289
fn write_column_indexes ( & mut self , row_groups : & mut [ RowGroup ] ) -> Result < ( ) > {
312
290
// iter row group
@@ -337,14 +315,17 @@ impl<W: Write + Send> SerializedFileWriter<W> {
337
315
self . finished = true ;
338
316
let num_rows = self . row_groups . iter ( ) . map ( |x| x. num_rows ( ) ) . sum ( ) ;
339
317
318
+ for row_group in & mut self . row_groups {
319
+ write_bloom_filters ( & mut self . buf , & mut self . bloom_filters , row_group) ?;
320
+ }
321
+
340
322
let mut row_groups = self
341
323
. row_groups
342
324
. as_slice ( )
343
325
. iter ( )
344
326
. map ( |v| v. to_thrift ( ) )
345
327
. collect :: < Vec < _ > > ( ) ;
346
328
347
- self . write_bloom_filters ( & mut row_groups) ?;
348
329
// Write column indexes and offset indexes
349
330
self . write_column_indexes ( & mut row_groups) ?;
350
331
self . write_offset_indexes ( & mut row_groups) ?;
@@ -449,6 +430,43 @@ impl<W: Write + Send> SerializedFileWriter<W> {
449
430
}
450
431
}
451
432
433
+ /// Serialize all the bloom filters of the given row group to the given buffer,
434
+ /// and returns the updated row group metadata.
435
+ fn write_bloom_filters < W : Write + Send > (
436
+ buf : & mut TrackedWrite < W > ,
437
+ bloom_filters : & mut Vec < Vec < Option < Sbbf > > > ,
438
+ row_group : & mut RowGroupMetaData ,
439
+ ) -> Result < ( ) > {
440
+ // iter row group
441
+ // iter each column
442
+ // write bloom filter to the file
443
+
444
+ let row_group_idx: u16 = row_group
445
+ . ordinal ( )
446
+ . expect ( "Missing row group ordinal" )
447
+ . try_into ( )
448
+ . expect ( "Negative row group ordinal" ) ;
449
+ let row_group_idx = row_group_idx as usize ;
450
+ for ( column_idx, column_chunk) in row_group. columns_mut ( ) . iter_mut ( ) . enumerate ( ) {
451
+ match bloom_filters[ row_group_idx] [ column_idx] . take ( ) {
452
+ Some ( bloom_filter) => {
453
+ let start_offset = buf. bytes_written ( ) ;
454
+ bloom_filter. write ( & mut * buf) ?;
455
+ let end_offset = buf. bytes_written ( ) ;
456
+ // set offset and index for bloom filter
457
+ * column_chunk = column_chunk
458
+ . clone ( )
459
+ . into_builder ( )
460
+ . set_bloom_filter_offset ( Some ( start_offset as i64 ) )
461
+ . set_bloom_filter_length ( Some ( ( end_offset - start_offset) as i32 ) )
462
+ . build ( ) ?;
463
+ }
464
+ None => { }
465
+ }
466
+ }
467
+ Ok ( ( ) )
468
+ }
469
+
452
470
/// Parquet row group writer API.
453
471
/// Provides methods to access column writers in an iterator-like fashion, order is
454
472
/// guaranteed to match the order of schema leaves (column descriptors).
@@ -474,7 +492,7 @@ pub struct SerializedRowGroupWriter<'a, W: Write> {
474
492
offset_indexes : Vec < Option < OffsetIndex > > ,
475
493
row_group_index : i16 ,
476
494
file_offset : i64 ,
477
- on_close : Option < OnCloseRowGroup < ' a > > ,
495
+ on_close : Option < OnCloseRowGroup < ' a , W > > ,
478
496
}
479
497
480
498
impl < ' a , W : Write + Send > SerializedRowGroupWriter < ' a , W > {
@@ -491,7 +509,7 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
491
509
properties : WriterPropertiesPtr ,
492
510
buf : & ' a mut TrackedWrite < W > ,
493
511
row_group_index : i16 ,
494
- on_close : Option < OnCloseRowGroup < ' a > > ,
512
+ on_close : Option < OnCloseRowGroup < ' a , W > > ,
495
513
) -> Self {
496
514
let num_columns = schema_descr. num_columns ( ) ;
497
515
let file_offset = buf. bytes_written ( ) as i64 ;
@@ -675,12 +693,12 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
675
693
. set_file_offset ( self . file_offset )
676
694
. build ( ) ?;
677
695
678
- let metadata = Arc :: new ( row_group_metadata) ;
679
- self . row_group_metadata = Some ( metadata. clone ( ) ) ;
696
+ self . row_group_metadata = Some ( Arc :: new ( row_group_metadata. clone ( ) ) ) ;
680
697
681
698
if let Some ( on_close) = self . on_close . take ( ) {
682
699
on_close (
683
- metadata,
700
+ self . buf ,
701
+ row_group_metadata,
684
702
self . bloom_filters ,
685
703
self . column_indexes ,
686
704
self . offset_indexes ,
@@ -1452,7 +1470,7 @@ mod tests {
1452
1470
assert_eq ! ( flushed. len( ) , idx + 1 ) ;
1453
1471
assert_eq ! ( Some ( idx as i16 ) , last_group. ordinal( ) ) ;
1454
1472
assert_eq ! ( Some ( row_group_file_offset as i64 ) , last_group. file_offset( ) ) ;
1455
- assert_eq ! ( flushed[ idx] . as_ref ( ) , last_group . as_ref ( ) ) ;
1473
+ assert_eq ! ( flushed[ idx] , Arc :: unwrap_or_clone ( last_group ) ) ;
1456
1474
}
1457
1475
let file_metadata = file_writer. close ( ) . unwrap ( ) ;
1458
1476
0 commit comments