@@ -282,7 +282,7 @@ impl ArrowReader {
282
282
let delete_row_selection = Self :: build_deletes_row_selection (
283
283
record_batch_stream_builder. metadata ( ) . row_groups ( ) ,
284
284
& selected_row_group_indices,
285
- positional_delete_indexes,
285
+ positional_delete_indexes. as_ref ( ) ,
286
286
) ?;
287
287
288
288
// merge the row selection from the delete files with the row selection
@@ -345,17 +345,19 @@ impl ArrowReader {
345
345
/// as having been deleted by a positional delete, taking into account any row groups that have
346
346
/// been skipped entirely by the filter predicate
347
347
fn build_deletes_row_selection (
348
- row_group_metadata : & [ RowGroupMetaData ] ,
348
+ row_group_metadata_list : & [ RowGroupMetaData ] ,
349
349
selected_row_groups : & Option < Vec < usize > > ,
350
- mut positional_deletes : DeleteVector ,
350
+ positional_deletes : & DeleteVector ,
351
351
) -> Result < RowSelection > {
352
352
let mut results: Vec < RowSelector > = Vec :: new ( ) ;
353
353
let mut selected_row_groups_idx = 0 ;
354
- let mut current_page_base_idx: u64 = 0 ;
354
+ let mut current_row_group_base_idx: u64 = 0 ;
355
+ let mut delete_vector_iter = positional_deletes. iter ( ) ;
356
+ let mut next_deleted_row_idx_opt = delete_vector_iter. next ( ) ;
355
357
356
- for ( idx, row_group_metadata) in row_group_metadata . iter ( ) . enumerate ( ) {
357
- let page_num_rows = row_group_metadata. num_rows ( ) as u64 ;
358
- let next_page_base_idx = current_page_base_idx + page_num_rows ;
358
+ for ( idx, row_group_metadata) in row_group_metadata_list . iter ( ) . enumerate ( ) {
359
+ let row_group_num_rows = row_group_metadata. num_rows ( ) as u64 ;
360
+ let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows ;
359
361
360
362
// if row group selection is enabled,
361
363
if let Some ( selected_row_groups) = selected_row_groups {
@@ -372,36 +374,37 @@ impl ArrowReader {
372
374
} else {
373
375
// remove any positional deletes from the skipped page so that
374
376
// `positional.deletes.min()` can be used
375
- positional_deletes. remove_range ( current_page_base_idx..next_page_base_idx) ;
377
+ delete_vector_iter. advance_to ( next_row_group_base_idx) ;
378
+ next_deleted_row_idx_opt = delete_vector_iter. next ( ) ;
376
379
377
380
// still increment the current page base index but then skip to the next row group
378
381
// in the file
379
- current_page_base_idx += page_num_rows ;
382
+ current_row_group_base_idx += row_group_num_rows ;
380
383
continue ;
381
384
}
382
385
}
383
386
384
- let mut next_deleted_row_idx = match positional_deletes . min ( ) {
387
+ let mut next_deleted_row_idx = match next_deleted_row_idx_opt {
385
388
Some ( next_deleted_row_idx) => {
386
- // if the index of the next deleted row is beyond this page , add a selection for
387
- // the remainder of this page and skip to the next page
388
- if next_deleted_row_idx >= next_page_base_idx {
389
- results. push ( RowSelector :: select ( page_num_rows as usize ) ) ;
389
+ // if the index of the next deleted row is beyond this row group , add a selection for
390
+ // the remainder of this row group and skip to the next row group
391
+ if next_deleted_row_idx >= next_row_group_base_idx {
392
+ results. push ( RowSelector :: select ( row_group_num_rows as usize ) ) ;
390
393
continue ;
391
394
}
392
395
393
396
next_deleted_row_idx
394
397
}
395
398
396
- // If there are no more pos deletes, add a selector for the entirety of this page .
399
+ // If there are no more pos deletes, add a selector for the entirety of this row group .
397
400
_ => {
398
- results. push ( RowSelector :: select ( page_num_rows as usize ) ) ;
401
+ results. push ( RowSelector :: select ( row_group_num_rows as usize ) ) ;
399
402
continue ;
400
403
}
401
404
} ;
402
405
403
- let mut current_idx = current_page_base_idx ;
404
- ' chunks: while next_deleted_row_idx < next_page_base_idx {
406
+ let mut current_idx = current_row_group_base_idx ;
407
+ ' chunks: while next_deleted_row_idx < next_row_group_base_idx {
405
408
// `select` all rows that precede the next delete index
406
409
if current_idx < next_deleted_row_idx {
407
410
let run_length = next_deleted_row_idx - current_idx;
@@ -412,18 +415,18 @@ impl ArrowReader {
412
415
// `skip` all consecutive deleted rows in the current row group
413
416
let mut run_length = 0 ;
414
417
while next_deleted_row_idx == current_idx
415
- && next_deleted_row_idx < next_page_base_idx
418
+ && next_deleted_row_idx < next_row_group_base_idx
416
419
{
417
420
run_length += 1 ;
418
421
current_idx += 1 ;
419
- positional_deletes. remove ( next_deleted_row_idx) ;
420
422
421
- next_deleted_row_idx = match positional_deletes. min ( ) {
423
+ next_deleted_row_idx_opt = delete_vector_iter. next ( ) ;
424
+ next_deleted_row_idx = match next_deleted_row_idx_opt {
422
425
Some ( next_deleted_row_idx) => next_deleted_row_idx,
423
426
_ => {
424
427
// We've processed the final positional delete.
425
428
// Conclude the skip and then break so that we select the remaining
426
- // rows in the page and move on to the next row group
429
+ // rows in the row group and move on to the next row group
427
430
results. push ( RowSelector :: skip ( run_length) ) ;
428
431
break ' chunks;
429
432
}
@@ -432,13 +435,13 @@ impl ArrowReader {
432
435
results. push ( RowSelector :: skip ( run_length) ) ;
433
436
}
434
437
435
- if current_idx < next_page_base_idx {
438
+ if current_idx < next_row_group_base_idx {
436
439
results. push ( RowSelector :: select (
437
- ( next_page_base_idx - current_idx) as usize ,
440
+ ( next_row_group_base_idx - current_idx) as usize ,
438
441
) ) ;
439
442
}
440
443
441
- current_page_base_idx += page_num_rows ;
444
+ current_row_group_base_idx += row_group_num_rows ;
442
445
}
443
446
444
447
Ok ( results. into ( ) )
@@ -1375,18 +1378,19 @@ mod tests {
1375
1378
use arrow_array:: { ArrayRef , RecordBatch , StringArray } ;
1376
1379
use arrow_schema:: { DataType , Field , Schema as ArrowSchema , TimeUnit } ;
1377
1380
use futures:: TryStreamExt ;
1381
+ use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
1378
1382
use parquet:: arrow:: { ArrowWriter , ProjectionMask } ;
1379
1383
use parquet:: basic:: Compression ;
1380
- use parquet:: file:: properties:: WriterProperties ;
1381
- use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
1382
1384
use parquet:: file:: metadata:: { ColumnChunkMetaData , RowGroupMetaData } ;
1385
+ use parquet:: file:: properties:: WriterProperties ;
1383
1386
use parquet:: schema:: parser:: parse_message_type;
1384
- use tempfile:: TempDir ;
1385
1387
use parquet:: schema:: types:: { SchemaDescPtr , SchemaDescriptor } ;
1386
1388
use roaring:: RoaringTreemap ;
1389
+ use tempfile:: TempDir ;
1387
1390
1388
1391
use crate :: arrow:: reader:: { CollectFieldIdVisitor , PARQUET_FIELD_ID_META_KEY } ;
1389
1392
use crate :: arrow:: { ArrowReader , ArrowReaderBuilder } ;
1393
+ use crate :: delete_vector:: DeleteVector ;
1390
1394
use crate :: expr:: visitors:: bound_predicate_visitor:: visit;
1391
1395
use crate :: expr:: { Bind , Predicate , Reference } ;
1392
1396
use crate :: io:: FileIO ;
@@ -1733,16 +1737,14 @@ message schema {
1733
1737
2999 , // single item at end of selected rg3 (1)
1734
1738
3000 , // single item at start of skipped rg4
1735
1739
] ) ;
1736
-
1737
- let positional_deletes = DeleteVector {
1738
- inner : positional_deletes
1739
- } ;
1740
+
1741
+ let positional_deletes = DeleteVector :: new ( positional_deletes) ;
1740
1742
1741
1743
// using selected row groups 1 and 3
1742
1744
let result = ArrowReader :: build_deletes_row_selection (
1743
1745
& row_groups_metadata,
1744
1746
& selected_row_groups,
1745
- positional_deletes. clone ( ) ,
1747
+ & positional_deletes,
1746
1748
)
1747
1749
. unwrap ( ) ;
1748
1750
@@ -1766,7 +1768,7 @@ message schema {
1766
1768
let result = ArrowReader :: build_deletes_row_selection (
1767
1769
& row_groups_metadata,
1768
1770
& None ,
1769
- positional_deletes,
1771
+ & positional_deletes,
1770
1772
)
1771
1773
. unwrap ( ) ;
1772
1774
0 commit comments