@@ -33,6 +33,7 @@ use super::schema::{
33
33
decimal_length_from_precision,
34
34
} ;
35
35
36
+ use crate :: arrow:: levels:: calculate_array_levels;
36
37
use crate :: column:: writer:: ColumnWriter ;
37
38
use crate :: errors:: { ParquetError , Result } ;
38
39
use crate :: file:: properties:: WriterProperties ;
@@ -173,16 +174,15 @@ impl<W: Write> ArrowWriter<W> {
173
174
}
174
175
}
175
176
176
- let mut levels: Vec < _ > = arrays
177
+ let mut levels = arrays
177
178
. iter ( )
178
179
. map ( |array| {
179
- let batch_level = LevelInfo :: new ( 0 , array. len ( ) ) ;
180
- let mut levels = batch_level. calculate_array_levels ( array, field) ;
180
+ let mut levels = calculate_array_levels ( array, field) ?;
181
181
// Reverse levels as we pop() them when writing arrays
182
182
levels. reverse ( ) ;
183
- levels
183
+ Ok ( levels)
184
184
} )
185
- . collect ( ) ;
185
+ . collect :: < Result < Vec < _ > > > ( ) ? ;
186
186
187
187
write_leaves ( & mut row_group_writer, & arrays, & mut levels) ?;
188
188
}
@@ -341,26 +341,23 @@ fn write_leaf(
341
341
column : & ArrayRef ,
342
342
levels : LevelInfo ,
343
343
) -> Result < i64 > {
344
- let indices = levels. filter_array_indices ( ) ;
345
- // Slice array according to computed offset and length
346
- let column = column. slice ( levels. offset , levels. length ) ;
344
+ let indices = levels. non_null_indices ( ) ;
347
345
let written = match writer {
348
346
ColumnWriter :: Int32ColumnWriter ( ref mut typed) => {
349
347
let values = match column. data_type ( ) {
350
348
ArrowDataType :: Date64 => {
351
349
// If the column is a Date64, we cast it to a Date32, and then interpret that as Int32
352
350
let array = if let ArrowDataType :: Date64 = column. data_type ( ) {
353
- let array =
354
- arrow:: compute:: cast ( & column, & ArrowDataType :: Date32 ) ?;
351
+ let array = arrow:: compute:: cast ( column, & ArrowDataType :: Date32 ) ?;
355
352
arrow:: compute:: cast ( & array, & ArrowDataType :: Int32 ) ?
356
353
} else {
357
- arrow:: compute:: cast ( & column, & ArrowDataType :: Int32 ) ?
354
+ arrow:: compute:: cast ( column, & ArrowDataType :: Int32 ) ?
358
355
} ;
359
356
let array = array
360
357
. as_any ( )
361
358
. downcast_ref :: < arrow_array:: Int32Array > ( )
362
359
. expect ( "Unable to get int32 array" ) ;
363
- get_numeric_array_slice :: < Int32Type , _ > ( array, & indices)
360
+ get_numeric_array_slice :: < Int32Type , _ > ( array, indices)
364
361
}
365
362
ArrowDataType :: UInt32 => {
366
363
// follow C++ implementation and use overflow/reinterpret cast from u32 to i32 which will map
@@ -373,21 +370,21 @@ fn write_leaf(
373
370
array,
374
371
|x| x as i32 ,
375
372
) ;
376
- get_numeric_array_slice :: < Int32Type , _ > ( & array, & indices)
373
+ get_numeric_array_slice :: < Int32Type , _ > ( & array, indices)
377
374
}
378
375
_ => {
379
- let array = arrow:: compute:: cast ( & column, & ArrowDataType :: Int32 ) ?;
376
+ let array = arrow:: compute:: cast ( column, & ArrowDataType :: Int32 ) ?;
380
377
let array = array
381
378
. as_any ( )
382
379
. downcast_ref :: < arrow_array:: Int32Array > ( )
383
380
. expect ( "Unable to get i32 array" ) ;
384
- get_numeric_array_slice :: < Int32Type , _ > ( array, & indices)
381
+ get_numeric_array_slice :: < Int32Type , _ > ( array, indices)
385
382
}
386
383
} ;
387
384
typed. write_batch (
388
385
values. as_slice ( ) ,
389
- Some ( levels. definition . as_slice ( ) ) ,
390
- levels. repetition . as_deref ( ) ,
386
+ levels. def_levels ( ) ,
387
+ levels. rep_levels ( ) ,
391
388
) ?
392
389
}
393
390
ColumnWriter :: BoolColumnWriter ( ref mut typed) => {
@@ -396,9 +393,9 @@ fn write_leaf(
396
393
. downcast_ref :: < arrow_array:: BooleanArray > ( )
397
394
. expect ( "Unable to get boolean array" ) ;
398
395
typed. write_batch (
399
- get_bool_array_slice ( array, & indices) . as_slice ( ) ,
400
- Some ( levels. definition . as_slice ( ) ) ,
401
- levels. repetition . as_deref ( ) ,
396
+ get_bool_array_slice ( array, indices) . as_slice ( ) ,
397
+ levels. def_levels ( ) ,
398
+ levels. rep_levels ( ) ,
402
399
) ?
403
400
}
404
401
ColumnWriter :: Int64ColumnWriter ( ref mut typed) => {
@@ -408,7 +405,7 @@ fn write_leaf(
408
405
. as_any ( )
409
406
. downcast_ref :: < arrow_array:: Int64Array > ( )
410
407
. expect ( "Unable to get i64 array" ) ;
411
- get_numeric_array_slice :: < Int64Type , _ > ( array, & indices)
408
+ get_numeric_array_slice :: < Int64Type , _ > ( array, indices)
412
409
}
413
410
ArrowDataType :: UInt64 => {
414
411
// follow C++ implementation and use overflow/reinterpret cast from u64 to i64 which will map
@@ -421,21 +418,21 @@ fn write_leaf(
421
418
array,
422
419
|x| x as i64 ,
423
420
) ;
424
- get_numeric_array_slice :: < Int64Type , _ > ( & array, & indices)
421
+ get_numeric_array_slice :: < Int64Type , _ > ( & array, indices)
425
422
}
426
423
_ => {
427
- let array = arrow:: compute:: cast ( & column, & ArrowDataType :: Int64 ) ?;
424
+ let array = arrow:: compute:: cast ( column, & ArrowDataType :: Int64 ) ?;
428
425
let array = array
429
426
. as_any ( )
430
427
. downcast_ref :: < arrow_array:: Int64Array > ( )
431
428
. expect ( "Unable to get i64 array" ) ;
432
- get_numeric_array_slice :: < Int64Type , _ > ( array, & indices)
429
+ get_numeric_array_slice :: < Int64Type , _ > ( array, indices)
433
430
}
434
431
} ;
435
432
typed. write_batch (
436
433
values. as_slice ( ) ,
437
- Some ( levels. definition . as_slice ( ) ) ,
438
- levels. repetition . as_deref ( ) ,
434
+ levels. def_levels ( ) ,
435
+ levels. rep_levels ( ) ,
439
436
) ?
440
437
}
441
438
ColumnWriter :: Int96ColumnWriter ( ref mut _typed) => {
@@ -447,9 +444,9 @@ fn write_leaf(
447
444
. downcast_ref :: < arrow_array:: Float32Array > ( )
448
445
. expect ( "Unable to get Float32 array" ) ;
449
446
typed. write_batch (
450
- get_numeric_array_slice :: < FloatType , _ > ( array, & indices) . as_slice ( ) ,
451
- Some ( levels. definition . as_slice ( ) ) ,
452
- levels. repetition . as_deref ( ) ,
447
+ get_numeric_array_slice :: < FloatType , _ > ( array, indices) . as_slice ( ) ,
448
+ levels. def_levels ( ) ,
449
+ levels. rep_levels ( ) ,
453
450
) ?
454
451
}
455
452
ColumnWriter :: DoubleColumnWriter ( ref mut typed) => {
@@ -458,9 +455,9 @@ fn write_leaf(
458
455
. downcast_ref :: < arrow_array:: Float64Array > ( )
459
456
. expect ( "Unable to get Float64 array" ) ;
460
457
typed. write_batch (
461
- get_numeric_array_slice :: < DoubleType , _ > ( array, & indices) . as_slice ( ) ,
462
- Some ( levels. definition . as_slice ( ) ) ,
463
- levels. repetition . as_deref ( ) ,
458
+ get_numeric_array_slice :: < DoubleType , _ > ( array, indices) . as_slice ( ) ,
459
+ levels. def_levels ( ) ,
460
+ levels. rep_levels ( ) ,
464
461
) ?
465
462
}
466
463
ColumnWriter :: ByteArrayColumnWriter ( ref mut typed) => match column. data_type ( ) {
@@ -471,8 +468,8 @@ fn write_leaf(
471
468
. expect ( "Unable to get BinaryArray array" ) ;
472
469
typed. write_batch (
473
470
get_binary_array ( array) . as_slice ( ) ,
474
- Some ( levels. definition . as_slice ( ) ) ,
475
- levels. repetition . as_deref ( ) ,
471
+ levels. def_levels ( ) ,
472
+ levels. rep_levels ( ) ,
476
473
) ?
477
474
}
478
475
ArrowDataType :: Utf8 => {
@@ -482,8 +479,8 @@ fn write_leaf(
482
479
. expect ( "Unable to get LargeBinaryArray array" ) ;
483
480
typed. write_batch (
484
481
get_string_array ( array) . as_slice ( ) ,
485
- Some ( levels. definition . as_slice ( ) ) ,
486
- levels. repetition . as_deref ( ) ,
482
+ levels. def_levels ( ) ,
483
+ levels. rep_levels ( ) ,
487
484
) ?
488
485
}
489
486
ArrowDataType :: LargeBinary => {
@@ -493,8 +490,8 @@ fn write_leaf(
493
490
. expect ( "Unable to get LargeBinaryArray array" ) ;
494
491
typed. write_batch (
495
492
get_large_binary_array ( array) . as_slice ( ) ,
496
- Some ( levels. definition . as_slice ( ) ) ,
497
- levels. repetition . as_deref ( ) ,
493
+ levels. def_levels ( ) ,
494
+ levels. rep_levels ( ) ,
498
495
) ?
499
496
}
500
497
ArrowDataType :: LargeUtf8 => {
@@ -504,8 +501,8 @@ fn write_leaf(
504
501
. expect ( "Unable to get LargeUtf8 array" ) ;
505
502
typed. write_batch (
506
503
get_large_string_array ( array) . as_slice ( ) ,
507
- Some ( levels. definition . as_slice ( ) ) ,
508
- levels. repetition . as_deref ( ) ,
504
+ levels. def_levels ( ) ,
505
+ levels. rep_levels ( ) ,
509
506
) ?
510
507
}
511
508
_ => unreachable ! ( "Currently unreachable because data type not supported" ) ,
@@ -518,14 +515,14 @@ fn write_leaf(
518
515
. as_any ( )
519
516
. downcast_ref :: < arrow_array:: IntervalYearMonthArray > ( )
520
517
. unwrap ( ) ;
521
- get_interval_ym_array_slice ( array, & indices)
518
+ get_interval_ym_array_slice ( array, indices)
522
519
}
523
520
IntervalUnit :: DayTime => {
524
521
let array = column
525
522
. as_any ( )
526
523
. downcast_ref :: < arrow_array:: IntervalDayTimeArray > ( )
527
524
. unwrap ( ) ;
528
- get_interval_dt_array_slice ( array, & indices)
525
+ get_interval_dt_array_slice ( array, indices)
529
526
}
530
527
_ => {
531
528
return Err ( ParquetError :: NYI (
@@ -541,14 +538,14 @@ fn write_leaf(
541
538
. as_any ( )
542
539
. downcast_ref :: < arrow_array:: FixedSizeBinaryArray > ( )
543
540
. unwrap ( ) ;
544
- get_fsb_array_slice ( array, & indices)
541
+ get_fsb_array_slice ( array, indices)
545
542
}
546
543
ArrowDataType :: Decimal ( _, _) => {
547
544
let array = column
548
545
. as_any ( )
549
546
. downcast_ref :: < arrow_array:: DecimalArray > ( )
550
547
. unwrap ( ) ;
551
- get_decimal_array_slice ( array, & indices)
548
+ get_decimal_array_slice ( array, indices)
552
549
}
553
550
_ => {
554
551
return Err ( ParquetError :: NYI (
@@ -559,8 +556,8 @@ fn write_leaf(
559
556
} ;
560
557
typed. write_batch (
561
558
bytes. as_slice ( ) ,
562
- Some ( levels. definition . as_slice ( ) ) ,
563
- levels. repetition . as_deref ( ) ,
559
+ levels. def_levels ( ) ,
560
+ levels. rep_levels ( ) ,
564
561
) ?
565
562
}
566
563
} ;
@@ -593,6 +590,7 @@ macro_rules! def_get_binary_array_fn {
593
590
} ;
594
591
}
595
592
593
+ // TODO: These methods don't handle non null indices correctly (#1753)
596
594
def_get_binary_array_fn ! ( get_binary_array, arrow_array:: BinaryArray ) ;
597
595
def_get_binary_array_fn ! ( get_string_array, arrow_array:: StringArray ) ;
598
596
def_get_binary_array_fn ! ( get_large_binary_array, arrow_array:: LargeBinaryArray ) ;
0 commit comments