19
19
20
20
// TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328
21
21
22
- use arrow:: array:: builder:: FixedSizeBinaryBuilder ;
22
+ use arrow:: array:: {
23
+ BooleanBuilder , FixedSizeBinaryBuilder , LargeStringBuilder , StringBuilder ,
24
+ } ;
23
25
use arrow:: datatypes:: i256;
24
26
use arrow:: { array:: ArrayRef , datatypes:: DataType } ;
25
27
use arrow_array:: {
26
28
new_empty_array, new_null_array, BinaryArray , BooleanArray , Date32Array , Date64Array ,
27
- Decimal128Array , Decimal256Array , FixedSizeBinaryArray , Float16Array , Float32Array ,
28
- Float64Array , Int16Array , Int32Array , Int64Array , Int8Array , LargeBinaryArray ,
29
- LargeStringArray , StringArray , Time32MillisecondArray , Time32SecondArray ,
30
- Time64MicrosecondArray , Time64NanosecondArray , TimestampMicrosecondArray ,
31
- TimestampMillisecondArray , TimestampNanosecondArray , TimestampSecondArray ,
32
- UInt16Array , UInt32Array , UInt64Array , UInt8Array ,
29
+ Decimal128Array , Decimal256Array , Float16Array , Float32Array , Float64Array ,
30
+ Int16Array , Int32Array , Int64Array , Int8Array , LargeBinaryArray ,
31
+ Time32MillisecondArray , Time32SecondArray , Time64MicrosecondArray ,
32
+ Time64NanosecondArray , TimestampMicrosecondArray , TimestampMillisecondArray ,
33
+ TimestampNanosecondArray , TimestampSecondArray , UInt16Array , UInt32Array ,
34
+ UInt64Array , UInt8Array ,
33
35
} ;
34
36
use arrow_schema:: { Field , FieldRef , Schema , TimeUnit } ;
35
37
use datafusion_common:: { internal_datafusion_err, internal_err, plan_err, Result } ;
@@ -393,51 +395,73 @@ macro_rules! get_statistics {
393
395
} )
394
396
} ,
395
397
DataType :: Binary => Ok ( Arc :: new( BinaryArray :: from_iter(
396
- [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) . map ( |x| x . map ( |x| x . to_vec ( ) ) ) ,
398
+ [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator)
397
399
) ) ) ,
398
400
DataType :: LargeBinary => Ok ( Arc :: new( LargeBinaryArray :: from_iter(
399
- [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) . map( |x| x. map( |x|x. to_vec( ) ) ) ,
400
- ) ) ) ,
401
- DataType :: Utf8 => Ok ( Arc :: new( StringArray :: from_iter(
402
- [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) . map( |x| {
403
- x. and_then( |x| {
404
- let res = std:: str :: from_utf8( x) . map( |s| s. to_string( ) ) . ok( ) ;
405
- if res. is_none( ) {
406
- log:: debug!( "Utf8 statistics is a non-UTF8 value, ignoring it." ) ;
407
- }
408
- res
409
- } )
410
- } ) ,
401
+ [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator)
411
402
) ) ) ,
403
+ DataType :: Utf8 => {
404
+ let iterator = [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) ;
405
+ let mut builder = StringBuilder :: new( ) ;
406
+ for x in iterator {
407
+ let Some ( x) = x else {
408
+ builder. append_null( ) ; // no statistics value
409
+ continue ;
410
+ } ;
411
+
412
+ let Ok ( x) = std:: str :: from_utf8( x) else {
413
+ log:: debug!( "Utf8 statistics is a non-UTF8 value, ignoring it." ) ;
414
+ builder. append_null( ) ;
415
+ continue ;
416
+ } ;
417
+
418
+ builder. append_value( x) ;
419
+ }
420
+ Ok ( Arc :: new( builder. finish( ) ) )
421
+ } ,
412
422
DataType :: LargeUtf8 => {
413
- Ok ( Arc :: new( LargeStringArray :: from_iter(
414
- [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) . map( |x| {
415
- x. and_then( |x| {
416
- let res = std:: str :: from_utf8( x) . map( |s| s. to_string( ) ) . ok( ) ;
417
- if res. is_none( ) {
418
- log:: debug!( "LargeUtf8 statistics is a non-UTF8 value, ignoring it." ) ;
419
- }
420
- res
421
- } )
422
- } ) ,
423
- ) ) )
424
- }
425
- DataType :: FixedSizeBinary ( size) => Ok ( Arc :: new( FixedSizeBinaryArray :: from(
426
- [ <$stat_type_prefix FixedLenByteArrayStatsIterator >] :: new( $iterator) . map( |x| {
427
- x. and_then( |x| {
428
- if x. len( ) . try_into( ) == Ok ( * size) {
429
- Some ( x)
430
- } else {
431
- log:: debug!(
432
- "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it." ,
433
- size,
434
- x. len( ) ,
435
- ) ;
436
- None
437
- }
438
- } )
439
- } ) . collect:: <Vec <_>>( ) ,
440
- ) ) ) ,
423
+ let iterator = [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) ;
424
+ let mut builder = LargeStringBuilder :: new( ) ;
425
+ for x in iterator {
426
+ let Some ( x) = x else {
427
+ builder. append_null( ) ; // no statistics value
428
+ continue ;
429
+ } ;
430
+
431
+ let Ok ( x) = std:: str :: from_utf8( x) else {
432
+ log:: debug!( "Utf8 statistics is a non-UTF8 value, ignoring it." ) ;
433
+ builder. append_null( ) ;
434
+ continue ;
435
+ } ;
436
+
437
+ builder. append_value( x) ;
438
+ }
439
+ Ok ( Arc :: new( builder. finish( ) ) )
440
+ } ,
441
+ DataType :: FixedSizeBinary ( size) => {
442
+ let iterator = [ <$stat_type_prefix FixedLenByteArrayStatsIterator >] :: new( $iterator) ;
443
+ let mut builder = FixedSizeBinaryBuilder :: new( * size) ;
444
+ for x in iterator {
445
+ let Some ( x) = x else {
446
+ builder. append_null( ) ; // no statistics value
447
+ continue ;
448
+ } ;
449
+
450
+ // ignore invalid values
451
+ if x. len( ) . try_into( ) != Ok ( * size) {
452
+ log:: debug!(
453
+ "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it." ,
454
+ size,
455
+ x. len( ) ,
456
+ ) ;
457
+ builder. append_null( ) ;
458
+ continue ;
459
+ }
460
+
461
+ builder. append_value( x) . expect( "ensure to append successfully here, because size have been checked before" ) ;
462
+ }
463
+ Ok ( Arc :: new( builder. finish( ) ) )
464
+ } ,
441
465
DataType :: Decimal128 ( precision, scale) => {
442
466
let arr = Decimal128Array :: from_iter(
443
467
[ <$stat_type_prefix Decimal128StatsIterator >] :: new( $iterator)
@@ -740,15 +764,20 @@ macro_rules! get_data_page_statistics {
740
764
( $stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
741
765
paste! {
742
766
match $data_type {
743
- Some ( DataType :: Boolean ) => Ok ( Arc :: new(
744
- BooleanArray :: from_iter(
745
- [ <$stat_type_prefix BooleanDataPageStatsIterator >] :: new( $iterator)
746
- . flatten( )
747
- // BooleanArray::from_iter required a sized iterator, so collect into Vec first
748
- . collect:: <Vec <_>>( )
749
- . into_iter( )
750
- )
751
- ) ) ,
767
+ Some ( DataType :: Boolean ) => {
768
+ let iterator = [ <$stat_type_prefix BooleanDataPageStatsIterator >] :: new( $iterator) ;
769
+ let mut builder = BooleanBuilder :: new( ) ;
770
+ for x in iterator {
771
+ for x in x. into_iter( ) {
772
+ let Some ( x) = x else {
773
+ builder. append_null( ) ; // no statistics value
774
+ continue ;
775
+ } ;
776
+ builder. append_value( x) ;
777
+ }
778
+ }
779
+ Ok ( Arc :: new( builder. finish( ) ) )
780
+ } ,
752
781
Some ( DataType :: UInt8 ) => Ok ( Arc :: new(
753
782
UInt8Array :: from_iter(
754
783
[ <$stat_type_prefix Int32DataPageStatsIterator >] :: new( $iterator)
@@ -830,32 +859,48 @@ macro_rules! get_data_page_statistics {
830
859
Some ( DataType :: Float64 ) => Ok ( Arc :: new( Float64Array :: from_iter( [ <$stat_type_prefix Float64DataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
831
860
Some ( DataType :: Binary ) => Ok ( Arc :: new( BinaryArray :: from_iter( [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
832
861
Some ( DataType :: LargeBinary ) => Ok ( Arc :: new( LargeBinaryArray :: from_iter( [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
833
- Some ( DataType :: Utf8 ) => Ok ( Arc :: new( StringArray :: from(
834
- [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . map( |x| {
835
- x. into_iter( ) . map( |x| {
836
- x. and_then( |x| {
837
- let res = std:: str :: from_utf8( x. data( ) ) . map( |s| s. to_string( ) ) . ok( ) ;
838
- if res. is_none( ) {
862
+ Some ( DataType :: Utf8 ) => {
863
+ let mut builder = StringBuilder :: new( ) ;
864
+ let iterator = [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) ;
865
+ for x in iterator {
866
+ for x in x. into_iter( ) {
867
+ let Some ( x) = x else {
868
+ builder. append_null( ) ; // no statistics value
869
+ continue ;
870
+ } ;
871
+
872
+ let Ok ( x) = std:: str :: from_utf8( x. data( ) ) else {
839
873
log:: debug!( "Utf8 statistics is a non-UTF8 value, ignoring it." ) ;
840
- }
841
- res
842
- } )
843
- } )
844
- } ) . flatten( ) . collect:: <Vec <_>>( ) ,
845
- ) ) ) ,
846
- Some ( DataType :: LargeUtf8 ) => Ok ( Arc :: new( LargeStringArray :: from(
847
- [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . map( |x| {
848
- x. into_iter( ) . map( |x| {
849
- x. and_then( |x| {
850
- let res = std:: str :: from_utf8( x. data( ) ) . map( |s| s. to_string( ) ) . ok( ) ;
851
- if res. is_none( ) {
852
- log:: debug!( "LargeUtf8 statistics is a non-UTF8 value, ignoring it." ) ;
853
- }
854
- res
855
- } )
856
- } )
857
- } ) . flatten( ) . collect:: <Vec <_>>( ) ,
858
- ) ) ) ,
874
+ builder. append_null( ) ;
875
+ continue ;
876
+ } ;
877
+
878
+ builder. append_value( x) ;
879
+ }
880
+ }
881
+ Ok ( Arc :: new( builder. finish( ) ) )
882
+ } ,
883
+ Some ( DataType :: LargeUtf8 ) => {
884
+ let mut builder = LargeStringBuilder :: new( ) ;
885
+ let iterator = [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) ;
886
+ for x in iterator {
887
+ for x in x. into_iter( ) {
888
+ let Some ( x) = x else {
889
+ builder. append_null( ) ; // no statistics value
890
+ continue ;
891
+ } ;
892
+
893
+ let Ok ( x) = std:: str :: from_utf8( x. data( ) ) else {
894
+ log:: debug!( "LargeUtf8 statistics is a non-UTF8 value, ignoring it." ) ;
895
+ builder. append_null( ) ;
896
+ continue ;
897
+ } ;
898
+
899
+ builder. append_value( x) ;
900
+ }
901
+ }
902
+ Ok ( Arc :: new( builder. finish( ) ) )
903
+ } ,
859
904
Some ( DataType :: Dictionary ( _, value_type) ) => {
860
905
[ <$stat_type_prefix: lower _ page_statistics>] ( Some ( value_type) , $iterator)
861
906
} ,
@@ -871,14 +916,14 @@ macro_rules! get_data_page_statistics {
871
916
Some ( DataType :: Date32 ) => Ok ( Arc :: new( Date32Array :: from_iter( [ <$stat_type_prefix Int32DataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
872
917
Some ( DataType :: Date64 ) => Ok (
873
918
Arc :: new(
874
- Date64Array :: from ( [ <$stat_type_prefix Int32DataPageStatsIterator >] :: new( $iterator)
919
+ Date64Array :: from_iter ( [ <$stat_type_prefix Int32DataPageStatsIterator >] :: new( $iterator)
875
920
. map( |x| {
876
921
x. into_iter( )
877
922
. map( |x| {
878
923
x. and_then( |x| i64 :: try_from( x) . ok( ) )
879
- . map( |x| x * 24 * 60 * 60 * 1000 )
880
924
} )
881
- } ) . flatten( ) . collect:: <Vec <_>>( )
925
+ . map( |x| x. map( |x| x * 24 * 60 * 60 * 1000 ) )
926
+ } ) . flatten( )
882
927
)
883
928
)
884
929
) ,
0 commit comments