@@ -33,7 +33,7 @@ use arrow_array::{
33
33
use arrow_schema:: { Field , FieldRef , Schema , TimeUnit } ;
34
34
use datafusion_common:: { internal_datafusion_err, internal_err, plan_err, Result } ;
35
35
use half:: f16;
36
- use parquet:: data_type:: FixedLenByteArray ;
36
+ use parquet:: data_type:: { ByteArray , FixedLenByteArray } ;
37
37
use parquet:: file:: metadata:: { ParquetColumnIndex , ParquetOffsetIndex , RowGroupMetaData } ;
38
38
use parquet:: file:: page_index:: index:: { Index , PageIndex } ;
39
39
use parquet:: file:: statistics:: Statistics as ParquetStatistics ;
@@ -600,6 +600,18 @@ make_data_page_stats_iterator!(
600
600
Index :: DOUBLE ,
601
601
f64
602
602
) ;
603
+ make_data_page_stats_iterator ! (
604
+ MinByteArrayDataPageStatsIterator ,
605
+ |x: & PageIndex <ByteArray >| { x. min. clone( ) } ,
606
+ Index :: BYTE_ARRAY ,
607
+ ByteArray
608
+ ) ;
609
+ make_data_page_stats_iterator ! (
610
+ MaxByteArrayDataPageStatsIterator ,
611
+ |x: & PageIndex <ByteArray >| { x. max. clone( ) } ,
612
+ Index :: BYTE_ARRAY ,
613
+ ByteArray
614
+ ) ;
603
615
macro_rules! get_data_page_statistics {
604
616
( $stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
605
617
paste! {
@@ -692,6 +704,34 @@ macro_rules! get_data_page_statistics {
692
704
) ) ,
693
705
Some ( DataType :: Float32 ) => Ok ( Arc :: new( Float32Array :: from_iter( [ <$stat_type_prefix Float32DataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
694
706
Some ( DataType :: Float64 ) => Ok ( Arc :: new( Float64Array :: from_iter( [ <$stat_type_prefix Float64DataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
707
+ Some ( DataType :: Binary ) => Ok ( Arc :: new( BinaryArray :: from_iter( [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
708
+ Some ( DataType :: LargeBinary ) => Ok ( Arc :: new( LargeBinaryArray :: from_iter( [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . flatten( ) ) ) ) ,
709
+ Some ( DataType :: Utf8 ) => Ok ( Arc :: new( StringArray :: from(
710
+ [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . map( |x| {
711
+ x. into_iter( ) . filter_map( |x| {
712
+ x. and_then( |x| {
713
+ let res = std:: str :: from_utf8( x. data( ) ) . map( |s| s. to_string( ) ) . ok( ) ;
714
+ if res. is_none( ) {
715
+ log:: debug!( "Utf8 statistics is a non-UTF8 value, ignoring it." ) ;
716
+ }
717
+ res
718
+ } )
719
+ } )
720
+ } ) . flatten( ) . collect:: <Vec <_>>( ) ,
721
+ ) ) ) ,
722
+ Some ( DataType :: LargeUtf8 ) => Ok ( Arc :: new( LargeStringArray :: from(
723
+ [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) . map( |x| {
724
+ x. into_iter( ) . filter_map( |x| {
725
+ x. and_then( |x| {
726
+ let res = std:: str :: from_utf8( x. data( ) ) . map( |s| s. to_string( ) ) . ok( ) ;
727
+ if res. is_none( ) {
728
+ log:: debug!( "LargeUtf8 statistics is a non-UTF8 value, ignoring it." ) ;
729
+ }
730
+ res
731
+ } )
732
+ } )
733
+ } ) . flatten( ) . collect:: <Vec <_>>( ) ,
734
+ ) ) ) ,
695
735
Some ( DataType :: Timestamp ( unit, timezone) ) => {
696
736
let iter = [ <$stat_type_prefix Int64DataPageStatsIterator >] :: new( $iterator) . flatten( ) ;
697
737
Ok ( match unit {
@@ -831,6 +871,11 @@ where
831
871
. iter ( )
832
872
. map ( |x| x. null_count . map ( |x| x as u64 ) )
833
873
. collect :: < Vec < _ > > ( ) ,
874
+ Index :: BYTE_ARRAY ( native_index) => native_index
875
+ . indexes
876
+ . iter ( )
877
+ . map ( |x| x. null_count . map ( |x| x as u64 ) )
878
+ . collect :: < Vec < _ > > ( ) ,
834
879
_ => unimplemented ! ( ) ,
835
880
} ) ;
836
881
0 commit comments