@@ -30,9 +30,9 @@ pub use self::csv::CsvExec;
30
30
pub ( crate ) use self :: parquet:: plan_to_parquet;
31
31
pub use self :: parquet:: { ParquetExec , ParquetFileMetrics , ParquetFileReaderFactory } ;
32
32
use arrow:: {
33
- array:: { ArrayData , ArrayRef , DictionaryArray } ,
33
+ array:: { ArrayData , ArrayRef , BufferBuilder , DictionaryArray } ,
34
34
buffer:: Buffer ,
35
- datatypes:: { DataType , Field , Schema , SchemaRef , UInt16Type } ,
35
+ datatypes:: { ArrowNativeType , DataType , Field , Schema , SchemaRef , UInt16Type } ,
36
36
record_batch:: RecordBatch ,
37
37
} ;
38
38
pub use avro:: AvroExec ;
@@ -49,25 +49,21 @@ use crate::{
49
49
error:: { DataFusionError , Result } ,
50
50
scalar:: ScalarValue ,
51
51
} ;
52
- use arrow:: array:: { new_null_array, UInt16BufferBuilder } ;
52
+ use arrow:: array:: new_null_array;
53
53
use arrow:: record_batch:: RecordBatchOptions ;
54
54
use log:: { debug, info} ;
55
55
use object_store:: path:: Path ;
56
56
use object_store:: ObjectMeta ;
57
57
use std:: {
58
58
collections:: HashMap ,
59
59
fmt:: { Display , Formatter , Result as FmtResult } ,
60
+ marker:: PhantomData ,
60
61
sync:: Arc ,
61
62
vec,
62
63
} ;
63
64
64
65
use super :: { ColumnStatistics , Statistics } ;
65
66
66
- /// Convert logical type of partition column to physical type: `Dictionary(UInt16, val_type)`
67
- pub fn partition_type_wrap ( val_type : DataType ) -> DataType {
68
- DataType :: Dictionary ( Box :: new ( DataType :: UInt16 ) , Box :: new ( val_type) )
69
- }
70
-
71
67
/// The base configurations to provide when creating a physical plan for
72
68
/// any given file format.
73
69
#[ derive( Debug , Clone ) ]
@@ -346,7 +342,7 @@ struct PartitionColumnProjector {
346
342
/// An Arrow buffer initialized to zeros that represents the key array of all partition
347
343
/// columns (partition columns are materialized by dictionary arrays with only one
348
344
/// value in the dictionary, thus all the keys are equal to zero).
349
- key_buffer_cache : Option < Buffer > ,
345
+ key_buffer_cache : ZeroBufferGenerators ,
350
346
/// Mapping between the indexes in the list of partition columns and the target
351
347
/// schema. Sorted by index in the target schema so that we can iterate on it to
352
348
/// insert the partition columns in the target record batch.
@@ -372,7 +368,7 @@ impl PartitionColumnProjector {
372
368
373
369
Self {
374
370
projected_partition_indexes,
375
- key_buffer_cache : None ,
371
+ key_buffer_cache : Default :: default ( ) ,
376
372
projected_schema,
377
373
}
378
374
}
@@ -400,7 +396,7 @@ impl PartitionColumnProjector {
400
396
for & ( pidx, sidx) in & self . projected_partition_indexes {
401
397
cols. insert (
402
398
sidx,
403
- create_dict_array (
399
+ create_output_array (
404
400
& mut self . key_buffer_cache ,
405
401
& partition_values[ pidx] ,
406
402
file_batch. num_rows ( ) ,
@@ -411,26 +407,60 @@ impl PartitionColumnProjector {
411
407
}
412
408
}
413
409
414
- fn create_dict_array (
415
- key_buffer_cache : & mut Option < Buffer > ,
416
- val : & ScalarValue ,
417
- len : usize ,
418
- ) -> ArrayRef {
419
- // build value dictionary
420
- let dict_vals = val. to_array ( ) ;
421
-
422
- // build keys array
423
- let sliced_key_buffer = match key_buffer_cache {
424
- Some ( buf) if buf. len ( ) >= len * 2 => buf. slice ( buf. len ( ) - len * 2 ) ,
425
- _ => {
426
- let mut key_buffer_builder = UInt16BufferBuilder :: new ( len * 2 ) ;
427
- key_buffer_builder. advance ( len * 2 ) ; // keys are all 0
428
- key_buffer_cache. insert ( key_buffer_builder. finish ( ) ) . clone ( )
410
+ #[ derive( Debug , Default ) ]
411
+ struct ZeroBufferGenerators {
412
+ gen_i8 : ZeroBufferGenerator < i8 > ,
413
+ gen_i16 : ZeroBufferGenerator < i16 > ,
414
+ gen_i32 : ZeroBufferGenerator < i32 > ,
415
+ gen_i64 : ZeroBufferGenerator < i64 > ,
416
+ gen_u8 : ZeroBufferGenerator < u8 > ,
417
+ gen_u16 : ZeroBufferGenerator < u16 > ,
418
+ gen_u32 : ZeroBufferGenerator < u32 > ,
419
+ gen_u64 : ZeroBufferGenerator < u64 > ,
420
+ }
421
+
422
+ /// Generate a arrow [`Buffer`] that contains zero values.
423
+ #[ derive( Debug , Default ) ]
424
+ struct ZeroBufferGenerator < T >
425
+ where
426
+ T : ArrowNativeType ,
427
+ {
428
+ cache : Option < Buffer > ,
429
+ _t : PhantomData < T > ,
430
+ }
431
+
432
+ impl < T > ZeroBufferGenerator < T >
433
+ where
434
+ T : ArrowNativeType ,
435
+ {
436
+ const SIZE : usize = std:: mem:: size_of :: < T > ( ) ;
437
+
438
+ fn get_buffer ( & mut self , n_vals : usize ) -> Buffer {
439
+ match & mut self . cache {
440
+ Some ( buf) if buf. len ( ) >= n_vals * Self :: SIZE => {
441
+ buf. slice_with_length ( 0 , n_vals * Self :: SIZE )
442
+ }
443
+ _ => {
444
+ let mut key_buffer_builder = BufferBuilder :: < T > :: new ( n_vals) ;
445
+ key_buffer_builder. advance ( n_vals) ; // keys are all 0
446
+ self . cache . insert ( key_buffer_builder. finish ( ) ) . clone ( )
447
+ }
429
448
}
430
- } ;
449
+ }
450
+ }
431
451
432
- // create data type
433
- let data_type = partition_type_wrap ( val. get_datatype ( ) ) ;
452
+ fn create_dict_array < T > (
453
+ buffer_gen : & mut ZeroBufferGenerator < T > ,
454
+ dict_val : & ScalarValue ,
455
+ len : usize ,
456
+ data_type : DataType ,
457
+ ) -> ArrayRef
458
+ where
459
+ T : ArrowNativeType ,
460
+ {
461
+ let dict_vals = dict_val. to_array ( ) ;
462
+
463
+ let sliced_key_buffer = buffer_gen. get_buffer ( len) ;
434
464
435
465
// assemble pieces together
436
466
let mut builder = ArrayData :: builder ( data_type)
@@ -442,6 +472,84 @@ fn create_dict_array(
442
472
) )
443
473
}
444
474
475
+ fn create_output_array (
476
+ key_buffer_cache : & mut ZeroBufferGenerators ,
477
+ val : & ScalarValue ,
478
+ len : usize ,
479
+ ) -> ArrayRef {
480
+ if let ScalarValue :: Dictionary ( key_type, dict_val) = & val {
481
+ match key_type. as_ref ( ) {
482
+ DataType :: Int8 => {
483
+ return create_dict_array (
484
+ & mut key_buffer_cache. gen_i8 ,
485
+ dict_val,
486
+ len,
487
+ val. get_datatype ( ) ,
488
+ ) ;
489
+ }
490
+ DataType :: Int16 => {
491
+ return create_dict_array (
492
+ & mut key_buffer_cache. gen_i16 ,
493
+ dict_val,
494
+ len,
495
+ val. get_datatype ( ) ,
496
+ ) ;
497
+ }
498
+ DataType :: Int32 => {
499
+ return create_dict_array (
500
+ & mut key_buffer_cache. gen_i32 ,
501
+ dict_val,
502
+ len,
503
+ val. get_datatype ( ) ,
504
+ ) ;
505
+ }
506
+ DataType :: Int64 => {
507
+ return create_dict_array (
508
+ & mut key_buffer_cache. gen_i64 ,
509
+ dict_val,
510
+ len,
511
+ val. get_datatype ( ) ,
512
+ ) ;
513
+ }
514
+ DataType :: UInt8 => {
515
+ return create_dict_array (
516
+ & mut key_buffer_cache. gen_u8 ,
517
+ dict_val,
518
+ len,
519
+ val. get_datatype ( ) ,
520
+ ) ;
521
+ }
522
+ DataType :: UInt16 => {
523
+ return create_dict_array (
524
+ & mut key_buffer_cache. gen_u16 ,
525
+ dict_val,
526
+ len,
527
+ val. get_datatype ( ) ,
528
+ ) ;
529
+ }
530
+ DataType :: UInt32 => {
531
+ return create_dict_array (
532
+ & mut key_buffer_cache. gen_u32 ,
533
+ dict_val,
534
+ len,
535
+ val. get_datatype ( ) ,
536
+ ) ;
537
+ }
538
+ DataType :: UInt64 => {
539
+ return create_dict_array (
540
+ & mut key_buffer_cache. gen_u64 ,
541
+ dict_val,
542
+ len,
543
+ val. get_datatype ( ) ,
544
+ ) ;
545
+ }
546
+ _ => { }
547
+ }
548
+ }
549
+
550
+ val. to_array_of_size ( len)
551
+ }
552
+
445
553
/// A single file or part of a file that should be read, along with its schema, statistics
446
554
pub struct FileMeta {
447
555
/// Path for the file (e.g. URL, filesystem path, etc)
@@ -670,9 +778,9 @@ mod tests {
670
778
// file_batch is ok here because we kept all the file cols in the projection
671
779
file_batch,
672
780
& [
673
- ScalarValue :: Utf8 ( Some ( "2021" . to_owned ( ) ) ) ,
674
- ScalarValue :: Utf8 ( Some ( "10" . to_owned ( ) ) ) ,
675
- ScalarValue :: Utf8 ( Some ( "26" . to_owned ( ) ) ) ,
781
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "2021" . to_owned ( ) ) ) ) ,
782
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "10" . to_owned ( ) ) ) ) ,
783
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "26" . to_owned ( ) ) ) ) ,
676
784
] ,
677
785
)
678
786
. expect ( "Projection of partition columns into record batch failed" ) ;
@@ -698,9 +806,9 @@ mod tests {
698
806
// file_batch is ok here because we kept all the file cols in the projection
699
807
file_batch,
700
808
& [
701
- ScalarValue :: Utf8 ( Some ( "2021" . to_owned ( ) ) ) ,
702
- ScalarValue :: Utf8 ( Some ( "10" . to_owned ( ) ) ) ,
703
- ScalarValue :: Utf8 ( Some ( "27" . to_owned ( ) ) ) ,
809
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "2021" . to_owned ( ) ) ) ) ,
810
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "10" . to_owned ( ) ) ) ) ,
811
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "27" . to_owned ( ) ) ) ) ,
704
812
] ,
705
813
)
706
814
. expect ( "Projection of partition columns into record batch failed" ) ;
@@ -728,9 +836,9 @@ mod tests {
728
836
// file_batch is ok here because we kept all the file cols in the projection
729
837
file_batch,
730
838
& [
731
- ScalarValue :: Utf8 ( Some ( "2021" . to_owned ( ) ) ) ,
732
- ScalarValue :: Utf8 ( Some ( "10" . to_owned ( ) ) ) ,
733
- ScalarValue :: Utf8 ( Some ( "28" . to_owned ( ) ) ) ,
839
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "2021" . to_owned ( ) ) ) ) ,
840
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "10" . to_owned ( ) ) ) ) ,
841
+ partition_value_wrap ( ScalarValue :: Utf8 ( Some ( "28" . to_owned ( ) ) ) ) ,
734
842
] ,
735
843
)
736
844
. expect ( "Projection of partition columns into record batch failed" ) ;
@@ -862,4 +970,13 @@ mod tests {
862
970
extensions : None ,
863
971
}
864
972
}
973
+
974
+ /// Convert logical type of partition column to physical type: `Dictionary(UInt16, val_type)`
975
+ fn partition_type_wrap ( val_type : DataType ) -> DataType {
976
+ DataType :: Dictionary ( Box :: new ( DataType :: UInt16 ) , Box :: new ( val_type) )
977
+ }
978
+
979
+ fn partition_value_wrap ( val : ScalarValue ) -> ScalarValue {
980
+ ScalarValue :: Dictionary ( Box :: new ( DataType :: UInt16 ) , Box :: new ( val) )
981
+ }
865
982
}
0 commit comments