@@ -685,15 +685,13 @@ impl LogicalPlan {
685
685
} ) )
686
686
}
687
687
LogicalPlan :: Union ( Union { inputs, schema } ) => {
688
- let input_schema = inputs[ 0 ] . schema ( ) ;
689
- // If inputs are not pruned do not change schema
690
- // TODO this seems wrong (shouldn't we always use the schema of the input?)
691
- let schema = if schema. fields ( ) . len ( ) == input_schema. fields ( ) . len ( ) {
692
- Arc :: clone ( & schema)
688
+ let first_input_schema = inputs[ 0 ] . schema ( ) ;
689
+ if schema. fields ( ) . len ( ) == first_input_schema. fields ( ) . len ( ) {
690
+ // If inputs are not pruned do not change schema
691
+ Ok ( LogicalPlan :: Union ( Union { inputs, schema } ) )
693
692
} else {
694
- Arc :: clone ( input_schema)
695
- } ;
696
- Ok ( LogicalPlan :: Union ( Union { inputs, schema } ) )
693
+ Ok ( LogicalPlan :: Union ( Union :: try_new ( inputs) ?) )
694
+ }
697
695
}
698
696
LogicalPlan :: Distinct ( distinct) => {
699
697
let distinct = match distinct {
@@ -2598,6 +2596,106 @@ pub struct Union {
2598
2596
pub schema : DFSchemaRef ,
2599
2597
}
2600
2598
2599
+ impl Union {
2600
+ /// Constructs new Union instance deriving schema from inputs.
2601
+ fn try_new ( inputs : Vec < Arc < LogicalPlan > > ) -> Result < Self > {
2602
+ let schema = Self :: derive_schema_from_inputs ( & inputs, false ) ?;
2603
+ Ok ( Union { inputs, schema } )
2604
+ }
2605
+
2606
+ /// Constructs new Union instance deriving schema from inputs.
2607
+ /// Inputs do not have to have matching types and produced schema will
2608
+ /// take type from the first input.
2609
+ pub fn try_new_with_loose_types ( inputs : Vec < Arc < LogicalPlan > > ) -> Result < Self > {
2610
+ let schema = Self :: derive_schema_from_inputs ( & inputs, true ) ?;
2611
+ Ok ( Union { inputs, schema } )
2612
+ }
2613
+
2614
+ /// Constructs new Union instance deriving schema from inputs.
2615
+ ///
2616
+ /// `loose_types` if true, inputs do not have to have matching types and produced schema will
2617
+ /// take type from the first input. TODO this is not necessarily reasonable behavior.
2618
+ fn derive_schema_from_inputs (
2619
+ inputs : & [ Arc < LogicalPlan > ] ,
2620
+ loose_types : bool ,
2621
+ ) -> Result < DFSchemaRef > {
2622
+ if inputs. len ( ) < 2 {
2623
+ return plan_err ! ( "UNION requires at least two inputs" ) ;
2624
+ }
2625
+ let first_schema = inputs[ 0 ] . schema ( ) ;
2626
+ let fields_count = first_schema. fields ( ) . len ( ) ;
2627
+ for input in inputs. iter ( ) . skip ( 1 ) {
2628
+ if fields_count != input. schema ( ) . fields ( ) . len ( ) {
2629
+ return plan_err ! (
2630
+ "UNION queries have different number of columns: \
2631
+ left has {} columns whereas right has {} columns",
2632
+ fields_count,
2633
+ input. schema( ) . fields( ) . len( )
2634
+ ) ;
2635
+ }
2636
+ }
2637
+
2638
+ let union_fields = ( 0 ..fields_count)
2639
+ . map ( |i| {
2640
+ let fields = inputs
2641
+ . iter ( )
2642
+ . map ( |input| input. schema ( ) . field ( i) )
2643
+ . collect :: < Vec < _ > > ( ) ;
2644
+ let first_field = fields[ 0 ] ;
2645
+ let name = first_field. name ( ) ;
2646
+ let data_type = if loose_types {
2647
+ // TODO apply type coercion here, or document why it's better to defer
2648
+ // temporarily use the data type from the left input and later rely on the analyzer to
2649
+ // coerce the two schemas into a common one.
2650
+ first_field. data_type ( )
2651
+ } else {
2652
+ fields. iter ( ) . skip ( 1 ) . try_fold (
2653
+ first_field. data_type ( ) ,
2654
+ |acc, field| {
2655
+ if acc != field. data_type ( ) {
2656
+ return plan_err ! (
2657
+ "UNION field {i} have different type in inputs: \
2658
+ left has {} whereas right has {}",
2659
+ first_field. data_type( ) ,
2660
+ field. data_type( )
2661
+ ) ;
2662
+ }
2663
+ Ok ( acc)
2664
+ } ,
2665
+ ) ?
2666
+ } ;
2667
+ let nullable = fields. iter ( ) . any ( |field| field. is_nullable ( ) ) ;
2668
+ let mut field = Field :: new ( name, data_type. clone ( ) , nullable) ;
2669
+ let field_metadata =
2670
+ intersect_maps ( fields. iter ( ) . map ( |field| field. metadata ( ) ) ) ;
2671
+ field. set_metadata ( field_metadata) ;
2672
+ // TODO reusing table reference from the first schema is probably wrong
2673
+ let table_reference = first_schema. qualified_field ( i) . 0 . cloned ( ) ;
2674
+ Ok ( ( table_reference, Arc :: new ( field) ) )
2675
+ } )
2676
+ . collect :: < Result < _ > > ( ) ?;
2677
+ let union_schema_metadata =
2678
+ intersect_maps ( inputs. iter ( ) . map ( |input| input. schema ( ) . metadata ( ) ) ) ;
2679
+
2680
+ // Functional Dependencies doesn't preserve after UNION operation
2681
+ let schema = DFSchema :: new_with_metadata ( union_fields, union_schema_metadata) ?;
2682
+ let schema = Arc :: new ( schema) ;
2683
+
2684
+ Ok ( schema)
2685
+ }
2686
+ }
2687
+
2688
+ fn intersect_maps < ' a > (
2689
+ inputs : impl IntoIterator < Item = & ' a HashMap < String , String > > ,
2690
+ ) -> HashMap < String , String > {
2691
+ let mut inputs = inputs. into_iter ( ) ;
2692
+ let mut merged: HashMap < String , String > = inputs. next ( ) . cloned ( ) . unwrap_or_default ( ) ;
2693
+ for input in inputs {
2694
+ merged. retain ( |k, v| input. get ( k) == Some ( v) ) ;
2695
+ }
2696
+ merged
2697
+ }
2698
+
2601
2699
// Manual implementation needed because of `schema` field. Comparison excludes this field.
2602
2700
impl PartialOrd for Union {
2603
2701
fn partial_cmp ( & self , other : & Self ) -> Option < Ordering > {
0 commit comments