Skip to content

Commit 5b67d3f

Browse files
findepifelipecrv
authored andcommitted
[PR SNAPSHOT] Fix UNION field nullability tracking
This cherry pick a snapshot of a PR offered upstream. ------------------------------------------------------ This commit fixes two bugs related to UNION handling - when constructing union plan nullability of the other union branch was ignored, thus resulting field could easily have incorrect nullability - when pruning/simplifying projects, in `recompute_schema` function there was similar logic, thus loosing nullability information even for correctly constructed Union plan node As a result, other optimizer logic (e.g. `expr_simplifier.rs`) could draw incorrect conclusions and thus lead to incorrect query results, as demonstrated with the attached SLT test.
1 parent 05c0347 commit 5b67d3f

File tree

3 files changed

+126
-31
lines changed

3 files changed

+126
-31
lines changed

datafusion/expr/src/logical_plan/builder.rs

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,7 @@ use datafusion_common::file_options::file_type::FileType;
5454
use datafusion_common::{
5555
exec_err, get_target_functional_dependencies, internal_err, not_impl_err,
5656
plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError,
57-
FunctionalDependencies, Result, ScalarValue, TableReference, ToDFSchema,
58-
UnnestOptions,
57+
Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions,
5958
};
6059
use datafusion_expr_common::type_coercion::binary::type_union_resolution;
6160

@@ -1518,27 +1517,10 @@ pub fn validate_unique_names<'a>(
15181517
/// [`TypeCoercionRewriter::coerce_union`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/struct.TypeCoercionRewriter.html#method.coerce_union
15191518
/// [`coerce_union_schema`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/fn.coerce_union_schema.html
15201519
pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result<LogicalPlan> {
1521-
if left_plan.schema().fields().len() != right_plan.schema().fields().len() {
1522-
return plan_err!(
1523-
"UNION queries have different number of columns: \
1524-
left has {} columns whereas right has {} columns",
1525-
left_plan.schema().fields().len(),
1526-
right_plan.schema().fields().len()
1527-
);
1528-
}
1529-
1530-
// Temporarily use the schema from the left input and later rely on the analyzer to
1531-
// coerce the two schemas into a common one.
1532-
1533-
// Functional Dependencies doesn't preserve after UNION operation
1534-
let schema = (**left_plan.schema()).clone();
1535-
let schema =
1536-
Arc::new(schema.with_functional_dependencies(FunctionalDependencies::empty())?);
1537-
1538-
Ok(LogicalPlan::Union(Union {
1539-
inputs: vec![Arc::new(left_plan), Arc::new(right_plan)],
1540-
schema,
1541-
}))
1520+
Ok(LogicalPlan::Union(Union::try_new_with_loose_types(vec![
1521+
Arc::new(left_plan),
1522+
Arc::new(right_plan),
1523+
])?))
15421524
}
15431525

15441526
/// Create Projection

datafusion/expr/src/logical_plan/plan.rs

Lines changed: 106 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -692,15 +692,13 @@ impl LogicalPlan {
692692
}))
693693
}
694694
LogicalPlan::Union(Union { inputs, schema }) => {
695-
let input_schema = inputs[0].schema();
696-
// If inputs are not pruned do not change schema
697-
// TODO this seems wrong (shouldn't we always use the schema of the input?)
698-
let schema = if schema.fields().len() == input_schema.fields().len() {
699-
Arc::clone(&schema)
695+
let first_input_schema = inputs[0].schema();
696+
if schema.fields().len() == first_input_schema.fields().len() {
697+
// If inputs are not pruned do not change schema
698+
Ok(LogicalPlan::Union(Union { inputs, schema }))
700699
} else {
701-
Arc::clone(input_schema)
702-
};
703-
Ok(LogicalPlan::Union(Union { inputs, schema }))
700+
Ok(LogicalPlan::Union(Union::try_new(inputs)?))
701+
}
704702
}
705703
LogicalPlan::Distinct(distinct) => {
706704
let distinct = match distinct {
@@ -2625,6 +2623,106 @@ pub struct Union {
26252623
pub schema: DFSchemaRef,
26262624
}
26272625

2626+
impl Union {
2627+
/// Constructs new Union instance deriving schema from inputs.
2628+
fn try_new(inputs: Vec<Arc<LogicalPlan>>) -> Result<Self> {
2629+
let schema = Self::derive_schema_from_inputs(&inputs, false)?;
2630+
Ok(Union { inputs, schema })
2631+
}
2632+
2633+
/// Constructs new Union instance deriving schema from inputs.
2634+
/// Inputs do not have to have matching types and produced schema will
2635+
/// take type from the first input.
2636+
pub fn try_new_with_loose_types(inputs: Vec<Arc<LogicalPlan>>) -> Result<Self> {
2637+
let schema = Self::derive_schema_from_inputs(&inputs, true)?;
2638+
Ok(Union { inputs, schema })
2639+
}
2640+
2641+
/// Constructs new Union instance deriving schema from inputs.
2642+
///
2643+
/// `loose_types` if true, inputs do not have to have matching types and produced schema will
2644+
/// take type from the first input. TODO this is not necessarily reasonable behavior.
2645+
fn derive_schema_from_inputs(
2646+
inputs: &[Arc<LogicalPlan>],
2647+
loose_types: bool,
2648+
) -> Result<DFSchemaRef> {
2649+
if inputs.len() < 2 {
2650+
return plan_err!("UNION requires at least two inputs");
2651+
}
2652+
let first_schema = inputs[0].schema();
2653+
let fields_count = first_schema.fields().len();
2654+
for input in inputs.iter().skip(1) {
2655+
if fields_count != input.schema().fields().len() {
2656+
return plan_err!(
2657+
"UNION queries have different number of columns: \
2658+
left has {} columns whereas right has {} columns",
2659+
fields_count,
2660+
input.schema().fields().len()
2661+
);
2662+
}
2663+
}
2664+
2665+
let union_fields = (0..fields_count)
2666+
.map(|i| {
2667+
let fields = inputs
2668+
.iter()
2669+
.map(|input| input.schema().field(i))
2670+
.collect::<Vec<_>>();
2671+
let first_field = fields[0];
2672+
let name = first_field.name();
2673+
let data_type = if loose_types {
2674+
// TODO apply type coercion here, or document why it's better to defer
2675+
// temporarily use the data type from the left input and later rely on the analyzer to
2676+
// coerce the two schemas into a common one.
2677+
first_field.data_type()
2678+
} else {
2679+
fields.iter().skip(1).try_fold(
2680+
first_field.data_type(),
2681+
|acc, field| {
2682+
if acc != field.data_type() {
2683+
return plan_err!(
2684+
"UNION field {i} have different type in inputs: \
2685+
left has {} whereas right has {}",
2686+
first_field.data_type(),
2687+
field.data_type()
2688+
);
2689+
}
2690+
Ok(acc)
2691+
},
2692+
)?
2693+
};
2694+
let nullable = fields.iter().any(|field| field.is_nullable());
2695+
let mut field = Field::new(name, data_type.clone(), nullable);
2696+
let field_metadata =
2697+
intersect_maps(fields.iter().map(|field| field.metadata()));
2698+
field.set_metadata(field_metadata);
2699+
// TODO reusing table reference from the first schema is probably wrong
2700+
let table_reference = first_schema.qualified_field(i).0.cloned();
2701+
Ok((table_reference, Arc::new(field)))
2702+
})
2703+
.collect::<Result<_>>()?;
2704+
let union_schema_metadata =
2705+
intersect_maps(inputs.iter().map(|input| input.schema().metadata()));
2706+
2707+
// Functional Dependencies doesn't preserve after UNION operation
2708+
let schema = DFSchema::new_with_metadata(union_fields, union_schema_metadata)?;
2709+
let schema = Arc::new(schema);
2710+
2711+
Ok(schema)
2712+
}
2713+
}
2714+
2715+
fn intersect_maps<'a>(
2716+
inputs: impl IntoIterator<Item = &'a HashMap<String, String>>,
2717+
) -> HashMap<String, String> {
2718+
let mut inputs = inputs.into_iter();
2719+
let mut merged: HashMap<String, String> = inputs.next().cloned().unwrap_or_default();
2720+
for input in inputs {
2721+
merged.retain(|k, v| input.get(k) == Some(v));
2722+
}
2723+
merged
2724+
}
2725+
26282726
// Manual implementation needed because of `schema` field. Comparison excludes this field.
26292727
impl PartialOrd for Union {
26302728
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {

datafusion/sqllogictest/test_files/union.slt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,3 +777,18 @@ select make_array(make_array(1)) x UNION ALL SELECT make_array(arrow_cast(make_a
777777
----
778778
[[-1]]
779779
[[1]]
780+
781+
# test for https://github.com/apache/datafusion/issues/14352
782+
query TB rowsort
783+
SELECT
784+
a,
785+
a IS NOT NULL
786+
FROM (
787+
-- second column, even though it's not selected, was necessary to reproduce the bug linked above
788+
SELECT 'foo' AS a, 3 AS b
789+
UNION ALL
790+
SELECT NULL AS a, 4 AS b
791+
)
792+
----
793+
NULL false
794+
foo true

0 commit comments

Comments
 (0)