Skip to content

Commit cf8f2f8

Browse files
authored
Introduce LogicalPlan invariants, begin automatically checking them (#13651)
* minor(13525): perform LP validation before and after each possible mutation * minor(13525): validate unique field names on query and subquery schemas, after each optimizer pass * minor(13525): validate union after each optimizer passes * refactor: make explicit what is an invariant of the logical plan, versus assertions made after a given analyzer or optimizer pass * chore: add link to invariant docs * fix: add new invariants module * refactor: move all LP invariant checking into LP, delineate executable (valid semantic plan) vs basic LP invariants * test: update test for slight error message change * fix: push_down_filter optimization pass can push a IN(<subquery>) into a TableScan's filter clause * refactor: move collect_subquery_cols() to common utils crate * refactor: clarify the purpose of assert_valid_optimization(), runs after all optimizer passes, except in debug mode it runs after each pass. * refactor: based upon performance tests, run the maximum number of checks without impa ct: * assert_valid_optimization can run each optimizer pass * remove the recursive cehck_fields, which caused the performance regression * the full LP Invariants::Executable can only run in debug * chore: update error naming and terminology used in code comments * refactor: use proper error methods * chore: more cleanup of error messages * chore: handle option trailer to error message * test: update sqllogictests tests to not use multiline
1 parent 5045bde commit cf8f2f8

File tree

11 files changed

+209
-104
lines changed

11 files changed

+209
-104
lines changed

datafusion/optimizer/src/analyzer/subquery.rs renamed to datafusion/expr/src/logical_plan/invariants.rs

+95-10
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,98 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::analyzer::check_plan;
19-
use crate::utils::collect_subquery_cols;
18+
use datafusion_common::{
19+
internal_err, plan_err,
20+
tree_node::{TreeNode, TreeNodeRecursion},
21+
DFSchemaRef, Result,
22+
};
2023

21-
use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
22-
use datafusion_common::{plan_err, Result};
23-
use datafusion_expr::expr_rewriter::strip_outer_reference;
24-
use datafusion_expr::utils::split_conjunction;
25-
use datafusion_expr::{Aggregate, Expr, Filter, Join, JoinType, LogicalPlan, Window};
24+
use crate::{
25+
expr::{Exists, InSubquery},
26+
expr_rewriter::strip_outer_reference,
27+
utils::{collect_subquery_cols, split_conjunction},
28+
Aggregate, Expr, Filter, Join, JoinType, LogicalPlan, Window,
29+
};
30+
31+
pub enum InvariantLevel {
32+
/// Invariants that are always true in DataFusion `LogicalPlan`s
33+
/// such as the number of expected children and no duplicated output fields
34+
Always,
35+
/// Invariants that must hold true for the plan to be "executable"
36+
/// such as the type and number of function arguments are correct and
37+
/// that wildcards have been expanded
38+
///
39+
/// To ensure a LogicalPlan satisfies the `Executable` invariants, run the
40+
/// `Analyzer`
41+
Executable,
42+
}
43+
44+
pub fn assert_always_invariants(plan: &LogicalPlan) -> Result<()> {
45+
// Refer to <https://datafusion.apache.org/contributor-guide/specification/invariants.html#relation-name-tuples-in-logical-fields-and-logical-columns-are-unique>
46+
assert_unique_field_names(plan)?;
47+
48+
Ok(())
49+
}
50+
51+
pub fn assert_executable_invariants(plan: &LogicalPlan) -> Result<()> {
52+
assert_always_invariants(plan)?;
53+
assert_valid_semantic_plan(plan)?;
54+
Ok(())
55+
}
56+
57+
/// Returns an error if plan, and subplans, do not have unique fields.
58+
///
59+
/// This invariant is subject to change.
60+
/// refer: <https://github.com/apache/datafusion/issues/13525#issuecomment-2494046463>
61+
fn assert_unique_field_names(plan: &LogicalPlan) -> Result<()> {
62+
plan.schema().check_names()
63+
}
64+
65+
/// Returns an error if the plan is not sematically valid.
66+
fn assert_valid_semantic_plan(plan: &LogicalPlan) -> Result<()> {
67+
assert_subqueries_are_valid(plan)?;
68+
69+
Ok(())
70+
}
71+
72+
/// Returns an error if the plan does not have the expected schema.
73+
/// Ignores metadata and nullability.
74+
pub fn assert_expected_schema(schema: &DFSchemaRef, plan: &LogicalPlan) -> Result<()> {
75+
let equivalent = plan.schema().equivalent_names_and_types(schema);
76+
77+
if !equivalent {
78+
internal_err!(
79+
"Failed due to a difference in schemas, original schema: {:?}, new schema: {:?}",
80+
schema,
81+
plan.schema()
82+
)
83+
} else {
84+
Ok(())
85+
}
86+
}
87+
88+
/// Asserts that the subqueries are structured properly with valid node placement.
89+
///
90+
/// Refer to [`check_subquery_expr`] for more details.
91+
fn assert_subqueries_are_valid(plan: &LogicalPlan) -> Result<()> {
92+
plan.apply_with_subqueries(|plan: &LogicalPlan| {
93+
plan.apply_expressions(|expr| {
94+
// recursively look for subqueries
95+
expr.apply(|expr| {
96+
match expr {
97+
Expr::Exists(Exists { subquery, .. })
98+
| Expr::InSubquery(InSubquery { subquery, .. })
99+
| Expr::ScalarSubquery(subquery) => {
100+
check_subquery_expr(plan, &subquery.subquery, expr)?;
101+
}
102+
_ => {}
103+
};
104+
Ok(TreeNodeRecursion::Continue)
105+
})
106+
})
107+
})
108+
.map(|_| ())
109+
}
26110

27111
/// Do necessary check on subquery expressions and fail the invalid plan
28112
/// 1) Check whether the outer plan is in the allowed outer plans list to use subquery expressions,
@@ -36,7 +120,7 @@ pub fn check_subquery_expr(
36120
inner_plan: &LogicalPlan,
37121
expr: &Expr,
38122
) -> Result<()> {
39-
check_plan(inner_plan)?;
123+
assert_subqueries_are_valid(inner_plan)?;
40124
if let Expr::ScalarSubquery(subquery) = expr {
41125
// Scalar subquery should only return one column
42126
if subquery.subquery.schema().fields().len() > 1 {
@@ -108,12 +192,13 @@ pub fn check_subquery_expr(
108192
match outer_plan {
109193
LogicalPlan::Projection(_)
110194
| LogicalPlan::Filter(_)
195+
| LogicalPlan::TableScan(_)
111196
| LogicalPlan::Window(_)
112197
| LogicalPlan::Aggregate(_)
113198
| LogicalPlan::Join(_) => Ok(()),
114199
_ => plan_err!(
115200
"In/Exist subquery can only be used in \
116-
Projection, Filter, Window functions, Aggregate and Join plan nodes, \
201+
Projection, Filter, TableScan, Window functions, Aggregate and Join plan nodes, \
117202
but was used in [{}]",
118203
outer_plan.display()
119204
),
@@ -285,8 +370,8 @@ mod test {
285370
use std::cmp::Ordering;
286371
use std::sync::Arc;
287372

373+
use crate::{Extension, UserDefinedLogicalNodeCore};
288374
use datafusion_common::{DFSchema, DFSchemaRef};
289-
use datafusion_expr::{Extension, UserDefinedLogicalNodeCore};
290375

291376
use super::*;
292377

datafusion/expr/src/logical_plan/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ mod ddl;
2020
pub mod display;
2121
pub mod dml;
2222
mod extension;
23+
pub(crate) mod invariants;
24+
pub use invariants::{assert_expected_schema, check_subquery_expr, InvariantLevel};
2325
mod plan;
2426
mod statement;
2527
pub mod tree_node;

datafusion/expr/src/logical_plan/plan.rs

+11
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ use std::hash::{Hash, Hasher};
2424
use std::sync::{Arc, LazyLock};
2525

2626
use super::dml::CopyTo;
27+
use super::invariants::{
28+
assert_always_invariants, assert_executable_invariants, InvariantLevel,
29+
};
2730
use super::DdlStatement;
2831
use crate::builder::{change_redundant_column, unnest_with_options};
2932
use crate::expr::{Placeholder, Sort as SortExpr, WindowFunction};
@@ -1127,6 +1130,14 @@ impl LogicalPlan {
11271130
}
11281131
}
11291132

1133+
/// checks that the plan conforms to the listed invariant level, returning an Error if not
1134+
pub fn check_invariants(&self, check: InvariantLevel) -> Result<()> {
1135+
match check {
1136+
InvariantLevel::Always => assert_always_invariants(self),
1137+
InvariantLevel::Executable => assert_executable_invariants(self),
1138+
}
1139+
}
1140+
11301141
/// Helper for [Self::with_new_exprs] to use when no expressions are expected.
11311142
#[inline]
11321143
#[allow(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again

datafusion/expr/src/utils.rs

+19-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! Expression utilities
1919
2020
use std::cmp::Ordering;
21-
use std::collections::HashSet;
21+
use std::collections::{BTreeSet, HashSet};
2222
use std::ops::Deref;
2323
use std::sync::Arc;
2424

@@ -1402,6 +1402,24 @@ pub fn format_state_name(name: &str, state_name: &str) -> String {
14021402
format!("{name}[{state_name}]")
14031403
}
14041404

1405+
/// Determine the set of [`Column`]s produced by the subquery.
1406+
pub fn collect_subquery_cols(
1407+
exprs: &[Expr],
1408+
subquery_schema: &DFSchema,
1409+
) -> Result<BTreeSet<Column>> {
1410+
exprs.iter().try_fold(BTreeSet::new(), |mut cols, expr| {
1411+
let mut using_cols: Vec<Column> = vec![];
1412+
for col in expr.column_refs().into_iter() {
1413+
if subquery_schema.has_column(col) {
1414+
using_cols.push(col.clone());
1415+
}
1416+
}
1417+
1418+
cols.extend(using_cols);
1419+
Result::<_>::Ok(cols)
1420+
})
1421+
}
1422+
14051423
#[cfg(test)]
14061424
mod tests {
14071425
use super::*;

datafusion/optimizer/src/analyzer/mod.rs

+24-36
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,14 @@ use log::debug;
2424

2525
use datafusion_common::config::ConfigOptions;
2626
use datafusion_common::instant::Instant;
27-
use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
28-
use datafusion_common::{DataFusionError, Result};
29-
use datafusion_expr::expr::Exists;
30-
use datafusion_expr::expr::InSubquery;
27+
use datafusion_common::Result;
3128
use datafusion_expr::expr_rewriter::FunctionRewrite;
32-
use datafusion_expr::{Expr, LogicalPlan};
29+
use datafusion_expr::{InvariantLevel, LogicalPlan};
3330

3431
use crate::analyzer::count_wildcard_rule::CountWildcardRule;
3532
use crate::analyzer::expand_wildcard_rule::ExpandWildcardRule;
3633
use crate::analyzer::inline_table_scan::InlineTableScan;
3734
use crate::analyzer::resolve_grouping_function::ResolveGroupingFunction;
38-
use crate::analyzer::subquery::check_subquery_expr;
3935
use crate::analyzer::type_coercion::TypeCoercion;
4036
use crate::utils::log_plan;
4137

@@ -46,17 +42,24 @@ pub mod expand_wildcard_rule;
4642
pub mod function_rewrite;
4743
pub mod inline_table_scan;
4844
pub mod resolve_grouping_function;
49-
pub mod subquery;
5045
pub mod type_coercion;
5146

47+
pub mod subquery {
48+
#[deprecated(
49+
since = "44.0.0",
50+
note = "please use `datafusion_expr::check_subquery_expr` instead"
51+
)]
52+
pub use datafusion_expr::check_subquery_expr;
53+
}
54+
5255
/// [`AnalyzerRule`]s transform [`LogicalPlan`]s in some way to make
5356
/// the plan valid prior to the rest of the DataFusion optimization process.
5457
///
5558
/// `AnalyzerRule`s are different than an [`OptimizerRule`](crate::OptimizerRule)s
5659
/// which must preserve the semantics of the `LogicalPlan`, while computing
5760
/// results in a more optimal way.
5861
///
59-
/// For example, an `AnalyzerRule` may resolve [`Expr`]s into more specific
62+
/// For example, an `AnalyzerRule` may resolve [`Expr`](datafusion_expr::Expr)s into more specific
6063
/// forms such as a subquery reference, or do type coercion to ensure the types
6164
/// of operands are correct.
6265
///
@@ -140,6 +143,10 @@ impl Analyzer {
140143
where
141144
F: FnMut(&LogicalPlan, &dyn AnalyzerRule),
142145
{
146+
// verify the logical plan required invariants at the start, before analyzer
147+
plan.check_invariants(InvariantLevel::Always)
148+
.map_err(|e| e.context("Invalid input plan passed to Analyzer"))?;
149+
143150
let start_time = Instant::now();
144151
let mut new_plan = plan;
145152

@@ -161,39 +168,20 @@ impl Analyzer {
161168

162169
// TODO add common rule executor for Analyzer and Optimizer
163170
for rule in rules {
164-
new_plan = rule.analyze(new_plan, config).map_err(|e| {
165-
DataFusionError::Context(rule.name().to_string(), Box::new(e))
166-
})?;
171+
new_plan = rule
172+
.analyze(new_plan, config)
173+
.map_err(|e| e.context(rule.name()))?;
167174
log_plan(rule.name(), &new_plan);
168175
observer(&new_plan, rule.as_ref());
169176
}
170-
// for easier display in explain output
171-
check_plan(&new_plan).map_err(|e| {
172-
DataFusionError::Context("check_analyzed_plan".to_string(), Box::new(e))
173-
})?;
177+
178+
// verify at the end, after the last LP analyzer pass, that the plan is executable.
179+
new_plan
180+
.check_invariants(InvariantLevel::Executable)
181+
.map_err(|e| e.context("Invalid (non-executable) plan after Analyzer"))?;
182+
174183
log_plan("Final analyzed plan", &new_plan);
175184
debug!("Analyzer took {} ms", start_time.elapsed().as_millis());
176185
Ok(new_plan)
177186
}
178187
}
179-
180-
/// Do necessary check and fail the invalid plan
181-
fn check_plan(plan: &LogicalPlan) -> Result<()> {
182-
plan.apply_with_subqueries(|plan: &LogicalPlan| {
183-
plan.apply_expressions(|expr| {
184-
// recursively look for subqueries
185-
expr.apply(|expr| {
186-
match expr {
187-
Expr::Exists(Exists { subquery, .. })
188-
| Expr::InSubquery(InSubquery { subquery, .. })
189-
| Expr::ScalarSubquery(subquery) => {
190-
check_subquery_expr(plan, &subquery.subquery, expr)?;
191-
}
192-
_ => {}
193-
};
194-
Ok(TreeNodeRecursion::Continue)
195-
})
196-
})
197-
})
198-
.map(|_| ())
199-
}

datafusion/optimizer/src/decorrelate.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,16 @@ use std::ops::Deref;
2222
use std::sync::Arc;
2323

2424
use crate::simplify_expressions::ExprSimplifier;
25-
use crate::utils::collect_subquery_cols;
2625

2726
use datafusion_common::tree_node::{
2827
Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
2928
};
3029
use datafusion_common::{plan_err, Column, DFSchemaRef, HashMap, Result, ScalarValue};
3130
use datafusion_expr::expr::Alias;
3231
use datafusion_expr::simplify::SimplifyContext;
33-
use datafusion_expr::utils::{conjunction, find_join_exprs, split_conjunction};
32+
use datafusion_expr::utils::{
33+
collect_subquery_cols, conjunction, find_join_exprs, split_conjunction,
34+
};
3435
use datafusion_expr::{
3536
expr, lit, BinaryExpr, Cast, EmptyRelation, Expr, FetchType, LogicalPlan,
3637
LogicalPlanBuilder, Operator,

datafusion/optimizer/src/decorrelate_predicate_subquery.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ mod tests {
835835
.build()?;
836836

837837
// Maybe okay if the table only has a single column?
838-
let expected = "check_analyzed_plan\
838+
let expected = "Invalid (non-executable) plan after Analyzer\
839839
\ncaused by\
840840
\nError during planning: InSubquery should only return one column, but found 4";
841841
assert_analyzer_check_err(vec![], plan, expected);
@@ -930,7 +930,7 @@ mod tests {
930930
.project(vec![col("customer.c_custkey")])?
931931
.build()?;
932932

933-
let expected = "check_analyzed_plan\
933+
let expected = "Invalid (non-executable) plan after Analyzer\
934934
\ncaused by\
935935
\nError during planning: InSubquery should only return one column";
936936
assert_analyzer_check_err(vec![], plan, expected);

0 commit comments

Comments
 (0)