From 45ae442ce721c4e7d2d96fb9fa8b9fa5b67bdab7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 4 Sep 2023 12:19:26 -0700 Subject: [PATCH] docs --- .../simplify_expressions/expr_simplifier.rs | 53 ++++- .../src/simplify_expressions/guarantees.rs | 36 +++- .../physical-expr/src/intervals/cp_solver.rs | 202 +++++++----------- 3 files changed, 159 insertions(+), 132 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 78a8d70635b4..7f1c69223805 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -152,7 +152,58 @@ impl ExprSimplifier { expr.rewrite(&mut expr_rewrite) } - /// Add guarantees + /// Input guarantees and simplify the expression. + /// + /// The guarantees can simplify expressions. For example, if a column is + /// guaranteed to always be a certain value, it's references in the expression + /// can be replaced with that literal. + /// + /// ```rust + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, Expr}; + /// use datafusion_common::{Result, ScalarValue, ToDFSchema}; + /// use datafusion_physical_expr::execution_props::ExecutionProps; + /// use datafusion_optimizer::simplify_expressions::{ + /// ExprSimplifier, SimplifyContext, + /// guarantees::{Guarantee, GuaranteeBound, NullStatus}}; + /// + /// let schema = Schema::new(vec![ + /// Field::new("x", DataType::Int64, false), + /// Field::new("y", DataType::UInt32, false), + /// Field::new("z", DataType::Int64, false), + /// ]) + /// .to_dfschema_ref().unwrap(); + /// + /// // Create the simplifier + /// let props = ExecutionProps::new(); + /// let context = SimplifyContext::new(&props) + /// .with_schema(schema); + /// let simplifier = ExprSimplifier::new(context); + /// + /// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5) + /// let expr_x = col("x").gt_eq(lit(3_i64)); + /// let expr_y = (col("y") + lit(2_u32)).lt(lit(10_u32)); + /// let expr_z = col("z").gt(lit(5_i64)); + /// let expr = expr_x.and(expr_y).and(expr_z.clone()); + /// + /// let guarantees = vec![ + /// // x is guaranteed to be between 3 and 5 + /// ( + /// col("x"), + /// Guarantee::new( + /// Some(GuaranteeBound::new(ScalarValue::Int64(Some(3)), false)), + /// Some(GuaranteeBound::new(ScalarValue::Int64(Some(5)), false)), + /// NullStatus::NeverNull, + /// ) + /// ), + /// // y is guaranteed to be 3 + /// (col("y"), Guarantee::from(&ScalarValue::UInt32(Some(3)))), + /// ]; + /// let output = simplifier.simplify_with_guarantees(expr, &guarantees).unwrap(); + /// // Expression becomes: true AND true AND (z > 5), which simplifies to + /// // z > 5. + /// assert_eq!(output, expr_z); + /// ``` pub fn simplify_with_guarantees<'a>( &self, expr: Expr, diff --git a/datafusion/optimizer/src/simplify_expressions/guarantees.rs b/datafusion/optimizer/src/simplify_expressions/guarantees.rs index 0772eaab50f3..7fe25817781e 100644 --- a/datafusion/optimizer/src/simplify_expressions/guarantees.rs +++ b/datafusion/optimizer/src/simplify_expressions/guarantees.rs @@ -15,8 +15,26 @@ // specific language governing permissions and limitations // under the License. -//! Logic to inject guarantees with expressions. +//! Guarantees which can be used with [ExprSimplifier::simplify_with_guarantees()][crate::simplify_expressions::expr_simplifier::ExprSimplifier::simplify_with_guarantees]. //! +//! Guarantees can represent single values or possible ranges of values. +//! +//! ``` +//! use datafusion_optimizer::simplify_expressions::guarantees::{ +//! Guarantee, GuaranteeBound, NullStatus}; +//! +//! // Guarantee that value is always 1_i32 +//! Guarantee::from(&ScalarValue::Int32(Some(1))); +//! // Guarantee that value is always NULL +//! Guarantee::from(&ScalarValue::Null); +//! // Guarantee that value is always between 1_i32 and 10_i32 (inclusive) +//! // and never null. +//! Guarantee::new( +//! Some(GuaranteeBound::new(ScalarValue::Int32(Some(1)), false)), +//! Some(GuaranteeBound::new(ScalarValue::Int32(Some(10)), false)), +//! NullStatus::NeverNull, +//! ); +//! ``` use datafusion_common::{tree_node::TreeNodeRewriter, Result, ScalarValue}; use datafusion_expr::{expr::InList, lit, Between, BinaryExpr, Expr, Operator}; use std::collections::HashMap; @@ -24,7 +42,7 @@ use std::collections::HashMap; /// A bound on the value of an expression. #[derive(Debug, Clone, PartialEq)] pub struct GuaranteeBound { - /// The value of the bound. + /// The value of the bound. If the bound is null, then there is no bound. pub bound: ScalarValue, /// If true, the bound is exclusive. If false, the bound is inclusive. /// In terms of inequalities, this means the bound is `<` or `>` rather than @@ -40,6 +58,7 @@ impl GuaranteeBound { } impl Default for GuaranteeBound { + /// Default value is a closed bound at null. fn default() -> Self { Self { bound: ScalarValue::Null, @@ -70,9 +89,11 @@ pub enum NullStatus { /// nulls. #[derive(Debug, Clone, PartialEq)] pub struct Guarantee { - /// The min values that the expression can take on. If `min.bound` is + /// The min values that the expression can take on. If the min is null, then + /// there is no known min. pub min: GuaranteeBound, - /// The max values that the expression can take on. + /// The max values that the expression can take on. If the max is null, + /// then there is no known max. pub max: GuaranteeBound, /// Whether the expression is expected to be either always null or never null. pub null_status: NullStatus, @@ -97,14 +118,19 @@ impl Guarantee { self.min.bound > *value || (self.min.bound == *value && self.min.open) } + /// Whether values are guaranteed to be greater than or equal to the given + /// value. fn greater_than_or_eq(&self, value: &ScalarValue) -> bool { self.min.bound >= *value } + /// Whether values are guaranteed to be less than the given value. fn less_than(&self, value: &ScalarValue) -> bool { self.max.bound < *value || (self.max.bound == *value && self.max.open) } + /// Whether values are guaranteed to be less than or equal to the given + /// value. fn less_than_or_eq(&self, value: &ScalarValue) -> bool { self.max.bound <= *value } @@ -136,8 +162,6 @@ impl From<&ScalarValue> for Guarantee { } /// Rewrite expressions to incorporate guarantees. -/// -/// pub(crate) struct GuaranteeRewriter<'a> { guarantees: HashMap<&'a Expr, &'a Guarantee>, } diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index 71691017186e..edf1507c705a 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -16,99 +16,6 @@ // under the License. //! Constraint propagator/solver for custom PhysicalExpr graphs. -//! -//! Interval arithmetic provides a way to perform mathematical operations on -//! intervals, which represent a range of possible values rather than a single -//! point value. This allows for the propagation of ranges through mathematical -//! operations, and can be used to compute bounds for a complicated expression. -//! The key idea is that by breaking down a complicated expression into simpler -//! terms, and then combining the bounds for those simpler terms, one can -//! obtain bounds for the overall expression. -//! -//! For example, consider a mathematical expression such as x^2 + y = 4. Since -//! it would be a binary tree in [PhysicalExpr] notation, this type of an -//! hierarchical computation is well-suited for a graph based implementation. -//! In such an implementation, an equation system f(x) = 0 is represented by a -//! directed acyclic expression graph (DAEG). -//! -//! In order to use interval arithmetic to compute bounds for this expression, -//! one would first determine intervals that represent the possible values of x -//! and y. Let's say that the interval for x is [1, 2] and the interval for y -//! is [-3, 1]. In the chart below, you can see how the computation takes place. -//! -//! This way of using interval arithmetic to compute bounds for a complex -//! expression by combining the bounds for the constituent terms within the -//! original expression allows us to reason about the range of possible values -//! of the expression. This information later can be used in range pruning of -//! the provably unnecessary parts of `RecordBatch`es. -//! -//! References -//! 1 - Kabak, Mehmet Ozan. Analog Circuit Start-Up Behavior Analysis: An Interval -//! Arithmetic Based Approach, Chapter 4. Stanford University, 2015. -//! 2 - Moore, Ramon E. Interval analysis. Vol. 4. Englewood Cliffs: Prentice-Hall, 1966. -//! 3 - F. Messine, "Deterministic global optimization using interval constraint -//! propagation techniques," RAIRO-Operations Research, vol. 38, no. 04, -//! pp. 277{293, 2004. -//! -//! ``` text -//! Computing bounds for an expression using interval arithmetic. Constraint propagation through a top-down evaluation of the expression -//! graph using inverse semantics. -//! -//! [-2, 5] ∩ [4, 4] = [4, 4] [4, 4] -//! +-----+ +-----+ +-----+ +-----+ -//! +----| + |----+ +----| + |----+ +----| + |----+ +----| + |----+ -//! | | | | | | | | | | | | | | | | -//! | +-----+ | | +-----+ | | +-----+ | | +-----+ | -//! | | | | | | | | -//! +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -//! | 2 | | y | | 2 | [1, 4] | y | | 2 | [1, 4] | y | | 2 | [1, 4] | y | [0, 1]* -//! |[.] | | | |[.] | | | |[.] | | | |[.] | | | -//! +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -//! | | | [-3, 1] | -//! | | | | -//! +---+ +---+ +---+ +---+ -//! | x | [1, 2] | x | [1, 2] | x | [1, 2] | x | [1, 2] -//! +---+ +---+ +---+ +---+ -//! -//! (a) Bottom-up evaluation: Step1 (b) Bottom up evaluation: Step2 (a) Top-down propagation: Step1 (b) Top-down propagation: Step2 -//! -//! [1 - 3, 4 + 1] = [-2, 5] [1 - 3, 4 + 1] = [-2, 5] -//! +-----+ +-----+ +-----+ +-----+ -//! +----| + |----+ +----| + |----+ +----| + |----+ +----| + |----+ -//! | | | | | | | | | | | | | | | | -//! | +-----+ | | +-----+ | | +-----+ | | +-----+ | -//! | | | | | | | | -//! +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -//! | 2 |[1, 4] | y | | 2 |[1, 4] | y | | 2 |[3, 4]** | y | | 2 |[1, 4] | y | -//! |[.] | | | |[.] | | | |[.] | | | |[.] | | | -//! +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ -//! | [-3, 1] | [-3, 1] | [0, 1] | [-3, 1] -//! | | | | -//! +---+ +---+ +---+ +---+ -//! | x | [1, 2] | x | [1, 2] | x | [1, 2] | x | [sqrt(3), 2]*** -//! +---+ +---+ +---+ +---+ -//! -//! (c) Bottom-up evaluation: Step3 (d) Bottom-up evaluation: Step4 (c) Top-down propagation: Step3 (d) Top-down propagation: Step4 -//! -//! * [-3, 1] ∩ ([4, 4] - [1, 4]) = [0, 1] -//! ** [1, 4] ∩ ([4, 4] - [0, 1]) = [3, 4] -//! *** [1, 2] ∩ [sqrt(3), sqrt(4)] = [sqrt(3), 2] -//! ``` -//! -//! # Examples -//! -//! ``` -//! # Expression: (x + 4) - y -//! -//! -//! # x: [0, 4), y: (1, 3] -//! -//! # Result: -//! ``` -//! -//! # Null handling -//! -//! use std::collections::HashSet; use std::fmt::{Display, Formatter}; @@ -132,7 +39,83 @@ use crate::PhysicalExpr; use super::IntervalBound; - +// Interval arithmetic provides a way to perform mathematical operations on +// intervals, which represent a range of possible values rather than a single +// point value. This allows for the propagation of ranges through mathematical +// operations, and can be used to compute bounds for a complicated expression. +// The key idea is that by breaking down a complicated expression into simpler +// terms, and then combining the bounds for those simpler terms, one can +// obtain bounds for the overall expression. +// +// For example, consider a mathematical expression such as x^2 + y = 4. Since +// it would be a binary tree in [PhysicalExpr] notation, this type of an +// hierarchical computation is well-suited for a graph based implementation. +// In such an implementation, an equation system f(x) = 0 is represented by a +// directed acyclic expression graph (DAEG). +// +// In order to use interval arithmetic to compute bounds for this expression, +// one would first determine intervals that represent the possible values of x +// and y. Let's say that the interval for x is [1, 2] and the interval for y +// is [-3, 1]. In the chart below, you can see how the computation takes place. +// +// This way of using interval arithmetic to compute bounds for a complex +// expression by combining the bounds for the constituent terms within the +// original expression allows us to reason about the range of possible values +// of the expression. This information later can be used in range pruning of +// the provably unnecessary parts of `RecordBatch`es. +// +// References +// 1 - Kabak, Mehmet Ozan. Analog Circuit Start-Up Behavior Analysis: An Interval +// Arithmetic Based Approach, Chapter 4. Stanford University, 2015. +// 2 - Moore, Ramon E. Interval analysis. Vol. 4. Englewood Cliffs: Prentice-Hall, 1966. +// 3 - F. Messine, "Deterministic global optimization using interval constraint +// propagation techniques," RAIRO-Operations Research, vol. 38, no. 04, +// pp. 277{293, 2004. +// +// ``` text +// Computing bounds for an expression using interval arithmetic. Constraint propagation through a top-down evaluation of the expression +// graph using inverse semantics. +// +// [-2, 5] ∩ [4, 4] = [4, 4] [4, 4] +// +-----+ +-----+ +-----+ +-----+ +// +----| + |----+ +----| + |----+ +----| + |----+ +----| + |----+ +// | | | | | | | | | | | | | | | | +// | +-----+ | | +-----+ | | +-----+ | | +-----+ | +// | | | | | | | | +// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +// | 2 | | y | | 2 | [1, 4] | y | | 2 | [1, 4] | y | | 2 | [1, 4] | y | [0, 1]* +// |[.] | | | |[.] | | | |[.] | | | |[.] | | | +// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +// | | | [-3, 1] | +// | | | | +// +---+ +---+ +---+ +---+ +// | x | [1, 2] | x | [1, 2] | x | [1, 2] | x | [1, 2] +// +---+ +---+ +---+ +---+ +// +// (a) Bottom-up evaluation: Step1 (b) Bottom up evaluation: Step2 (a) Top-down propagation: Step1 (b) Top-down propagation: Step2 +// +// [1 - 3, 4 + 1] = [-2, 5] [1 - 3, 4 + 1] = [-2, 5] +// +-----+ +-----+ +-----+ +-----+ +// +----| + |----+ +----| + |----+ +----| + |----+ +----| + |----+ +// | | | | | | | | | | | | | | | | +// | +-----+ | | +-----+ | | +-----+ | | +-----+ | +// | | | | | | | | +// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +// | 2 |[1, 4] | y | | 2 |[1, 4] | y | | 2 |[3, 4]** | y | | 2 |[1, 4] | y | +// |[.] | | | |[.] | | | |[.] | | | |[.] | | | +// +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ +// | [-3, 1] | [-3, 1] | [0, 1] | [-3, 1] +// | | | | +// +---+ +---+ +---+ +---+ +// | x | [1, 2] | x | [1, 2] | x | [1, 2] | x | [sqrt(3), 2]*** +// +---+ +---+ +---+ +---+ +// +// (c) Bottom-up evaluation: Step3 (d) Bottom-up evaluation: Step4 (c) Top-down propagation: Step3 (d) Top-down propagation: Step4 +// +// * [-3, 1] ∩ ([4, 4] - [1, 4]) = [0, 1] +// ** [1, 4] ∩ ([4, 4] - [0, 1]) = [3, 4] +// *** [1, 2] ∩ [sqrt(3), sqrt(4)] = [sqrt(3), 2] +// ``` /// This object implements a directed acyclic expression graph (DAEG) that /// is used to compute ranges for expressions through interval arithmetic. @@ -578,7 +561,6 @@ pub fn check_support(expr: &Arc) -> bool { #[cfg(test)] mod tests { use super::*; - use datafusion_expr::expr; use itertools::Itertools; use crate::expressions::{BinaryExpr, Column}; @@ -1141,36 +1123,6 @@ mod tests { let final_node_count = graph.node_count(); // Assert that the final node count is equal the previous node count (i.e., no node was pruned). assert_eq!(prev_node_count, final_node_count); - Ok(()) - } - - #[test] - fn my_test() -> Result<()> { - let expr_x = Arc::new(Column::new("x", 0)); - let expr_y = Arc::new(Column::new("y", 1)); - let expression = Arc::new(BinaryExpr::new( - Arc::new(BinaryExpr::new( - expr_x.clone(), - Operator::Plus, - Arc::new(Literal::new(ScalarValue::Int32(Some(4)))), - )), - Operator::Minus, - expr_y.clone(), - )); - - let mut graph = ExprIntervalGraph::try_new(expression.clone()).unwrap(); - // Pass in expr_x and expr_y so we can input their intervals - let mapping = graph.gather_node_indices(&[expr_x, expr_y]); - - let interval_x = Interval::make(Some(0), Some(4), (false, true)); - let interval_y = Interval::make(Some(1), Some(3), (true, false)); - graph.assign_intervals(&[ - (mapping[0].1, interval_x.clone()), - (mapping[1].1, interval_y.clone()), - ]); - - - Ok(()) } }