Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add guarantees to simplification #7467

Merged
merged 16 commits into from
Sep 13, 2023
179 changes: 176 additions & 3 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,19 @@ use datafusion_expr::{
and, expr, lit, or, BinaryExpr, BuiltinScalarFunction, Case, ColumnarValue, Expr,
Like, Volatility,
};
use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps};
use datafusion_physical_expr::{
create_physical_expr, execution_props::ExecutionProps, intervals::NullableInterval,
};

use crate::simplify_expressions::SimplifyInfo;

use crate::simplify_expressions::guarantees::GuaranteeRewriter;

/// This structure handles API for expression simplification
pub struct ExprSimplifier<S> {
info: S,
///
wjones127 marked this conversation as resolved.
Show resolved Hide resolved
guarantees: Vec<(Expr, NullableInterval)>,
}

pub const THRESHOLD_INLINE_INLIST: usize = 3;
Expand All @@ -57,7 +63,10 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
///
/// [`SimplifyContext`]: crate::simplify_expressions::context::SimplifyContext
pub fn new(info: S) -> Self {
Self { info }
Self {
info,
guarantees: vec![],
}
}

/// Simplifies this [`Expr`]`s as much as possible, evaluating
Expand Down Expand Up @@ -121,6 +130,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
let mut simplifier = Simplifier::new(&self.info);
let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?;
let mut or_in_list_simplifier = OrInListSimplifier::new();
let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees);

// TODO iterate until no changes are made during rewrite
// (evaluating constants can enable new simplifications and
Expand All @@ -129,6 +139,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
expr.rewrite(&mut const_evaluator)?
.rewrite(&mut simplifier)?
.rewrite(&mut or_in_list_simplifier)?
.rewrite(&mut guarantee_rewriter)?
// run both passes twice to try an minimize simplifications that we missed
.rewrite(&mut const_evaluator)?
.rewrite(&mut simplifier)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think of such a loop to cover every simplification case and make it easier to accommodate future simplifications, or would it be unnecessary?

        loop {
            let original_expr = expr.clone();
            expr = expr
                .rewrite(&mut const_evaluator)?
                .rewrite(&mut simplifier)?
                .rewrite(&mut or_in_list_simplifier)?
                .rewrite(&mut guarantee_rewriter)?;

            if expr == original_expr {
                break;
            }
        }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a neat idea. I think we should try it in a follow on PR.

ALso, If we did this I would also suggest adding a limit on the number of loops (to avoid a "ping-poing" infinite loop where passes rewrite an expression back and forth)

Expand All @@ -149,6 +160,65 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {

expr.rewrite(&mut expr_rewrite)
}

/// Input guarantees about the values of columns.
///
/// The guarantees can simplify expressions. For example, if a column `x` is
/// guaranteed to be `3`, then the expression `x > 1` can be replaced by the
/// literal `true`.
///
/// The guarantees are provided as a `Vec<(Expr, NullableInterval)>`,
/// where the [Expr] is a column reference and the [NullableInterval]
/// is an interval representing the known possible values of that column.
///
/// ```rust
/// use arrow::datatypes::{DataType, Field, Schema};
/// use datafusion_expr::{col, lit, Expr};
/// use datafusion_common::{Result, ScalarValue, ToDFSchema};
/// use datafusion_physical_expr::execution_props::ExecutionProps;
/// use datafusion_physical_expr::intervals::{Interval, NullableInterval};
/// use datafusion_optimizer::simplify_expressions::{
/// ExprSimplifier, SimplifyContext};
///
/// let schema = Schema::new(vec![
/// Field::new("x", DataType::Int64, false),
/// Field::new("y", DataType::UInt32, false),
/// Field::new("z", DataType::Int64, false),
/// ])
/// .to_dfschema_ref().unwrap();
///
/// // Create the simplifier
/// let props = ExecutionProps::new();
/// let context = SimplifyContext::new(&props)
/// .with_schema(schema);
///
/// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is really cool @wjones127

/// let expr_x = col("x").gt_eq(lit(3_i64));
/// let expr_y = (col("y") + lit(2_u32)).lt(lit(10_u32));
/// let expr_z = col("z").gt(lit(5_i64));
/// let expr = expr_x.and(expr_y).and(expr_z.clone());
///
/// let guarantees = vec![
/// // x ∈ [3, 5]
/// (
/// col("x"),
/// NullableInterval::NotNull {
/// values: Interval::make(Some(3_i64), Some(5_i64), (false, false)),
/// }
/// ),
/// // y = 3
/// (col("y"), NullableInterval::from(&ScalarValue::UInt32(Some(3)))),
/// ];
/// let simplifier = ExprSimplifier::new(context).with_guarantees(guarantees);
/// let output = simplifier.simplify(expr).unwrap();
/// // Expression becomes: true AND true AND (z > 5), which simplifies to
/// // z > 5.
/// assert_eq!(output, expr_z);
/// ```
pub fn with_guarantees(mut self, guarantees: Vec<(Expr, NullableInterval)>) -> Self {
self.guarantees = guarantees;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

self
}
}

#[allow(rustdoc::private_intra_doc_links)]
Expand Down Expand Up @@ -1211,7 +1281,9 @@ mod tests {
use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema};
use datafusion_expr::*;
use datafusion_physical_expr::{
execution_props::ExecutionProps, functions::make_scalar_function,
execution_props::ExecutionProps,
functions::make_scalar_function,
intervals::{Interval, NullableInterval},
};

// ------------------------------
Expand Down Expand Up @@ -2675,6 +2747,19 @@ mod tests {
try_simplify(expr).unwrap()
}

fn simplify_with_guarantee(
expr: Expr,
guarantees: Vec<(Expr, NullableInterval)>,
) -> Expr {
let schema = expr_test_schema();
let execution_props = ExecutionProps::new();
let mut simplifier = ExprSimplifier::new(
SimplifyContext::new(&execution_props).with_schema(schema),
)
.with_guarantees(guarantees);
simplifier.simplify(expr).unwrap()
}

fn expr_test_schema() -> DFSchemaRef {
Arc::new(
DFSchema::new_with_metadata(
Expand Down Expand Up @@ -3138,4 +3223,92 @@ mod tests {
let expr = not_ilike(null, "%");
assert_eq!(simplify(expr), lit_bool_null());
}

#[test]
fn test_simplify_with_guarantee() {
// (c3 >= 3) AND (c4 + 2 < 10 OR (c1 NOT IN ("a", "b")))
let expr_x = col("c3").gt(lit(3_i64));
let expr_y = (col("c4") + lit(2_u32)).lt(lit(10_u32));
let expr_z = col("c1").in_list(vec![lit("a"), lit("b")], true);
let expr = expr_x.clone().and(expr_y.clone().or(expr_z));

// All guaranteed null
let guarantees = vec![
(col("c3"), NullableInterval::from(&ScalarValue::Int64(None))),
(
col("c4"),
NullableInterval::from(&ScalarValue::UInt32(None)),
),
(col("c1"), NullableInterval::from(&ScalarValue::Utf8(None))),
];

let output = simplify_with_guarantee(expr.clone(), guarantees);
assert_eq!(output, lit_bool_null());

// All guaranteed false
let guarantees = vec![
(
col("c3"),
NullableInterval::NotNull {
values: Interval::make(Some(0_i64), Some(2_i64), (false, false)),
},
),
(
col("c4"),
NullableInterval::from(&ScalarValue::UInt32(Some(9))),
),
(
col("c1"),
NullableInterval::from(&ScalarValue::Utf8(Some("a".to_string()))),
),
];
let output = simplify_with_guarantee(expr.clone(), guarantees);
assert_eq!(output, lit(false));

// Guaranteed false or null -> no change.
let guarantees = vec![
(
col("c3"),
NullableInterval::MaybeNull {
values: Interval::make(Some(0_i64), Some(2_i64), (false, false)),
},
),
(
col("c4"),
NullableInterval::MaybeNull {
values: Interval::make(Some(9_u32), Some(9_u32), (false, false)),
},
),
(
col("c1"),
NullableInterval::NotNull {
values: Interval::make(Some("d"), Some("f"), (false, false)),
},
),
];
let output = simplify_with_guarantee(expr.clone(), guarantees);
assert_eq!(&output, &expr_x);

// Sufficient true guarantees
let guarantees = vec![
(
col("c3"),
NullableInterval::from(&ScalarValue::Int64(Some(9))),
),
(
col("c4"),
NullableInterval::from(&ScalarValue::UInt32(Some(3))),
),
];
let output = simplify_with_guarantee(expr.clone(), guarantees);
assert_eq!(output, lit(true));

// Only partially simplify
let guarantees = vec![(
col("c4"),
NullableInterval::from(&ScalarValue::UInt32(Some(3))),
)];
let output = simplify_with_guarantee(expr.clone(), guarantees);
assert_eq!(&output, &expr_x);
}
}
Loading
Loading