diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index 20eea0acb..0adae6c4b 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -1,8 +1,6 @@ //! An implementation of parquet row group skipping using data skipping predicates over footer stats. -use crate::predicates::parquet_stats_skipping::{ - ParquetStatsProvider, ParquetStatsSkippingFilter as _, -}; use crate::expressions::{ColumnName, Expression, Scalar, UnaryExpression, BinaryExpression, VariadicExpression}; +use crate::predicates::parquet_stats_skipping::ParquetStatsProvider; use crate::schema::{DataType, PrimitiveType}; use chrono::{DateTime, Days}; use parquet::arrow::arrow_reader::ArrowReaderBuilder; @@ -57,6 +55,7 @@ impl<'a> RowGroupFilter<'a> { /// Applies a filtering predicate to a row group. Return value false means to skip it. fn apply(row_group: &'a RowGroupMetaData, predicate: &Expression) -> bool { + use crate::predicates::PredicateEvaluator as _; RowGroupFilter::new(row_group, predicate).eval_sql_where(predicate) != Some(false) } diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 620142679..47d35afa1 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -47,6 +47,17 @@ pub enum BinaryOperator { } impl BinaryOperator { + /// True if this is a comparison for which NULL input always produces NULL output + pub(crate) fn is_null_intolerant_comparison(&self) -> bool { + use BinaryOperator::*; + match self { + Plus | Minus | Multiply | Divide => false, // not a comparison + LessThan | LessThanOrEqual | GreaterThan | GreaterThanOrEqual => true, + Equal | NotEqual => true, + Distinct | In | NotIn => false, // tolerates NULL input + } + } + /// Returns `` (if any) such that `B A` is equivalent to `A B`. pub(crate) fn commute(&self) -> Option { use BinaryOperator::*; diff --git a/kernel/src/predicates/mod.rs b/kernel/src/predicates/mod.rs index f13ed9a3b..54004db6f 100644 --- a/kernel/src/predicates/mod.rs +++ b/kernel/src/predicates/mod.rs @@ -20,7 +20,21 @@ mod tests; /// /// Because inversion (`NOT` operator) has special semantics and can often be optimized away by /// pushing it down, most methods take an `inverted` flag. That allows operations like -/// [`UnaryOperator::Not`] to simply evaluate their operand with a flipped `inverted` flag, +/// [`UnaryOperator::Not`] to simply evaluate their operand with a flipped `inverted` flag, and +/// greatly simplifies the implementations of most operators (other than those which have to +/// directly implement NOT semantics, which are unavoidably complex in that regard). +/// +/// # Parameterized output type +/// +/// The types involved in predicate evaluation are parameterized and implementation-specific. For +/// example, [`crate::engine::parquet_stats_skipping::ParquetStatsProvider`] directly evaluates the +/// predicate over parquet footer stats and returns boolean results, while +/// [`crate::scan::data_skipping::DataSkippingPredicateCreator`] instead transforms the input +/// predicate expression to a data skipping predicate expresion that the engine can evaluated +/// directly against Delta data skipping stats during log replay. Although this approach is harder +/// to read and reason about at first, the majority of expressions can be implemented generically, +/// which greatly reduces redundancy and ensures that all flavors of predicate evaluation have the +/// same semantics. /// /// # NULL and error semantics /// @@ -44,6 +58,9 @@ mod tests; pub(crate) trait PredicateEvaluator { type Output; + /// A (possibly inverted) scalar NULL test, e.g. ` IS [NOT] NULL`. + fn eval_scalar_is_null(&self, val: &Scalar, inverted: bool) -> Option; + /// A (possibly inverted) boolean scalar value, e.g. `[NOT] `. fn eval_scalar(&self, val: &Scalar, inverted: bool) -> Option; @@ -123,14 +140,19 @@ pub(crate) trait PredicateEvaluator { fn eval_unary(&self, op: UnaryOperator, expr: &Expr, inverted: bool) -> Option { match op { UnaryOperator::Not => self.eval_expr(expr, !inverted), - UnaryOperator::IsNull => { - // Data skipping only supports IS [NOT] NULL over columns (not expressions) - let Expr::Column(col) = expr else { + UnaryOperator::IsNull => match expr { + // WARNING: Only literals and columns can be safely null-checked. Attempting to + // null-check an expressions such as `a < 10` could wrongly produce FALSE in case + // `a` is just plain missing (rather than known to be NULL. A missing-value can + // arise e.g. if data skipping encounters a column with missing stats, or if + // partition pruning encounters a non-partition column. + Expr::Literal(val) => self.eval_scalar_is_null(val, inverted), + Expr::Column(col) => self.eval_is_null(col, inverted), + _ => { debug!("Unsupported operand: IS [NOT] NULL: {expr:?}"); - return None; - }; - self.eval_is_null(col, inverted) - } + None + } + }, } } @@ -229,12 +251,137 @@ pub(crate) trait PredicateEvaluator { Variadic(VariadicExpression { op, exprs }) => self.eval_variadic(*op, exprs, inverted), } } + + /// Evaluates a predicate with SQL WHERE semantics. + /// + /// By default, [`eval_expr`] behaves badly for comparisons involving NULL columns (e.g. `a < + /// 10` when `a` is NULL), because the comparison correctly evaluates to NULL, but NULL + /// expressions are interpreted as "stats missing" (= cannot skip). This ambiguity can "poison" + /// the entire expression, causing it to return NULL instead of FALSE that would allow skipping: + /// + /// ```text + /// WHERE a < 10 -- NULL (can't skip file) + /// WHERE a < 10 AND TRUE -- NULL (can't skip file) + /// WHERE a < 10 OR FALSE -- NULL (can't skip file) + /// ``` + /// + /// Meanwhile, SQL WHERE semantics only keeps rows for which the filter evaluates to + /// TRUE (discarding rows that evaluate to FALSE or NULL): + /// + /// ```text + /// WHERE a < 10 -- NULL (discard row) + /// WHERE a < 10 AND TRUE -- NULL (discard row) + /// WHERE a < 10 OR FALSE -- NULL (discard row) + /// ``` + /// + /// Conceptually, the behavior difference between data skipping and SQL WHERE semantics can be + /// addressed by evaluating with null-safe semantics, as if by ` IS NOT NULL AND `: + /// + /// ```text + /// WHERE (a < 10) IS NOT NULL AND (a < 10) -- FALSE (skip file) + /// WHERE (a < 10 AND TRUE) IS NOT NULL AND (a < 10 AND TRUE) -- FALSE (skip file) + /// WHERE (a < 10 OR FALSE) IS NOT NULL AND (a < 10 OR FALSE) -- FALSE (skip file) + /// ``` + /// + /// HOWEVER, we cannot safely NULL-check the result of an arbitrary data skipping predicate + /// because an expression will also produce NULL if the value is just plain missing (e.g. data + /// skipping over a column that lacks stats), and if that NULL should propagate all the way to + /// top-level, it would be wrongly interpreted as FALSE (= skippable). + /// + /// To prevent wrong data skipping, the predicate evaluator always returns NULL for a NULL check + /// over anything except for literals and columns with known values. So we must push the NULL + /// check down through supported operations (AND as well as null-intolerant comparisons like + /// `<`, `!=`, etc) until it reaches columns and literals where it can do some good, e.g.: + /// + /// ```text + /// WHERE a < 10 AND (b < 20 OR c < 30) + /// ``` + /// + /// would conceptually be interpreted as + /// + /// ```text + /// WHERE + /// (a < 10 AND (b < 20 OR c < 30)) IS NOT NULL AND + /// (a < 10 AND (b < 20 OR c < 30)) + /// ``` + /// + /// We then push the NULL check down through the top-level AND: + /// + /// ```text + /// WHERE + /// (a < 10 IS NOT NULL AND a < 10) AND + /// ((b < 20 OR c < 30) IS NOT NULL AND (b < 20 OR c < 30)) + /// ``` + /// + /// and attempt to push it further into the `a < 10` and `OR` clauses: + /// + /// ```text + /// WHERE + /// (a IS NOT NULL AND 10 IS NOT NULL AND a < 10) AND + /// (b < 20 OR c < 30) + /// ``` + /// + /// Any time the push-down reaches an operator that does not support push-down (such as OR), we + /// simply drop the NULL check. This way, the top-level NULL check only applies to + /// sub-expressions that can safely implement it, while ignoring other sub-expressions. The + /// unsupported sub-expressions could produce nulls at runtime that prevent skipping, but false + /// positives are OK -- the query will still correctly filter out the unwanted rows that result. + /// + /// At expression evaluation time, a NULL value of `a` (from our example) would evaluate as: + /// + /// ```text + /// AND(..., AND(a IS NOT NULL, 10 IS NOT NULL, a < 10), ...) + /// AND(..., AND(FALSE, TRUE, NULL), ...) + /// AND(..., FALSE, ...) + /// FALSE + /// ``` + /// + /// While a non-NULL value of `a` would instead evaluate as: + /// + /// ```text + /// AND(..., AND(a IS NOT NULL, 10 IS NOT NULL, a < 10), ...) + /// AND(..., AND(TRUE, TRUE, ), ...) + /// AND(..., , ...) + /// ``` + /// + /// And a missing value for `a` would safely disable the clause: + /// + /// ```text + /// AND(..., AND(a IS NOT NULL, 10 IS NOT NULL, a < 10), ...) + /// AND(..., AND(NULL, TRUE, NULL), ...) + /// AND(..., NULL, ...) + /// ``` + fn eval_sql_where(&self, filter: &Expr) -> Option { + use Expr::{Binary, Variadic}; + match filter { + Variadic(v) => { + // Recursively invoke `eval_sql_where` instead of the usual `eval_expr` for AND/OR. + let exprs = v.exprs.iter().map(|expr| self.eval_sql_where(expr)); + self.finish_eval_variadic(v.op, exprs, false) + } + Binary(BinaryExpression { op, left, right }) if op.is_null_intolerant_comparison() => { + // Perform a nullsafe comparison instead of the usual `eval_binary` + let exprs = [ + self.eval_unary(UnaryOperator::IsNull, left, true), + self.eval_unary(UnaryOperator::IsNull, right, true), + self.eval_binary(*op, left, right, false), + ]; + self.finish_eval_variadic(VariadicOperator::And, exprs, false) + } + _ => self.eval_expr(filter, false), + } + } } /// A collection of provided methods from the [`PredicateEvaluator`] trait, factored out to allow -/// reuse by the different predicate evaluator implementations. +/// reuse by multiple bool-output predicate evaluator implementations. pub(crate) struct PredicateEvaluatorDefaults; impl PredicateEvaluatorDefaults { + /// Directly null-tests a scalar. See [`PredicateEvaluator::eval_scalar_is_null`]. + pub(crate) fn eval_scalar_is_null(val: &Scalar, inverted: bool) -> Option { + Some(val.is_null() != inverted) + } + /// Directly evaluates a boolean scalar. See [`PredicateEvaluator::eval_scalar`]. pub(crate) fn eval_scalar(val: &Scalar, inverted: bool) -> Option { match val { @@ -326,6 +473,14 @@ impl ResolveColumnAsScalar for UnimplementedColumnResolver { } } +// Used internally and by some tests +pub(crate) struct EmptyColumnResolver; +impl ResolveColumnAsScalar for EmptyColumnResolver { + fn resolve_column(&self, _col: &ColumnName) -> Option { + None + } +} + // In testing, it is convenient to just build a hashmap of scalar values. #[cfg(test)] impl ResolveColumnAsScalar for std::collections::HashMap { @@ -358,13 +513,17 @@ impl From for DefaultPredicateEvaluator PredicateEvaluator for DefaultPredicateEvaluator { type Output = bool; + fn eval_scalar_is_null(&self, val: &Scalar, inverted: bool) -> Option { + PredicateEvaluatorDefaults::eval_scalar_is_null(val, inverted) + } + fn eval_scalar(&self, val: &Scalar, inverted: bool) -> Option { PredicateEvaluatorDefaults::eval_scalar(val, inverted) } fn eval_is_null(&self, col: &ColumnName, inverted: bool) -> Option { let col = self.resolve_column(col)?; - Some(matches!(col, Scalar::Null(_)) != inverted) + self.eval_scalar_is_null(&col, inverted) } fn eval_lt(&self, col: &ColumnName, val: &Scalar) -> Option { @@ -428,12 +587,6 @@ impl PredicateEvaluator for DefaultPredicateEvaluator< /// example, comparisons involving a column are converted into comparisons over that column's /// min/max stats, and NULL checks are converted into comparisons involving the column's nullcount /// and rowcount stats. -/// -/// The types involved in these operations are parameterized and implementation-specific. For -/// example, [`crate::engine::parquet_stats_skipping::ParquetStatsProvider`] directly evaluates data -/// skipping expressions and returnss boolean results, while -/// [`crate::scan::data_skipping::DataSkippingPredicateCreator`] instead converts the input -/// predicate to a data skipping predicate that can be evaluated directly later. pub(crate) trait DataSkippingPredicateEvaluator { /// The output type produced by this expression evaluator type Output; @@ -454,6 +607,9 @@ pub(crate) trait DataSkippingPredicateEvaluator { /// Retrieves the row count of a column (parquet footers always include this stat). fn get_rowcount_stat(&self) -> Option; + /// See [`PredicateEvaluator::eval_scalar_is_null`] + fn eval_scalar_is_null(&self, val: &Scalar, inverted: bool) -> Option; + /// See [`PredicateEvaluator::eval_scalar`] fn eval_scalar(&self, val: &Scalar, inverted: bool) -> Option; @@ -589,6 +745,10 @@ pub(crate) trait DataSkippingPredicateEvaluator { impl PredicateEvaluator for T { type Output = T::Output; + fn eval_scalar_is_null(&self, val: &Scalar, inverted: bool) -> Option { + self.eval_scalar_is_null(val, inverted) + } + fn eval_scalar(&self, val: &Scalar, inverted: bool) -> Option { self.eval_scalar(val, inverted) } diff --git a/kernel/src/predicates/parquet_stats_skipping.rs b/kernel/src/predicates/parquet_stats_skipping.rs index a8c679d69..ff7536f40 100644 --- a/kernel/src/predicates/parquet_stats_skipping.rs +++ b/kernel/src/predicates/parquet_stats_skipping.rs @@ -1,11 +1,6 @@ //! An implementation of data skipping that leverages parquet stats from the file footer. -use crate::expressions::{ - BinaryExpression, BinaryOperator, ColumnName, Expression as Expr, Scalar, UnaryOperator, - VariadicExpression, VariadicOperator, -}; -use crate::predicates::{ - DataSkippingPredicateEvaluator, PredicateEvaluator, PredicateEvaluatorDefaults, -}; +use crate::expressions::{BinaryOperator, ColumnName, Scalar, VariadicOperator}; +use crate::predicates::{DataSkippingPredicateEvaluator, PredicateEvaluatorDefaults}; use crate::schema::DataType; use std::cmp::Ordering; @@ -65,6 +60,10 @@ impl DataSkippingPredicateEvaluator for T { PredicateEvaluatorDefaults::partial_cmp_scalars(ord, &col, val, inverted) } + fn eval_scalar_is_null(&self, val: &Scalar, inverted: bool) -> Option { + PredicateEvaluatorDefaults::eval_scalar_is_null(val, inverted) + } + fn eval_scalar(&self, val: &Scalar, inverted: bool) -> Option { PredicateEvaluatorDefaults::eval_scalar(val, inverted) } @@ -96,109 +95,3 @@ impl DataSkippingPredicateEvaluator for T { PredicateEvaluatorDefaults::finish_eval_variadic(op, exprs, inverted) } } - -/// Data skipping based on parquet footer stats (e.g. row group skipping). The required methods -/// fetch stats values for requested columns (if available and with compatible types), and the -/// provided methods implement the actual skipping logic. -/// -/// NOTE: We are given a row-based filter, but stats-based predicate evaluation -- which applies to -/// a SET of rows -- has different semantics than row-based predicate evaluation. The provided -/// methods of this class convert various supported expressions into data skipping predicates, and -/// then return the result of evaluating the translated filter. -pub(crate) trait ParquetStatsSkippingFilter { - /// Attempts to filter using SQL WHERE semantics. - /// - /// By default, [`apply_expr`] can produce unwelcome behavior for comparisons involving all-NULL - /// columns (e.g. `a == 10`), because the (legitimately NULL) min/max stats are interpreted as - /// stats-missing that produces a NULL data skipping result). The resulting NULL can "poison" - /// the entire expression, causing it to return NULL instead of FALSE that would allow skipping. - /// - /// Meanwhile, SQL WHERE semantics only keep rows for which the filter evaluates to TRUE -- - /// effectively turning `` into the null-safe predicate `AND( IS NOT NULL, )`. - /// - /// We cannot safely evaluate an arbitrary data skipping expression with null-safe semantics - /// (because NULL could also mean missing-stats), but we CAN safely turn a column reference in a - /// comparison into a null-safe comparison, as long as the comparison's parent expressions are - /// all AND. To see why, consider a WHERE clause filter of the form: - /// - /// ```text - /// AND(..., a {cmp} b, ...) - /// ``` - /// - /// In order allow skipping based on the all-null `a` or `b`, we want to actually evaluate: - /// ```text - /// AND(..., AND(a IS NOT NULL, b IS NOT NULL, a {cmp} b), ...) - /// ``` - /// - /// This optimization relies on the fact that we only support IS [NOT] NULL skipping for - /// columns, and we only support skipping for comparisons between columns and literals. Thus, a - /// typical case such as: `AND(..., x < 10, ...)` would in the all-null case be evaluated as: - /// ```text - /// AND(..., AND(x IS NOT NULL, 10 IS NOT NULL, x < 10), ...) - /// AND(..., AND(FALSE, NULL, NULL), ...) - /// AND(..., FALSE, ...) - /// FALSE - /// ``` - /// - /// In the not all-null case, it would instead evaluate as: - /// ```text - /// AND(..., AND(x IS NOT NULL, 10 IS NOT NULL, x < 10), ...) - /// AND(..., AND(TRUE, NULL, ), ...) - /// ``` - /// - /// If the result was FALSE, it forces both inner and outer AND to FALSE, as desired. If the - /// result was TRUE or NULL, then it does not contribute to data skipping but also does not - /// block it if other legs of the AND evaluate to FALSE. - // TODO: If these are generally useful, we may want to move them into PredicateEvaluator? - fn eval_sql_where(&self, filter: &Expr) -> Option; - fn eval_binary_nullsafe(&self, op: BinaryOperator, left: &Expr, right: &Expr) -> Option; -} - -impl> ParquetStatsSkippingFilter for T { - fn eval_sql_where(&self, filter: &Expr) -> Option { - use Expr::{Binary, Variadic}; - match filter { - Variadic(VariadicExpression { - op: VariadicOperator::And, - exprs, - }) => { - let exprs: Vec<_> = exprs - .iter() - .map(|expr| self.eval_sql_where(expr)) - .map(|result| match result { - Some(value) => Expr::literal(value), - None => Expr::null_literal(DataType::BOOLEAN), - }) - .collect(); - self.eval_variadic(VariadicOperator::And, &exprs, false) - } - Binary(BinaryExpression { op, left, right }) => { - self.eval_binary_nullsafe(*op, left, right) - } - _ => self.eval_expr(filter, false), - } - } - - /// Helper method for [`apply_sql_where`], that evaluates `{a} {cmp} {b}` as - /// ```text - /// AND({a} IS NOT NULL, {b} IS NOT NULL, {a} {cmp} {b}) - /// ``` - /// - /// The null checks only apply to column expressions, so at least one of them will always be - /// NULL (since we don't support skipping over column-column comparisons). If any NULL check - /// fails (producing FALSE), it short-circuits the entire AND without ever evaluating the - /// comparison. Otherwise, the original comparison will run and -- if FALSE -- can cause data - /// skipping as usual. - fn eval_binary_nullsafe(&self, op: BinaryOperator, left: &Expr, right: &Expr) -> Option { - use UnaryOperator::IsNull; - // Convert `a {cmp} b` to `AND(a IS NOT NULL, b IS NOT NULL, a {cmp} b)`, - // and only evaluate the comparison if the null checks don't short circuit. - if let Some(false) = self.eval_unary(IsNull, left, true) { - return Some(false); - } - if let Some(false) = self.eval_unary(IsNull, right, true) { - return Some(false); - } - self.eval_binary(op, left, right, false) - } -} diff --git a/kernel/src/predicates/parquet_stats_skipping/tests.rs b/kernel/src/predicates/parquet_stats_skipping/tests.rs index 50833a166..949a4cb68 100644 --- a/kernel/src/predicates/parquet_stats_skipping/tests.rs +++ b/kernel/src/predicates/parquet_stats_skipping/tests.rs @@ -1,6 +1,6 @@ use super::*; use crate::expressions::{column_expr, Expression as Expr}; -use crate::predicates::PredicateEvaluator; +use crate::predicates::PredicateEvaluator as _; use crate::DataType; const TRUE: Option = Some(true); @@ -257,126 +257,3 @@ fn test_eval_is_null() { // all nulls do_test(2, &[TRUE, FALSE]); } - -struct AllNullTestFilter; -impl ParquetStatsProvider for AllNullTestFilter { - fn get_parquet_min_stat(&self, _col: &ColumnName, _data_type: &DataType) -> Option { - None - } - - fn get_parquet_max_stat(&self, _col: &ColumnName, _data_type: &DataType) -> Option { - None - } - - fn get_parquet_nullcount_stat(&self, _col: &ColumnName) -> Option { - Some(self.get_parquet_rowcount_stat()) - } - - fn get_parquet_rowcount_stat(&self) -> i64 { - 10 - } -} - -#[test] -fn test_sql_where() { - let col = &column_expr!("x"); - const VAL: Expr = Expr::Literal(Scalar::Integer(1)); - const NULL: Expr = Expr::Literal(Scalar::Null(DataType::BOOLEAN)); - const FALSE: Expr = Expr::Literal(Scalar::Boolean(false)); - const TRUE: Expr = Expr::Literal(Scalar::Boolean(true)); - - // Basic sanity checks - expect_eq!(AllNullTestFilter.eval_sql_where(&VAL), None, "WHERE {VAL}"); - expect_eq!(AllNullTestFilter.eval_sql_where(col), None, "WHERE {col}"); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::is_null(col.clone())), - Some(true), // No injected NULL checks - "WHERE {col} IS NULL" - ); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::lt(TRUE, FALSE)), - Some(false), // Injected NULL checks don't short circuit when inputs are NOT NULL - "WHERE {TRUE} < {FALSE}" - ); - - // Contrast normal vs SQL WHERE semantics - comparison - expect_eq!( - AllNullTestFilter.eval_expr(&Expr::lt(col.clone(), VAL), false), - None, - "{col} < {VAL}" - ); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::lt(col.clone(), VAL)), - Some(false), - "WHERE {col} < {VAL}" - ); - expect_eq!( - AllNullTestFilter.eval_expr(&Expr::lt(VAL, col.clone()), false), - None, - "{VAL} < {col}" - ); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::lt(VAL, col.clone())), - Some(false), - "WHERE {VAL} < {col}" - ); - - // Contrast normal vs SQL WHERE semantics - comparison inside AND - expect_eq!( - AllNullTestFilter.eval_expr(&Expr::and(NULL, Expr::lt(col.clone(), VAL)), false), - None, - "{NULL} AND {col} < {VAL}" - ); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::and(NULL, Expr::lt(col.clone(), VAL),)), - Some(false), - "WHERE {NULL} AND {col} < {VAL}" - ); - - expect_eq!( - AllNullTestFilter.eval_expr(&Expr::and(TRUE, Expr::lt(col.clone(), VAL)), false), - None, // NULL (from the NULL check) is stronger than TRUE - "{TRUE} AND {col} < {VAL}" - ); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::and(TRUE, Expr::lt(col.clone(), VAL),)), - Some(false), // FALSE (from the NULL check) is stronger than TRUE - "WHERE {TRUE} AND {col} < {VAL}" - ); - - // Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND - expect_eq!( - AllNullTestFilter.eval_expr( - &Expr::and(TRUE, Expr::and(NULL, Expr::lt(col.clone(), VAL)),), - false, - ), - None, - "{TRUE} AND ({NULL} AND {col} < {VAL})" - ); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::and( - TRUE, - Expr::and(NULL, Expr::lt(col.clone(), VAL)), - )), - Some(false), - "WHERE {TRUE} AND ({NULL} AND {col} < {VAL})" - ); - - // Semantics are the same for comparison inside OR inside AND - expect_eq!( - AllNullTestFilter.eval_expr( - &Expr::or(FALSE, Expr::and(NULL, Expr::lt(col.clone(), VAL)),), - false, - ), - None, - "{FALSE} OR ({NULL} AND {col} < {VAL})" - ); - expect_eq!( - AllNullTestFilter.eval_sql_where(&Expr::or( - FALSE, - Expr::and(NULL, Expr::lt(col.clone(), VAL)), - )), - None, - "WHERE {FALSE} OR ({NULL} AND {col} < {VAL})" - ); -} diff --git a/kernel/src/predicates/tests.rs b/kernel/src/predicates/tests.rs index fa4aec191..fcfb08eb9 100644 --- a/kernel/src/predicates/tests.rs +++ b/kernel/src/predicates/tests.rs @@ -2,7 +2,6 @@ use super::*; use crate::expressions::{ column_expr, column_name, ArrayData, Expression, StructData, UnaryOperator, }; -use crate::predicates::PredicateEvaluator; use crate::schema::ArrayType; use crate::DataType; @@ -394,12 +393,12 @@ fn test_eval_is_null() { let expr = Expression::literal(1); expect_eq!( filter.eval_unary(UnaryOperator::IsNull, &expr, true), - None, + Some(true), "1 IS NOT NULL" ); expect_eq!( filter.eval_unary(UnaryOperator::IsNull, &expr, false), - None, + Some(false), "1 IS NULL" ); } @@ -570,3 +569,81 @@ fn eval_binary() { ); } } + +// NOTE: `None` is NOT equivalent to `Some(Scalar::Null)` +struct NullColumnResolver; +impl ResolveColumnAsScalar for NullColumnResolver { + fn resolve_column(&self, _col: &ColumnName) -> Option { + Some(Scalar::Null(DataType::INTEGER)) + } +} + +#[test] +fn test_sql_where() { + let col = &column_expr!("x"); + const VAL: Expr = Expr::Literal(Scalar::Integer(1)); + const NULL: Expr = Expr::Literal(Scalar::Null(DataType::BOOLEAN)); + const FALSE: Expr = Expr::Literal(Scalar::Boolean(false)); + const TRUE: Expr = Expr::Literal(Scalar::Boolean(true)); + let null_filter = DefaultPredicateEvaluator::from(NullColumnResolver); + let empty_filter = DefaultPredicateEvaluator::from(EmptyColumnResolver); + + // Basic sanity checks + expect_eq!(null_filter.eval_sql_where(&VAL), None, "WHERE {VAL}"); + expect_eq!(null_filter.eval_sql_where(col), None, "WHERE {col}"); + + // SQL eval does not modify behavior of IS NULL + let expr = &Expr::is_null(col.clone()); + expect_eq!(null_filter.eval_sql_where(expr), Some(true), "{expr}"); + + // Injected NULL checks only short circuit if inputs are NULL + let expr = &Expr::lt(FALSE, TRUE); + expect_eq!(null_filter.eval_sql_where(expr), Some(true), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), Some(true), "{expr}"); + + // Constrast normal vs SQL WHERE semantics - comparison + let expr = &Expr::lt(col.clone(), VAL); + expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}"); + // NULL check produces NULL due to missing column + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); + + let expr = &Expr::lt(VAL, col.clone()); + expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); + + let expr = &Expr::distinct(VAL, col.clone()); + expect_eq!(null_filter.eval_expr(expr, false), Some(true), "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(true), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); + + let expr = &Expr::distinct(NULL, col.clone()); + expect_eq!(null_filter.eval_expr(expr, false), Some(false), "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); + + // Constrast normal vs SQL WHERE semantics - comparison inside AND + let expr = &Expr::and(NULL, Expr::lt(col.clone(), VAL)); + expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); + + // NULL/FALSE (from the NULL check) is stronger than TRUE + let expr = &Expr::and(TRUE, Expr::lt(col.clone(), VAL)); + expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); + + // Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND + let expr = &Expr::and(TRUE, Expr::and(NULL, Expr::lt(col.clone(), VAL))); + expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); + + // Ditto for comparison inside OR inside AND + let expr = &Expr::or(FALSE, Expr::and(NULL, Expr::lt(col.clone(), VAL))); + expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}"); + expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}"); + expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}"); +} diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index b30711f48..cea54c4c9 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -36,8 +36,15 @@ mod tests; /// are not eligible for data skipping. /// - `OR` is rewritten only if all operands are eligible for data skipping. Otherwise, the whole OR /// expression is dropped. -fn as_data_skipping_predicate(expr: &Expr, inverted: bool) -> Option { - DataSkippingPredicateCreator.eval_expr(expr, inverted) +#[cfg(test)] +fn as_data_skipping_predicate(expr: &Expr) -> Option { + DataSkippingPredicateCreator.eval_expr(expr, false) +} + +/// Like `as_data_skipping_predicate`, but invokes [`PredicateEvaluator::eval_sql_where`] instead +/// of [`PredicateEvaluator::eval_expr`]. +fn as_sql_data_skipping_predicate(expr: &Expr) -> Option { + DataSkippingPredicateCreator.eval_sql_where(expr) } pub(crate) struct DataSkippingFilter { @@ -108,7 +115,7 @@ impl DataSkippingFilter { let skipping_evaluator = engine.get_expression_handler().get_evaluator( stats_schema.clone(), - Expr::struct_from([as_data_skipping_predicate(&predicate, false)?]), + Expr::struct_from([as_sql_data_skipping_predicate(&predicate)?]), PREDICATE_SCHEMA.clone(), ); @@ -205,6 +212,10 @@ impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator { Some(Expr::binary(op, col, val.clone())) } + fn eval_scalar_is_null(&self, val: &Scalar, inverted: bool) -> Option { + PredicateEvaluatorDefaults::eval_scalar_is_null(val, inverted).map(Expr::literal) + } + fn eval_scalar(&self, val: &Scalar, inverted: bool) -> Option { PredicateEvaluatorDefaults::eval_scalar(val, inverted).map(Expr::literal) } diff --git a/kernel/src/scan/data_skipping/tests.rs b/kernel/src/scan/data_skipping/tests.rs index e12adb526..4f1d74f63 100644 --- a/kernel/src/scan/data_skipping/tests.rs +++ b/kernel/src/scan/data_skipping/tests.rs @@ -34,7 +34,7 @@ fn test_eval_is_null() { ]); let filter = DefaultPredicateEvaluator::from(resolver); for (expr, expect) in expressions.iter().zip(expected) { - let pred = as_data_skipping_predicate(expr, false).unwrap(); + let pred = as_data_skipping_predicate(expr).unwrap(); expect_eq!( filter.eval_expr(&pred, false), *expect, @@ -77,7 +77,7 @@ fn test_eval_binary_comparisons() { ]); let filter = DefaultPredicateEvaluator::from(resolver); for (expr, expect) in expressions.iter().zip(expected.iter()) { - let pred = as_data_skipping_predicate(expr, false).unwrap(); + let pred = as_data_skipping_predicate(expr).unwrap(); expect_eq!( filter.eval_expr(&pred, false), *expect, @@ -160,7 +160,7 @@ fn test_eval_variadic() { .collect(); let expr = Expr::and_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, false).unwrap(); + let pred = as_data_skipping_predicate(&expr).unwrap(); expect_eq!( filter.eval_expr(&pred, false), *expect_and, @@ -168,19 +168,19 @@ fn test_eval_variadic() { ); let expr = Expr::or_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, false).unwrap(); + let pred = as_data_skipping_predicate(&expr).unwrap(); expect_eq!(filter.eval_expr(&pred, false), *expect_or, "OR({inputs:?})"); - let expr = Expr::and_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, true).unwrap(); + let expr = !Expr::and_from(inputs.clone()); + let pred = as_data_skipping_predicate(&expr).unwrap(); expect_eq!( filter.eval_expr(&pred, false), expect_and.map(|val| !val), "NOT AND({inputs:?})" ); - let expr = Expr::or_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, true).unwrap(); + let expr = !Expr::or_from(inputs.clone()); + let pred = as_data_skipping_predicate(&expr).unwrap(); expect_eq!( filter.eval_expr(&pred, false), expect_or.map(|val| !val), @@ -216,7 +216,7 @@ fn test_eval_distinct() { ]); let filter = DefaultPredicateEvaluator::from(resolver); for (expr, expect) in expressions.iter().zip(expected) { - let pred = as_data_skipping_predicate(expr, false).unwrap(); + let pred = as_data_skipping_predicate(expr).unwrap(); expect_eq!( filter.eval_expr(&pred, false), *expect, @@ -252,3 +252,93 @@ fn test_eval_distinct() { // min < value < max, all nulls do_test(five, fifteen, 2, &[TRUE, FALSE, FALSE, TRUE]); } + +#[test] +fn test_sql_where() { + let col = &column_expr!("x"); + const VAL: Expr = Expr::Literal(Scalar::Integer(10)); + const NULL: Expr = Expr::Literal(Scalar::Null(DataType::BOOLEAN)); + const FALSE: Expr = Expr::Literal(Scalar::Boolean(false)); + const TRUE: Expr = Expr::Literal(Scalar::Boolean(true)); + + const ROWCOUNT: i64 = 2; + const ALL_NULL: i64 = ROWCOUNT; + const SOME_NULL: i64 = 1; + const NO_NULL: i64 = 0; + let do_test = + |nulls: i64, expr: &Expr, missing: bool, expect: Option, expect_sql: Option| { + assert!((0..=ROWCOUNT).contains(&nulls)); + let (min, max) = if nulls < ROWCOUNT { + (Scalar::Integer(5), Scalar::Integer(15)) + } else { + ( + Scalar::Null(DataType::INTEGER), + Scalar::Null(DataType::INTEGER), + ) + }; + let resolver = if missing { + HashMap::new() + } else { + HashMap::from_iter([ + (column_name!("numRecords"), Scalar::from(ROWCOUNT)), + (column_name!("nullCount.x"), Scalar::from(nulls)), + (column_name!("minValues.x"), min.clone()), + (column_name!("maxValues.x"), max.clone()), + ]) + }; + let filter = DefaultPredicateEvaluator::from(resolver); + let pred = as_data_skipping_predicate(expr).unwrap(); + expect_eq!( + filter.eval_expr(&pred, false), + expect, + "{expr:#?} became {pred:#?} ({min}..{max}, {nulls} nulls)" + ); + let sql_pred = as_sql_data_skipping_predicate(expr).unwrap(); + expect_eq!( + filter.eval_expr(&sql_pred, false), + expect_sql, + "{expr:#?} became {sql_pred:#?} ({min}..{max}, {nulls} nulls)" + ); + }; + + // Sanity tests -- only all-null columns should behave differently between normal and SQL WHERE. + const MISSING: bool = true; + const PRESENT: bool = false; + let expr = &Expr::lt(TRUE, FALSE); + do_test(ALL_NULL, expr, MISSING, Some(false), Some(false)); + + let expr = &Expr::is_not_null(col.clone()); + do_test(ALL_NULL, expr, PRESENT, Some(false), Some(false)); + do_test(ALL_NULL, expr, MISSING, None, None); + + // SQL WHERE allows a present-but-all-null column to be pruned, but not a missing column. + let expr = &Expr::lt(col.clone(), VAL); + do_test(NO_NULL, expr, PRESENT, Some(true), Some(true)); + do_test(SOME_NULL, expr, PRESENT, Some(true), Some(true)); + do_test(ALL_NULL, expr, PRESENT, None, Some(false)); + do_test(ALL_NULL, expr, MISSING, None, None); + + // Comparison inside AND works + let expr = &Expr::and(NULL, Expr::lt(col.clone(), VAL)); + do_test(ALL_NULL, expr, PRESENT, None, Some(false)); + do_test(ALL_NULL, expr, MISSING, None, None); + + let expr = &Expr::and(TRUE, Expr::lt(VAL, col.clone())); + do_test(ALL_NULL, expr, PRESENT, None, Some(false)); + do_test(ALL_NULL, expr, MISSING, None, None); + + // Comparison inside AND inside AND works + let expr = &Expr::and(TRUE, Expr::and(NULL, Expr::lt(col.clone(), VAL))); + do_test(ALL_NULL, expr, PRESENT, None, Some(false)); + do_test(ALL_NULL, expr, MISSING, None, None); + + // Comparison inside OR works + let expr = &Expr::or(FALSE, Expr::lt(col.clone(), VAL)); + do_test(ALL_NULL, expr, PRESENT, None, Some(false)); + do_test(ALL_NULL, expr, MISSING, None, None); + + // Comparison inside AND inside OR works + let expr = &Expr::or(FALSE, Expr::and(TRUE, Expr::lt(col.clone(), VAL))); + do_test(ALL_NULL, expr, PRESENT, None, Some(false)); + do_test(ALL_NULL, expr, MISSING, None, None); +} diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index cd17bca7d..49af0222c 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -13,9 +13,7 @@ use crate::actions::deletion_vector::{ }; use crate::actions::{get_log_add_schema, get_log_schema, ADD_NAME, REMOVE_NAME}; use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar}; -use crate::predicates::parquet_stats_skipping::{ - ParquetStatsProvider, ParquetStatsSkippingFilter as _, -}; +use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver}; use crate::scan::state::{DvInfo, Stats}; use crate::schema::{ ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, @@ -184,27 +182,10 @@ impl PhysicalPredicate { // Evaluates a static data skipping predicate, ignoring any column references, and returns true if // the predicate allows to statically skip all files. Since this is direct evaluation (not an -// expression rewrite), we use a dummy `ParquetStatsProvider` that provides no stats. +// expression rewrite), we use a `DefaultPredicateEvaluator` with an empty column resolver. fn can_statically_skip_all_files(predicate: &Expression) -> bool { - struct NoStats; - impl ParquetStatsProvider for NoStats { - fn get_parquet_min_stat(&self, _: &ColumnName, _: &DataType) -> Option { - None - } - - fn get_parquet_max_stat(&self, _: &ColumnName, _: &DataType) -> Option { - None - } - - fn get_parquet_nullcount_stat(&self, _: &ColumnName) -> Option { - None - } - - fn get_parquet_rowcount_stat(&self) -> i64 { - 0 - } - } - NoStats.eval_sql_where(predicate) == Some(false) + use crate::predicates::PredicateEvaluator as _; + DefaultPredicateEvaluator::from(EmptyColumnResolver).eval_sql_where(predicate) == Some(false) } // Build the stats read schema filtering the table schema to keep only skipping-eligible