Skip to content

Commit cc139c9

Browse files
chore(pruning): Support IS NOT NULL predicates in PruningPredicate (#9208)
* chore: add test cases for predicate is_null and is_not_null * feat(pruning): support predicate build for is_not_null expression * doc: add example in doc for `IS NOT NULL` * chore: remove edit on cargo file * chore: add `IS NOT NULL` test for row group pruning chore: remove Debug derive * chore: update comment null --> NULL Co-authored-by: Liang-Chi Hsieh <[email protected]> * chore: update comment Co-authored-by: Liang-Chi Hsieh <[email protected]> --------- Co-authored-by: Liang-Chi Hsieh <[email protected]>
1 parent 0c46d7f commit cc139c9

File tree

2 files changed

+107
-5
lines changed

2 files changed

+107
-5
lines changed

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -620,13 +620,20 @@ mod tests {
620620
ParquetStatistics::boolean(Some(false), Some(true), None, 1, false),
621621
],
622622
);
623-
vec![rgm1, rgm2]
623+
let rgm3 = get_row_group_meta_data(
624+
&schema_descr,
625+
vec![
626+
ParquetStatistics::int32(Some(17), Some(30), None, 1, false),
627+
ParquetStatistics::boolean(Some(false), Some(true), None, 0, false),
628+
],
629+
);
630+
vec![rgm1, rgm2, rgm3]
624631
}
625632

626633
#[test]
627634
fn row_group_pruning_predicate_null_expr() {
628635
use datafusion_expr::{col, lit};
629-
// int > 1 and IsNull(bool) => c1_max > 1 and bool_null_count > 0
636+
// c1 > 15 and IsNull(c2) => c1_max > 15 and c2_null_count > 0
630637
let schema = Arc::new(Schema::new(vec![
631638
Field::new("c1", DataType::Int32, false),
632639
Field::new("c2", DataType::Boolean, false),
@@ -657,7 +664,7 @@ mod tests {
657664
use datafusion_expr::{col, lit};
658665
// test row group predicate with an unknown (Null) expr
659666
//
660-
// int > 1 and bool = NULL => c1_max > 1 and null
667+
// c1 > 15 and c2 = NULL => c1_max > 15 and NULL
661668
let schema = Arc::new(Schema::new(vec![
662669
Field::new("c1", DataType::Int32, false),
663670
Field::new("c2", DataType::Boolean, false),
@@ -672,7 +679,7 @@ mod tests {
672679

673680
let metrics = parquet_file_metrics();
674681
// bool = NULL always evaluates to NULL (and thus will not
675-
// pass predicates. Ideally these should both be false
682+
// pass predicates. Ideally these should all be false
676683
assert_eq!(
677684
prune_row_groups_by_statistics(
678685
&schema,
@@ -682,7 +689,39 @@ mod tests {
682689
Some(&pruning_predicate),
683690
&metrics
684691
),
685-
vec![1]
692+
vec![1, 2]
693+
);
694+
}
695+
696+
#[test]
697+
fn row_group_pruning_predicate_not_null_expr() {
698+
use datafusion_expr::{col, lit};
699+
// c1 > 15 and IsNotNull(c2) => c1_max > 15 and c2_null_count = 0
700+
let schema = Arc::new(Schema::new(vec![
701+
Field::new("c1", DataType::Int32, false),
702+
Field::new("c2", DataType::Boolean, false),
703+
]));
704+
let schema_descr = arrow_to_parquet_schema(&schema).unwrap();
705+
let expr = col("c1").gt(lit(15)).and(col("c2").is_not_null());
706+
let expr = logical2physical(&expr, &schema);
707+
let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap();
708+
let groups = gen_row_group_meta_data_for_pruning_predicate();
709+
710+
let metrics = parquet_file_metrics();
711+
assert_eq!(
712+
prune_row_groups_by_statistics(
713+
&schema,
714+
&schema_descr,
715+
&groups,
716+
None,
717+
Some(&pruning_predicate),
718+
&metrics
719+
),
720+
// The first row group was filtered out because c1_max is 10, which is smaller than 15.
721+
// The second row group was filtered out because it contains null value on "c2".
722+
// The third row group is kept because c1_max is 30, which is greater than 15 AND
723+
// it does NOT contain any null value on "c2".
724+
vec![2]
686725
);
687726
}
688727

datafusion/core/src/physical_optimizer/pruning.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ pub trait PruningStatistics {
315315
/// `x < 5` | `x_max < 5`
316316
/// `x = 5 AND y = 10` | `x_min <= 5 AND 5 <= x_max AND y_min <= 10 AND 10 <= y_max`
317317
/// `x IS NULL` | `x_null_count > 0`
318+
/// `x IS NOT NULL` | `x_null_count = 0`
318319
///
319320
/// ## Predicate Evaluation
320321
/// The PruningPredicate works in two passes
@@ -1120,6 +1121,34 @@ fn build_is_null_column_expr(
11201121
}
11211122
}
11221123

1124+
/// Given an expression reference to `expr`, if `expr` is a column expression,
1125+
/// returns a pruning expression in terms of IsNotNull that will evaluate to true
1126+
/// if the column does NOT contain null, and false if it may contain null
1127+
fn build_is_not_null_column_expr(
1128+
expr: &Arc<dyn PhysicalExpr>,
1129+
schema: &Schema,
1130+
required_columns: &mut RequiredColumns,
1131+
) -> Option<Arc<dyn PhysicalExpr>> {
1132+
if let Some(col) = expr.as_any().downcast_ref::<phys_expr::Column>() {
1133+
let field = schema.field_with_name(col.name()).ok()?;
1134+
1135+
let null_count_field = &Field::new(field.name(), DataType::UInt64, true);
1136+
required_columns
1137+
.null_count_column_expr(col, expr, null_count_field)
1138+
.map(|null_count_column_expr| {
1139+
// IsNotNull(column) => null_count = 0
1140+
Arc::new(phys_expr::BinaryExpr::new(
1141+
null_count_column_expr,
1142+
Operator::Eq,
1143+
Arc::new(phys_expr::Literal::new(ScalarValue::UInt64(Some(0)))),
1144+
)) as _
1145+
})
1146+
.ok()
1147+
} else {
1148+
None
1149+
}
1150+
}
1151+
11231152
/// The maximum number of entries in an `InList` that might be rewritten into
11241153
/// an OR chain
11251154
const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20;
@@ -1146,6 +1175,14 @@ fn build_predicate_expression(
11461175
return build_is_null_column_expr(is_null.arg(), schema, required_columns)
11471176
.unwrap_or(unhandled);
11481177
}
1178+
if let Some(is_not_null) = expr_any.downcast_ref::<phys_expr::IsNotNullExpr>() {
1179+
return build_is_not_null_column_expr(
1180+
is_not_null.arg(),
1181+
schema,
1182+
required_columns,
1183+
)
1184+
.unwrap_or(unhandled);
1185+
}
11491186
if let Some(col) = expr_any.downcast_ref::<phys_expr::Column>() {
11501187
return build_single_column_expr(col, schema, required_columns, false)
11511188
.unwrap_or(unhandled);
@@ -2052,6 +2089,32 @@ mod tests {
20522089
Ok(())
20532090
}
20542091

2092+
#[test]
2093+
fn row_group_predicate_is_null() -> Result<()> {
2094+
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
2095+
let expected_expr = "c1_null_count@0 > 0";
2096+
2097+
let expr = col("c1").is_null();
2098+
let predicate_expr =
2099+
test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
2100+
assert_eq!(predicate_expr.to_string(), expected_expr);
2101+
2102+
Ok(())
2103+
}
2104+
2105+
#[test]
2106+
fn row_group_predicate_is_not_null() -> Result<()> {
2107+
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
2108+
let expected_expr = "c1_null_count@0 = 0";
2109+
2110+
let expr = col("c1").is_not_null();
2111+
let predicate_expr =
2112+
test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
2113+
assert_eq!(predicate_expr.to_string(), expected_expr);
2114+
2115+
Ok(())
2116+
}
2117+
20552118
#[test]
20562119
fn row_group_predicate_required_columns() -> Result<()> {
20572120
let schema = Schema::new(vec![

0 commit comments

Comments
 (0)