Merge remote-tracking branch 'apache/main' into workspace-tokio

alamb · alamb · commit e539663c7a97 · 2024-02-11T06:31:07.000-05:00
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -56,6 +56,7 @@ cargo run --example csv_sql
 - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
 - [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
 - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
+- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics
 - [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file
 - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
 - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
diff --git a/datafusion-examples/examples/pruning.rs b/datafusion-examples/examples/pruning.rs
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray, Int32Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::common::{DFSchema, ScalarValue};
+use datafusion::execution::context::ExecutionProps;
+use datafusion::physical_expr::create_physical_expr;
+use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
+use datafusion::prelude::*;
+use std::collections::HashSet;
+use std::sync::Arc;
+
+/// This example shows how to use  DataFusion's `PruningPredicate` to prove
+/// filter expressions can never be true based on statistics such as min/max
+/// values of columns.
+///
+/// The process is called "pruning" and is commonly used in query engines to
+/// quickly eliminate entire files / partitions / row groups of data from
+/// consideration using statistical information from a catalog or other
+/// metadata.
+#[tokio::main]
+async fn main() {
+    // In this example, we'll use the PruningPredicate to determine if
+    // the expression `x = 5 AND y = 10` can never be true based on statistics
+
+    // Start with the expression `x = 5 AND y = 10`
+    let expr = col("x").eq(lit(5)).and(col("y").eq(lit(10)));
+
+    // We can analyze this predicate using information provided by the
+    // `PruningStatistics` trait, in this case we'll use a simple catalog that
+    // models three files.  For all rows in each file:
+    //
+    //  File 1: x has values between `4` and `6`
+    //          y has the value 10
+    //
+    //  File 2: x has values between `4` and `6`
+    //          y has the value of `7`
+    //
+    //  File 3: x has the value 1
+    //          nothing is known about the value of y
+    let my_catalog = MyCatalog::new();
+
+    // Create a `PruningPredicate`.
+    //
+    // Note the predicate does not automatically coerce types or simplify
+    // expressions. See expr_api.rs examples for how to do this if required
+    let predicate = create_pruning_predicate(expr, &my_catalog.schema);
+
+    // Evaluate the predicate for the three files in the catalog
+    let prune_results = predicate.prune(&my_catalog).unwrap();
+    println!("Pruning results: {prune_results:?}");
+
+    // The result is a `Vec` of bool values, one for each file in the catalog
+    assert_eq!(
+        prune_results,
+        vec![
+            // File 1: `x = 5 AND y = 10` can evaluate to true if x has values
+            // between `4` and `6`, y has the value `10`, so the file can not be
+            // skipped
+            //
+            // NOTE this doesn't mean there actually are rows that evaluate to
+            // true, but the pruning predicate can't prove there aren't any.
+            true,
+            // File 2: `x = 5 AND y = 10` can never evaluate to true because y
+            // has only the value of 7. Thus this file can be skipped.
+            false,
+            // File 3: `x = 5 AND y = 10` can never evaluate to true because x
+            // has the value `1`, and for any value of `y` the expression will
+            // evaluate to false (`x = 5 AND y = 10 -->` false AND null` --> `false`). Thus this file can also be
+            // skipped.
+            false
+        ]
+    );
+}
+
+/// A simple model catalog that has information about the three files that store
+/// data for a table with two columns (x and y).
+struct MyCatalog {
+    schema: SchemaRef,
+    // (min, max) for x
+    x_values: Vec<(Option<i32>, Option<i32>)>,
+    // (min, max) for y
+    y_values: Vec<(Option<i32>, Option<i32>)>,
+}
+impl MyCatalog {
+    fn new() -> Self {
+        MyCatalog {
+            schema: Arc::new(Schema::new(vec![
+                Field::new("x", DataType::Int32, false),
+                Field::new("y", DataType::Int32, false),
+            ])),
+            x_values: vec![
+                // File 1: x has values between `4` and `6`
+                (Some(4), Some(6)),
+                // File 2: x has values between `4` and `6`
+                (Some(4), Some(6)),
+                // File 3: x has the value 1
+                (Some(1), Some(1)),
+            ],
+            y_values: vec![
+                // File 1: y has the value 10
+                (Some(10), Some(10)),
+                // File 2: y has the value of `7`
+                (Some(7), Some(7)),
+                // File 3: nothing is known about the value of y. This is
+                // represented as (None, None).
+                //
+                // Note, returning null means the value isn't known, NOT
+                // that we know the entire column is null.
+                (None, None),
+            ],
+        }
+    }
+}
+
+/// We communicate the statistical information to DataFusion by implementing the
+/// PruningStatistics trait.
+impl PruningStatistics for MyCatalog {
+    fn num_containers(&self) -> usize {
+        // there are 3 files in this "catalog", and thus each array returned
+        // from min_values and max_values also has 3 elements
+        3
+    }
+
+    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
+        // The pruning predicate evaluates the bounds for multiple expressions
+        // at once, so  return an array with an element for the minimum value in
+        // each file
+        match column.name.as_str() {
+            "x" => Some(i32_array(self.x_values.iter().map(|(min, _)| min))),
+            "y" => Some(i32_array(self.y_values.iter().map(|(min, _)| min))),
+            name => panic!("unknown column name: {name}"),
+        }
+    }
+
+    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
+        // similarly to min_values, return an array with an element for the
+        // maximum value in each file
+        match column.name.as_str() {
+            "x" => Some(i32_array(self.x_values.iter().map(|(_, max)| max))),
+            "y" => Some(i32_array(self.y_values.iter().map(|(_, max)| max))),
+            name => panic!("unknown column name: {name}"),
+        }
+    }
+
+    fn null_counts(&self, _column: &Column) -> Option<ArrayRef> {
+        // In this example, we know nothing about the number of nulls
+        None
+    }
+
+    fn contained(
+        &self,
+        _column: &Column,
+        _values: &HashSet<ScalarValue>,
+    ) -> Option<BooleanArray> {
+        // this method can be used to implement Bloom filter like filtering
+        // but we do not illustrate that here
+        None
+    }
+}
+
+fn create_pruning_predicate(expr: Expr, schema: &SchemaRef) -> PruningPredicate {
+    let df_schema = DFSchema::try_from(schema.as_ref().clone()).unwrap();
+    let props = ExecutionProps::new();
+    let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap();
+    PruningPredicate::try_new(physical_expr, schema.clone()).unwrap()
+}
+
+fn i32_array<'a>(values: impl Iterator<Item = &'a Option<i32>>) -> ArrayRef {
+    Arc::new(Int32Array::from_iter(values.cloned()))
+}
diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs
@@ -27,7 +27,6 @@ use std::iter::repeat;
 use std::str::FromStr;
 use std::sync::Arc;
 
-use crate::arrow_datafusion_err;
 use crate::cast::{
     as_decimal128_array, as_decimal256_array, as_dictionary_array,
     as_fixed_size_binary_array, as_fixed_size_list_array,
@@ -1639,18 +1638,16 @@ impl ScalarValue {
         scale: i8,
         size: usize,
     ) -> Result<Decimal128Array> {
-        match value {
+        Ok(match value {
             Some(val) => Decimal128Array::from(vec![val; size])
-                .with_precision_and_scale(precision, scale)
-                .map_err(|e| arrow_datafusion_err!(e)),
+                .with_precision_and_scale(precision, scale)?,
             None => {
                 let mut builder = Decimal128Array::builder(size)
-                    .with_precision_and_scale(precision, scale)
-                    .map_err(|e| arrow_datafusion_err!(e))?;
+                    .with_precision_and_scale(precision, scale)?;
                 builder.append_nulls(size);
-                Ok(builder.finish())
+                builder.finish()
             }
-        }
+        })
     }
 
     fn build_decimal256_array(
@@ -1659,11 +1656,10 @@ impl ScalarValue {
         scale: i8,
         size: usize,
     ) -> Result<Decimal256Array> {
-        std::iter::repeat(value)
+        Ok(std::iter::repeat(value)
             .take(size)
             .collect::<Decimal256Array>()
-            .with_precision_and_scale(precision, scale)
-            .map_err(|e| arrow_datafusion_err!(e))
+            .with_precision_and_scale(precision, scale)?)
     }
 
     /// Converts `Vec<ScalarValue>` where each element has type corresponding to
@@ -2053,7 +2049,7 @@ impl ScalarValue {
 
     fn list_to_array_of_size(arr: &dyn Array, size: usize) -> Result<ArrayRef> {
         let arrays = std::iter::repeat(arr).take(size).collect::<Vec<_>>();
-        arrow::compute::concat(arrays.as_slice()).map_err(|e| arrow_datafusion_err!(e))
+        Ok(arrow::compute::concat(arrays.as_slice())?)
     }
 
     /// Retrieve ScalarValue for each row in `array`
diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -136,6 +136,8 @@ pub trait PruningStatistics {
 /// possibly evaluate to `true` given information about a column provided by
 /// [`PruningStatistics`].
 ///
+/// # Introduction
+///
 /// `PruningPredicate` analyzes filter expressions using statistics such as
 /// min/max values and null counts, attempting to prove a "container" (e.g.
 /// Parquet Row Group) can be skipped without reading the actual data,
@@ -163,6 +165,12 @@ pub trait PruningStatistics {
 ///
 /// # Example
 ///
+/// See the [`pruning.rs` example in the `datafusion-examples`] for a complete
+/// example of how to use `PruningPredicate` to prune files based on min/max
+/// values.
+///
+/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/pruning.rs
+///
 /// Given an expression like `x = 5` and statistics for 3 containers (Row
 /// Groups, files, etc) `A`, `B`, and `C`:
 ///
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs
@@ -1209,7 +1209,15 @@ impl SMJStream {
                     ) {
                         // The reverse of the selection mask. For the rows not pass join filter above,
                         // we need to join them (left or right) with null rows for outer joins.
-                        let not_mask = compute::not(mask)?;
+                        let not_mask = if mask.null_count() > 0 {
+                            // If the mask contains nulls, we need to use `prep_null_mask_filter` to
+                            // handle the nulls in the mask as false to produce rows where the mask
+                            // was null itself.
+                            compute::not(&compute::prep_null_mask_filter(mask))?
+                        } else {
+                            compute::not(mask)?
+                        };
+
                         let null_joined_batch =
                             compute::filter_record_batch(&output_batch, &not_mask)?;
 
@@ -1254,6 +1262,19 @@ impl SMJStream {
 
                         // For full join, we also need to output the null joined rows from the buffered side
                         if matches!(self.join_type, JoinType::Full) {
+                            // Handle not mask for buffered side further.
+                            // For buffered side, we want to output the rows that are not null joined with
+                            // the streamed side. i.e. the rows that are not null in the `buffered_indices`.
+                            let not_mask = if let Some(nulls) = buffered_indices.nulls() {
+                                let mask = not_mask.values() & nulls.inner();
+                                BooleanArray::new(mask, None)
+                            } else {
+                                not_mask
+                            };
+
+                            let null_joined_batch =
+                                compute::filter_record_batch(&output_batch, &not_mask)?;
+
                             let mut streamed_columns = self
                                 .streamed_schema
                                 .fields()
diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/adding-udfs.md