korowa · alamb · Jul 20, 2024 · Jul 29, 2024 · Jul 29, 2024 · Aug 2, 2024
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -324,6 +324,15 @@ config_namespace! {
 
         /// Should DataFusion keep the columns used for partition_by in the output RecordBatches
         pub keep_partition_by_columns: bool, default = false
+
+        /// Aggregation ratio (number of distinct groups / number of input rows)
+        /// threshold for skipping partial aggregation. If the value is greater
+        /// then partial aggregation will skip aggregation for further input
+        pub skip_partial_aggregation_probe_ratio_threshold: f64, default = 0.8
+
+        /// Number of input rows partial aggregation partition should process, before
+        /// aggregation ratio check and trying to switch to skipping aggregation mode
+        pub skip_partial_aggregation_probe_rows_threshold: usize, default = 100_000
     }
 }
 

diff --git a/datafusion/expr/src/groups_accumulator.rs b/datafusion/expr/src/groups_accumulator.rs
@@ -18,7 +18,7 @@
 //! Vectorized [`GroupsAccumulator`]
 
 use arrow_array::{ArrayRef, BooleanArray};
-use datafusion_common::Result;
+use datafusion_common::{not_impl_err, Result};
 
 /// Describes how many rows should be emitted during grouping.
 #[derive(Debug, Clone, Copy)]
@@ -158,6 +158,24 @@ pub trait GroupsAccumulator: Send {
         total_num_groups: usize,
     ) -> Result<()>;
 
+    /// Converts input batch to intermediate aggregate state,
+    /// without grouping (each input row considered as a separate
+    /// group).
+    fn convert_to_state(
+        &self,
+        _values: &[ArrayRef],
+        _opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        not_impl_err!("Input batch conversion to state not implemented")
+    }
+
+    /// Returns `true` is groups accumulator supports input batch
+    /// to intermediate aggregate state conversion (`convert_to_state`
+    /// method is implemented).
+    fn supports_convert_to_state(&self) -> bool {
+        false
+    }
+
     /// Amount of memory used to store the state of this accumulator,
     /// in bytes. This function is called once per batch, so it should
     /// be `O(n)` to compute, not `O(num_groups)`

diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml
@@ -48,3 +48,15 @@ datafusion-physical-expr-common = { workspace = true }
 log = { workspace = true }
 paste = "1.0.14"
 sqlparser = { workspace = true }
+
+[dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
+criterion = "0.5"
+
+[[bench]]
+name = "count"
+harness = false
+
+[[bench]]
+name = "sum"
+harness = false
diff --git a/datafusion/functions-aggregate/benches/count.rs b/datafusion/functions-aggregate/benches/count.rs
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray};
+use arrow::datatypes::Int32Type;
+use arrow::util::bench_util::{create_boolean_array, create_primitive_array};
+use arrow_schema::{DataType, Field, Schema};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_common::DFSchema;
+use datafusion_expr::{function::AccumulatorArgs, AggregateUDFImpl, GroupsAccumulator};
+use datafusion_functions_aggregate::count::Count;
+use std::sync::Arc;
+
+fn prepare_accumulator() -> Box<dyn GroupsAccumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Int32, true)]));
+    let df_schema = DFSchema::try_from(Arc::clone(&schema)).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        data_type: &DataType::Int64,
+        schema: &schema,
+        dfschema: &df_schema,
+        ignore_nulls: false,
+        sort_exprs: &[],
+        is_reversed: false,
+        name: "COUNT(f)",
+        is_distinct: false,
+        input_types: &[DataType::Int32],
+        input_exprs: &[datafusion_expr::col("f")],
+    };
+    let count_fn = Count::new();
+
+    count_fn
+        .create_groups_accumulator(accumulator_args)
+        .unwrap()
+}
+
+fn convert_to_state_bench(
+    c: &mut Criterion,
+    name: &str,
+    values: ArrayRef,
+    opt_filter: Option<&BooleanArray>,
+) {
+    let accumulator = prepare_accumulator();
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                accumulator
+                    .convert_to_state(&[values.clone()], opt_filter)
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn count_benchmark(c: &mut Criterion) {
+    let values = Arc::new(create_primitive_array::<Int32Type>(8192, 0.0)) as ArrayRef;
+    convert_to_state_bench(c, "count convert state no nulls, no filter", values, None);
+
+    let values = Arc::new(create_primitive_array::<Int32Type>(8192, 0.3)) as ArrayRef;
+    convert_to_state_bench(c, "count convert state nulls, no filter", values, None);
+
+    let values = Arc::new(create_primitive_array::<Int32Type>(8192, 0.0)) as ArrayRef;
+    let filter = create_boolean_array(8192, 0.0, 0.5);
+    convert_to_state_bench(
+        c,
+        "count convert state no nulls, filter",
+        values,
+        Some(&filter),
+    );
+
+    let values = Arc::new(create_primitive_array::<Int32Type>(8192, 0.3)) as ArrayRef;
+    let filter = create_boolean_array(8192, 0.0, 0.5);
+    convert_to_state_bench(
+        c,
+        "count convert state nulls, filter",
+        values,
+        Some(&filter),
+    );
+}
+
+criterion_group!(benches, count_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/sum.rs b/datafusion/functions-aggregate/benches/sum.rs
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray};
+use arrow::datatypes::Int64Type;
+use arrow::util::bench_util::{create_boolean_array, create_primitive_array};
+use arrow_schema::{DataType, Field, Schema};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_common::DFSchema;
+use datafusion_expr::{function::AccumulatorArgs, AggregateUDFImpl, GroupsAccumulator};
+use datafusion_functions_aggregate::sum::Sum;
+use std::sync::Arc;
+
+fn prepare_accumulator(data_type: &DataType) -> Box<dyn GroupsAccumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", data_type.clone(), true)]));
+    let df_schema = DFSchema::try_from(Arc::clone(&schema)).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        data_type,
+        schema: &schema,
+        dfschema: &df_schema,
+        ignore_nulls: false,
+        sort_exprs: &[],
+        is_reversed: false,
+        name: "SUM(f)",
+        is_distinct: false,
+        input_types: &[data_type.clone()],
+        input_exprs: &[datafusion_expr::col("f")],
+    };
+    let sum_fn = Sum::new();
+
+    sum_fn.create_groups_accumulator(accumulator_args).unwrap()
+}
+
+fn convert_to_state_bench(
+    c: &mut Criterion,
+    name: &str,
+    values: ArrayRef,
+    opt_filter: Option<&BooleanArray>,
+) {
+    let accumulator = prepare_accumulator(values.data_type());
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                accumulator
+                    .convert_to_state(&[values.clone()], opt_filter)
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn count_benchmark(c: &mut Criterion) {
+    let values = Arc::new(create_primitive_array::<Int64Type>(8192, 0.0)) as ArrayRef;
+    convert_to_state_bench(c, "sum i64 convert state no nulls, no filter", values, None);
+
+    let values = Arc::new(create_primitive_array::<Int64Type>(8192, 0.3)) as ArrayRef;
+    convert_to_state_bench(c, "sum i64 convert state nulls, no filter", values, None);
+
+    let values = Arc::new(create_primitive_array::<Int64Type>(8192, 0.0)) as ArrayRef;
+    let filter = create_boolean_array(8192, 0.0, 0.5);
+    convert_to_state_bench(
+        c,
+        "sum i64 convert state no nulls, filter",
+        values,
+        Some(&filter),
+    );
+
+    let values = Arc::new(create_primitive_array::<Int64Type>(8192, 0.3)) as ArrayRef;
+    let filter = create_boolean_array(8192, 0.0, 0.5);
+    convert_to_state_bench(
+        c,
+        "sum i64 convert state nulls, filter",
+        values,
+        Some(&filter),
+    );
+}
+
+criterion_group!(benches, count_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs
@@ -23,6 +23,7 @@ use std::{fmt::Debug, sync::Arc};
 
 use arrow::{
     array::{ArrayRef, AsArray},
+    compute,
     datatypes::{
         DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Field,
         Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
@@ -437,6 +438,66 @@ impl GroupsAccumulator for CountGroupsAccumulator {
         Ok(vec![Arc::new(counts) as ArrayRef])
     }
 
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        let values = &values[0];
+
+        let state_array = match (values.logical_nulls(), opt_filter) {
+            (None, None) => {
+                // In case there is no nulls in input and no filter, returning array of 1
+                Arc::new(Int64Array::from_value(1, values.len()))
+            }
+            (Some(nulls), None) => {
+                // If there are any nulls in input values -- casting `nulls` (true for values, false for nulls)
+                // of input array to Int64
+                let nulls = BooleanArray::new(nulls.into_inner(), None);
+                compute::cast(&nulls, &DataType::Int64)?
+            }
+            (None, Some(filter)) => {
+                // If there is only filter
+                // - applying filter null mask to filter values by bitand filter values and nulls buffers
+                //   (using buffers guarantees absence of nulls in result)
+                // - casting result of bitand to Int64 array
+                let (filter_values, filter_nulls) = filter.clone().into_parts();
+
+                let state_buf = match filter_nulls {
+                    Some(filter_nulls) => &filter_values & filter_nulls.inner(),
+                    None => filter_values,
+                };
+
+                let boolean_state = BooleanArray::new(state_buf, None);
+                compute::cast(&boolean_state, &DataType::Int64)?
+            }
+            (Some(nulls), Some(filter)) => {
+                // For both input nulls and filter
+                // - applying filter null mask to filter values by bitand filter values and nulls buffers
+                //   (using buffers guarantees absence of nulls in result)
+                // - applying values null mask to filter buffer by another bitand on filter result and
+                //   nulls from input values
+                // - casting result to Int64 array
+                let (filter_values, filter_nulls) = filter.clone().into_parts();
+
+                let filter_buf = match filter_nulls {
+                    Some(filter_nulls) => &filter_values & filter_nulls.inner(),
+                    None => filter_values,
+                };
+                let state_buf = &filter_buf & nulls.inner();
+
+                let boolean_state = BooleanArray::new(state_buf, None);
+                compute::cast(&boolean_state, &DataType::Int64)?
+            }
+        };
+
+        Ok(vec![state_array])
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        true
+    }
+
     fn size(&self) -> usize {
         self.counts.capacity() * std::mem::size_of::<usize>()
     }

diff --git a/datafusion/physical-expr-common/src/aggregate/groups_accumulator/prim_op.rs b/datafusion/physical-expr-common/src/aggregate/groups_accumulator/prim_op.rs
@@ -18,6 +18,8 @@
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, AsArray, BooleanArray, PrimitiveArray};
+use arrow::buffer::NullBuffer;
+use arrow::compute;
 use arrow::datatypes::ArrowPrimitiveType;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
@@ -134,6 +136,51 @@ where
         self.update_batch(values, group_indices, opt_filter, total_num_groups)
     }
 
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        let values = values[0].as_primitive::<T>().clone();
+
+        // Initializing state with starting values
+        let initial_state =
+            PrimitiveArray::<T>::from_value(self.starting_value, values.len());
+
+        // Recalculating values in case there is filter
+        let values = match opt_filter {
+            None => values,
+            Some(filter) => {
+                let (filter_values, filter_nulls) = filter.clone().into_parts();
+                // Calculating filter mask as a result of bitand of filter, and converting it to null buffer
+                let filter_bool = match filter_nulls {
+                    Some(filter_nulls) => filter_nulls.inner() & &filter_values,
+                    None => filter_values,
+                };
+                let filter_nulls = NullBuffer::from(filter_bool);
+
+                // Rebuilding input values with a new nulls mask, which is equal to
+                // the union of original nulls and filter mask
+                let (dt, values_buf, original_nulls) = values.clone().into_parts();
+                let nulls_buf =
+                    NullBuffer::union(original_nulls.as_ref(), Some(&filter_nulls));
+                PrimitiveArray::<T>::new(values_buf, nulls_buf).with_data_type(dt)
+            }
+        };
+
+        let state_values = compute::binary_mut(initial_state, &values, |mut x, y| {
+            (self.prim_fn)(&mut x, y);
+            x
+        });
+        let state_values = state_values.unwrap().unwrap();
+
+        Ok(vec![Arc::new(state_values)])
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        true
+    }
+
     fn size(&self) -> usize {
         self.values.capacity() * std::mem::size_of::<T::Native>() + self.null_state.size()
     }