apache · jayzhan211 · Nov 20, 2024 · Nov 16, 2024 · Nov 16, 2024 · Nov 16, 2024
diff --git a/datafusion/functions-aggregate-common/Cargo.toml b/datafusion/functions-aggregate-common/Cargo.toml
@@ -43,3 +43,10 @@ datafusion-common = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 rand = { workspace = true }
+
+[dev-dependencies]
+criterion = "0.5"
+
+[[bench]]
+harness = false
+name = "accumulate"
diff --git a/datafusion/functions-aggregate-common/benches/accumulate.rs b/datafusion/functions-aggregate-common/benches/accumulate.rs
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, BooleanArray, Int64Array};
+use criterion::{criterion_group, criterion_main, Criterion};
+use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate_indices;
+
+fn generate_group_indices(len: usize) -> Vec<usize> {
+    (0..len).collect()
+}
+
+fn generate_values(len: usize, has_null: bool) -> ArrayRef {
+    if has_null {
+        let values = (0..len)
+            .map(|i| if i % 7 == 0 { None } else { Some(i as i64) })
+            .collect::<Vec<_>>();
+        Arc::new(Int64Array::from(values))
+    } else {
+        let values = (0..len).map(|i| Some(i as i64)).collect::<Vec<_>>();
+        Arc::new(Int64Array::from(values))
+    }
+}
+
+fn generate_filter(len: usize) -> Option<BooleanArray> {
+    let values = (0..len)
+        .map(|i| {
+            if i % 7 == 0 {
+                None
+            } else if i % 5 == 0 {
+                Some(false)
+            } else {
+                Some(true)
+            }
+        })
+        .collect::<Vec<_>>();
+    Some(BooleanArray::from(values))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let len = 500_000;
+    let group_indices = generate_group_indices(len);
+    let rows_count = group_indices.len();
+    let values = generate_values(len, true);
+    let opt_filter = generate_filter(len);
+    let mut counts: Vec<i64> = vec![0; rows_count];
+    accumulate_indices(
+        &group_indices,
+        values.logical_nulls().as_ref(),
+        opt_filter.as_ref(),
+        |group_index| {
+            counts[group_index] += 1;
+        },
+    );
+
+    c.bench_function("Handle both nulls and filter", |b| {
+        b.iter(|| {
+            accumulate_indices(
+                &group_indices,
+                values.logical_nulls().as_ref(),
+                opt_filter.as_ref(),
+                |group_index| {
+                    counts[group_index] += 1;
+                },
+            );
+        })
+    });
+
+    c.bench_function("Handle nulls only", |b| {
+        b.iter(|| {
+            accumulate_indices(
+                &group_indices,
+                values.logical_nulls().as_ref(),
+                None,
+                |group_index| {
+                    counts[group_index] += 1;
+                },
+            );
+        })
+    });
+
+    let values = generate_values(len, false);
+    c.bench_function("Handle filter only", |b| {
+        b.iter(|| {
+            accumulate_indices(
+                &group_indices,
+                values.logical_nulls().as_ref(),
+                opt_filter.as_ref(),
+                |group_index| {
+                    counts[group_index] += 1;
+                },
+            );
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs
@@ -395,19 +395,41 @@ pub fn accumulate_indices<F>(
             }
         }
         (None, Some(filter)) => {
-            assert_eq!(filter.len(), group_indices.len());
-            // The performance with a filter could be improved by
-            // iterating over the filter in chunks, rather than a single
-            // iterator. TODO file a ticket
-            let iter = group_indices.iter().zip(filter.iter());
-            for (&group_index, filter_value) in iter {
-                if let Some(true) = filter_value {
-                    index_fn(group_index)
-                }
-            }
+            debug_assert_eq!(filter.len(), group_indices.len());
+            let group_indices_chunks = group_indices.chunks_exact(64);
+            let bit_chunks = filter.values().bit_chunks();
+
+            let group_indices_remainder = group_indices_chunks.remainder();
+
+            group_indices_chunks.zip(bit_chunks.iter()).for_each(
+                |(group_index_chunk, mask)| {
+                    // index_mask has value 1 << i in the loop
+                    let mut index_mask = 1;
+                    group_index_chunk.iter().for_each(|&group_index| {
+                        // valid bit was set, real vale
+                        let is_valid = (mask & index_mask) != 0;
+                        if is_valid {
+                            index_fn(group_index);
+                        }
+                        index_mask <<= 1;
+                    })
+                },
+            );
+
+            // handle any remaining bits (after the initial 64)
+            let remainder_bits = bit_chunks.remainder_bits();
+            group_indices_remainder
+                .iter()
+                .enumerate()
+                .for_each(|(i, &group_index)| {
+                    let is_valid = remainder_bits & (1 << i) != 0;
+                    if is_valid {
+                        index_fn(group_index)
+                    }
+                });
         }
         (Some(valids), None) => {
-            assert_eq!(valids.len(), group_indices.len());
+            debug_assert_eq!(valids.len(), group_indices.len());
             // This is based on (ahem, COPY/PASTA) arrow::compute::aggregate::sum
             // iterate over in chunks of 64 bits for more efficient null checking
             let group_indices_chunks = group_indices.chunks_exact(64);
@@ -444,20 +466,44 @@ pub fn accumulate_indices<F>(
         }
 
         (Some(valids), Some(filter)) => {
-            assert_eq!(filter.len(), group_indices.len());
-            assert_eq!(valids.len(), group_indices.len());
-            // The performance with a filter could likely be improved by
-            // iterating over the filter in chunks, rather than using
-            // iterators. TODO file a ticket
-            filter
+            debug_assert_eq!(filter.len(), group_indices.len());
+            debug_assert_eq!(valids.len(), group_indices.len());
+
+            let group_indices_chunks = group_indices.chunks_exact(64);
+            let valid_bit_chunks = valids.inner().bit_chunks();
+            let filter_bit_chunks = filter.values().bit_chunks();
+
+            let group_indices_remainder = group_indices_chunks.remainder();
+
+            group_indices_chunks
+                .zip(valid_bit_chunks.iter())
+                .zip(filter_bit_chunks.iter())
+                .for_each(|((group_index_chunk, valid_mask), filter_mask)| {
+                    // index_mask has value 1 << i in the loop
+                    let mut index_mask = 1;
+                    group_index_chunk.iter().for_each(|&group_index| {
+                        // valid bit was set, real vale
+                        let is_valid = (valid_mask & filter_mask & index_mask) != 0;
+                        if is_valid {
+                            index_fn(group_index);
+                        }
+                        index_mask <<= 1;
+                    })
+                });
+
+            // handle any remaining bits (after the initial 64)
+            let remainder_valid_bits = valid_bit_chunks.remainder_bits();
+            let remainder_filter_bits = filter_bit_chunks.remainder_bits();
+            group_indices_remainder
                 .iter()
-                .zip(group_indices.iter())
-                .zip(valids.iter())
-                .for_each(|((filter_value, &group_index), is_valid)| {
-                    if let (Some(true), true) = (filter_value, is_valid) {
+                .enumerate()
+                .for_each(|(i, &group_index)| {
+                    let is_valid =
+                        remainder_valid_bits & remainder_filter_bits & (1 << i) != 0;
+                    if is_valid {
                         index_fn(group_index)
                     }
-                })
+                });
         }
     }
 }