apache · alamb · Apr 4, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 31, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
@@ -899,10 +899,11 @@ impl TableProvider for ListingTable {
             .split_file_groups_by_statistics
             .then(|| {
                 output_ordering.first().map(|output_ordering| {
-                    FileScanConfig::split_groups_by_statistics(
+                    FileScanConfig::split_groups_by_statistics_with_target_partitions(
                         &self.table_schema,
                         &partitioned_file_lists,
                         output_ordering,
+                        self.options.target_partitions,
                     )
                 })
             })

diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml
@@ -72,6 +72,7 @@ xz2 = { version = "0.1", optional = true, features = ["static"] }
 zstd = { version = "0.13", optional = true, default-features = false }
 
 [dev-dependencies]
+criterion = { workspace = true }
 tempfile = { workspace = true }
 
 [lints]
@@ -80,3 +81,7 @@ workspace = true
 [lib]
 name = "datafusion_datasource"
 path = "src/mod.rs"
+
+[[bench]]
+name = "split_groups_by_statistics"
+harness = false
diff --git a/datafusion/datasource/benches/split_groups_by_statistics.rs b/datafusion/datasource/benches/split_groups_by_statistics.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use datafusion_datasource::file_scan_config::FileScanConfig;
+use datafusion_datasource::{generate_test_files, verify_sort_integrity};
+use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use std::sync::Arc;
+use std::time::Duration;
+
+pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
+    let file_schema = Arc::new(Schema::new(vec![Field::new(
+        "value",
+        DataType::Float64,
+        false,
+    )]));
+
+    let sort_expr = PhysicalSortExpr {
+        expr: Arc::new(datafusion_physical_expr::expressions::Column::new(
+            "value", 0,
+        )),
+        options: arrow::compute::SortOptions::default(),
+    };
+    let sort_ordering = LexOrdering::from(vec![sort_expr]);
+
+    // Small, medium, large number of files
+    let file_counts = [10, 100, 1000];
+    let overlap_factors = [0.0, 0.2, 0.5, 0.8]; // No, low, medium, high overlap
+
+    let target_partitions: [usize; 4] = [4, 8, 16, 32];
+
+    let mut group = c.benchmark_group("split_groups");
+    group.measurement_time(Duration::from_secs(10));
+
+    for &num_files in &file_counts {
+        for &overlap in &overlap_factors {
+            let file_groups = generate_test_files(num_files, overlap);
+            // Benchmark original algorithm
+            group.bench_with_input(
+                BenchmarkId::new(
+                    "original",
+                    format!("files={},overlap={:.1}", num_files, overlap),
+                ),
+                &(
+                    file_groups.clone(),
+                    file_schema.clone(),
+                    sort_ordering.clone(),
+                ),
+                |b, (fg, schema, order)| {
+                    let mut result = Vec::new();
+                    b.iter(|| {
+                        result =
+                            FileScanConfig::split_groups_by_statistics(schema, fg, order)
+                                .unwrap();
+                    });
+                    assert!(verify_sort_integrity(&result));
+                },
+            );
+
+            // Benchmark new algorithm with different target partitions
+            for &tp in &target_partitions {
+                group.bench_with_input(
+                    BenchmarkId::new(
+                        format!("v2_partitions={}", tp),
+                        format!("files={},overlap={:.1}", num_files, overlap),
+                    ),
+                    &(
+                        file_groups.clone(),
+                        file_schema.clone(),
+                        sort_ordering.clone(),
+                        tp,
+                    ),
+                    |b, (fg, schema, order, target)| {
+                        let mut result = Vec::new();
+                        b.iter(|| {
+                            result = FileScanConfig::split_groups_by_statistics_with_target_partitions(
+                                schema, fg, order, *target,
+                            )
+                            .unwrap();
+                        });
+                        assert!(verify_sort_integrity(&result));
+                    },
+                );
+            }
+        }
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, compare_split_groups_by_statistics_algorithms);
+criterion_main!(benches);