Improve split_groups_by_statistics method

xudong963 · xudong963 · commit 75df701acdd4 · 2025-03-28T16:39:11.000+08:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml
@@ -72,6 +72,7 @@ xz2 = { version = "0.1", optional = true, features = ["static"] }
 zstd = { version = "0.13", optional = true, default-features = false }
 
 [dev-dependencies]
+criterion = { workspace = true }
 tempfile = { workspace = true }
 
 [lints]
@@ -80,3 +81,7 @@ workspace = true
 [lib]
 name = "datafusion_datasource"
 path = "src/mod.rs"
+
+[[bench]]
+name = "split_groups_by_statistics"
+harness = false
diff --git a/datafusion/datasource/benches/split_groups_by_statistics.rs b/datafusion/datasource/benches/split_groups_by_statistics.rs
@@ -0,0 +1,160 @@
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use datafusion_common::stats::Precision;
+use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
+use datafusion_datasource::file_groups::FileGroup;
+use datafusion_datasource::file_scan_config::FileScanConfig;
+use datafusion_datasource::PartitionedFile;
+use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use object_store::{path::Path, ObjectMeta};
+use std::sync::Arc;
+use std::time::Duration;
+
+/// Generates test files with min-max statistics in different overlap patterns
+fn generate_test_files(num_files: usize, overlap_factor: f64) -> Vec<FileGroup> {
+    let mut files = Vec::with_capacity(num_files);
+    let range_size = if overlap_factor == 0.0 {
+        100 / num_files as i64
+    } else {
+        (100.0 / (overlap_factor * num_files as f64)).max(1.0) as i64
+    };
+
+    for i in 0..num_files {
+        let base = (i as f64 * range_size as f64 * (1.0 - overlap_factor)) as i64;
+        let min = base as f64;
+        let max = (base + range_size) as f64;
+
+        let file = PartitionedFile {
+            object_meta: ObjectMeta {
+                location: Path::from(format!("file_{}.parquet", i)),
+                last_modified: chrono::Utc::now(),
+                size: 1000,
+                e_tag: None,
+                version: None,
+            },
+            partition_values: vec![],
+            range: None,
+            statistics: Some(Statistics {
+                num_rows: Precision::Exact(100),
+                total_byte_size: Precision::Exact(1000),
+                column_statistics: vec![ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Float64(Some(max))),
+                    min_value: Precision::Exact(ScalarValue::Float64(Some(min))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                }],
+            }),
+            extensions: None,
+            metadata_size_hint: None,
+        };
+        files.push(file);
+    }
+
+    vec![FileGroup::new(files)]
+}
+
+pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
+    let file_schema = Arc::new(Schema::new(vec![Field::new(
+        "value",
+        DataType::Float64,
+        false,
+    )]));
+
+    let sort_expr = PhysicalSortExpr {
+        expr: Arc::new(datafusion_physical_expr::expressions::Column::new(
+            "value", 0,
+        )),
+        options: arrow::compute::SortOptions::default(),
+    };
+    let sort_ordering = LexOrdering::from(vec![sort_expr]);
+
+    let file_counts = [10, 100, 1000]; // Small, medium, large number of files
+    let overlap_factors = [0.0, 0.2, 0.5, 0.8]; // Low, medium, high overlap
+
+    let target_partitions: [usize; 4] = [4, 8, 16, 32];
+
+    let mut group = c.benchmark_group("file_distribution_algorithms");
+    group.measurement_time(Duration::from_secs(10));
+
+    for &num_files in &file_counts {
+        for &overlap in &overlap_factors {
+            let file_groups = generate_test_files(num_files, overlap);
+            // Benchmark original algorithm
+            group.bench_with_input(
+                BenchmarkId::new(
+                    "original",
+                    format!("files={},overlap={:.1}", num_files, overlap),
+                ),
+                &(
+                    file_groups.clone(),
+                    file_schema.clone(),
+                    sort_ordering.clone(),
+                ),
+                |b, (fg, schema, order)| {
+                    b.iter(|| {
+                        let file_groups =
+                            FileScanConfig::split_groups_by_statistics(schema, fg, order)
+                                .unwrap();
+                        assert!(verify_sort_integrity(&file_groups));
+                    });
+                },
+            );
+
+            // Benchmark new algorithm with different target partitions
+            for &tp in &target_partitions {
+                group.bench_with_input(
+                    BenchmarkId::new(
+                        format!("v2_partitions={}", tp),
+                        format!("files={},overlap={:.1}", num_files, overlap),
+                    ),
+                    &(
+                        file_groups.clone(),
+                        file_schema.clone(),
+                        sort_ordering.clone(),
+                        tp,
+                    ),
+                    |b, (fg, schema, order, target)| {
+                        b.iter(|| {
+                            let file_groups =
+                                FileScanConfig::split_groups_by_statistics_v2(
+                                    schema, fg, order, *target,
+                                )
+                                .unwrap();
+                            assert!(verify_sort_integrity(&file_groups));
+                        });
+                    },
+                );
+            }
+        }
+    }
+
+    group.finish();
+}
+
+// Helper function to verify that files within each group maintain sort order
+fn verify_sort_integrity(file_groups: &[FileGroup]) -> bool {
+    for group in file_groups {
+        let files = group.iter().collect::<Vec<_>>();
+        for i in 1..files.len() {
+            let prev_file = files[i - 1];
+            let curr_file = files[i];
+
+            // Check if the min value of current file is greater than max value of previous file
+            if let (Some(prev_stats), Some(curr_stats)) =
+                (&prev_file.statistics, &curr_file.statistics)
+            {
+                let prev_max = &prev_stats.column_statistics[0].max_value;
+                let curr_min = &curr_stats.column_statistics[0].min_value;
+                if curr_min.get_value().unwrap() <= prev_max.get_value().unwrap() {
+                    return false;
+                }
+            }
+        }
+    }
+    true
+}
+
+criterion_group!(benches, compare_split_groups_by_statistics_algorithms);
+criterion_main!(benches);
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
@@ -575,6 +575,95 @@ impl FileScanConfig {
         })
     }
 
+    /// Splits file groups into new groups based on statistics to enable efficient parallel processing.
+    ///
+    /// The method distributes files across a target number of partitions while ensuring
+    /// files within each partition maintain sort order based on their min/max statistics.
+    ///
+    /// The algorithm works by:
+    /// 1. Sorting all files by their minimum values
+    /// 2. Trying to place each file into an existing group where it can maintain sort order
+    /// 3. Creating new groups when necessary if a file cannot fit into existing groups
+    /// 4. Prioritizing smaller groups when multiple suitable groups exist (for load balancing)
+    ///
+    /// # Parameters
+    /// * `table_schema`: Schema containing information about the columns
+    /// * `file_groups`: The original file groups to split
+    /// * `sort_order`: The lexicographical ordering to maintain within each group
+    /// * `target_partitions`: The desired number of output partitions
+    ///
+    /// # Returns
+    /// A new set of file groups, where files within each group are non-overlapping with respect to
+    /// their min/max statistics and maintain the specified sort order.
+    pub fn split_groups_by_statistics_v2(
+        table_schema: &SchemaRef,
+        file_groups: &[FileGroup],
+        sort_order: &LexOrdering,
+        target_partitions: usize,
+    ) -> Result<Vec<FileGroup>> {
+        let flattened_files = file_groups
+            .iter()
+            .flat_map(FileGroup::iter)
+            .collect::<Vec<_>>();
+
+        if flattened_files.is_empty() {
+            return Ok(vec![]);
+        }
+
+        let statistics = MinMaxStatistics::new_from_files(
+            sort_order,
+            table_schema,
+            None,
+            flattened_files.iter().copied(),
+        )?;
+
+        let indices_sorted_by_min = statistics.min_values_sorted();
+
+        // Initialize with target_partitions empty groups
+        let mut file_groups_indices: Vec<Vec<usize>> =
+            vec![vec![]; target_partitions.max(1)];
+
+        for (idx, min) in indices_sorted_by_min {
+            // Find all groups where the file can fit
+            let mut suitable_groups: Vec<(usize, &mut Vec<usize>)> = file_groups_indices
+                .iter_mut()
+                .enumerate()
+                .filter(|(_, group)| {
+                    group.is_empty()
+                        || min
+                            > statistics
+                                .max(*group.last().expect("groups should not be empty"))
+                })
+                .collect();
+
+            // Sort by group size to prioritize smaller groups
+            suitable_groups.sort_by_key(|(_, group)| group.len());
+
+            if let Some((_, group)) = suitable_groups.first_mut() {
+                group.push(idx);
+            } else {
+                // Create a new group if no existing group fits
+                file_groups_indices.push(vec![idx]);
+            }
+        }
+
+        // Remove any empty groups
+        file_groups_indices.retain(|group| !group.is_empty());
+
+        // Assemble indices back into groups of PartitionedFiles
+        Ok(file_groups_indices
+            .into_iter()
+            .map(|file_group_indices| {
+                FileGroup::new(
+                    file_group_indices
+                        .into_iter()
+                        .map(|idx| flattened_files[idx].clone())
+                        .collect(),
+                )
+            })
+            .collect())
+    }
+
     /// Attempts to do a bin-packing on files into file groups, such that any two files
     /// in a file group are ordered and non-overlapping with respect to their statistics.
     /// It will produce the smallest number of file groups possible.