add tests

xudong963 · xudong963 · commit e98116babf7e · 2025-04-03T10:30:10.000+08:00
diff --git a/datafusion/datasource/benches/split_groups_by_statistics.rs b/datafusion/datasource/benches/split_groups_by_statistics.rs
@@ -17,61 +17,14 @@
 
 use arrow::datatypes::{DataType, Field, Schema};
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
-use datafusion_common::stats::Precision;
-use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
-use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_datasource::PartitionedFile;
+use datafusion_datasource::test_util::generate_test_files;
+use datafusion_datasource::test_util::verify_sort_integrity;
 use datafusion_physical_expr::PhysicalSortExpr;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use object_store::{path::Path, ObjectMeta};
 use std::sync::Arc;
 use std::time::Duration;
 
-/// Generates test files with min-max statistics in different overlap patterns
-fn generate_test_files(num_files: usize, overlap_factor: f64) -> Vec<FileGroup> {
-    let mut files = Vec::with_capacity(num_files);
-    let range_size = if overlap_factor == 0.0 {
-        100 / num_files as i64
-    } else {
-        (100.0 / (overlap_factor * num_files as f64)).max(1.0) as i64
-    };
-
-    for i in 0..num_files {
-        let base = (i as f64 * range_size as f64 * (1.0 - overlap_factor)) as i64;
-        let min = base as f64;
-        let max = (base + range_size) as f64;
-
-        let file = PartitionedFile {
-            object_meta: ObjectMeta {
-                location: Path::from(format!("file_{}.parquet", i)),
-                last_modified: chrono::Utc::now(),
-                size: 1000,
-                e_tag: None,
-                version: None,
-            },
-            partition_values: vec![],
-            range: None,
-            statistics: Some(Statistics {
-                num_rows: Precision::Exact(100),
-                total_byte_size: Precision::Exact(1000),
-                column_statistics: vec![ColumnStatistics {
-                    null_count: Precision::Exact(0),
-                    max_value: Precision::Exact(ScalarValue::Float64(Some(max))),
-                    min_value: Precision::Exact(ScalarValue::Float64(Some(min))),
-                    sum_value: Precision::Absent,
-                    distinct_count: Precision::Absent,
-                }],
-            }),
-            extensions: None,
-            metadata_size_hint: None,
-        };
-        files.push(file);
-    }
-
-    vec![FileGroup::new(files)]
-}
-
 pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
     let file_schema = Arc::new(Schema::new(vec![Field::new(
         "value",
@@ -180,28 +133,5 @@ pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
     group.finish();
 }
 
-// Helper function to verify that files within each group maintain sort order
-fn verify_sort_integrity(file_groups: &[FileGroup]) -> bool {
-    for group in file_groups {
-        let files = group.iter().collect::<Vec<_>>();
-        for i in 1..files.len() {
-            let prev_file = files[i - 1];
-            let curr_file = files[i];
-
-            // Check if the min value of current file is greater than max value of previous file
-            if let (Some(prev_stats), Some(curr_stats)) =
-                (&prev_file.statistics, &curr_file.statistics)
-            {
-                let prev_max = &prev_stats.column_statistics[0].max_value;
-                let curr_min = &curr_stats.column_statistics[0].min_value;
-                if curr_min.get_value().unwrap() <= prev_max.get_value().unwrap() {
-                    return false;
-                }
-            }
-        }
-    }
-    true
-}
-
 criterion_group!(benches, compare_split_groups_by_statistics_algorithms);
 criterion_main!(benches);
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
@@ -2311,5 +2311,163 @@ mod tests {
         );
         assert_eq!(new_config.constraints, Constraints::default());
         assert!(new_config.new_lines_in_values);
+    fn test_split_groups_by_statistics_with_target_partitions() -> Result<()> {
+        use crate::test_util::generate_test_files;
+        use crate::test_util::verify_sort_integrity;
+        use datafusion_common::DFSchema;
+        use datafusion_expr::{col, execution_props::ExecutionProps};
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "value",
+            DataType::Float64,
+            false,
+        )]));
+
+        // Setup sort expression
+        let exec_props = ExecutionProps::new();
+        let df_schema = DFSchema::try_from_qualified_schema("test", schema.as_ref())?;
+        let sort_expr = vec![col("value").sort(true, false)];
+
+        let physical_sort_exprs: Vec<_> = sort_expr
+            .iter()
+            .map(|expr| create_physical_sort_expr(expr, &df_schema, &exec_props).unwrap())
+            .collect();
+
+        let sort_ordering = LexOrdering::from(physical_sort_exprs);
+
+        // Test case parameters
+        struct TestCase {
+            name: String,
+            file_count: usize,
+            overlap_factor: f64,
+            target_partitions: usize,
+            expected_partition_count: usize,
+        }
+
+        let test_cases = vec![
+            // Basic cases
+            TestCase {
+                name: "no_overlap_10_files_4_partitions".to_string(),
+                file_count: 10,
+                overlap_factor: 0.0,
+                target_partitions: 4,
+                expected_partition_count: 4,
+            },
+            TestCase {
+                name: "medium_overlap_20_files_5_partitions".to_string(),
+                file_count: 20,
+                overlap_factor: 0.5,
+                target_partitions: 5,
+                expected_partition_count: 5,
+            },
+            TestCase {
+                name: "high_overlap_30_files_3_partitions".to_string(),
+                file_count: 30,
+                overlap_factor: 0.8,
+                target_partitions: 3,
+                expected_partition_count: 7,
+            },
+            // Edge cases
+            TestCase {
+                name: "fewer_files_than_partitions".to_string(),
+                file_count: 3,
+                overlap_factor: 0.0,
+                target_partitions: 10,
+                expected_partition_count: 3, // Should only create as many partitions as files
+            },
+            TestCase {
+                name: "single_file".to_string(),
+                file_count: 1,
+                overlap_factor: 0.0,
+                target_partitions: 5,
+                expected_partition_count: 1, // Should create only one partition
+            },
+            TestCase {
+                name: "empty_files".to_string(),
+                file_count: 0,
+                overlap_factor: 0.0,
+                target_partitions: 3,
+                expected_partition_count: 0, // Empty result for empty input
+            },
+        ];
+
+        for case in test_cases {
+            println!("Running test case: {}", case.name);
+
+            // Generate files using bench utility function
+            let file_groups = generate_test_files(case.file_count, case.overlap_factor);
+
+            // Call the function under test
+            let result =
+                FileScanConfig::split_groups_by_statistics_with_target_partitions(
+                    &schema,
+                    &file_groups,
+                    &sort_ordering,
+                    case.target_partitions,
+                )?;
+
+            // Verify results
+            println!(
+                "Created {} partitions (target was {})",
+                result.len(),
+                case.target_partitions
+            );
+
+            // Check partition count
+            assert_eq!(
+                result.len(),
+                case.expected_partition_count,
+                "Case '{}': Unexpected partition count",
+                case.name
+            );
+
+            // Verify sort integrity
+            assert!(
+                verify_sort_integrity(&result),
+                "Case '{}': Files within partitions are not properly ordered",
+                case.name
+            );
+
+            // Distribution check for partitions
+            if case.file_count > 1 && case.expected_partition_count > 1 {
+                let group_sizes: Vec<usize> = result.iter().map(FileGroup::len).collect();
+                let max_size = *group_sizes.iter().max().unwrap();
+                let min_size = *group_sizes.iter().min().unwrap();
+
+                // Check partition balancing - difference shouldn't be extreme
+                let avg_files_per_partition =
+                    case.file_count as f64 / case.expected_partition_count as f64;
+                assert!(
+                    (max_size as f64) < 2.0 * avg_files_per_partition,
+                    "Case '{}': Unbalanced distribution. Max partition size {} exceeds twice the average {}",
+                    case.name,
+                    max_size,
+                    avg_files_per_partition
+                );
+
+                println!(
+                    "Distribution - min files: {}, max files: {}",
+                    min_size, max_size
+                );
+            }
+        }
+
+        // Test error case: zero target partitions
+        let empty_groups: Vec<FileGroup> = vec![];
+        let err = FileScanConfig::split_groups_by_statistics_with_target_partitions(
+            &schema,
+            &empty_groups,
+            &sort_ordering,
+            0,
+        )
+        .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("target_partitions must be greater than 0"),
+            "Expected error for zero target partitions"
+        );
+
+        Ok(())
     }
 }
diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs
@@ -43,8 +43,8 @@ pub mod sink;
 pub mod source;
 mod statistics;
 
-#[cfg(test)]
-mod test_util;
+#[cfg(any(test))]
+pub mod test_util;
 
 pub mod url;
 pub mod write;
diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs
@@ -17,14 +17,18 @@
 
 use crate::{
     file::FileSource, file_scan_config::FileScanConfig, file_stream::FileOpener,
+    PartitionedFile,
 };
 
 use std::sync::Arc;
 
+use crate::file_groups::FileGroup;
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{Result, Statistics};
+use datafusion_common::stats::Precision;
+use datafusion_common::{ColumnStatistics, Result, ScalarValue, Statistics};
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-use object_store::ObjectStore;
+use object_store::path::Path;
+use object_store::{ObjectMeta, ObjectStore};
 
 /// Minimal [`crate::file::FileSource`] implementation for use in tests.
 #[derive(Clone, Default)]
@@ -81,3 +85,73 @@ impl FileSource for MockSource {
         "mock"
     }
 }
+
+/// Generates test files with min-max statistics in different overlap patterns
+pub fn generate_test_files(num_files: usize, overlap_factor: f64) -> Vec<FileGroup> {
+    let mut files = Vec::with_capacity(num_files);
+    if num_files == 0 {
+        return vec![];
+    }
+    let range_size = if overlap_factor == 0.0 {
+        100 / num_files as i64
+    } else {
+        (100.0 / (overlap_factor * num_files as f64)).max(1.0) as i64
+    };
+
+    for i in 0..num_files {
+        let base = (i as f64 * range_size as f64 * (1.0 - overlap_factor)) as i64;
+        let min = base as f64;
+        let max = (base + range_size) as f64;
+
+        let file = PartitionedFile {
+            object_meta: ObjectMeta {
+                location: Path::from(format!("file_{}.parquet", i)),
+                last_modified: chrono::Utc::now(),
+                size: 1000,
+                e_tag: None,
+                version: None,
+            },
+            partition_values: vec![],
+            range: None,
+            statistics: Some(Statistics {
+                num_rows: Precision::Exact(100),
+                total_byte_size: Precision::Exact(1000),
+                column_statistics: vec![ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Float64(Some(max))),
+                    min_value: Precision::Exact(ScalarValue::Float64(Some(min))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                }],
+            }),
+            extensions: None,
+            metadata_size_hint: None,
+        };
+        files.push(file);
+    }
+
+    vec![FileGroup::new(files)]
+}
+
+// Helper function to verify that files within each group maintain sort order
+pub fn verify_sort_integrity(file_groups: &[FileGroup]) -> bool {
+    for group in file_groups {
+        let files = group.iter().collect::<Vec<_>>();
+        for i in 1..files.len() {
+            let prev_file = files[i - 1];
+            let curr_file = files[i];
+
+            // Check if the min value of current file is greater than max value of previous file
+            if let (Some(prev_stats), Some(curr_stats)) =
+                (&prev_file.statistics, &curr_file.statistics)
+            {
+                let prev_max = &prev_stats.column_statistics[0].max_value;
+                let curr_min = &curr_stats.column_statistics[0].min_value;
+                if curr_min.get_value().unwrap() <= prev_max.get_value().unwrap() {
+                    return false;
+                }
+            }
+        }
+    }
+    true
+}