Skip to content

Commit ecb60f0

Browse files
committed
move notes
1 parent 37f84a5 commit ecb60f0

File tree

2 files changed

+39
-30
lines changed

2 files changed

+39
-30
lines changed

datafusion/datasource/benches/split_groups_by_statistics.rs

-28
Original file line numberDiff line numberDiff line change
@@ -41,34 +41,6 @@ pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
4141

4242
// Small, medium, large number of files
4343
let file_counts = [10, 100, 1000];
44-
// overlap_factors controls how much the value ranges in generated test files overlap:
45-
// - 0.0: No overlap between files (completely disjoint ranges)
46-
// - 0.2: Low overlap (20% of the range size overlaps with adjacent files)
47-
// - 0.5: Medium overlap (50% of ranges overlap)
48-
// - 0.8: High overlap (80% of ranges overlap between files)
49-
//
50-
// Example with 5 files and different overlap factors:
51-
// [min, max]
52-
// overlap_factor = 0.0 (no overlap):
53-
// File 0: [0, 20]
54-
// File 1: [20, 40]
55-
// File 2: [40, 60]
56-
// File 3: [60, 80]
57-
// File 4: [80, 100]
58-
//
59-
// overlap_factor = 0.5 (50% overlap):
60-
// File 0: [0, 40]
61-
// File 1: [20, 60]
62-
// File 2: [40, 80]
63-
// File 3: [60, 100]
64-
// File 4: [80, 120]
65-
//
66-
// overlap_factor = 0.8 (80% overlap):
67-
// File 0: [0, 100]
68-
// File 1: [20, 120]
69-
// File 2: [40, 140]
70-
// File 3: [60, 160]
71-
// File 4: [80, 180]
7244
let overlap_factors = [0.0, 0.2, 0.5, 0.8]; // No, low, medium, high overlap
7345

7446
let target_partitions: [usize; 4] = [4, 8, 16, 32];

datafusion/datasource/src/mod.rs

+39-2
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,45 @@ async fn find_first_newline(
314314
Ok(index)
315315
}
316316

317-
/// Generates test files with min-max statistics in different overlap patterns
318-
/// Used by tests and benchmarks
317+
/// Generates test files with min-max statistics in different overlap patterns.
318+
///
319+
/// Used by tests and benchmarks.
320+
///
321+
/// # Overlap Factors
322+
///
323+
/// The `overlap_factor` parameter controls how much the value ranges in generated test files overlap:
324+
/// - `0.0`: No overlap between files (completely disjoint ranges)
325+
/// - `0.2`: Low overlap (20% of the range size overlaps with adjacent files)
326+
/// - `0.5`: Medium overlap (50% of ranges overlap)
327+
/// - `0.8`: High overlap (80% of ranges overlap between files)
328+
///
329+
/// # Examples
330+
///
331+
/// With 5 files and different overlap factors showing `[min, max]` ranges:
332+
///
333+
/// overlap_factor = 0.0 (no overlap):
334+
///
335+
/// File 0: [0, 20]
336+
/// File 1: [20, 40]
337+
/// File 2: [40, 60]
338+
/// File 3: [60, 80]
339+
/// File 4: [80, 100]
340+
///
341+
/// overlap_factor = 0.5 (50% overlap):
342+
///
343+
/// File 0: [0, 40]
344+
/// File 1: [20, 60]
345+
/// File 2: [40, 80]
346+
/// File 3: [60, 100]
347+
/// File 4: [80, 120]
348+
///
349+
/// overlap_factor = 0.8 (80% overlap):
350+
///
351+
/// File 0: [0, 100]
352+
/// File 1: [20, 120]
353+
/// File 2: [40, 140]
354+
/// File 3: [60, 160]
355+
/// File 4: [80, 180]
319356
pub fn generate_test_files(num_files: usize, overlap_factor: f64) -> Vec<FileGroup> {
320357
let mut files = Vec::with_capacity(num_files);
321358
if num_files == 0 {

0 commit comments

Comments
 (0)