Skip to content

Commit b86619e

Browse files
authored
Minor: refine comments for statistics compution (#15647)
1 parent 784df33 commit b86619e

File tree

2 files changed

+14
-9
lines changed

2 files changed

+14
-9
lines changed

datafusion/core/src/datasource/listing/table.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -716,9 +716,13 @@ impl ListingOptions {
716716
#[derive(Debug)]
717717
pub struct ListingTable {
718718
table_paths: Vec<ListingTableUrl>,
719-
/// File fields only
719+
/// `file_schema` contains only the columns physically stored in the data files themselves.
720+
/// - Represents the actual fields found in files like Parquet, CSV, etc.
721+
/// - Used when reading the raw data from files
720722
file_schema: SchemaRef,
721-
/// File fields + partition columns
723+
/// `table_schema` combines `file_schema` + partition columns
724+
/// - Partition columns are derived from directory paths (not stored in files)
725+
/// - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet`
722726
table_schema: SchemaRef,
723727
options: ListingOptions,
724728
definition: Option<String>,

datafusion/datasource/src/statistics.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ pub fn compute_file_group_statistics(
506506
///
507507
/// # Parameters
508508
/// * `file_groups` - Vector of file groups to process
509-
/// * `file_schema` - Schema of the files
509+
/// * `table_schema` - Schema of the table
510510
/// * `collect_stats` - Whether to collect statistics
511511
/// * `inexact_stats` - Whether to mark the resulting statistics as inexact
512512
///
@@ -516,7 +516,7 @@ pub fn compute_file_group_statistics(
516516
/// * The summary statistics across all file groups, aka all files summary statistics
517517
pub fn compute_all_files_statistics(
518518
file_groups: Vec<FileGroup>,
519-
file_schema: SchemaRef,
519+
table_schema: SchemaRef,
520520
collect_stats: bool,
521521
inexact_stats: bool,
522522
) -> Result<(Vec<FileGroup>, Statistics)> {
@@ -526,16 +526,17 @@ pub fn compute_all_files_statistics(
526526
for file_group in file_groups {
527527
file_groups_with_stats.push(compute_file_group_statistics(
528528
file_group,
529-
Arc::clone(&file_schema),
529+
Arc::clone(&table_schema),
530530
collect_stats,
531531
)?);
532532
}
533533

534534
// Then summary statistics across all file groups
535-
let mut statistics =
536-
compute_summary_statistics(&file_groups_with_stats, &file_schema, |file_group| {
537-
file_group.statistics()
538-
});
535+
let mut statistics = compute_summary_statistics(
536+
&file_groups_with_stats,
537+
&table_schema,
538+
|file_group| file_group.statistics(),
539+
);
539540

540541
if inexact_stats {
541542
statistics = statistics.to_inexact()

0 commit comments

Comments
 (0)