Skip to content

Commit 0cab805

Browse files
Xuanwowiedldalamb
authored
Provide field and schema metadata missing on distinct aggregations. (#12691) (#12975)
* test(12687): reproducer of missing metadata bug * fix(12687): minimum change needed to fix the missing metadata Co-authored-by: wiedld <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent 7fbc134 commit 0cab805

File tree

3 files changed

+53
-10
lines changed

3 files changed

+53
-10
lines changed

datafusion/physical-plan/src/aggregates/mod.rs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use crate::aggregates::{
2626
topk_stream::GroupedTopKAggregateStream,
2727
};
2828
use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
29+
use crate::projection::get_field_metadata;
2930
use crate::windows::get_ordered_partition_by_indices;
3031
use crate::{
3132
DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode,
@@ -793,14 +794,17 @@ fn create_schema(
793794
) -> Result<Schema> {
794795
let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len());
795796
for (index, (expr, name)) in group_expr.iter().enumerate() {
796-
fields.push(Field::new(
797-
name,
798-
expr.data_type(input_schema)?,
799-
// In cases where we have multiple grouping sets, we will use NULL expressions in
800-
// order to align the grouping sets. So the field must be nullable even if the underlying
801-
// schema field is not.
802-
group_expr_nullable[index] || expr.nullable(input_schema)?,
803-
))
797+
fields.push(
798+
Field::new(
799+
name,
800+
expr.data_type(input_schema)?,
801+
// In cases where we have multiple grouping sets, we will use NULL expressions in
802+
// order to align the grouping sets. So the field must be nullable even if the underlying
803+
// schema field is not.
804+
group_expr_nullable[index] || expr.nullable(input_schema)?,
805+
)
806+
.with_metadata(get_field_metadata(expr, input_schema).unwrap_or_default()),
807+
)
804808
}
805809

806810
match mode {
@@ -821,7 +825,10 @@ fn create_schema(
821825
}
822826
}
823827

824-
Ok(Schema::new(fields))
828+
Ok(Schema::new_with_metadata(
829+
fields,
830+
input_schema.metadata().clone(),
831+
))
825832
}
826833

827834
fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef {

datafusion/physical-plan/src/projection.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ impl ExecutionPlan for ProjectionExec {
237237

238238
/// If e is a direct column reference, returns the field level
239239
/// metadata for that field, if any. Otherwise returns None
240-
fn get_field_metadata(
240+
pub(crate) fn get_field_metadata(
241241
e: &Arc<dyn PhysicalExpr>,
242242
input_schema: &Schema,
243243
) -> Option<HashMap<String, String>> {

datafusion/sqllogictest/test_files/metadata.slt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,42 @@ WHERE "data"."id" = "samples"."id";
5959
3
6060

6161

62+
63+
# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
64+
query I
65+
select count(distinct name) from table_with_metadata;
66+
----
67+
2
68+
69+
# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
70+
query I
71+
select approx_median(distinct id) from table_with_metadata;
72+
----
73+
2
74+
75+
# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
76+
statement ok
77+
select array_agg(distinct id) from table_with_metadata;
78+
79+
query I
80+
select distinct id from table_with_metadata order by id;
81+
----
82+
1
83+
3
84+
NULL
85+
86+
query I
87+
select count(id) from table_with_metadata;
88+
----
89+
2
90+
91+
query I
92+
select count(id) cnt from table_with_metadata group by name order by cnt;
93+
----
94+
0
95+
1
96+
1
97+
6298
# Regression test: missing schema metadata, when aggregate on cross join
6399
query I
64100
SELECT count("data"."id")

0 commit comments

Comments
 (0)