Skip to content

Commit f9cc687

Browse files
committed
Schema ambiguity checks when dereferencing
1 parent 9ea86ca commit f9cc687

File tree

10 files changed

+98
-48
lines changed

10 files changed

+98
-48
lines changed

datafusion/common/src/dfschema.rs

Lines changed: 58 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ impl DFSchema {
314314
&self,
315315
qualifier: Option<&TableReference>,
316316
name: &str,
317-
) -> Option<usize> {
317+
) -> Result<Option<usize>> {
318318
let mut matches = self
319319
.iter()
320320
.enumerate()
@@ -328,16 +328,53 @@ impl DFSchema {
328328
// field to lookup is unqualified, no need to compare qualifier
329329
(None, Some(_)) | (None, None) => f.name() == name,
330330
})
331-
.map(|(idx, _)| idx);
332-
matches.next()
331+
.map(|(idx, (q, _))| (idx, q));
332+
let first_match = matches.next();
333+
match first_match {
334+
None => Ok(None),
335+
Some((first_index, first_qualifier)) => {
336+
let next_match = matches.next();
337+
match next_match {
338+
None => Ok(Some(first_index)),
339+
Some((_, next_qualifier)) => {
340+
match (first_qualifier, next_qualifier) {
341+
(Some(q), Some(_)) => {
342+
_schema_err!(SchemaError::DuplicateQualifiedField {
343+
qualifier: Box::new(q.clone()),
344+
name: name.to_string(),
345+
})
346+
}
347+
348+
(None, None) => {
349+
_schema_err!(SchemaError::DuplicateUnqualifiedField {
350+
name: name.to_string(),
351+
})
352+
}
353+
354+
_ => _schema_err!(SchemaError::AmbiguousReference {
355+
field: Column {
356+
relation: Some(
357+
first_qualifier
358+
.or(next_qualifier)
359+
.unwrap()
360+
.clone()
361+
),
362+
name: name.to_string(),
363+
},
364+
}),
365+
}
366+
}
367+
}
368+
}
369+
}
333370
}
334371

335372
/// Find the index of the column with the given qualifier and name,
336373
/// returning `None` if not found
337374
///
338375
/// See [Self::index_of_column] for a version that returns an error if the
339376
/// column is not found
340-
pub fn maybe_index_of_column(&self, col: &Column) -> Option<usize> {
377+
pub fn maybe_index_of_column(&self, col: &Column) -> Result<Option<usize>> {
341378
self.index_of_column_by_name(col.relation.as_ref(), &col.name)
342379
}
343380

@@ -347,14 +384,15 @@ impl DFSchema {
347384
/// See [Self::maybe_index_of_column] for a version that returns `None` if
348385
/// the column is not found
349386
pub fn index_of_column(&self, col: &Column) -> Result<usize> {
350-
self.maybe_index_of_column(col)
387+
self.maybe_index_of_column(col)?
351388
.ok_or_else(|| field_not_found(col.relation.clone(), &col.name, self))
352389
}
353390

354391
/// Check if the column is in the current schema
355-
pub fn is_column_from_schema(&self, col: &Column) -> bool {
356-
self.index_of_column_by_name(col.relation.as_ref(), &col.name)
357-
.is_some()
392+
pub fn is_column_from_schema(&self, col: &Column) -> Result<bool> {
393+
Ok(self
394+
.index_of_column_by_name(col.relation.as_ref(), &col.name)?
395+
.is_some())
358396
}
359397

360398
/// Find the field with the given name
@@ -378,7 +416,7 @@ impl DFSchema {
378416
) -> Result<(Option<&TableReference>, &Field)> {
379417
if let Some(qualifier) = qualifier {
380418
let idx = self
381-
.index_of_column_by_name(Some(qualifier), name)
419+
.index_of_column_by_name(Some(qualifier), name)?
382420
.ok_or_else(|| field_not_found(Some(qualifier.clone()), name, self))?;
383421
Ok((self.field_qualifiers[idx].as_ref(), self.field(idx)))
384422
} else {
@@ -490,7 +528,7 @@ impl DFSchema {
490528
name: &str,
491529
) -> Result<&Field> {
492530
let idx = self
493-
.index_of_column_by_name(Some(qualifier), name)
531+
.index_of_column_by_name(Some(qualifier), name)?
494532
.ok_or_else(|| field_not_found(Some(qualifier.clone()), name, self))?;
495533

496534
Ok(self.field(idx))
@@ -629,9 +667,9 @@ impl DFSchema {
629667
let iter1 = fields1.iter();
630668
let iter2 = fields2.iter();
631669
fields1.len() == fields2.len() &&
632-
// all fields have to be the same
670+
// all fields have to be the same
633671
iter1
634-
.zip(iter2)
672+
.zip(iter2)
635673
.all(|(f1, f2)| Self::field_is_logically_equal(f1, f2))
636674
}
637675
(DataType::Union(fields1, _), DataType::Union(fields2, _)) => {
@@ -668,9 +706,9 @@ impl DFSchema {
668706
let iter1 = fields1.iter();
669707
let iter2 = fields2.iter();
670708
fields1.len() == fields2.len() &&
671-
// all fields have to be the same
709+
// all fields have to be the same
672710
iter1
673-
.zip(iter2)
711+
.zip(iter2)
674712
.all(|(f1, f2)| Self::field_is_semantically_equal(f1, f2))
675713
}
676714
(DataType::Union(fields1, _), DataType::Union(fields2, _)) => {
@@ -1178,8 +1216,8 @@ mod tests {
11781216
.to_string(),
11791217
expected_help
11801218
);
1181-
assert!(schema.index_of_column_by_name(None, "y").is_none());
1182-
assert!(schema.index_of_column_by_name(None, "t1.c0").is_none());
1219+
assert!(schema.index_of_column_by_name(None, "y")?.is_none());
1220+
assert!(schema.index_of_column_by_name(None, "t1.c0")?.is_none());
11831221

11841222
Ok(())
11851223
}
@@ -1268,28 +1306,28 @@ mod tests {
12681306
{
12691307
let col = Column::from_qualified_name("t1.c0");
12701308
let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
1271-
assert!(schema.is_column_from_schema(&col));
1309+
assert!(schema.is_column_from_schema(&col)?);
12721310
}
12731311

12741312
// qualified not exists
12751313
{
12761314
let col = Column::from_qualified_name("t1.c2");
12771315
let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
1278-
assert!(!schema.is_column_from_schema(&col));
1316+
assert!(!schema.is_column_from_schema(&col)?);
12791317
}
12801318

12811319
// unqualified exists
12821320
{
12831321
let col = Column::from_name("c0");
12841322
let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
1285-
assert!(schema.is_column_from_schema(&col));
1323+
assert!(schema.is_column_from_schema(&col)?);
12861324
}
12871325

12881326
// unqualified not exists
12891327
{
12901328
let col = Column::from_name("c2");
12911329
let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
1292-
assert!(!schema.is_column_from_schema(&col));
1330+
assert!(!schema.is_column_from_schema(&col)?);
12931331
}
12941332

12951333
Ok(())

datafusion/expr/src/logical_plan/plan.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use std::sync::{Arc, OnceLock};
2525

2626
use super::dml::CopyTo;
2727
use super::DdlStatement;
28-
use crate::builder::{change_redundant_column, unnest_with_options};
28+
use crate::builder::unnest_with_options;
2929
use crate::expr::{Placeholder, Sort as SortExpr, WindowFunction};
3030
use crate::expr_rewriter::{
3131
create_col_from_scalar_expr, normalize_cols, normalize_sorts, NamePreserver,
@@ -2193,7 +2193,7 @@ impl SubqueryAlias {
21932193
alias: impl Into<TableReference>,
21942194
) -> Result<Self> {
21952195
let alias = alias.into();
2196-
let fields = change_redundant_column(plan.schema().fields());
2196+
let fields = plan.schema().fields().clone();
21972197
let meta_data = plan.schema().as_ref().metadata().clone();
21982198
let schema: Schema =
21992199
DFSchema::from_unqualified_fields(fields.into(), meta_data)?.into();

datafusion/expr/src/utils.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1012,7 +1012,7 @@ pub fn check_all_columns_from_schema(
10121012
schema: &DFSchema,
10131013
) -> Result<bool> {
10141014
for col in columns.iter() {
1015-
let exist = schema.is_column_from_schema(col);
1015+
let exist = schema.is_column_from_schema(col)?;
10161016
if !exist {
10171017
return Ok(false);
10181018
}

datafusion/optimizer/src/optimize_projections/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ fn optimize_projections(
177177
let all_exprs_iter = new_group_bys.iter().chain(new_aggr_expr.iter());
178178
let schema = aggregate.input.schema();
179179
let necessary_indices =
180-
RequiredIndicies::new().with_exprs(schema, all_exprs_iter);
180+
RequiredIndicies::new().with_exprs(schema, all_exprs_iter)?;
181181
let necessary_exprs = necessary_indices.get_required_exprs(schema);
182182

183183
return optimize_projections(
@@ -217,7 +217,8 @@ fn optimize_projections(
217217

218218
// Get all the required column indices at the input, either by the
219219
// parent or window expression requirements.
220-
let required_indices = child_reqs.with_exprs(&input_schema, &new_window_expr);
220+
let required_indices =
221+
child_reqs.with_exprs(&input_schema, &new_window_expr)?;
221222

222223
return optimize_projections(
223224
Arc::unwrap_or_clone(window.input),
@@ -753,7 +754,7 @@ fn rewrite_projection_given_requirements(
753754
let exprs_used = indices.get_at_indices(&expr);
754755

755756
let required_indices =
756-
RequiredIndicies::new().with_exprs(input.schema(), exprs_used.iter());
757+
RequiredIndicies::new().with_exprs(input.schema(), exprs_used.iter())?;
757758

758759
// rewrite the children projection, and if they are changed rewrite the
759760
// projection down

datafusion/optimizer/src/optimize_projections/required_indices.rs

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ impl RequiredIndicies {
9696
// Add indices of the child fields referred to by the expressions in the
9797
// parent
9898
plan.apply_expressions(|e| {
99-
self.add_expr(schema, e);
99+
self.add_expr(schema, e)?;
100100
Ok(TreeNodeRecursion::Continue)
101101
})?;
102102
Ok(self.compact())
@@ -111,17 +111,18 @@ impl RequiredIndicies {
111111
///
112112
/// * `input_schema`: The input schema to analyze for index requirements.
113113
/// * `expr`: An expression for which we want to find necessary field indices.
114-
fn add_expr(&mut self, input_schema: &DFSchemaRef, expr: &Expr) {
114+
fn add_expr(&mut self, input_schema: &DFSchemaRef, expr: &Expr) -> Result<()> {
115115
// TODO could remove these clones (and visit the expression directly)
116116
let mut cols = expr.column_refs();
117117
// Get outer-referenced (subquery) columns:
118118
outer_columns(expr, &mut cols);
119119
self.indices.reserve(cols.len());
120120
for col in cols {
121-
if let Some(idx) = input_schema.maybe_index_of_column(col) {
121+
if let Some(idx) = input_schema.maybe_index_of_column(col)? {
122122
self.indices.push(idx);
123123
}
124124
}
125+
Ok(())
125126
}
126127

127128
/// Adds the indices of the fields referred to by the given expressions
@@ -132,17 +133,14 @@ impl RequiredIndicies {
132133
/// * `input_schema`: The input schema to analyze for index requirements.
133134
/// * `exprs`: the expressions for which we want to find field indices.
134135
pub fn with_exprs<'a>(
135-
self,
136+
mut self,
136137
schema: &DFSchemaRef,
137138
exprs: impl IntoIterator<Item = &'a Expr>,
138-
) -> Self {
139-
exprs
140-
.into_iter()
141-
.fold(self, |mut acc, expr| {
142-
acc.add_expr(schema, expr);
143-
acc
144-
})
145-
.compact()
139+
) -> Result<Self> {
140+
for expr in exprs {
141+
self.add_expr(schema, expr)?;
142+
}
143+
Ok(self.compact())
146144
}
147145

148146
/// Adds all `indices` into this instance.

datafusion/sql/src/statement.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1750,7 +1750,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
17501750
.enumerate()
17511751
.map(|(i, c)| {
17521752
let column_index = table_schema
1753-
.index_of_column_by_name(None, &c)
1753+
.index_of_column_by_name(None, &c)?
17541754
.ok_or_else(|| unqualified_field_not_found(&c, &table_schema))?;
17551755
if value_indices[column_index].is_some() {
17561756
return schema_err!(SchemaError::DuplicateUnqualifiedField {

datafusion/sql/src/unparser/utils.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ pub(crate) fn unproject_sort_expr(
269269

270270
// In case of aggregation there could be columns containing aggregation functions we need to unproject
271271
if let Some(agg) = agg {
272-
if agg.schema.is_column_from_schema(col_ref) {
272+
if agg.schema.is_column_from_schema(col_ref)? {
273273
let new_expr = unproject_agg_exprs(sort_expr.expr, agg, None)?;
274274
sort_expr.expr = new_expr;
275275
return Ok(sort_expr);

datafusion/sqllogictest/test_files/join.slt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,18 +1215,19 @@ statement ok
12151215
create table t1(v1 int) as values(100);
12161216

12171217
## Query with Ambiguous column reference
1218-
query I
1218+
query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1
12191219
select count(*)
12201220
from t1
12211221
right outer join t1
12221222
on t1.v1 > 0;
1223-
----
1224-
1
12251223

1226-
query I
1224+
query error
12271225
select t1.v1 from t1 join t1 using(v1) cross join (select struct('foo' as v1) as t1);
12281226
----
1229-
100
1227+
DataFusion error: Optimizer rule 'eliminate_cross_join' failed
1228+
caused by
1229+
Schema error: Schema contains duplicate qualified field name t1.v1
1230+
12301231

12311232
statement ok
12321233
drop table t1;

datafusion/sqllogictest/test_files/select.slt

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1807,11 +1807,23 @@ SELECT id AS col, id+1 AS col FROM users
18071807
----
18081808
1 2
18091809

1810+
# a reference is ambiguous
1811+
query error DataFusion error: Schema error: Ambiguous reference to unqualified field a
1812+
select a from (select 1 as a, 2 as a) t;
1813+
1814+
# t.a reference is ambiguous
1815+
query error DataFusion error: Schema error: Schema contains duplicate qualified field name t\.a
1816+
select t.a from (select 1 as a, 2 as a) t;
1817+
1818+
# TODO PostgreSQL disallows self-join without giving tables distinct aliases, but some other databases, e.g. Trino, do allow this, so this could work
18101819
# TODO When joining using USING, the condition columns should appear once in the output, and should be selectible using unqualified name only
1811-
query ITIT
1820+
query error
18121821
SELECT * FROM users JOIN users USING (id);
18131822
----
1814-
1 Tom 1 Tom
1823+
DataFusion error: expand_wildcard_rule
1824+
caused by
1825+
Schema error: Schema contains duplicate qualified field name users.id
1826+
18151827

18161828
statement ok
18171829
create view v as select count(id) from users;

datafusion/substrait/src/logical_plan/consumer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1270,7 +1270,7 @@ fn apply_projection(table: DataFrame, substrait_schema: DFSchema) -> Result<Logi
12701270
.iter()
12711271
.map(|substrait_field| {
12721272
Ok(df_schema
1273-
.index_of_column_by_name(None, substrait_field.name().as_str())
1273+
.index_of_column_by_name(None, substrait_field.name().as_str())?
12741274
.unwrap())
12751275
})
12761276
.collect::<Result<_>>()?;

0 commit comments

Comments
 (0)