Skip to content

Commit 57aba78

Browse files
committed
Reject CREATE TABLE/VIEW with duplicate column names
DFSchema checks column for uniqueness, but allows duplicate column names when they are qualified differently. This is because DFSchema plays central role during query planning as a identifier resolution scope. Those checks in their current form should not be there, since they prevent execution of queries with duplicate column aliases, which is legal in SQL. But even with these checks present, they are not sufficient to ensure CREATE TABLE/VIEW is well structured. Table or view columns need to have unique names and there is no qualification involved. This commit adds necessary checks in CREATE TABLE/VIEW DDL structs, ensuring that CREATE TABLE/VIEW logical plans are valid in that regard.
1 parent 168c696 commit 57aba78

File tree

3 files changed

+164
-2
lines changed

3 files changed

+164
-2
lines changed

datafusion/common/src/error.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ pub enum SchemaError {
149149
DuplicateQualifiedField {
150150
qualifier: Box<TableReference>,
151151
name: String,
152+
},
153+
/// Schema duplicate qualified fields with duplicate unqualified names
154+
QualifiedFieldWithDuplicateName {
155+
qualifier: Box<TableReference>,
156+
name: String,
152157
},
153158
/// Schema contains duplicate unqualified field name
154159
DuplicateUnqualifiedField { name: String },
@@ -188,6 +193,14 @@ impl Display for SchemaError {
188193
quote_identifier(name)
189194
)
190195
}
196+
Self::QualifiedFieldWithDuplicateName { qualifier, name } => {
197+
write!(
198+
f,
199+
"Schema contains qualified fields with duplicate unqualified names {}.{}",
200+
qualifier.to_quoted_string(),
201+
quote_identifier(name)
202+
)
203+
}
191204
Self::DuplicateUnqualifiedField { name } => {
192205
write!(
193206
f,

datafusion/expr/src/logical_plan/ddl.rs

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
use crate::{Expr, LogicalPlan, SortExpr, Volatility};
1919
use std::cmp::Ordering;
20-
use std::collections::HashMap;
20+
use std::collections::{BTreeSet, HashMap, HashSet};
2121
use std::sync::Arc;
2222
use std::{
2323
fmt::{self, Display},
@@ -28,7 +28,8 @@ use crate::expr::Sort;
2828
use arrow::datatypes::DataType;
2929
use datafusion_common::tree_node::{Transformed, TreeNodeContainer, TreeNodeRecursion};
3030
use datafusion_common::{
31-
Constraints, DFSchemaRef, Result, SchemaReference, TableReference,
31+
schema_err, Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, Result,
32+
SchemaError, SchemaReference, TableReference,
3233
};
3334
use sqlparser::ast::Ident;
3435

@@ -306,6 +307,7 @@ impl CreateExternalTable {
306307
constraints,
307308
column_defaults,
308309
} = fields;
310+
check_fields_unique(&schema)?;
309311
Ok(Self {
310312
name,
311313
schema,
@@ -544,6 +546,7 @@ impl CreateMemoryTable {
544546
column_defaults,
545547
temporary,
546548
} = fields;
549+
check_fields_unique(input.schema())?;
547550
Ok(Self {
548551
name,
549552
constraints,
@@ -698,6 +701,7 @@ impl CreateView {
698701
definition,
699702
temporary,
700703
} = fields;
704+
check_fields_unique(input.schema())?;
701705
Ok(Self {
702706
name,
703707
input,
@@ -800,6 +804,48 @@ impl CreateViewBuilder {
800804
})
801805
}
802806
}
807+
fn check_fields_unique(schema: &DFSchema) -> Result<()> {
808+
// Use tree set for deterministic error messages
809+
let mut qualified_names = BTreeSet::new();
810+
let mut unqualified_names = HashSet::new();
811+
let mut name_occurrences: HashMap<&String, usize> = HashMap::new();
812+
813+
for (qualifier, field) in schema.iter() {
814+
if let Some(qualifier) = qualifier {
815+
// Check for duplicate qualified field names
816+
if !qualified_names.insert((qualifier, field.name())) {
817+
return schema_err!(SchemaError::DuplicateQualifiedField {
818+
qualifier: Box::new(qualifier.clone()),
819+
name: field.name().to_string(),
820+
});
821+
}
822+
// Check for duplicate unqualified field names
823+
} else if !unqualified_names.insert(field.name()) {
824+
return schema_err!(SchemaError::DuplicateUnqualifiedField {
825+
name: field.name().to_string()
826+
});
827+
}
828+
*name_occurrences.entry(field.name()).or_default() += 1;
829+
}
830+
831+
for (qualifier, name) in qualified_names {
832+
// Check for duplicate between qualified and unqualified field names
833+
if unqualified_names.contains(name) {
834+
return schema_err!(SchemaError::AmbiguousReference {
835+
field: Column::new(Some(qualifier.clone()), name)
836+
});
837+
}
838+
// Check for duplicates between qualified names as the qualification will be stripped off
839+
if name_occurrences[name] > 1 {
840+
return schema_err!(SchemaError::QualifiedFieldWithDuplicateName {
841+
qualifier: Box::new(qualifier.clone()),
842+
name: name.to_owned(),
843+
});
844+
}
845+
}
846+
847+
Ok(())
848+
}
803849

804850
/// Creates a catalog (aka "Database").
805851
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
@@ -1085,7 +1131,9 @@ impl PartialOrd for CreateIndex {
10851131

10861132
#[cfg(test)]
10871133
mod test {
1134+
use super::*;
10881135
use crate::{CreateCatalog, DdlStatement, DropView};
1136+
use arrow::datatypes::{DataType, Field, Schema};
10891137
use datafusion_common::{DFSchema, DFSchemaRef, TableReference};
10901138
use std::cmp::Ordering;
10911139

@@ -1112,4 +1160,85 @@ mod test {
11121160

11131161
assert_eq!(drop_view.partial_cmp(&catalog), Some(Ordering::Greater));
11141162
}
1163+
1164+
#[test]
1165+
fn test_check_fields_unique() -> Result<()> {
1166+
// no duplicate fields, unqualified schema
1167+
check_fields_unique(&DFSchema::try_from(Schema::new(vec![
1168+
Field::new("c100", DataType::Boolean, true),
1169+
Field::new("c101", DataType::Boolean, true),
1170+
]))?)?;
1171+
1172+
// no duplicate fields, qualified schema
1173+
check_fields_unique(&DFSchema::try_from_qualified_schema(
1174+
"t1",
1175+
&Schema::new(vec![
1176+
Field::new("c100", DataType::Boolean, true),
1177+
Field::new("c101", DataType::Boolean, true),
1178+
]),
1179+
)?)?;
1180+
1181+
// duplicate unqualified field with same qualifier
1182+
assert_eq!(
1183+
check_fields_unique(&DFSchema::try_from(Schema::new(vec![
1184+
Field::new("c0", DataType::Boolean, true),
1185+
Field::new("c1", DataType::Boolean, true),
1186+
Field::new("c1", DataType::Boolean, true),
1187+
Field::new("c2", DataType::Boolean, true),
1188+
]))?)
1189+
.unwrap_err()
1190+
.strip_backtrace()
1191+
.to_string(),
1192+
"Schema error: Schema contains duplicate unqualified field name c1"
1193+
);
1194+
1195+
// duplicate qualified field with same qualifier
1196+
assert_eq!(
1197+
check_fields_unique(&DFSchema::try_from_qualified_schema(
1198+
"t1",
1199+
&Schema::new(vec![
1200+
Field::new("c1", DataType::Boolean, true),
1201+
Field::new("c1", DataType::Boolean, true),
1202+
]),
1203+
)?)
1204+
.unwrap_err()
1205+
.strip_backtrace()
1206+
.to_string(),
1207+
"Schema error: Schema contains duplicate qualified field name t1.c1"
1208+
);
1209+
1210+
// duplicate qualified and unqualified field
1211+
assert_eq!(
1212+
check_fields_unique(&DFSchema::from_field_specific_qualified_schema(
1213+
vec![
1214+
None,
1215+
Some(TableReference::from("t1")),
1216+
],
1217+
&Arc::new(Schema::new(vec![
1218+
Field::new("c1", DataType::Boolean, true),
1219+
Field::new("c1", DataType::Boolean, true),
1220+
]))
1221+
)?)
1222+
.unwrap_err().strip_backtrace().to_string(),
1223+
"Schema error: Schema contains qualified field name t1.c1 and unqualified field name c1 which would be ambiguous"
1224+
);
1225+
1226+
// qualified fields with duplicate unqualified names
1227+
assert_eq!(
1228+
check_fields_unique(&DFSchema::from_field_specific_qualified_schema(
1229+
vec![
1230+
Some(TableReference::from("t1")),
1231+
Some(TableReference::from("t2")),
1232+
],
1233+
&Arc::new(Schema::new(vec![
1234+
Field::new("c1", DataType::Boolean, true),
1235+
Field::new("c1", DataType::Boolean, true),
1236+
]))
1237+
)?)
1238+
.unwrap_err().strip_backtrace().to_string(),
1239+
"Schema error: Schema contains qualified fields with duplicate unqualified names t1.c1"
1240+
);
1241+
1242+
Ok(())
1243+
}
11151244
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
# Issue https://github.com/apache/datafusion/issues/13487
19+
statement error DataFusion error: Schema error: Schema contains qualified fields with duplicate unqualified names l\.id
20+
CREATE TABLE t AS SELECT * FROM (SELECT 1 AS id, 'Foo' AS name) l JOIN (SELECT 1 AS id, 'Bar' as name) r ON l.id = r.id;

0 commit comments

Comments
 (0)