Skip to content

Commit d6f7c16

Browse files
Speedup DFSchema::merge using HashSet indices (#9020)
* Speedup `DFSchema::merge` using HashSet indices * Remove map * Use name() Co-authored-by: comphead <[email protected]> * fmt --------- Co-authored-by: comphead <[email protected]>
1 parent 78447d6 commit d6f7c16

File tree

1 file changed

+12
-4
lines changed

1 file changed

+12
-4
lines changed

datafusion/common/src/dfschema.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! DFSchema is an extended schema struct that DataFusion uses to provide support for
1919
//! fields with optional relation names.
2020
21-
use std::collections::{BTreeSet, HashMap};
21+
use std::collections::{BTreeSet, HashMap, HashSet};
2222
use std::convert::TryFrom;
2323
use std::fmt::{Display, Formatter};
2424
use std::hash::Hash;
@@ -218,17 +218,25 @@ impl DFSchema {
218218
if other_schema.fields.is_empty() {
219219
return;
220220
}
221+
222+
let self_fields: HashSet<&DFField> = self.fields.iter().collect();
223+
let self_unqualified_names: HashSet<&str> =
224+
self.fields.iter().map(|x| x.name().as_str()).collect();
225+
226+
let mut fields_to_add = vec![];
227+
221228
for field in other_schema.fields() {
222229
// skip duplicate columns
223230
let duplicated_field = match field.qualifier() {
224-
Some(q) => self.has_column_with_qualified_name(q, field.name()),
231+
Some(_) => self_fields.contains(field),
225232
// for unqualified columns, check as unqualified name
226-
None => self.has_column_with_unqualified_name(field.name()),
233+
None => self_unqualified_names.contains(field.name().as_str()),
227234
};
228235
if !duplicated_field {
229-
self.fields.push(field.clone());
236+
fields_to_add.push(field.clone());
230237
}
231238
}
239+
self.fields.extend(fields_to_add);
232240
self.metadata.extend(other_schema.metadata.clone())
233241
}
234242

0 commit comments

Comments
 (0)