Skip to content

Commit ede5598

Browse files
authored
Improve CommonSubexprEliminate identifier management (10% faster planning) (#10473)
* implement hash based CSE identifier * move `is_volatile()` check out of visitor * fix comments * add transformed asserts when `rewrite_expr` is called * update MSRV to 1.76 * cleanup * better comments regarding volatile and short circuiting expressions * address review comments
1 parent f0ef0e6 commit ede5598

File tree

11 files changed

+723
-358
lines changed

11 files changed

+723
-358
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ homepage = "https://datafusion.apache.org"
5252
license = "Apache-2.0"
5353
readme = "README.md"
5454
repository = "https://github.com/apache/datafusion"
55-
rust-version = "1.75"
55+
rust-version = "1.76"
5656
version = "39.0.0"
5757

5858
[workspace.dependencies]

datafusion-cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ license = "Apache-2.0"
2626
homepage = "https://datafusion.apache.org"
2727
repository = "https://github.com/apache/datafusion"
2828
# Specify MSRV here as `cargo msrv` doesn't support workspace version
29-
rust-version = "1.75"
29+
rust-version = "1.76"
3030
readme = "README.md"
3131

3232
[dependencies]

datafusion/common/src/hash_utils.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use crate::error::{Result, _internal_err};
3636

3737
// Combines two hashes into one hash
3838
#[inline]
39-
fn combine_hashes(l: u64, r: u64) -> u64 {
39+
pub fn combine_hashes(l: u64, r: u64) -> u64 {
4040
let hash = (17 * 37u64).wrapping_add(l);
4141
hash.wrapping_mul(37).wrapping_add(r)
4242
}

datafusion/core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ authors = { workspace = true }
3030
# Specify MSRV here as `cargo msrv` doesn't support workspace version and fails with
3131
# "Unable to find key 'package.rust-version' (or 'package.metadata.msrv') in 'arrow-datafusion/Cargo.toml'"
3232
# https://github.com/foresterre/cargo-msrv/issues/590
33-
rust-version = "1.75"
33+
rust-version = "1.76"
3434

3535
[lints]
3636
workspace = true

datafusion/expr/src/expr.rs

Lines changed: 172 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
2020
use std::collections::HashSet;
2121
use std::fmt::{self, Display, Formatter, Write};
22-
use std::hash::Hash;
22+
use std::hash::{Hash, Hasher};
23+
use std::mem;
2324
use std::str::FromStr;
2425
use std::sync::Arc;
2526

@@ -1461,6 +1462,176 @@ impl Expr {
14611462
| Expr::Placeholder(..) => false,
14621463
}
14631464
}
1465+
1466+
/// Hashes the direct content of an `Expr` without recursing into its children.
1467+
///
1468+
/// This method is useful to incrementally compute hashes, such as in
1469+
/// `CommonSubexprEliminate` which builds a deep hash of a node and its descendants
1470+
/// during the bottom-up phase of the first traversal and so avoid computing the hash
1471+
/// of the node and then the hash of its descendants separately.
1472+
///
1473+
/// If a node doesn't have any children then this method is similar to `.hash()`, but
1474+
/// not necessarily returns the same value.
1475+
///
1476+
/// As it is pretty easy to forget changing this method when `Expr` changes the
1477+
/// implementation doesn't use wildcard patterns (`..`, `_`) to catch changes
1478+
/// compile time.
1479+
pub fn hash_node<H: Hasher>(&self, hasher: &mut H) {
1480+
mem::discriminant(self).hash(hasher);
1481+
match self {
1482+
Expr::Alias(Alias {
1483+
expr: _expr,
1484+
relation,
1485+
name,
1486+
}) => {
1487+
relation.hash(hasher);
1488+
name.hash(hasher);
1489+
}
1490+
Expr::Column(column) => {
1491+
column.hash(hasher);
1492+
}
1493+
Expr::ScalarVariable(data_type, name) => {
1494+
data_type.hash(hasher);
1495+
name.hash(hasher);
1496+
}
1497+
Expr::Literal(scalar_value) => {
1498+
scalar_value.hash(hasher);
1499+
}
1500+
Expr::BinaryExpr(BinaryExpr {
1501+
left: _left,
1502+
op,
1503+
right: _right,
1504+
}) => {
1505+
op.hash(hasher);
1506+
}
1507+
Expr::Like(Like {
1508+
negated,
1509+
expr: _expr,
1510+
pattern: _pattern,
1511+
escape_char,
1512+
case_insensitive,
1513+
})
1514+
| Expr::SimilarTo(Like {
1515+
negated,
1516+
expr: _expr,
1517+
pattern: _pattern,
1518+
escape_char,
1519+
case_insensitive,
1520+
}) => {
1521+
negated.hash(hasher);
1522+
escape_char.hash(hasher);
1523+
case_insensitive.hash(hasher);
1524+
}
1525+
Expr::Not(_expr)
1526+
| Expr::IsNotNull(_expr)
1527+
| Expr::IsNull(_expr)
1528+
| Expr::IsTrue(_expr)
1529+
| Expr::IsFalse(_expr)
1530+
| Expr::IsUnknown(_expr)
1531+
| Expr::IsNotTrue(_expr)
1532+
| Expr::IsNotFalse(_expr)
1533+
| Expr::IsNotUnknown(_expr)
1534+
| Expr::Negative(_expr) => {}
1535+
Expr::Between(Between {
1536+
expr: _expr,
1537+
negated,
1538+
low: _low,
1539+
high: _high,
1540+
}) => {
1541+
negated.hash(hasher);
1542+
}
1543+
Expr::Case(Case {
1544+
expr: _expr,
1545+
when_then_expr: _when_then_expr,
1546+
else_expr: _else_expr,
1547+
}) => {}
1548+
Expr::Cast(Cast {
1549+
expr: _expr,
1550+
data_type,
1551+
})
1552+
| Expr::TryCast(TryCast {
1553+
expr: _expr,
1554+
data_type,
1555+
}) => {
1556+
data_type.hash(hasher);
1557+
}
1558+
Expr::Sort(Sort {
1559+
expr: _expr,
1560+
asc,
1561+
nulls_first,
1562+
}) => {
1563+
asc.hash(hasher);
1564+
nulls_first.hash(hasher);
1565+
}
1566+
Expr::ScalarFunction(ScalarFunction { func, args: _args }) => {
1567+
func.hash(hasher);
1568+
}
1569+
Expr::AggregateFunction(AggregateFunction {
1570+
func_def,
1571+
args: _args,
1572+
distinct,
1573+
filter: _filter,
1574+
order_by: _order_by,
1575+
null_treatment,
1576+
}) => {
1577+
func_def.hash(hasher);
1578+
distinct.hash(hasher);
1579+
null_treatment.hash(hasher);
1580+
}
1581+
Expr::WindowFunction(WindowFunction {
1582+
fun,
1583+
args: _args,
1584+
partition_by: _partition_by,
1585+
order_by: _order_by,
1586+
window_frame,
1587+
null_treatment,
1588+
}) => {
1589+
fun.hash(hasher);
1590+
window_frame.hash(hasher);
1591+
null_treatment.hash(hasher);
1592+
}
1593+
Expr::InList(InList {
1594+
expr: _expr,
1595+
list: _list,
1596+
negated,
1597+
}) => {
1598+
negated.hash(hasher);
1599+
}
1600+
Expr::Exists(Exists { subquery, negated }) => {
1601+
subquery.hash(hasher);
1602+
negated.hash(hasher);
1603+
}
1604+
Expr::InSubquery(InSubquery {
1605+
expr: _expr,
1606+
subquery,
1607+
negated,
1608+
}) => {
1609+
subquery.hash(hasher);
1610+
negated.hash(hasher);
1611+
}
1612+
Expr::ScalarSubquery(subquery) => {
1613+
subquery.hash(hasher);
1614+
}
1615+
Expr::Wildcard { qualifier } => {
1616+
qualifier.hash(hasher);
1617+
}
1618+
Expr::GroupingSet(grouping_set) => {
1619+
mem::discriminant(grouping_set).hash(hasher);
1620+
match grouping_set {
1621+
GroupingSet::Rollup(_exprs) | GroupingSet::Cube(_exprs) => {}
1622+
GroupingSet::GroupingSets(_exprs) => {}
1623+
}
1624+
}
1625+
Expr::Placeholder(place_holder) => {
1626+
place_holder.hash(hasher);
1627+
}
1628+
Expr::OuterReferenceColumn(data_type, column) => {
1629+
data_type.hash(hasher);
1630+
column.hash(hasher);
1631+
}
1632+
Expr::Unnest(Unnest { expr: _expr }) => {}
1633+
};
1634+
}
14641635
}
14651636

14661637
// modifies expr if it is a placeholder with datatype of right

0 commit comments

Comments
 (0)