Skip to content

Commit 52d2e23

Browse files
committed
implement hash based CSE identifier
1 parent 0f80b92 commit 52d2e23

File tree

3 files changed

+698
-317
lines changed

3 files changed

+698
-317
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use crate::error::{Result, _internal_err};
3636

3737
// Combines two hashes into one hash
3838
#[inline]
39-
fn combine_hashes(l: u64, r: u64) -> u64 {
39+
pub fn combine_hashes(l: u64, r: u64) -> u64 {
4040
let hash = (17 * 37u64).wrapping_add(l);
4141
hash.wrapping_mul(37).wrapping_add(r)
4242
}

datafusion/expr/src/expr.rs

Lines changed: 172 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
2020
use std::collections::HashSet;
2121
use std::fmt::{self, Display, Formatter, Write};
22-
use std::hash::Hash;
22+
use std::hash::{Hash, Hasher};
23+
use std::mem;
2324
use std::str::FromStr;
2425
use std::sync::Arc;
2526

@@ -1419,6 +1420,176 @@ impl Expr {
14191420
| Expr::Placeholder(..) => false,
14201421
}
14211422
}
1423+
1424+
/// This method hashes the direct content of an expression node without recursing into
1425+
/// its children. This is useful because in `CommonSubexprEliminate` we can build up
1426+
/// the deep hash of a node and its descendants during the bottom-up phase of the
1427+
/// first traversal and so avoid computing the hash of the node and then the hash of
1428+
/// its descendants separately.
1429+
///
1430+
/// As it is pretty easy to forget changing this method when `Expr` changes the
1431+
/// implementation doesn't use wildcard patterns (`..`, `_`) to catch changes
1432+
/// compile time.
1433+
pub fn hash_node<H: Hasher>(&self, hasher: &mut H) {
1434+
mem::discriminant(self).hash(hasher);
1435+
match self {
1436+
Expr::Alias(Alias {
1437+
expr: _expr,
1438+
relation,
1439+
name,
1440+
}) => {
1441+
relation.hash(hasher);
1442+
name.hash(hasher);
1443+
}
1444+
Expr::Column(column) => {
1445+
column.hash(hasher);
1446+
}
1447+
Expr::ScalarVariable(data_type, name) => {
1448+
data_type.hash(hasher);
1449+
name.hash(hasher);
1450+
}
1451+
Expr::Literal(scalar_value) => {
1452+
scalar_value.hash(hasher);
1453+
}
1454+
Expr::BinaryExpr(BinaryExpr {
1455+
left: _left,
1456+
op,
1457+
right: _right,
1458+
}) => {
1459+
op.hash(hasher);
1460+
}
1461+
Expr::Like(Like {
1462+
negated,
1463+
expr: _expr,
1464+
pattern: _pattern,
1465+
escape_char,
1466+
case_insensitive,
1467+
})
1468+
| Expr::SimilarTo(Like {
1469+
negated,
1470+
expr: _expr,
1471+
pattern: _pattern,
1472+
escape_char,
1473+
case_insensitive,
1474+
}) => {
1475+
negated.hash(hasher);
1476+
escape_char.hash(hasher);
1477+
case_insensitive.hash(hasher);
1478+
}
1479+
Expr::Not(_expr)
1480+
| Expr::IsNotNull(_expr)
1481+
| Expr::IsNull(_expr)
1482+
| Expr::IsTrue(_expr)
1483+
| Expr::IsFalse(_expr)
1484+
| Expr::IsUnknown(_expr)
1485+
| Expr::IsNotTrue(_expr)
1486+
| Expr::IsNotFalse(_expr)
1487+
| Expr::IsNotUnknown(_expr)
1488+
| Expr::Negative(_expr) => {}
1489+
Expr::Between(Between {
1490+
expr: _expr,
1491+
negated,
1492+
low: _low,
1493+
high: _high,
1494+
}) => {
1495+
negated.hash(hasher);
1496+
}
1497+
Expr::Case(Case {
1498+
expr: _expr,
1499+
when_then_expr: _when_then_expr,
1500+
else_expr: _else_expr,
1501+
}) => {}
1502+
Expr::Cast(Cast {
1503+
expr: _expr,
1504+
data_type,
1505+
})
1506+
| Expr::TryCast(TryCast {
1507+
expr: _expr,
1508+
data_type,
1509+
}) => {
1510+
data_type.hash(hasher);
1511+
}
1512+
Expr::Sort(Sort {
1513+
expr: _expr,
1514+
asc,
1515+
nulls_first,
1516+
}) => {
1517+
asc.hash(hasher);
1518+
nulls_first.hash(hasher);
1519+
}
1520+
Expr::ScalarFunction(ScalarFunction { func, args: _args }) => {
1521+
func.hash(hasher);
1522+
}
1523+
Expr::AggregateFunction(AggregateFunction {
1524+
func_def,
1525+
args: _args,
1526+
distinct,
1527+
filter: _filter,
1528+
order_by: _order_by,
1529+
null_treatment,
1530+
}) => {
1531+
func_def.hash(hasher);
1532+
distinct.hash(hasher);
1533+
null_treatment.hash(hasher);
1534+
}
1535+
Expr::WindowFunction(WindowFunction {
1536+
fun,
1537+
args: _args,
1538+
partition_by: _partition_by,
1539+
order_by: _order_by,
1540+
window_frame,
1541+
null_treatment,
1542+
}) => {
1543+
fun.hash(hasher);
1544+
window_frame.hash(hasher);
1545+
null_treatment.hash(hasher);
1546+
}
1547+
Expr::InList(InList {
1548+
expr: _expr,
1549+
list: _list,
1550+
negated,
1551+
}) => {
1552+
negated.hash(hasher);
1553+
}
1554+
Expr::Exists(Exists { subquery, negated }) => {
1555+
subquery.hash(hasher);
1556+
negated.hash(hasher);
1557+
}
1558+
Expr::InSubquery(InSubquery {
1559+
expr: _expr,
1560+
subquery,
1561+
negated,
1562+
}) => {
1563+
subquery.hash(hasher);
1564+
negated.hash(hasher);
1565+
}
1566+
Expr::ScalarSubquery(subquery) => {
1567+
subquery.hash(hasher);
1568+
}
1569+
Expr::Wildcard { qualifier } => {
1570+
qualifier.hash(hasher);
1571+
}
1572+
Expr::GroupingSet(grouping_set) => match grouping_set {
1573+
GroupingSet::Rollup(_exprs) => {
1574+
hasher.write_u8(0);
1575+
}
1576+
GroupingSet::Cube(_exprs) => {
1577+
hasher.write_u8(1);
1578+
}
1579+
GroupingSet::GroupingSets(_exprs) => {
1580+
hasher.write_u8(2);
1581+
}
1582+
},
1583+
Expr::Placeholder(place_holder) => {
1584+
place_holder.hash(hasher);
1585+
}
1586+
Expr::OuterReferenceColumn(data_type, column) => {
1587+
data_type.hash(hasher);
1588+
column.hash(hasher);
1589+
}
1590+
Expr::Unnest(Unnest { expr: _expr }) => {}
1591+
};
1592+
}
14221593
}
14231594

14241595
// modifies expr if it is a placeholder with datatype of right

0 commit comments

Comments
 (0)