Skip to content

Commit 4e278ca

Browse files
korowaalamb
andauthored
fix: hash join tests with forced collisions (#11806)
* tests: hash join tests with hash collisions * replace div_ceil expression with function call * Apply suggestions from code review Co-authored-by: Andrew Lamb <[email protected]> * remove redundant comments --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 3d76aa2 commit 4e278ca

File tree

1 file changed

+51
-32
lines changed

1 file changed

+51
-32
lines changed

datafusion/physical-plan/src/joins/hash_join.rs

+51-32
Original file line numberDiff line numberDiff line change
@@ -1583,7 +1583,6 @@ mod tests {
15831583
use rstest::*;
15841584
use rstest_reuse::*;
15851585

1586-
#[cfg(not(feature = "force_hash_collisions"))]
15871586
fn div_ceil(a: usize, b: usize) -> usize {
15881587
(a + b - 1) / b
15891588
}
@@ -1931,9 +1930,6 @@ mod tests {
19311930
Ok(())
19321931
}
19331932

1934-
// FIXME(#TODO) test fails with feature `force_hash_collisions`
1935-
// https://github.com/apache/datafusion/issues/11658
1936-
#[cfg(not(feature = "force_hash_collisions"))]
19371933
#[apply(batch_sizes)]
19381934
#[tokio::test]
19391935
async fn join_inner_two(batch_size: usize) -> Result<()> {
@@ -1964,12 +1960,20 @@ mod tests {
19641960

19651961
assert_eq!(columns, vec!["a1", "b2", "c1", "a1", "b2", "c2"]);
19661962

1967-
// expected joined records = 3
1968-
// in case batch_size is 1 - additional empty batch for remaining 3-2 row
1969-
let mut expected_batch_count = div_ceil(3, batch_size);
1970-
if batch_size == 1 {
1971-
expected_batch_count += 1;
1972-
}
1963+
let expected_batch_count = if cfg!(not(feature = "force_hash_collisions")) {
1964+
// Expected number of hash table matches = 3
1965+
// in case batch_size is 1 - additional empty batch for remaining 3-2 row
1966+
let mut expected_batch_count = div_ceil(3, batch_size);
1967+
if batch_size == 1 {
1968+
expected_batch_count += 1;
1969+
}
1970+
expected_batch_count
1971+
} else {
1972+
// With hash collisions enabled, all records will match each other
1973+
// and filtered later.
1974+
div_ceil(9, batch_size)
1975+
};
1976+
19731977
assert_eq!(batches.len(), expected_batch_count);
19741978

19751979
let expected = [
@@ -1989,9 +1993,6 @@ mod tests {
19891993
}
19901994

19911995
/// Test where the left has 2 parts, the right with 1 part => 1 part
1992-
// FIXME(#TODO) test fails with feature `force_hash_collisions`
1993-
// https://github.com/apache/datafusion/issues/11658
1994-
#[cfg(not(feature = "force_hash_collisions"))]
19951996
#[apply(batch_sizes)]
19961997
#[tokio::test]
19971998
async fn join_inner_one_two_parts_left(batch_size: usize) -> Result<()> {
@@ -2029,12 +2030,20 @@ mod tests {
20292030

20302031
assert_eq!(columns, vec!["a1", "b2", "c1", "a1", "b2", "c2"]);
20312032

2032-
// expected joined records = 3
2033-
// in case batch_size is 1 - additional empty batch for remaining 3-2 row
2034-
let mut expected_batch_count = div_ceil(3, batch_size);
2035-
if batch_size == 1 {
2036-
expected_batch_count += 1;
2037-
}
2033+
let expected_batch_count = if cfg!(not(feature = "force_hash_collisions")) {
2034+
// Expected number of hash table matches = 3
2035+
// in case batch_size is 1 - additional empty batch for remaining 3-2 row
2036+
let mut expected_batch_count = div_ceil(3, batch_size);
2037+
if batch_size == 1 {
2038+
expected_batch_count += 1;
2039+
}
2040+
expected_batch_count
2041+
} else {
2042+
// With hash collisions enabled, all records will match each other
2043+
// and filtered later.
2044+
div_ceil(9, batch_size)
2045+
};
2046+
20382047
assert_eq!(batches.len(), expected_batch_count);
20392048

20402049
let expected = [
@@ -2104,9 +2113,6 @@ mod tests {
21042113
}
21052114

21062115
/// Test where the left has 1 part, the right has 2 parts => 2 parts
2107-
// FIXME(#TODO) test fails with feature `force_hash_collisions`
2108-
// https://github.com/apache/datafusion/issues/11658
2109-
#[cfg(not(feature = "force_hash_collisions"))]
21102116
#[apply(batch_sizes)]
21112117
#[tokio::test]
21122118
async fn join_inner_one_two_parts_right(batch_size: usize) -> Result<()> {
@@ -2143,12 +2149,19 @@ mod tests {
21432149
let stream = join.execute(0, Arc::clone(&task_ctx))?;
21442150
let batches = common::collect(stream).await?;
21452151

2146-
// expected joined records = 1 (first right batch)
2147-
// and additional empty batch for non-joined 20-6-80
2148-
let mut expected_batch_count = div_ceil(1, batch_size);
2149-
if batch_size == 1 {
2150-
expected_batch_count += 1;
2151-
}
2152+
let expected_batch_count = if cfg!(not(feature = "force_hash_collisions")) {
2153+
// Expected number of hash table matches for first right batch = 1
2154+
// and additional empty batch for non-joined 20-6-80
2155+
let mut expected_batch_count = div_ceil(1, batch_size);
2156+
if batch_size == 1 {
2157+
expected_batch_count += 1;
2158+
}
2159+
expected_batch_count
2160+
} else {
2161+
// With hash collisions enabled, all records will match each other
2162+
// and filtered later.
2163+
div_ceil(6, batch_size)
2164+
};
21522165
assert_eq!(batches.len(), expected_batch_count);
21532166

21542167
let expected = [
@@ -2166,8 +2179,14 @@ mod tests {
21662179
let stream = join.execute(1, Arc::clone(&task_ctx))?;
21672180
let batches = common::collect(stream).await?;
21682181

2169-
// expected joined records = 2 (second right batch)
2170-
let expected_batch_count = div_ceil(2, batch_size);
2182+
let expected_batch_count = if cfg!(not(feature = "force_hash_collisions")) {
2183+
// Expected number of hash table matches for second right batch = 2
2184+
div_ceil(2, batch_size)
2185+
} else {
2186+
// With hash collisions enabled, all records will match each other
2187+
// and filtered later.
2188+
div_ceil(3, batch_size)
2189+
};
21712190
assert_eq!(batches.len(), expected_batch_count);
21722191

21732192
let expected = [
@@ -3732,9 +3751,9 @@ mod tests {
37323751
| JoinType::Right
37333752
| JoinType::RightSemi
37343753
| JoinType::RightAnti => {
3735-
(expected_resultset_records + batch_size - 1) / batch_size
3754+
div_ceil(expected_resultset_records, batch_size)
37363755
}
3737-
_ => (expected_resultset_records + batch_size - 1) / batch_size + 1,
3756+
_ => div_ceil(expected_resultset_records, batch_size) + 1,
37383757
};
37393758
assert_eq!(
37403759
batches.len(),

0 commit comments

Comments
 (0)