Skip to content

Commit 6fe00ce

Browse files
andygroveDandandan
andauthored
Fix join order for TPCH Q17 & Q18 by improving FilterExec statistics (#8126)
* Assume filters are highly selective if we cannot truly estimate cardinality * fix regression * cargo fmt * simplify code * Update datafusion/physical-plan/src/filter.rs Co-authored-by: Daniël Heres <[email protected]> * add comment with link to follow on issue * Use default of 20% selectivity * trigger CI * remove files * trigger CI * address feedback --------- Co-authored-by: Daniël Heres <[email protected]>
1 parent e642cc2 commit 6fe00ce

File tree

1 file changed

+14
-2
lines changed

1 file changed

+14
-2
lines changed

datafusion/physical-plan/src/filter.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,11 +194,23 @@ impl ExecutionPlan for FilterExec {
194194
fn statistics(&self) -> Result<Statistics> {
195195
let predicate = self.predicate();
196196

197+
let input_stats = self.input.statistics()?;
197198
let schema = self.schema();
198199
if !check_support(predicate, &schema) {
199-
return Ok(Statistics::new_unknown(&schema));
200+
// assume filter selects 20% of rows if we cannot do anything smarter
201+
// tracking issue for making this configurable:
202+
// https://github.com/apache/arrow-datafusion/issues/8133
203+
let selectivity = 0.2_f32;
204+
let mut stats = input_stats.clone().into_inexact();
205+
if let Precision::Inexact(n) = stats.num_rows {
206+
stats.num_rows = Precision::Inexact((selectivity * n as f32) as usize);
207+
}
208+
if let Precision::Inexact(n) = stats.total_byte_size {
209+
stats.total_byte_size =
210+
Precision::Inexact((selectivity * n as f32) as usize);
211+
}
212+
return Ok(stats);
200213
}
201-
let input_stats = self.input.statistics()?;
202214

203215
let num_rows = input_stats.num_rows;
204216
let total_byte_size = input_stats.total_byte_size;

0 commit comments

Comments
 (0)