From a01e36a7b163d1c87f0ea9a863465771f48a3488 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 30 Mar 2025 15:46:14 -0500 Subject: [PATCH 01/27] Add mechanism for pushing down filters though ExecutionPlan & implement dynamic filter pushdown for TopK operator --- datafusion/common/src/config.rs | 7 + .../src/datasource/physical_plan/parquet.rs | 365 +++++++++++++++-- datafusion/core/tests/fuzz_cases/mod.rs | 2 + .../tests/fuzz_cases/topk_filter_pushdown.rs | 324 +++++++++++++++ datafusion/datasource-parquet/src/opener.rs | 69 +++- datafusion/datasource-parquet/src/source.rs | 54 +-- datafusion/datasource/src/file.rs | 9 +- datafusion/datasource/src/file_scan_config.rs | 16 +- datafusion/datasource/src/source.rs | 23 +- .../physical-expr-common/src/physical_expr.rs | 69 ++++ datafusion/physical-expr/src/lib.rs | 6 +- datafusion/physical-expr/src/utils/mod.rs | 16 + datafusion/physical-optimizer/src/pruning.rs | 5 +- .../physical-plan/src/coalesce_batches.rs | 15 + .../physical-plan/src/dynamic_filters.rs | 320 +++++++++++++++ .../physical-plan/src/execution_plan.rs | 7 + datafusion/physical-plan/src/filter.rs | 20 +- datafusion/physical-plan/src/lib.rs | 3 +- datafusion/physical-plan/src/projection.rs | 14 + .../physical-plan/src/repartition/mod.rs | 14 + datafusion/physical-plan/src/sorts/sort.rs | 73 +++- datafusion/physical-plan/src/topk/mod.rs | 372 +++++++++++++++++- .../proto/src/physical_plan/to_proto.rs | 7 +- .../test_files/information_schema.slt | 2 + docs/source/user-guide/configs.md | 1 + 25 files changed, 1712 insertions(+), 101 deletions(-) create mode 100644 datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs create mode 100644 datafusion/physical-plan/src/dynamic_filters.rs diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index b0f17630c910..6c0a329f6bf3 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -590,6 +590,13 @@ config_namespace! { /// during aggregations, if possible pub enable_topk_aggregation: bool, default = true + /// When set to true attempts to push down dynamic filters generated by operators into the file scan phase. + /// For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer + /// will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. + /// This means that if we already have 10 timestamps in the year 2025 + /// any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. + pub enable_dynamic_filter_pushdown: bool, default = true + /// When set to true, the optimizer will insert filters before a join between /// a nullable and non-nullable column to filter out nulls on the nullable side. This /// filter can add additional overhead when the file format does not fully support diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 9e1b2822e854..2ad00637e8bf 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -43,11 +43,12 @@ mod tests { }; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder}; use arrow::record_batch::RecordBatch; + use arrow::util::pretty::pretty_format_batches; use arrow_schema::SchemaRef; use bytes::{BufMut, BytesMut}; use datafusion_common::config::TableParquetOptions; use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; - use datafusion_common::{assert_contains, Result, ScalarValue}; + use datafusion_common::{assert_batches_eq, assert_contains, Result, ScalarValue}; use datafusion_datasource::file_format::FileFormat; use datafusion_datasource::file_meta::FileMeta; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; @@ -1455,6 +1456,7 @@ mod tests { .await; // should have a pruning predicate + #[expect(deprecated)] let pruning_predicate = rt.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); @@ -1496,6 +1498,7 @@ mod tests { .round_trip(vec![batches.clone()]) .await; + #[expect(deprecated)] let pruning_predicate = rt0.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); @@ -1538,6 +1541,7 @@ mod tests { .await; // should have a pruning predicate + #[expect(deprecated)] let pruning_predicate = rt1.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); let pruning_predicate = rt2.parquet_source.predicate(); @@ -1581,6 +1585,7 @@ mod tests { .await; // Should not contain a pruning predicate (since nothing can be pruned) + #[expect(deprecated)] let pruning_predicate = rt.parquet_source.pruning_predicate(); assert!( pruning_predicate.is_none(), @@ -1616,6 +1621,7 @@ mod tests { .await; // Should have a pruning predicate + #[expect(deprecated)] let pruning_predicate = rt.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); } @@ -1769,13 +1775,13 @@ mod tests { let sql = "select * from base_table where name='test02'"; let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap(); assert_eq!(batch.len(), 1); - insta::assert_snapshot!(batches_to_string(&batch),@r###" - +---------------------+----+--------+ - | struct | id | name | - +---------------------+----+--------+ - | {id: 4, name: aaa2} | 2 | test02 | - +---------------------+----+--------+ - "###); + insta::assert_snapshot!(batches_to_string(&batch),@r" + +--------------------+----+--------+ + | struct | id | name | + +--------------------+----+--------+ + | {id: 3, name: zzz} | 2 | test02 | + +--------------------+----+--------+ + "); Ok(()) } @@ -1798,13 +1804,13 @@ mod tests { let sql = "select * from base_table where name='test02'"; let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap(); assert_eq!(batch.len(), 1); - insta::assert_snapshot!(batches_to_string(&batch),@r###" - +---------------------+----+--------+ - | struct | id | name | - +---------------------+----+--------+ - | {id: 4, name: aaa2} | 2 | test02 | - +---------------------+----+--------+ - "###); + insta::assert_snapshot!(batches_to_string(&batch),@r" + +--------------------+----+--------+ + | struct | id | name | + +--------------------+----+--------+ + | {id: 3, name: zzz} | 2 | test02 | + +--------------------+----+--------+ + "); Ok(()) } @@ -1818,14 +1824,14 @@ mod tests { Field::new("id", DataType::Int64, true), Field::new("name", DataType::Utf8, false), ]); - let id_array = Int64Array::from(vec![Some(1), Some(2)]); + let id_array = Int64Array::from(vec![Some(2), Some(1)]); let columns = vec![ Arc::new(Int64Array::from(vec![3, 4])) as _, - Arc::new(StringArray::from(vec!["aaa1", "aaa2"])) as _, + Arc::new(StringArray::from(vec!["zzz", "aaa"])) as _, ]; let struct_array = StructArray::new(struct_fields, columns, None); - let name_array = StringArray::from(vec![Some("test01"), Some("test02")]); + let name_array = StringArray::from(vec![Some("test02"), Some("test01")]); let schema = Arc::new(schema); let batch = RecordBatch::try_new( @@ -1837,12 +1843,53 @@ mod tests { ], ) .unwrap(); - let file = File::create(file).unwrap(); - let w_opt = WriterProperties::builder().build(); - let mut writer = ArrowWriter::try_new(file, schema, Some(w_opt)).unwrap(); - writer.write(&batch).unwrap(); - writer.flush().unwrap(); - writer.close().unwrap(); + write_record_batch(file, batch).unwrap(); + } + + fn write_file_with_non_null_ids(file: &String, value: i64) { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![Some(value)]); + let name_array = StringArray::from(vec![Some("test")]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(id_array), Arc::new(name_array)], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); + } + + fn write_file_with_null_ids(file: &String) { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![None]); + let name_array = StringArray::from(vec![Some(format!("test{:02}", "null"))]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(id_array), Arc::new(name_array)], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); + } + + fn write_record_batch(file: &String, batch: RecordBatch) -> Result<()> { + let file = File::create(file)?; + let w_opt = WriterProperties::builder() + .set_max_row_group_size(1) + .build(); + let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(w_opt))?; + writer.write(&batch)?; + writer.flush()?; + writer.close()?; + Ok(()) } /// Write out a batch to a parquet file and return the total size of the file @@ -1904,6 +1951,49 @@ mod tests { } } + struct DynamicFilterTestCase { + query: String, + path: String, + } + + impl DynamicFilterTestCase { + fn new(query: String, path: String) -> Self { + Self { query, path } + } + + async fn _run_query(&self, query: &str) -> Vec { + // Force 1 partition and 1 rg per partition because if we widen the plan + // and read all batches at once we won't get any dynamic pushdown. + let mut cfg = SessionConfig::new(); + cfg = cfg.set_u64("datafusion.execution.parquet.max_row_group_size", 1); + let ctx = SessionContext::new_with_config(cfg); + + let mut pq_options = TableParquetOptions::default(); + pq_options.global.max_row_group_size = 1; + pq_options.global.pushdown_filters = true; + let fmt = ParquetFormat::default().with_options(pq_options); + let opt = ListingOptions::new(Arc::new(fmt)).with_target_partitions(1); + ctx.register_listing_table("base_table", &self.path, opt, None, None) + .await + .unwrap(); + + ctx.sql(query).await.unwrap().collect().await.unwrap() + } + + async fn results(&self) -> Vec { + self._run_query(&self.query).await + } + + async fn explain_plan(&self) -> String { + let query = format!("EXPLAIN ANALYZE {}", self.query); + let batches = self._run_query(&query).await; + + pretty_format_batches(&batches) + .map(|s| format!("{}", s)) + .unwrap_or_else(|_| "No explain plan generated".to_string()) + } + } + /// Test passing `metadata_size_hint` to either a single file or the whole exec #[tokio::test] async fn test_metadata_size_hint() { @@ -1976,4 +2066,231 @@ mod tests { assert_eq!(calls.len(), 2); assert_eq!(calls, vec![Some(123), Some(456)]); } + + #[tokio::test] + async fn test_topk_predicate_pushdown() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + // write 2 files so that one is processed before the other + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + + let query = "select name from base_table order by id desc limit 3"; + + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "| test02 |", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=3"); + } + + #[tokio::test] + async fn test_topk_predicate_pushdown_nulls_first() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + // write multiple files to ensure we get pushdown of dynamic filters from one file to another + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + + let name = format!("test{:02}.parquet", 100); + write_file_with_null_ids(&format!("{path}/{name}")); + + // nulls first by default + let query = "select name from base_table order by id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----------+", + "| name |", + "+----------+", + "| testnull |", + "| test02 |", + "| test02 |", + "+----------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=3"); + } + + #[tokio::test] + async fn test_topk_predicate_pushdown_multi_key() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + for file in 0..5 { + // write multiple files to ensure we get pushdown of dynamic filters from one file to another + // Ensure files are read in order + let name = format!("test{:02}.parquet", file); + write_file_with_non_null_ids(&format!("{path}/{name}"), file); + } + + let query = "select id from base_table order by name desc, id limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path.clone()); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 0 |", + "| 1 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=1"); + + let query1 = "select id from base_table order by name desc, id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query1.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 4 |", + "| 3 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=0"); + } + + #[tokio::test] + async fn test_topk_predicate_pushdown_nulls_last() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + let name = format!("test{:02}.parquet", 100); + write_file_with_null_ids(&format!("{path}/{name}")); + + let query = "select name from base_table order by id desc nulls last limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "| test02 |", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=4"); + } + + #[tokio::test] + async fn test_topk_predicate_pushdown_single_file() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + write_file(&format!("{path}/test.parquet")); + + let query = "select name from base_table order by id desc nulls last limit 1"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "pushdown_rows_pruned=1"); + } + + #[tokio::test] + async fn test_topk_predicate_pushdown_ignores_partition_columns() { + // The TopK operator will try to push down predicates on `file_id`. + // But since `file_id` is a partition column and not part of the file itself + // we cannot actually do any filtering on it at the file level. + // Thus it has to be ignored by `ParquetSource`. + // This test only shows that this does not result in any errors or panics, + // it is expected that "nothing exciting" happens here. + // I do think in the future it would be interesting to re-design how partition columns + // get handled, in particular by pushing them into SchemaAdapter so that the table schema == file schema + // and we can do predicate pushdown on them as well without relying on each TableProvider to + // do special handling of partition columns. + + let ctx = SessionContext::new(); + let opt = ListingOptions::new(Arc::new(ParquetFormat::default())) + .with_table_partition_cols(vec![("file_id".to_string(), DataType::UInt32)]) + // We need to force 1 partition because TopK predicate pushdown happens on a per-partition basis + // If we had 1 file per partition (as an example) no pushdown would happen + .with_target_partitions(1); + + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + for file in 0..5 { + // crete a directory for the partition + fs::create_dir_all(format!("{path}/file_id={file}")).unwrap(); + let name = format!("file_id={file}/test.parquet"); + write_file(&format!("{path}/{name}")); + } + ctx.register_listing_table("base_table", path, opt, None, None) + .await + .unwrap(); + + let query = "select file_id from base_table order by file_id asc limit 3"; + + let batches = ctx.sql(query).await.unwrap().collect().await.unwrap(); + #[rustfmt::skip] + let expected = [ + "+---------+", + "| file_id |", + "+---------+", + "| 0 |", + "| 0 |", + "| 1 |", + "+---------+", + ]; + assert_batches_eq!(expected, &batches); + + let sql = format!("explain analyze {query}"); + let batches = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); + let explain_plan = format!("{}", pretty_format_batches(&batches).unwrap()); + assert_contains!(explain_plan, "row_groups_pruned_statistics=0"); // just documenting current behavior + } } diff --git a/datafusion/core/tests/fuzz_cases/mod.rs b/datafusion/core/tests/fuzz_cases/mod.rs index d5511e2970f4..11bf29431e90 100644 --- a/datafusion/core/tests/fuzz_cases/mod.rs +++ b/datafusion/core/tests/fuzz_cases/mod.rs @@ -29,3 +29,5 @@ mod pruning; mod limit_fuzz; mod sort_preserving_repartition_fuzz; mod window_fuzz; + +mod topk_filter_pushdown; diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs new file mode 100644 index 000000000000..aafb38a5d542 --- /dev/null +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -0,0 +1,324 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; + +use arrow::array::{Int32Array, StringArray, StringDictionaryBuilder}; +use arrow::datatypes::Int32Type; +use arrow::record_batch::RecordBatch; +use arrow_schema::{DataType, Field, Schema}; +use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig}; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_datasource::ListingTableUrl; +use datafusion_datasource_parquet::ParquetFormat; +use datafusion_execution::object_store::ObjectStoreUrl; +use itertools::Itertools; +use object_store::memory::InMemory; +use object_store::path::Path; +use object_store::{ObjectStore, PutPayload}; +use parquet::arrow::ArrowWriter; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tokio::sync::Mutex; +use tokio::task::JoinSet; + +#[derive(Clone)] +struct TestDataSet { + store: Arc, + schema: Arc, +} + +/// List of in memory parquet files with UTF8 data +// Use a mutex rather than LazyLock to allow for async initialization +static TESTFILES: LazyLock>> = + LazyLock::new(|| Mutex::new(vec![])); + +async fn test_files() -> Vec { + let files_mutex = &TESTFILES; + let mut files = files_mutex.lock().await; + if !files.is_empty() { + return (*files).clone(); + } + + let mut rng = StdRng::seed_from_u64(0); + + for nulls_in_ids in [false, true] { + for nulls_in_names in [false, true] { + for nulls_in_departments in [false, true] { + let store = Arc::new(InMemory::new()); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, nulls_in_ids), + Field::new("name", DataType::Utf8, nulls_in_names), + Field::new( + "department", + DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + ), + nulls_in_departments, + ), + ])); + + let name_choices = if nulls_in_names { + [Some("Alice"), Some("Bob"), None, Some("David"), None] + } else { + [ + Some("Alice"), + Some("Bob"), + Some("Charlie"), + Some("David"), + Some("Eve"), + ] + }; + + let department_choices = if nulls_in_departments { + [ + Some("Theater"), + Some("Engineering"), + None, + Some("Arts"), + None, + ] + } else { + [ + Some("Theater"), + Some("Engineering"), + Some("Healthcare"), + Some("Arts"), + Some("Music"), + ] + }; + + // Generate 5 files, some with overlapping or repeated ids some without + for i in 0..5 { + let num_batches = rng.gen_range(1..3); + let mut batches = Vec::with_capacity(num_batches); + for _ in 0..num_batches { + let num_rows = 25; + let ids = Int32Array::from_iter((0..num_rows).map(|file| { + if nulls_in_ids { + if rng.gen_bool(1.0 / 10.0) { + None + } else { + Some(rng.gen_range(file..file + 5)) + } + } else { + Some(rng.gen_range(file..file + 5)) + } + })); + let names = StringArray::from_iter((0..num_rows).map(|_| { + // randomly select a name + let idx = rng.gen_range(0..name_choices.len()); + name_choices[idx].map(|s| s.to_string()) + })); + let mut departments = StringDictionaryBuilder::::new(); + for _ in 0..num_rows { + // randomly select a department + let idx = rng.gen_range(0..department_choices.len()); + departments.append_option(department_choices[idx].as_ref()); + } + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(ids), + Arc::new(names), + Arc::new(departments.finish()), + ], + ) + .unwrap(); + batches.push(batch); + } + let mut buf = vec![]; + { + let mut writer = + ArrowWriter::try_new(&mut buf, schema.clone(), None).unwrap(); + for batch in batches { + writer.write(&batch).unwrap(); + writer.flush().unwrap(); + } + writer.flush().unwrap(); + writer.finish().unwrap(); + } + let payload = PutPayload::from(buf); + let path = Path::from(format!("file_{i}.parquet")); + store.put(&path, payload).await.unwrap(); + } + files.push(TestDataSet { store, schema }); + } + } + } + (*files).clone() +} + +async fn run_query_with_config( + query: &str, + config: SessionConfig, + dataset: TestDataSet, +) -> Vec { + let store = dataset.store; + let schema = dataset.schema; + let ctx = SessionContext::new_with_config(config); + let url = ObjectStoreUrl::parse("memory://").unwrap(); + ctx.register_object_store(url.as_ref(), store.clone()); + + let format = Arc::new( + ParquetFormat::default() + .with_options(ctx.state().table_options().parquet.clone()), + ); + let options = ListingOptions::new(format); + let table_path = ListingTableUrl::parse("memory:///").unwrap(); + let config = ListingTableConfig::new(table_path) + .with_listing_options(options) + .with_schema(schema); + let table = Arc::new(ListingTable::try_new(config).unwrap()); + + ctx.register_table("test_table", table).unwrap(); + + ctx.sql(query).await.unwrap().collect().await.unwrap() +} + +#[derive(Debug)] +struct RunQueryResult { + query: String, + result: Vec, + expected: Vec, +} + +async fn run_query( + query: String, + cfg: SessionConfig, + dataset: TestDataSet, +) -> RunQueryResult { + let cfg_with_dynamic_filters = cfg + .clone() + .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true); + let cfg_without_dynamic_filters = cfg + .clone() + .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", false); + + let expected_result = + run_query_with_config(&query, cfg_without_dynamic_filters, dataset.clone()).await; + let result = + run_query_with_config(&query, cfg_with_dynamic_filters, dataset.clone()).await; + + RunQueryResult { + query: query.to_string(), + result, + expected: expected_result, + } +} + +struct TestCase { + query: String, + cfg: SessionConfig, + dataset: TestDataSet, +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_fuzz_topk_filter_pushdown() { + let order_columns = ["id", "name", "department"]; + let order_directions = ["ASC", "DESC"]; + let null_orders = ["NULLS FIRST", "NULLS LAST"]; + + let start = datafusion_common::instant::Instant::now(); + let mut orders: HashMap> = HashMap::new(); + for order_column in &order_columns { + for order_direction in &order_directions { + for null_order in &null_orders { + // if there is a vec for this column insert the order, otherwise create a new vec + let ordering = + format!("{} {} {}", order_column, order_direction, null_order); + match orders.get_mut(*order_column) { + Some(order_vec) => { + order_vec.push(ordering); + } + None => { + orders.insert(order_column.to_string(), vec![ordering]); + } + } + } + } + } + + let mut queries = vec![]; + + for limit in [1, 10] { + for num_order_by_columns in [1, 2, 3] { + for order_columns in ["id", "name", "department"] + .iter() + .combinations(num_order_by_columns) + { + for orderings in order_columns + .iter() + .map(|col| orders.get(**col).unwrap()) + .multi_cartesian_product() + { + let query = format!( + "SELECT * FROM test_table ORDER BY {} LIMIT {}", + orderings.into_iter().join(", "), + limit + ); + queries.push(query); + } + } + } + } + + queries.sort_unstable(); + println!( + "Generated {} queries in {:?}", + queries.len(), + start.elapsed() + ); + + let start = datafusion_common::instant::Instant::now(); + let datasets = test_files().await; + println!("Generated test files in {:?}", start.elapsed()); + + let mut test_cases = vec![]; + for enable_filter_pushdown in [true, false] { + for query in &queries { + for dataset in &datasets { + let mut cfg = SessionConfig::new(); + cfg = cfg.set_bool( + "datafusion.optimizer.enable_dynamic_filter_pushdown", + enable_filter_pushdown, + ); + test_cases.push(TestCase { + query: query.to_string(), + cfg, + dataset: dataset.clone(), + }); + } + } + } + + let start = datafusion_common::instant::Instant::now(); + let mut join_set = JoinSet::new(); + for tc in test_cases { + join_set.spawn(run_query(tc.query, tc.cfg, tc.dataset)); + } + let mut results = join_set.join_all().await; + results.sort_unstable_by(|a, b| a.query.cmp(&b.query)); + println!("Ran {} test cases in {:?}", results.len(), start.elapsed()); + + for result in results { + assert_eq!(result.result, result.expected, "Query: {}", result.query); + } +} diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 732fef47d5a7..4752aaadee1d 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -34,7 +34,7 @@ use arrow::error::ArrowError; use datafusion_common::{exec_err, Result}; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_optimizer::pruning::PruningPredicate; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion_physical_plan::metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder}; use futures::{StreamExt, TryStreamExt}; use log::debug; @@ -54,10 +54,6 @@ pub(super) struct ParquetOpener { pub limit: Option, /// Optional predicate to apply during the scan pub predicate: Option>, - /// Optional pruning predicate applied to row group statistics - pub pruning_predicate: Option>, - /// Optional pruning predicate applied to data page statistics - pub page_pruning_predicate: Option>, /// Schema of the output table pub table_schema: SchemaRef, /// Optional hint for how large the initial request to read parquet metadata @@ -109,18 +105,32 @@ impl FileOpener for ParquetOpener { .schema_adapter_factory .create(projected_schema, Arc::clone(&self.table_schema)); let predicate = self.predicate.clone(); - let pruning_predicate = self.pruning_predicate.clone(); - let page_pruning_predicate = self.page_pruning_predicate.clone(); let table_schema = Arc::clone(&self.table_schema); let reorder_predicates = self.reorder_filters; let pushdown_filters = self.pushdown_filters; - let enable_page_index = should_enable_page_index( - self.enable_page_index, - &self.page_pruning_predicate, - ); let enable_bloom_filter = self.enable_bloom_filter; let limit = self.limit; + let predicate_creation_errors = MetricBuilder::new(&self.metrics) + .global_counter("num_predicate_creation_errors"); + + let (pruning_predicate, page_pruning_predicate) = + if let Some(predicate) = &predicate { + let pruning_predicate = build_pruning_predicate( + Arc::clone(predicate), + &table_schema, + &predicate_creation_errors, + ); + let page_pruning_predicate = + build_page_pruning_predicate(predicate, &table_schema); + (pruning_predicate, Some(page_pruning_predicate)) + } else { + (None, None) + }; + + let enable_page_index = + should_enable_page_index(self.enable_page_index, &page_pruning_predicate); + Ok(Box::pin(async move { let options = ArrowReaderOptions::new().with_page_index(enable_page_index); @@ -295,3 +305,40 @@ fn create_initial_plan( // default to scanning all row groups Ok(ParquetAccessPlan::new_all(row_group_count)) } + +/// Build a pruning predicate from an optional predicate expression. +/// If the predicate is None or the predicate cannot be converted to a pruning +/// predicate, return None. +/// If there is an error creating the pruning predicate it is recorded by incrementing +/// the `predicate_creation_errors` counter. +pub(crate) fn build_pruning_predicate( + predicate: Arc, + file_schema: &SchemaRef, + predicate_creation_errors: &Count, +) -> Option> { + match PruningPredicate::try_new(predicate, Arc::clone(file_schema)) { + Ok(pruning_predicate) => { + if !pruning_predicate.always_true() { + return Some(Arc::new(pruning_predicate)); + } + } + Err(e) => { + debug!("Could not create pruning predicate for: {e}"); + predicate_creation_errors.add(1); + } + } + None +} + +/// Build a page pruning predicate from an optional predicate expression. +/// If the predicate is None or the predicate cannot be converted to a page pruning +/// predicate, return None. +pub(crate) fn build_page_pruning_predicate( + predicate: &Arc, + file_schema: &SchemaRef, +) -> Arc { + Arc::new(PagePruningAccessPlanFilter::new( + predicate, + Arc::clone(file_schema), + )) +} diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 66d4d313d5a6..8331c0074e44 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -20,6 +20,8 @@ use std::any::Any; use std::fmt::Formatter; use std::sync::Arc; +use crate::opener::build_page_pruning_predicate; +use crate::opener::build_pruning_predicate; use crate::opener::ParquetOpener; use crate::page_filter::PagePruningAccessPlanFilter; use crate::DefaultParquetFileReaderFactory; @@ -34,6 +36,7 @@ use datafusion_common::config::TableParquetOptions; use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_physical_expr::conjunction; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_optimizer::pruning::PruningPredicate; @@ -41,7 +44,6 @@ use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder}; use datafusion_physical_plan::DisplayFormatType; use itertools::Itertools; -use log::debug; use object_store::ObjectStore; /// Execution plan for reading one or more Parquet files. @@ -316,24 +318,10 @@ impl ParquetSource { conf = conf.with_metrics(metrics); conf.predicate = Some(Arc::clone(&predicate)); - match PruningPredicate::try_new(Arc::clone(&predicate), Arc::clone(&file_schema)) - { - Ok(pruning_predicate) => { - if !pruning_predicate.always_true() { - conf.pruning_predicate = Some(Arc::new(pruning_predicate)); - } - } - Err(e) => { - debug!("Could not create pruning predicate for: {e}"); - predicate_creation_errors.add(1); - } - }; - - let page_pruning_predicate = Arc::new(PagePruningAccessPlanFilter::new( - &predicate, - Arc::clone(&file_schema), - )); - conf.page_pruning_predicate = Some(page_pruning_predicate); + conf.page_pruning_predicate = + Some(build_page_pruning_predicate(&predicate, &file_schema)); + conf.pruning_predicate = + build_pruning_predicate(predicate, &file_schema, &predicate_creation_errors); conf } @@ -349,11 +337,13 @@ impl ParquetSource { } /// Optional reference to this parquet scan's pruning predicate + #[deprecated(note = "ParquetDataSource no longer constructs a PruningPredicate.")] pub fn pruning_predicate(&self) -> Option<&Arc> { self.pruning_predicate.as_ref() } /// Optional reference to this parquet scan's page pruning predicate + #[deprecated(note = "ParquetDataSource no longer constructs a PruningPredicate.")] pub fn page_pruning_predicate(&self) -> Option<&Arc> { self.page_pruning_predicate.as_ref() } @@ -488,8 +478,6 @@ impl FileSource for ParquetSource { .expect("Batch size must set before creating ParquetOpener"), limit: base_config.limit, predicate: self.predicate.clone(), - pruning_predicate: self.pruning_predicate.clone(), - page_pruning_predicate: self.page_pruning_predicate.clone(), table_schema: Arc::clone(&base_config.file_schema), metadata_size_hint: self.metadata_size_hint, metrics: self.metrics().clone(), @@ -537,11 +525,10 @@ impl FileSource for ParquetSource { .expect("projected_statistics must be set"); // When filters are pushed down, we have no way of knowing the exact statistics. // Note that pruning predicate is also a kind of filter pushdown. - // (bloom filters use `pruning_predicate` too) - if self.pruning_predicate().is_some() - || self.page_pruning_predicate().is_some() - || (self.predicate().is_some() && self.pushdown_filters()) - { + // (bloom filters use `pruning_predicate` too). + // Because filter pushdown may happen dynamically as long as there is a predicate + // if we have *any* predicate applied, we can't guarantee the statistics are exact. + if self.predicate().is_some() { Ok(statistics.to_inexact()) } else { Ok(statistics) @@ -559,6 +546,7 @@ impl FileSource for ParquetSource { .predicate() .map(|p| format!(", predicate={p}")) .unwrap_or_default(); + #[expect(deprecated)] let pruning_predicate_string = self .pruning_predicate() .map(|pre| { @@ -586,4 +574,18 @@ impl FileSource for ParquetSource { } } } + + fn push_down_filter( + &self, + expr: Arc, + ) -> datafusion_common::Result>> { + let mut conf = self.clone(); + conf.predicate = match self.predicate.as_ref() { + Some(existing_predicate) => { + Some(conjunction([Arc::clone(existing_predicate), expr])) + } + None => Some(expr), + }; + Ok(Some(Arc::new(conf))) + } } diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index 0066f39801a1..b0b9fe4ce905 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -27,7 +27,7 @@ use crate::file_scan_config::FileScanConfig; use crate::file_stream::FileOpener; use arrow::datatypes::SchemaRef; use datafusion_common::Statistics; -use datafusion_physical_expr::LexOrdering; +use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; @@ -93,4 +93,11 @@ pub trait FileSource: Send + Sync { } Ok(None) } + + fn push_down_filter( + &self, + _expr: Arc, + ) -> datafusion_common::Result>> { + Ok(None) + } } diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 5172dafb1f91..aab89c46bec0 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -37,7 +37,7 @@ use datafusion_execution::{ object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, }; use datafusion_physical_expr::{ - expressions::Column, EquivalenceProperties, LexOrdering, Partitioning, + expressions::Column, EquivalenceProperties, LexOrdering, Partitioning, PhysicalExpr, PhysicalSortExpr, }; use datafusion_physical_plan::{ @@ -584,6 +584,20 @@ impl DataSource for FileScanConfig { ) as _ })) } + + fn push_down_filter( + &self, + expr: Arc, + ) -> Result>> { + // Try to push down to the file source + if let Some(file_source) = self.file_source.push_down_filter(expr)? { + return Ok(Some(Arc::new(Self { + file_source, + ..self.clone() + }))); + } + Ok(None) + } } impl FileScanConfig { diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 6c9122ce1ac1..a3bbc153c535 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -33,7 +33,7 @@ use crate::file_scan_config::FileScanConfig; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; +use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; /// Common behaviors in Data Sources for both from Files and Memory. @@ -79,6 +79,13 @@ pub trait DataSource: Send + Sync + Debug { &self, _projection: &ProjectionExec, ) -> datafusion_common::Result>>; + + fn push_down_filter( + &self, + _expr: Arc, + ) -> datafusion_common::Result>> { + Ok(None) + } } /// [`ExecutionPlan`] handles different file formats like JSON, CSV, AVRO, ARROW, PARQUET @@ -192,6 +199,20 @@ impl ExecutionPlan for DataSourceExec { ) -> datafusion_common::Result>> { self.data_source.try_swapping_with_projection(projection) } + + fn push_down_filter( + &self, + expr: Arc, + ) -> datafusion_common::Result>> { + // Try to push down to the data source + if let Some(data_source) = self.data_source.push_down_filter(expr)? { + return Ok(Some(Arc::new(Self { + data_source, + ..self.clone() + }))); + } + Ok(None) + } } impl DataSourceExec { diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 43f214607f9f..22f671d349e2 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -27,6 +27,7 @@ use arrow::array::BooleanArray; use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; +use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue}; use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; @@ -283,6 +284,47 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { /// See the [`fmt_sql`] function for an example of printing `PhysicalExpr`s as SQL. /// fn fmt_sql(&self, f: &mut Formatter<'_>) -> fmt::Result; + + /// Take a snapshot of this `PhysicalExpr` if it is dynamic. + /// This is used to capture the current state of `PhysicalExpr`s that may contain + /// dynamic references to other operators in order to serialize it over the wire + /// or treat it via downcast matching. + /// + /// You should not call this method directly as it does not handle recursion. + /// Instead use `shapshot_physical_expr` to handle recursion and capture the + /// full state of the `PhysicalExpr`. + /// + /// This is expected to return "simple" expressions that do not have mutable state + /// and are composed of DataFusion's built-in `PhysicalExpr` implementations. + /// Callers however should *not* assume anything about the returned expressions + /// since callers and implementers may not agree on what "simple" or "built-in" + /// means. + /// In other words, if you need to searlize a `PhysicalExpr` across the wire + /// you should call this method and then try to serialize the result, + /// but you should handle unknown or unexpected `PhysicalExpr` implementations gracefully + /// just as if you had not called this method at all. + /// + /// In particular, consider: + /// * A `PhysicalExpr` that references the current state of a `datafusion::physical_plan::TopK` + /// that is involved in a query with `SELECT * FROM t1 ORDER BY a LIMIT 10`. + /// This function may return something like `a >= 12`. + /// * A `PhysicalExpr` that references the current state of a `datafusion::physical_plan::joins::HashJoinExec` + /// from a query such as `SELECT * FROM t1 JOIN t2 ON t1.a = t2.b`. + /// This function may return something like `t2.b IN (1, 5, 7)`. + /// + /// A system or function that can only deal with a hardcoded set of `PhysicalExpr` implementations + /// or needs to serialize this state to bytes may not be able to handle these dynamic references. + /// In such cases, we should return a simplified version of the `PhysicalExpr` that does not + /// contain these dynamic references. + /// + /// Note for implementers: this method should *not* handle recursion. + /// Recursion is handled in `shapshot_physical_expr`. + fn snapshot(&self) -> Result>> { + // By default, we return None to indicate that this PhysicalExpr does not + // have any dynamic references or state. + // This is a safe default behavior. + Ok(None) + } } /// [`PhysicalExpr`] can't be constrained by [`Eq`] directly because it must remain object @@ -446,3 +488,30 @@ pub fn fmt_sql(expr: &dyn PhysicalExpr) -> impl Display + '_ { Wrapper { expr } } + +/// Take a snapshot of the given `PhysicalExpr` if it is dynamic. +/// +/// Take a snapshot of this `PhysicalExpr` if it is dynamic. +/// This is used to capture the current state of `PhysicalExpr`s that may contain +/// dynamic references to other operators in order to serialize it over the wire +/// or treat it via downcast matching. +/// +/// See the documentation of [`PhysicalExpr::snapshot`] for more details. +/// +/// # Returns +/// +/// Returns an `Option>` which is the snapshot of the +/// `PhysicalExpr` if it is dynamic. If the `PhysicalExpr` does not have +/// any dynamic references or state, it returns `None`. +pub fn snasphot_physical_expr( + expr: Arc, +) -> Result> { + expr.transform_up(|e| { + if let Some(snapshot) = e.snapshot()? { + Ok(Transformed::yes(snapshot)) + } else { + Ok(Transformed::no(Arc::clone(&e))) + } + }) + .data() +} diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 93ced2eb628d..551ee97a8783 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -59,7 +59,9 @@ pub use physical_expr::{ PhysicalExprRef, }; -pub use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +pub use datafusion_physical_expr_common::physical_expr::{ + snasphot_physical_expr, PhysicalExpr, +}; pub use datafusion_physical_expr_common::sort_expr::{ LexOrdering, LexRequirement, PhysicalSortExpr, PhysicalSortRequirement, }; @@ -68,7 +70,7 @@ pub use planner::{create_physical_expr, create_physical_exprs}; pub use scalar_function::ScalarFunctionExpr; pub use datafusion_physical_expr_common::utils::reverse_order_bys; -pub use utils::split_conjunction; +pub use utils::{conjunction, split_conjunction}; // For backwards compatibility pub mod tree_node { diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs index 7e4c7f0e10ba..21496c5edef5 100644 --- a/datafusion/physical-expr/src/utils/mod.rs +++ b/datafusion/physical-expr/src/utils/mod.rs @@ -47,6 +47,22 @@ pub fn split_conjunction( split_impl(Operator::And, predicate, vec![]) } +/// Create a conjunction of the given predicates. +/// If the input is empty, return a literal true. +/// If the input contains a single predicate, return the predicate. +/// Otherwise, return a conjunction of the predicates (e.g. `a AND b AND c`). +pub fn conjunction( + predicates: impl IntoIterator>, +) -> Arc { + predicates + .into_iter() + .fold(None, |acc, predicate| match acc { + None => Some(predicate), + Some(acc) => Some(Arc::new(BinaryExpr::new(acc, Operator::And, predicate))), + }) + .unwrap_or_else(|| crate::expressions::lit(true)) +} + /// Assume the predicate is in the form of DNF, split the predicate to a Vec of PhysicalExprs. /// /// For example, split "a1 = a2 OR b1 <= b2 OR c1 != c2" into ["a1 = a2", "b1 <= b2", "c1 != c2"] diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index b5287f3d33f3..8084772b90de 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -40,7 +40,9 @@ use datafusion_common::{ use datafusion_common::{Column, DFSchema}; use datafusion_expr_common::operator::Operator; use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarantee}; -use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef}; +use datafusion_physical_expr::{ + expressions as phys_expr, snasphot_physical_expr, PhysicalExprRef, +}; use datafusion_physical_plan::{ColumnarValue, PhysicalExpr}; /// A source of runtime statistical information to [`PruningPredicate`]s. @@ -527,6 +529,7 @@ impl PruningPredicate { /// See the struct level documentation on [`PruningPredicate`] for more /// details. pub fn try_new(expr: Arc, schema: SchemaRef) -> Result { + let expr = snasphot_physical_expr(expr)?; let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _; // build predicate expression once diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 5244038b9ae2..f2e91dbbbe4e 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -32,6 +32,7 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::Result; use datafusion_execution::TaskContext; +use datafusion_physical_expr::PhysicalExpr; use crate::coalesce::{BatchCoalescer, CoalescerState}; use crate::execution_plan::CardinalityEffect; @@ -212,6 +213,20 @@ impl ExecutionPlan for CoalesceBatchesExec { fn cardinality_effect(&self) -> CardinalityEffect { CardinalityEffect::Equal } + + fn push_down_filter( + &self, + expr: Arc, + ) -> Result>> { + // Try to push down to the input + if let Some(input) = self.input.push_down_filter(expr)? { + return Ok(Some(Arc::new(Self { + input, + ..self.clone() + }))); + } + Ok(None) + } } /// Stream for [`CoalesceBatchesExec`]. See [`CoalesceBatchesExec`] for more details. diff --git a/datafusion/physical-plan/src/dynamic_filters.rs b/datafusion/physical-plan/src/dynamic_filters.rs new file mode 100644 index 000000000000..4bfad498788e --- /dev/null +++ b/datafusion/physical-plan/src/dynamic_filters.rs @@ -0,0 +1,320 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + any::Any, + hash::Hash, + sync::{Arc, RwLock}, +}; + +use datafusion_common::{ + tree_node::{Transformed, TransformedResult, TreeNode}, + Result, +}; +use datafusion_expr::ColumnarValue; +use datafusion_physical_expr::{expressions::lit, utils::conjunction, PhysicalExpr}; + +/// A source of dynamic runtime filters. +/// +/// During query execution, operators implementing this trait can provide +/// filter expressions that other operators can use to dynamically prune data. +/// +/// See `TopKDynamicFilterSource` in datafusion/physical-plan/src/topk/mod.rs for examples. +pub trait DynamicFilterSource: Send + Sync + std::fmt::Debug + 'static { + /// Take a snapshot of the current state of filtering, returning a non-dynamic PhysicalExpr. + /// This is used to e.g. serialize dynamic filters across the wire or to pass them into systems + /// that won't use the `PhysicalExpr` API (e.g. matching on the concrete types of the expressions like `PruningPredicate` does). + /// For example, it is expected that this returns a relatively simple expression such as `col1 > 5` for a TopK operator or + /// `col2 IN (1, 2, ... N)` for a HashJoin operator. + fn snapshot_current_filters(&self) -> Result>>; +} + +#[derive(Debug)] +pub struct DynamicFilterPhysicalExpr { + /// The children of this expression. + /// In particular, it is important that if the dynamic expression will reference any columns + /// those columns be marked as children of this expression so that the expression can be properly + /// bound to the schema. + children: Vec>, + /// Remapped children, if `PhysicalExpr::with_new_children` was called. + /// This is used to ensure that the children of the expression are always the same + /// as the children of the dynamic filter source. + remapped_children: Option>>, + /// The source of dynamic filters. + inner: Arc, + /// For testing purposes track the data type and nullability to make sure they don't change. + /// If they do, there's a bug in the implementation. + /// But this can have overhead in production, so it's only included in tests. + data_type: Arc>>, + nullable: Arc>>, +} + +impl std::fmt::Display for DynamicFilterPhysicalExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "DynamicFilterPhysicalExpr") + } +} + +// Manually derive PartialEq and Hash to work around https://github.com/rust-lang/rust/issues/78808 +impl PartialEq for DynamicFilterPhysicalExpr { + fn eq(&self, other: &Self) -> bool { + self.current().eq(&other.current()) + } +} + +impl Eq for DynamicFilterPhysicalExpr {} + +impl Hash for DynamicFilterPhysicalExpr { + fn hash(&self, state: &mut H) { + self.current().hash(state) + } +} + +impl DynamicFilterPhysicalExpr { + pub fn new( + children: Vec>, + inner: Arc, + ) -> Self { + Self { + children, + remapped_children: None, + inner, + data_type: Arc::new(RwLock::new(None)), + nullable: Arc::new(RwLock::new(None)), + } + } + + fn current(&self) -> Arc { + let current = if let Ok(current) = self.inner.snapshot_current_filters() { + conjunction(current) + } else { + lit(false) + }; + if let Some(remapped_children) = &self.remapped_children { + // Remap children to the current children + // of the expression. + current + .transform_up(|expr| { + // Check if this is any of our original children + if let Some(pos) = self + .children + .iter() + .position(|c| c.as_ref() == expr.as_ref()) + { + // If so, remap it to the current children + // of the expression. + let new_child = Arc::clone(&remapped_children[pos]); + Ok(Transformed::yes(new_child)) + } else { + // Otherwise, just return the expression + Ok(Transformed::no(expr)) + } + }) + .data() + .expect("transformation is infallible") + } else { + current + } + } +} + +impl PhysicalExpr for DynamicFilterPhysicalExpr { + fn as_any(&self) -> &dyn Any { + self + } + + fn children(&self) -> Vec<&Arc> { + self.remapped_children + .as_ref() + .unwrap_or(&self.children) + .iter() + .collect() + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + Ok(Arc::new(Self { + children: self.children.clone(), + remapped_children: Some(children), + inner: Arc::clone(&self.inner), + data_type: Arc::clone(&self.data_type), + nullable: Arc::clone(&self.nullable), + })) + } + + fn data_type( + &self, + input_schema: &arrow::datatypes::Schema, + ) -> Result { + let res = self.current().data_type(input_schema)?; + #[cfg(test)] + { + use datafusion_common::internal_err; + // Check if the data type has changed. + let mut data_type_lock = self + .data_type + .write() + .expect("Failed to acquire write lock for data_type"); + if let Some(existing) = &*data_type_lock { + if existing != &res { + // If the data type has changed, we have a bug. + return internal_err!( + "DynamicFilterPhysicalExpr data type has changed unexpectedly. \ + Expected: {existing:?}, Actual: {res:?}" + ); + } + } else { + *data_type_lock = Some(res.clone()); + } + } + Ok(res) + } + + fn nullable(&self, input_schema: &arrow::datatypes::Schema) -> Result { + let res = self.current().nullable(input_schema)?; + #[cfg(test)] + { + use datafusion_common::internal_err; + // Check if the nullability has changed. + let mut nullable_lock = self + .nullable + .write() + .expect("Failed to acquire write lock for nullable"); + if let Some(existing) = *nullable_lock { + if existing != res { + // If the nullability has changed, we have a bug. + return internal_err!( + "DynamicFilterPhysicalExpr nullability has changed unexpectedly. \ + Expected: {existing}, Actual: {res}" + ); + } + } else { + *nullable_lock = Some(res); + } + } + Ok(res) + } + + fn evaluate( + &self, + batch: &arrow::record_batch::RecordBatch, + ) -> Result { + let current = self.current(); + #[cfg(test)] + { + // Ensure that we are not evaluating after the expression has changed. + let schema = batch.schema(); + self.nullable(&schema)?; + self.data_type(&schema)?; + }; + current.evaluate(batch) + } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Ok(inner) = self.inner.snapshot_current_filters() { + conjunction(inner).fmt_sql(f) + } else { + write!(f, "dynamic_filter_expr()") // What do we want to do here? + } + } + + fn snapshot(&self) -> Result>> { + // Return the current expression as a snapshot. + Ok(Some(self.current())) + } +} + +#[cfg(test)] +mod test { + use arrow::array::RecordBatch; + use datafusion_common::ScalarValue; + + use super::*; + + #[test] + fn test_dynamic_filter_physical_expr_misbehaves_data_type_nullable() { + #[derive(Debug)] + struct MockDynamicFilterSource { + current_expr: Arc>>, + } + + impl DynamicFilterSource for MockDynamicFilterSource { + fn snapshot_current_filters(&self) -> Result>> { + let expr = self.current_expr.read().unwrap().clone(); + Ok(vec![expr]) + } + } + + let source = Arc::new(MockDynamicFilterSource { + current_expr: Arc::new(RwLock::new(lit(42) as Arc)), + }); + let dynamic_filter = DynamicFilterPhysicalExpr::new( + vec![], + Arc::clone(&source) as Arc, + ); + + // First call to data_type and nullable should set the initial values. + let initial_data_type = dynamic_filter + .data_type(&arrow::datatypes::Schema::empty()) + .unwrap(); + let initial_nullable = dynamic_filter + .nullable(&arrow::datatypes::Schema::empty()) + .unwrap(); + + // Call again and expect no change. + let second_data_type = dynamic_filter + .data_type(&arrow::datatypes::Schema::empty()) + .unwrap(); + let second_nullable = dynamic_filter + .nullable(&arrow::datatypes::Schema::empty()) + .unwrap(); + assert_eq!( + initial_data_type, second_data_type, + "Data type should not change on second call." + ); + assert_eq!( + initial_nullable, second_nullable, + "Nullability should not change on second call." + ); + + // Now change the current expression to something else. + { + let mut current = source.current_expr.write().unwrap(); + *current = lit(ScalarValue::Utf8(None)) as Arc; + } + // Check that we error if we call data_type, nullable or evaluate after changing the expression. + assert!( + dynamic_filter + .data_type(&arrow::datatypes::Schema::empty()) + .is_err(), + "Expected err when data_type is called after changing the expression." + ); + assert!( + dynamic_filter + .nullable(&arrow::datatypes::Schema::empty()) + .is_err(), + "Expected err when nullable is called after changing the expression." + ); + let batch = RecordBatch::new_empty(Arc::new(arrow::datatypes::Schema::empty())); + assert!( + dynamic_filter.evaluate(&batch).is_err(), + "Expected err when evaluate is called after changing the expression." + ); + } +} diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 2bc5706ee0e1..20becb330737 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -467,6 +467,13 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { ) -> Result>> { Ok(None) } + + fn push_down_filter( + &self, + _expr: Arc, + ) -> Result>> { + Ok(None) + } } /// [`ExecutionPlan`] Invariant Level diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index a8a9973ea043..e60c70e3d3d3 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -50,8 +50,8 @@ use datafusion_physical_expr::expressions::BinaryExpr; use datafusion_physical_expr::intervals::utils::check_support; use datafusion_physical_expr::utils::collect_columns; use datafusion_physical_expr::{ - analyze, split_conjunction, AcrossPartitions, AnalysisContext, ConstExpr, - ExprBoundaries, PhysicalExpr, + analyze, conjunction, split_conjunction, AcrossPartitions, AnalysisContext, + ConstExpr, ExprBoundaries, PhysicalExpr, }; use datafusion_physical_expr_common::physical_expr::fmt_sql; @@ -433,6 +433,22 @@ impl ExecutionPlan for FilterExec { } try_embed_projection(projection, self) } + + fn push_down_filter( + &self, + expr: Arc, + ) -> Result>> { + let mut input = Arc::clone(&self.input); + if let Some(new_input) = input.push_down_filter(Arc::clone(&expr))? { + input = new_input; + } + let new_predicate = conjunction([Arc::clone(&self.predicate), expr]); + Ok(Some(Arc::new(Self { + input, + predicate: Arc::clone(&new_predicate), + ..self.clone() + }))) + } } impl EmbeddedProjection for FilterExec { diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 04fbd06fabcd..2ccfe0b73ed6 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -36,7 +36,7 @@ pub use datafusion_expr::{Accumulator, ColumnarValue}; pub use datafusion_physical_expr::window::WindowExpr; use datafusion_physical_expr::PhysicalSortExpr; pub use datafusion_physical_expr::{ - expressions, Distribution, Partitioning, PhysicalExpr, + expressions, snasphot_physical_expr, Distribution, Partitioning, PhysicalExpr, }; pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay}; @@ -51,6 +51,7 @@ pub use crate::stream::EmptyRecordBatchStream; pub use crate::topk::TopK; pub use crate::visitor::{accept, visit_execution_plan, ExecutionPlanVisitor}; +mod dynamic_filters; mod ordering; mod render_tree; mod topk; diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 1d3e23ea9097..9a9e0db9a3fc 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -271,6 +271,20 @@ impl ExecutionPlan for ProjectionExec { Ok(Some(Arc::new(projection.clone()))) } } + + fn push_down_filter( + &self, + expr: Arc, + ) -> Result>> { + // Try to push down to the input + if let Some(input) = self.input.push_down_filter(expr)? { + return Ok(Some(Arc::new(Self { + input, + ..self.clone() + }))); + } + Ok(None) + } } /// If 'e' is a direct column reference, returns the field level diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index ebc751201378..6e7d885f58e1 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -723,6 +723,20 @@ impl ExecutionPlan for RepartitionExec { new_partitioning, )?))) } + + fn push_down_filter( + &self, + expr: Arc, + ) -> Result>> { + // Try to push down to the input + if let Some(input) = self.input.push_down_filter(expr)? { + return Ok(Some(Arc::new(Self { + input, + ..self.clone() + }))); + } + Ok(None) + } } impl RepartitionExec { diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 1072e9abf437..56262aed6e0f 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -57,7 +57,7 @@ use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; -use datafusion_physical_expr::LexOrdering; +use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::{StreamExt, TryStreamExt}; @@ -1197,35 +1197,55 @@ impl ExecutionPlan for SortExec { ) -> Result { trace!("Start SortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()); - let mut input = self.input.execute(partition, Arc::clone(&context))?; - - let execution_options = &context.session_config().options().execution; - - trace!("End SortExec's input.execute for partition: {}", partition); - let sort_satisfied = self .input .equivalence_properties() .ordering_satisfy_requirement(&LexRequirement::from(self.expr.clone())); + let input_exec = Arc::clone(&self.input); + + let execution_options = &context.session_config().options().execution; + + trace!("End SortExec's input.execute for partition: {}", partition); + match (sort_satisfied, self.fetch.as_ref()) { - (true, Some(fetch)) => Ok(Box::pin(LimitStream::new( - input, - 0, - Some(*fetch), - BaselineMetrics::new(&self.metrics_set, partition), - ))), - (true, None) => Ok(input), + (true, Some(fetch)) => { + let input = input_exec.execute(partition, Arc::clone(&context))?; + Ok(Box::pin(LimitStream::new( + input, + 0, + Some(*fetch), + BaselineMetrics::new(&self.metrics_set, partition), + ))) + } + (true, None) => self.input.execute(partition, Arc::clone(&context)), (false, Some(fetch)) => { + let schema = input_exec.schema(); let mut topk = TopK::try_new( partition, - input.schema(), + schema, self.expr.clone(), *fetch, context.session_config().batch_size(), context.runtime_env(), &self.metrics_set, )?; + let input_exec = if context + .session_config() + .options() + .optimizer + .enable_dynamic_filter_pushdown + { + // Try to push down the dynamic filter. If the execution plan doesn't + // support it, push_down_filter will return None and we'll + // keep the original input_exec. + input_exec + .push_down_filter(topk.dynamic_filter_source())? + .unwrap_or(input_exec) + } else { + input_exec + }; + let mut input = input_exec.execute(partition, Arc::clone(&context))?; Ok(Box::pin(RecordBatchStreamAdapter::new( self.schema(), futures::stream::once(async move { @@ -1239,6 +1259,7 @@ impl ExecutionPlan for SortExec { ))) } (false, None) => { + let mut input = input_exec.execute(partition, Arc::clone(&context))?; let mut sorter = ExternalSorter::new( partition, input.schema(), @@ -1319,6 +1340,28 @@ impl ExecutionPlan for SortExec { .with_preserve_partitioning(self.preserve_partitioning()), ))) } + + // Pass though filter pushdown. + // This often happens in partitioned plans with a TopK because we end up with 1 TopK per partition + a final TopK at the end. + // Implementing this pass-through allows global/top/final TopK to push down filters to the partitions. + fn push_down_filter( + &self, + expr: Arc, + ) -> Result>> { + let new_input = self.input.push_down_filter(expr)?; + if let Some(new_input) = new_input { + Ok(Some(Arc::new(SortExec { + input: new_input, + expr: self.expr.clone(), + metrics_set: self.metrics_set.clone(), + preserve_partitioning: self.preserve_partitioning, + fetch: self.fetch, + cache: self.cache.clone(), + }))) + } else { + Ok(None) + } + } } #[cfg(test)] diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 85de1eefce2e..3d45b5277ecf 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -17,27 +17,34 @@ //! TopK: Combination of Sort / LIMIT -use arrow::{ - compute::interleave, - row::{RowConverter, Rows, SortField}, -}; use std::mem::size_of; -use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; +use std::sync::{Arc, RwLock}; +use std::{cmp::Ordering, collections::BinaryHeap}; -use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder}; -use crate::spill::get_record_batch_memory_size; -use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; use arrow::array::{Array, ArrayRef, RecordBatch}; use arrow::datatypes::SchemaRef; -use datafusion_common::HashMap; +use arrow::{ + compute::interleave, + row::{RowConverter, Rows, SortField}, +}; +use arrow_schema::SortOptions; use datafusion_common::Result; +use datafusion_common::{internal_err, DataFusionError, HashMap}; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, runtime_env::RuntimeEnv, }; -use datafusion_physical_expr::PhysicalSortExpr; +use datafusion_expr::ColumnarValue; +use datafusion_expr::Operator; +use datafusion_physical_expr::expressions::{is_not_null, is_null, lit, BinaryExpr}; +use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; +use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder}; +use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; +use crate::spill::get_record_batch_memory_size; +use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; + /// Global TopK /// /// # Background @@ -90,6 +97,18 @@ pub struct TopK { scratch_rows: Rows, /// stores the top k values and their sort key values, in order heap: TopKHeap, + /// stores the current filters derived from this TopK that can be pushed down + filters: Option>, +} + +impl std::fmt::Debug for TopK { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TopK") + .field("schema", &self.schema) + .field("batch_size", &self.batch_size) + .field("expr", &self.expr) + .finish() + } } impl TopK { @@ -137,9 +156,26 @@ impl TopK { row_converter, scratch_rows, heap: TopKHeap::new(k, batch_size, schema), + filters: None, }) } + pub(crate) fn dynamic_filter_source(&mut self) -> Arc { + match self.filters { + Some(ref filters) => filters.as_dynamic_physical_expr(), + None => { + let children = self + .expr + .iter() + .map(|e| Arc::clone(&e.expr)) + .collect::>(); + let filters = Arc::new(TopKDynamicFilterSource::new(children)); + self.filters = Some(Arc::clone(&filters)); + filters.as_dynamic_physical_expr() + } + } + } + /// Insert `batch`, remembering if any of its values are among /// the top k seen so far. pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> { @@ -164,6 +200,7 @@ impl TopK { // Idea: filter out rows >= self.heap.max() early (before passing to `RowConverter`) // this avoids some work and also might be better vectorizable. let mut batch_entry = self.heap.register_batch(batch); + let mut need_to_update_dynamic_filters = false; for (index, row) in rows.iter().enumerate() { match self.heap.max() { // heap has k items, and the new row is greater than the @@ -173,11 +210,23 @@ impl TopK { None | Some(_) => { self.heap.add(&mut batch_entry, row, index); self.metrics.row_replacements.add(1); + need_to_update_dynamic_filters = true; } } } self.heap.insert_batch_entry(batch_entry); + if need_to_update_dynamic_filters { + if let Some(filters) = self.filters.as_ref() { + if let Some(threasholds) = self.heap.get_threshold_values(&self.expr)? { + if let Some(predicate) = Self::calculate_dynamic_filters(threasholds)? + { + filters.update_filters(predicate)?; + } + } + } + } + // conserve memory self.heap.maybe_compact()?; @@ -186,6 +235,90 @@ impl TopK { Ok(()) } + fn calculate_dynamic_filters( + thresholds: Vec, + ) -> Result>> { + // Create filter expressions for each threshold + let mut filters: Vec> = + Vec::with_capacity(thresholds.len()); + + let mut prev_sort_expr: Option> = None; + for threshold in thresholds { + // Create the appropriate operator based on sort order + let op = if threshold.sort_options.descending { + // For descending sort, we want col > threshold (exclude smaller values) + Operator::Gt + } else { + // For ascending sort, we want col < threshold (exclude larger values) + Operator::Lt + }; + + let value_null = threshold.value.is_null(); + + let comparison = Arc::new(BinaryExpr::new( + Arc::clone(&threshold.expr), + op, + lit(threshold.value.clone()), + )); + + let comparison_with_null = + match (threshold.sort_options.nulls_first, value_null) { + // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) + (true, true) => lit(false), + (true, false) => Arc::new(BinaryExpr::new( + is_null(Arc::clone(&threshold.expr))?, + Operator::Or, + comparison, + )), + // For nulls last, transform to (threshold.value is null and threshold.expr is not null) + // or (threshold.value is not null and comparison) + (false, true) => is_not_null(Arc::clone(&threshold.expr))?, + (false, false) => comparison, + }; + + let mut eq_expr = Arc::new(BinaryExpr::new( + Arc::clone(&threshold.expr), + Operator::Eq, + lit(threshold.value.clone()), + )); + + if value_null { + eq_expr = Arc::new(BinaryExpr::new( + is_null(Arc::clone(&threshold.expr))?, + Operator::Or, + eq_expr, + )); + } + + // For a query like order by a, b, the filter for column `b` is only applied if + // the condition a = threshold.value (considering null equality) is met. + // Therefore, we add equality predicates for all preceding fields to the filter logic of the current field, + // and include the current field's equality predicate in `prev_sort_expr` for use with subsequent fields. + match prev_sort_expr.take() { + None => { + prev_sort_expr = Some(eq_expr); + filters.push(comparison_with_null); + } + Some(p) => { + filters.push(Arc::new(BinaryExpr::new( + Arc::clone(&p), + Operator::And, + comparison_with_null, + ))); + + prev_sort_expr = + Some(Arc::new(BinaryExpr::new(p, Operator::And, eq_expr))); + } + } + } + + let dynamic_predicate = filters + .into_iter() + .reduce(|a, b| Arc::new(BinaryExpr::new(a, Operator::Or, b))); + + Ok(dynamic_predicate) + } + /// Returns the top k results broken into `batch_size` [`RecordBatch`]es, consuming the heap pub fn emit(self) -> Result { let Self { @@ -197,6 +330,7 @@ impl TopK { row_converter: _, scratch_rows: _, mut heap, + filters: _, } = self; let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop @@ -270,8 +404,18 @@ struct TopKHeap { owned_bytes: usize, } +/// Holds threshold value and sort order information for a column +struct ColumnThreshold { + /// The column expression + pub expr: Arc, + /// The threshold value + pub value: datafusion_common::ScalarValue, + /// Sort options + pub sort_options: SortOptions, +} + impl TopKHeap { - fn new(k: usize, batch_size: usize, schema: SchemaRef) -> Self { + pub fn new(k: usize, batch_size: usize, schema: SchemaRef) -> Self { assert!(k > 0); Self { k, @@ -282,6 +426,54 @@ impl TopKHeap { } } + /// Get threshold values for all columns in the given sort expressions. + /// If the heap does not yet have k items, returns None. + /// Otherwise, returns the threshold values from the max row in the heap. + pub fn get_threshold_values( + &self, + sort_exprs: &[PhysicalSortExpr], + ) -> Result>> { + // If the heap doesn't have k elements yet, we can't create thresholds + let max_row = match self.max() { + Some(row) => row, + None => return Ok(None), + }; + + // Get the batch that contains the max row + let batch_entry = match self.store.get(max_row.batch_id) { + Some(entry) => entry, + None => return internal_err!("Invalid batch ID in TopKRow"), + }; + + // Extract threshold values for each sort expression + let mut thresholds = Vec::with_capacity(sort_exprs.len()); + for sort_expr in sort_exprs { + // Extract the value for this column from the max row + let expr = Arc::clone(&sort_expr.expr); + let value = expr.evaluate(&batch_entry.batch.slice(max_row.index, 1))?; + + // Convert to scalar value - should be a single value since we're evaluating on a single row batch + let scalar = match value { + ColumnarValue::Scalar(scalar) => scalar, + ColumnarValue::Array(array) if array.len() == 1 => { + // Extract the first (and only) value from the array + datafusion_common::ScalarValue::try_from_array(&array, 0)? + } + array => { + return internal_err!("Expected a scalar value, got {:?}", array) + } + }; + + thresholds.push(ColumnThreshold { + expr, + value: scalar, + sort_options: sort_expr.options, + }); + } + + Ok(Some(thresholds)) + } + /// Register a [`RecordBatch`] with the heap, returning the /// appropriate entry pub fn register_batch(&mut self, batch: RecordBatch) -> RecordBatchEntry { @@ -297,7 +489,7 @@ impl TopKHeap { /// Returns the largest value stored by the heap if there are k /// items, otherwise returns None. Remember this structure is /// keeping the "smallest" k values - fn max(&self) -> Option<&TopKRow> { + pub fn max(&self) -> Option<&TopKRow> { if self.inner.len() < self.k { None } else { @@ -509,7 +701,7 @@ impl TopKRow { } /// Returns a slice to the owned row value - fn row(&self) -> &[u8] { + pub fn row(&self) -> &[u8] { self.row.as_slice() } } @@ -529,7 +721,7 @@ impl Ord for TopKRow { } #[derive(Debug)] -struct RecordBatchEntry { +pub struct RecordBatchEntry { id: u32, batch: RecordBatch, // for this batch, how many times has it been used @@ -644,10 +836,101 @@ impl RecordBatchStore { } } +/// Pushdown of dynamic fitlers from TopK operators is used to speed up queries +/// such as `SELECT * FROM table ORDER BY col DESC LIMIT 10` by pushing down the +/// threshold values for the sort columns to the data source. +/// That is, the TopK operator will keep track of the top 10 values for the sort +/// and before a new file is opened it's statitics will be checked against the +/// threshold values to determine if the file can be skipped and predicate pushdown +/// will use these to skip rows during the scan. +/// +/// For example, imagine this data gets created if multiple sources with clock skews, +/// network delays, etc. are writing data and you don't do anything fancy to guarantee +/// perfect sorting by `timestamp` (i.e. you naively write out the data to Parquet, maybe do some compaction, etc.). +/// The point is that 99% of yesterday's files have a `timestamp` smaller than 99% of today's files +/// but there may be a couple seconds of overlap between files. +/// To be concrete, let's say this is our data: +// +// | file | min | max | +// |------|-----|-----| +// | 1 | 1 | 10 | +// | 2 | 9 | 19 | +// | 3 | 20 | 31 | +// | 4 | 30 | 35 | +// +// Ideally a [`TableProvider`] is able to use file level stats or other methods to roughly order the files +// within each partition / file group such that we start with the newest / largest `timestamp`s. +// If this is not possible the optimization still works but is less efficient and harder to visualize, +// so for this example let's assume that we process 1 file at a time and we started with file 4. +// After processing file 4 let's say we have 10 values in our TopK heap, the smallest of which is 30. +// The TopK operator will then push down the filter `timestamp < 30` down the tree of [`ExecutionPlan`]s +// and if the data source supports dynamic filter pushdown it will accept a reference to this [`DynamicPhysicalExprSource`] +// and when it goes to open file 3 it will ask the [`DynamicPhysicalExprSource`] for the current filters. +// Since file 3 may contain values larger than 30 we cannot skip it entirely, +// but scanning it may still be more efficient due to page pruning and other optimizations. +// Once we get to file 2 however we can skip it entirely because we know that all values in file 2 are smaller than 30. +// The same goes for file 1. +// So this optimization just saved us 50% of the work of scanning the data. +#[derive(Debug, Clone)] +struct TopKDynamicFilterSource { + /// The children of the dynamic filters produced by this TopK. + /// In particular, this is the columns that are being sorted, derived from the sorting expressions. + children: Vec>, + /// The current filters derived from this TopK + predicate: Arc>>, +} + +impl TopKDynamicFilterSource { + fn new(children: Vec>) -> Self { + Self { + children, + predicate: Arc::new(RwLock::new(lit(true))), + } + } + + fn update_filters(&self, predicate: Arc) -> Result<()> { + let mut current_predicate = self.predicate.write().map_err(|_| { + DataFusionError::Internal( + "Failed to acquire write lock on TopKDynamicPhysicalExprSource" + .to_string(), + ) + })?; + *current_predicate = predicate; + Ok(()) + } +} + +impl TopKDynamicFilterSource { + fn as_dynamic_physical_expr(&self) -> Arc { + let new = self.clone(); + // Transform the sort expresions into referenced columns + let children = self.children.clone(); + Arc::new(DynamicFilterPhysicalExpr::new(children, Arc::new(new))) + } +} + +impl DynamicFilterSource for TopKDynamicFilterSource { + fn snapshot_current_filters(&self) -> Result>> { + let predicate = self + .predicate + .read() + .map_err(|_| { + DataFusionError::Internal( + "Failed to acquire read lock on TopKDynamicPhysicalExprSource" + .to_string(), + ) + })? + .clone(); + Ok(vec![predicate]) + } +} + #[cfg(test)] mod tests { use super::*; + use crate::expressions::col; use arrow::array::{Float64Array, Int32Array, RecordBatch}; + use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; /// This test ensures the size calculation is correct for RecordBatches with multiple columns. @@ -681,4 +964,65 @@ mod tests { record_batch_store.unuse(0); assert_eq!(record_batch_store.batches_size, 0); } + + #[test] + fn test_topk_as_dynamic_filter_source() { + let schema = Arc::new(Schema::new(vec![ + Field::new("col1", DataType::Int32, true), + Field::new("col2", DataType::Float64, false), + ])); + + let runtime = Arc::new(RuntimeEnv::default()); + let metrics = ExecutionPlanMetricsSet::new(); + + // Create a TopK with descending sort on col2 + let sort_expr = vec![PhysicalSortExpr { + expr: Arc::new(datafusion_physical_expr::expressions::Column::new( + "col2", 1, + )), + options: SortOptions { + descending: true, + nulls_first: false, + }, + }]; + + let mut topk = TopK::try_new( + 0, + Arc::clone(&schema), + sort_expr.into(), + 5, // k=5 + 100, // batch_size + runtime, + &metrics, + ) + .unwrap(); + + // Initially there should be no filters (empty heap) + let filter = topk.dynamic_filter_source().snapshot().unwrap().unwrap(); + assert!(filter.eq(&lit(true)), "{filter:?}"); + + // Insert some data to fill the heap + let col1 = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let col2 = + Float64Array::from(vec![10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(col1), Arc::new(col2)], + ) + .unwrap(); + + // Insert the data into TopK + topk.insert_batch(batch).unwrap(); + + // Now there should be a filter + let filter = topk.dynamic_filter_source().snapshot().unwrap().unwrap(); + + // We expect a filter for col2 > 6.0 (since we're doing descending sort and have 5 values) + let expected = Arc::new(BinaryExpr::new( + col("col2", &schema).unwrap(), + Operator::Gt, + lit(6.0), + )) as Arc; + assert!(filter.eq(&expected), "{filter:?}"); + } } diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index c196595eeed4..8eccf32fa3c7 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -28,7 +28,9 @@ use datafusion::physical_plan::expressions::{ }; use datafusion::physical_plan::udaf::AggregateFunctionExpr; use datafusion::physical_plan::windows::{PlainAggregateWindowExpr, WindowUDFExpr}; -use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr}; +use datafusion::physical_plan::{ + snasphot_physical_expr, Partitioning, PhysicalExpr, WindowExpr, +}; use datafusion::{ datasource::{ file_format::{csv::CsvSink, json::JsonSink}, @@ -210,6 +212,7 @@ pub fn serialize_physical_expr( value: &Arc, codec: &dyn PhysicalExtensionCodec, ) -> Result { + let value = snasphot_physical_expr(value.clone())?; let expr = value.as_any(); if let Some(expr) = expr.downcast_ref::() { @@ -368,7 +371,7 @@ pub fn serialize_physical_expr( }) } else { let mut buf: Vec = vec![]; - match codec.try_encode_expr(value, &mut buf) { + match codec.try_encode_expr(&value, &mut buf) { Ok(_) => { let inputs: Vec = value .children() diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 496f24abf6ed..f709c3875a9a 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -241,6 +241,7 @@ datafusion.explain.show_statistics false datafusion.optimizer.allow_symmetric_joins_without_pruning true datafusion.optimizer.default_filter_selectivity 20 datafusion.optimizer.enable_distinct_aggregation_soft_limit true +datafusion.optimizer.enable_dynamic_filter_pushdown true datafusion.optimizer.enable_round_robin_repartition true datafusion.optimizer.enable_topk_aggregation true datafusion.optimizer.expand_views_at_output false @@ -340,6 +341,7 @@ datafusion.explain.show_statistics false When set to true, the explain statement datafusion.optimizer.allow_symmetric_joins_without_pruning true Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. datafusion.optimizer.default_filter_selectivity 20 The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. +datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. datafusion.optimizer.enable_round_robin_repartition true When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores datafusion.optimizer.enable_topk_aggregation true When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible datafusion.optimizer.expand_views_at_output false When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 68e21183938b..ea18318dd699 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -99,6 +99,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | | datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | | datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | +| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. | | datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | | datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | From 74813118d229922b948672c876fbbb94861649a7 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 30 Mar 2025 16:13:21 -0500 Subject: [PATCH 02/27] lint --- datafusion/proto/src/physical_plan/to_proto.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 8eccf32fa3c7..1e5a27ec4eb6 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -212,7 +212,7 @@ pub fn serialize_physical_expr( value: &Arc, codec: &dyn PhysicalExtensionCodec, ) -> Result { - let value = snasphot_physical_expr(value.clone())?; + let value = snasphot_physical_expr(Arc::clone(value))?; let expr = value.as_any(); if let Some(expr) = expr.downcast_ref::() { From 3fc78e83e4f20e1bd00136ecb49623b044eb4764 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 31 Mar 2025 22:56:43 -0500 Subject: [PATCH 03/27] use an enum --- datafusion/datasource-parquet/src/source.rs | 28 ++++++++-- datafusion/datasource/src/file.rs | 9 ++- datafusion/datasource/src/file_scan_config.rs | 24 ++++---- datafusion/datasource/src/source.rs | 33 ++++++----- .../physical-plan/src/coalesce_batches.rs | 29 ++++++---- .../physical-plan/src/execution_plan.rs | 36 ++++++++++-- datafusion/physical-plan/src/filter.rs | 56 ++++++++++++++----- datafusion/physical-plan/src/projection.rs | 14 ----- .../physical-plan/src/repartition/mod.rs | 28 ++++++---- datafusion/physical-plan/src/sorts/sort.rs | 37 +++++++----- 10 files changed, 191 insertions(+), 103 deletions(-) diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 8331c0074e44..9a8dd8f03b7f 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -26,6 +26,7 @@ use crate::opener::ParquetOpener; use crate::page_filter::PagePruningAccessPlanFilter; use crate::DefaultParquetFileReaderFactory; use crate::ParquetFileReaderFactory; +use datafusion_datasource::file::FileSourceFilterPushdownResult; use datafusion_datasource::file_stream::FileOpener; use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapterFactory, @@ -40,6 +41,7 @@ use datafusion_physical_expr::conjunction; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_optimizer::pruning::PruningPredicate; +use datafusion_physical_plan::execution_plan::FilterPushdownSupport; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder}; use datafusion_physical_plan::DisplayFormatType; @@ -575,17 +577,31 @@ impl FileSource for ParquetSource { } } - fn push_down_filter( + fn push_down_filters( &self, - expr: Arc, - ) -> datafusion_common::Result>> { + filters: &[&Arc], + ) -> datafusion_common::Result> { let mut conf = self.clone(); conf.predicate = match self.predicate.as_ref() { Some(existing_predicate) => { - Some(conjunction([Arc::clone(existing_predicate), expr])) + // Combine existing predicate with new filters + Some(conjunction( + std::iter::once(Arc::clone(existing_predicate)) + .chain(filters.iter().cloned().cloned()), + )) + } + None => { + if filters.is_empty() { + None + } else { + // If no existing predicate, just use the new filters + Some(conjunction(filters.iter().cloned().cloned())) + } } - None => Some(expr), }; - Ok(Some(Arc::new(conf))) + Ok(Some(FileSourceFilterPushdownResult::new( + Arc::new(conf), + vec![FilterPushdownSupport::Exact; filters.len()], + ))) } } diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index b0b9fe4ce905..8dbc43a59a6d 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -28,6 +28,7 @@ use crate::file_stream::FileOpener; use arrow::datatypes::SchemaRef; use datafusion_common::Statistics; use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; +use datafusion_physical_plan::execution_plan::FilterPushdownResult; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; @@ -94,10 +95,12 @@ pub trait FileSource: Send + Sync { Ok(None) } - fn push_down_filter( + fn push_down_filters( &self, - _expr: Arc, - ) -> datafusion_common::Result>> { + _filters: &[&Arc], + ) -> datafusion_common::Result> { Ok(None) } } + +pub type FileSourceFilterPushdownResult = FilterPushdownResult>; diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index aab89c46bec0..1f50a8e998df 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -48,7 +48,6 @@ use datafusion_physical_plan::{ }; use log::{debug, warn}; -use crate::file_groups::FileGroup; use crate::{ display::FileGroupsDisplay, file::FileSource, @@ -58,6 +57,7 @@ use crate::{ statistics::MinMaxStatistics, PartitionedFile, }; +use crate::{file_groups::FileGroup, source::DataSourceFilterPushdownResult}; /// The base configurations for a [`DataSourceExec`], the a physical plan for /// any given file format. @@ -585,18 +585,20 @@ impl DataSource for FileScanConfig { })) } - fn push_down_filter( + fn push_down_filters( &self, - expr: Arc, - ) -> Result>> { - // Try to push down to the file source - if let Some(file_source) = self.file_source.push_down_filter(expr)? { - return Ok(Some(Arc::new(Self { - file_source, - ..self.clone() - }))); + filters: &[&Arc], + ) -> Result> { + if let Some(file_source_result) = self.file_source.push_down_filters(filters)? { + let mut new_self = self.clone(); + new_self.file_source = file_source_result.inner; + Ok(Some(DataSourceFilterPushdownResult { + inner: Arc::new(new_self) as Arc, + support: file_source_result.support, + })) + } else { + Ok(None) } - Ok(None) } } diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index a3bbc153c535..4e8e6e361b7a 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -22,7 +22,9 @@ use std::fmt; use std::fmt::{Debug, Formatter}; use std::sync::Arc; -use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion_physical_plan::execution_plan::{ + Boundedness, EmissionType, ExecutionPlanFilterPushdownResult, FilterPushdownResult, +}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use datafusion_physical_plan::projection::ProjectionExec; use datafusion_physical_plan::{ @@ -79,15 +81,16 @@ pub trait DataSource: Send + Sync + Debug { &self, _projection: &ProjectionExec, ) -> datafusion_common::Result>>; - - fn push_down_filter( + fn push_down_filters( &self, - _expr: Arc, - ) -> datafusion_common::Result>> { + _filters: &[&Arc], + ) -> datafusion_common::Result> { Ok(None) } } +pub type DataSourceFilterPushdownResult = FilterPushdownResult>; + /// [`ExecutionPlan`] handles different file formats like JSON, CSV, AVRO, ARROW, PARQUET /// /// `DataSourceExec` implements common functionality such as applying projections, @@ -200,16 +203,16 @@ impl ExecutionPlan for DataSourceExec { self.data_source.try_swapping_with_projection(projection) } - fn push_down_filter( - &self, - expr: Arc, - ) -> datafusion_common::Result>> { - // Try to push down to the data source - if let Some(data_source) = self.data_source.push_down_filter(expr)? { - return Ok(Some(Arc::new(Self { - data_source, - ..self.clone() - }))); + fn push_down_filters( + self: Arc, + filters: &[&Arc], + ) -> datafusion_common::Result> { + if let Some(pushdown_result) = self.data_source.push_down_filters(filters)? { + let new_self = Arc::new(DataSourceExec::new(pushdown_result.inner)); + return Ok(Some(ExecutionPlanFilterPushdownResult::new( + new_self, + pushdown_result.support, + ))); } Ok(None) } diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index f2e91dbbbe4e..e1161352bfbe 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -35,7 +35,7 @@ use datafusion_execution::TaskContext; use datafusion_physical_expr::PhysicalExpr; use crate::coalesce::{BatchCoalescer, CoalescerState}; -use crate::execution_plan::CardinalityEffect; +use crate::execution_plan::{CardinalityEffect, ExecutionPlanFilterPushdownResult}; use futures::ready; use futures::stream::{Stream, StreamExt}; @@ -214,18 +214,23 @@ impl ExecutionPlan for CoalesceBatchesExec { CardinalityEffect::Equal } - fn push_down_filter( - &self, - expr: Arc, - ) -> Result>> { - // Try to push down to the input - if let Some(input) = self.input.push_down_filter(expr)? { - return Ok(Some(Arc::new(Self { - input, - ..self.clone() - }))); + fn push_down_filters( + self: Arc, + filters: &[&Arc], + ) -> Result> { + let input = Arc::clone(&self.input); + if let Some(result) = input.push_down_filters(filters)? { + let new_self = Arc::new( + CoalesceBatchesExec::new(result.inner, self.target_batch_size) + .with_fetch(self.fetch), + ); + Ok(Some(ExecutionPlanFilterPushdownResult::new( + new_self, + result.support, + ))) + } else { + Ok(None) } - Ok(None) } } diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 20becb330737..e7be57b42125 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -468,14 +468,42 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { Ok(None) } - fn push_down_filter( - &self, - _expr: Arc, - ) -> Result>> { + fn push_down_filters( + self: Arc, + _filters: &[&Arc], + ) -> Result> { Ok(None) } } +#[derive(Debug, Clone, Copy)] +pub enum FilterPushdownSupport { + Inexact, + Exact, +} + +pub struct FilterPushdownResult { + pub inner: T, + pub support: Vec, +} + +impl FilterPushdownResult { + pub fn new(plan: T, support: Vec) -> Self { + Self { + inner: plan, + support, + } + } + + pub fn is_exact(&self) -> bool { + self.support + .iter() + .all(|s| matches!(s, FilterPushdownSupport::Exact)) + } +} + +pub type ExecutionPlanFilterPushdownResult = FilterPushdownResult>; + /// [`ExecutionPlan`] Invariant Level /// /// What set of assertions ([Invariant]s) holds for a particular `ExecutionPlan` diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index e60c70e3d3d3..d00ff5a4d9c9 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -25,7 +25,9 @@ use super::{ RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::common::can_project; -use crate::execution_plan::CardinalityEffect; +use crate::execution_plan::{ + CardinalityEffect, ExecutionPlanFilterPushdownResult, FilterPushdownSupport, +}; use crate::projection::{ make_with_child, try_embed_projection, update_expr, EmbeddedProjection, ProjectionExec, @@ -434,20 +436,48 @@ impl ExecutionPlan for FilterExec { try_embed_projection(projection, self) } - fn push_down_filter( - &self, - expr: Arc, - ) -> Result>> { + fn push_down_filters( + self: Arc, + filters: &[&Arc], + ) -> Result> { let mut input = Arc::clone(&self.input); - if let Some(new_input) = input.push_down_filter(Arc::clone(&expr))? { - input = new_input; + let all_filters = filters + .iter() + .map(|f| *f) + .chain(split_conjunction(&self.predicate)) + .collect::>(); + let mut new_predicate = None; + if let Some(result) = Arc::clone(&input).push_down_filters(&all_filters)? { + // Any filters that our input didn't accept as Exact we apply ourselves + if !result.is_exact() { + new_predicate = Some(conjunction( + (0..all_filters.len()) + .zip(result.support) + .filter_map(|(i, s)| { + matches!(s, FilterPushdownSupport::Exact) + .then_some(Arc::clone(all_filters[i])) + }), + )); + } + input = result.inner; + }; + if let Some(new_predicate) = new_predicate { + // If we have a new predicate, create a new FilterExec + return FilterExec::try_new(new_predicate, input) + .and_then(|e| { + let selectivity = self.default_selectivity(); + e.with_default_selectivity(selectivity) + }) + .map(|e| { + Some(ExecutionPlanFilterPushdownResult { + inner: Arc::new(e) as Arc, + support: vec![FilterPushdownSupport::Exact; all_filters.len()], + }) + }); + } else { + // No new predicate was created, return None + Ok(None) } - let new_predicate = conjunction([Arc::clone(&self.predicate), expr]); - Ok(Some(Arc::new(Self { - input, - predicate: Arc::clone(&new_predicate), - ..self.clone() - }))) } } diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 9a9e0db9a3fc..1d3e23ea9097 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -271,20 +271,6 @@ impl ExecutionPlan for ProjectionExec { Ok(Some(Arc::new(projection.clone()))) } } - - fn push_down_filter( - &self, - expr: Arc, - ) -> Result>> { - // Try to push down to the input - if let Some(input) = self.input.push_down_filter(expr)? { - return Ok(Some(Arc::new(Self { - input, - ..self.clone() - }))); - } - Ok(None) - } } /// If 'e' is a direct column reference, returns the field level diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 6e7d885f58e1..84c0d504d810 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -29,7 +29,9 @@ use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use super::{ DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream, }; -use crate::execution_plan::CardinalityEffect; +use crate::execution_plan::{ + CardinalityEffect, ExecutionPlanFilterPushdownResult, FilterPushdownResult, +}; use crate::hash_utils::create_hashes; use crate::metrics::BaselineMetrics; use crate::projection::{all_columns, make_with_child, update_expr, ProjectionExec}; @@ -724,18 +726,20 @@ impl ExecutionPlan for RepartitionExec { )?))) } - fn push_down_filter( - &self, - expr: Arc, - ) -> Result>> { - // Try to push down to the input - if let Some(input) = self.input.push_down_filter(expr)? { - return Ok(Some(Arc::new(Self { - input, - ..self.clone() - }))); + fn push_down_filters( + self: Arc, + filters: &[&Arc], + ) -> Result> { + let input = Arc::clone(&self.input); + if let Some(result) = input.push_down_filters(filters)? { + let new_self = Arc::new(RepartitionExec::try_new( + result.inner, + self.partitioning().clone(), + )?); + Ok(Some(FilterPushdownResult::new(new_self, result.support))) + } else { + Ok(None) } - Ok(None) } } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 56262aed6e0f..47cb95e314e8 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -25,7 +25,9 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use crate::common::spawn_buffered; -use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType}; +use crate::execution_plan::{ + Boundedness, CardinalityEffect, EmissionType, ExecutionPlanFilterPushdownResult, +}; use crate::expressions::PhysicalSortExpr; use crate::limit::LimitStream; use crate::metrics::{ @@ -1239,9 +1241,14 @@ impl ExecutionPlan for SortExec { // Try to push down the dynamic filter. If the execution plan doesn't // support it, push_down_filter will return None and we'll // keep the original input_exec. - input_exec - .push_down_filter(topk.dynamic_filter_source())? - .unwrap_or(input_exec) + let filter = topk.dynamic_filter_source(); + if let Some(pushdown_result) = + Arc::clone(&input_exec).push_down_filters(&[&filter])? + { + pushdown_result.inner + } else { + input_exec + } } else { input_exec }; @@ -1344,20 +1351,24 @@ impl ExecutionPlan for SortExec { // Pass though filter pushdown. // This often happens in partitioned plans with a TopK because we end up with 1 TopK per partition + a final TopK at the end. // Implementing this pass-through allows global/top/final TopK to push down filters to the partitions. - fn push_down_filter( - &self, - expr: Arc, - ) -> Result>> { - let new_input = self.input.push_down_filter(expr)?; - if let Some(new_input) = new_input { - Ok(Some(Arc::new(SortExec { - input: new_input, + fn push_down_filters( + self: Arc, + filters: &[&Arc], + ) -> Result> { + let input = Arc::clone(&self.input); + if let Some(result) = input.push_down_filters(filters)? { + let new_self = Arc::new(SortExec { + input: result.inner, expr: self.expr.clone(), metrics_set: self.metrics_set.clone(), preserve_partitioning: self.preserve_partitioning, fetch: self.fetch, cache: self.cache.clone(), - }))) + }); + Ok(Some(ExecutionPlanFilterPushdownResult::new( + new_self, + result.support, + ))) } else { Ok(None) } From f048db954813a43bc2d016427d74ca0f27d8cb95 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 31 Mar 2025 23:08:06 -0500 Subject: [PATCH 04/27] move to it's own module --- datafusion/physical-plan/src/topk/filter.rs | 114 ++++++++++++++++++++ datafusion/physical-plan/src/topk/mod.rs | 101 ++--------------- 2 files changed, 121 insertions(+), 94 deletions(-) create mode 100644 datafusion/physical-plan/src/topk/filter.rs diff --git a/datafusion/physical-plan/src/topk/filter.rs b/datafusion/physical-plan/src/topk/filter.rs new file mode 100644 index 000000000000..4b32549f40f1 --- /dev/null +++ b/datafusion/physical-plan/src/topk/filter.rs @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::{Arc, RwLock}; + +use datafusion_common::DataFusionError; +use datafusion_common::Result; +use datafusion_physical_expr::expressions::lit; +use datafusion_physical_expr::PhysicalExpr; + +use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; + +/// Pushdown of dynamic fitlers from TopK operators is used to speed up queries +/// such as `SELECT * FROM table ORDER BY col DESC LIMIT 10` by pushing down the +/// threshold values for the sort columns to the data source. +/// That is, the TopK operator will keep track of the top 10 values for the sort +/// and before a new file is opened it's statitics will be checked against the +/// threshold values to determine if the file can be skipped and predicate pushdown +/// will use these to skip rows during the scan. +/// +/// For example, imagine this data gets created if multiple sources with clock skews, +/// network delays, etc. are writing data and you don't do anything fancy to guarantee +/// perfect sorting by `timestamp` (i.e. you naively write out the data to Parquet, maybe do some compaction, etc.). +/// The point is that 99% of yesterday's files have a `timestamp` smaller than 99% of today's files +/// but there may be a couple seconds of overlap between files. +/// To be concrete, let's say this is our data: +// +// | file | min | max | +// |------|-----|-----| +// | 1 | 1 | 10 | +// | 2 | 9 | 19 | +// | 3 | 20 | 31 | +// | 4 | 30 | 35 | +// +// Ideally a [`TableProvider`] is able to use file level stats or other methods to roughly order the files +// within each partition / file group such that we start with the newest / largest `timestamp`s. +// If this is not possible the optimization still works but is less efficient and harder to visualize, +// so for this example let's assume that we process 1 file at a time and we started with file 4. +// After processing file 4 let's say we have 10 values in our TopK heap, the smallest of which is 30. +// The TopK operator will then push down the filter `timestamp < 30` down the tree of [`ExecutionPlan`]s +// and if the data source supports dynamic filter pushdown it will accept a reference to this [`DynamicPhysicalExprSource`] +// and when it goes to open file 3 it will ask the [`DynamicPhysicalExprSource`] for the current filters. +// Since file 3 may contain values larger than 30 we cannot skip it entirely, +// but scanning it may still be more efficient due to page pruning and other optimizations. +// Once we get to file 2 however we can skip it entirely because we know that all values in file 2 are smaller than 30. +// The same goes for file 1. +// So this optimization just saved us 50% of the work of scanning the data. +#[derive(Debug, Clone)] +pub struct TopKDynamicFilterSource { + /// The children of the dynamic filters produced by this TopK. + /// In particular, this is the columns that are being sorted, derived from the sorting expressions. + children: Vec>, + /// The current filters derived from this TopK + predicate: Arc>>, +} + +impl TopKDynamicFilterSource { + pub fn new(children: Vec>) -> Self { + Self { + children, + predicate: Arc::new(RwLock::new(lit(true))), + } + } + + pub fn update_filters(&self, predicate: Arc) -> Result<()> { + let mut current_predicate = self.predicate.write().map_err(|_| { + DataFusionError::Internal( + "Failed to acquire write lock on TopKDynamicPhysicalExprSource" + .to_string(), + ) + })?; + *current_predicate = predicate; + Ok(()) + } +} + +impl TopKDynamicFilterSource { + pub fn as_dynamic_physical_expr(&self) -> Arc { + let new = self.clone(); + // Transform the sort expresions into referenced columns + let children = self.children.clone(); + Arc::new(DynamicFilterPhysicalExpr::new(children, Arc::new(new))) + } +} + +impl DynamicFilterSource for TopKDynamicFilterSource { + fn snapshot_current_filters(&self) -> Result>> { + let predicate = self + .predicate + .read() + .map_err(|_| { + DataFusionError::Internal( + "Failed to acquire read lock on TopKDynamicPhysicalExprSource" + .to_string(), + ) + })? + .clone(); + Ok(vec![predicate]) + } +} diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 3d45b5277ecf..ce9f00e2fa9b 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -17,8 +17,10 @@ //! TopK: Combination of Sort / LIMIT +mod filter; + use std::mem::size_of; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::{cmp::Ordering, collections::BinaryHeap}; use arrow::array::{Array, ArrayRef, RecordBatch}; @@ -29,7 +31,7 @@ use arrow::{ }; use arrow_schema::SortOptions; use datafusion_common::Result; -use datafusion_common::{internal_err, DataFusionError, HashMap}; +use datafusion_common::{internal_err, HashMap}; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, runtime_env::RuntimeEnv, @@ -39,9 +41,9 @@ use datafusion_expr::Operator; use datafusion_physical_expr::expressions::{is_not_null, is_null, lit, BinaryExpr}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; +use filter::TopKDynamicFilterSource; use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder}; -use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; use crate::spill::get_record_batch_memory_size; use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; @@ -218,8 +220,8 @@ impl TopK { if need_to_update_dynamic_filters { if let Some(filters) = self.filters.as_ref() { - if let Some(threasholds) = self.heap.get_threshold_values(&self.expr)? { - if let Some(predicate) = Self::calculate_dynamic_filters(threasholds)? + if let Some(thresholds) = self.heap.get_threshold_values(&self.expr)? { + if let Some(predicate) = Self::calculate_dynamic_filters(thresholds)? { filters.update_filters(predicate)?; } @@ -836,95 +838,6 @@ impl RecordBatchStore { } } -/// Pushdown of dynamic fitlers from TopK operators is used to speed up queries -/// such as `SELECT * FROM table ORDER BY col DESC LIMIT 10` by pushing down the -/// threshold values for the sort columns to the data source. -/// That is, the TopK operator will keep track of the top 10 values for the sort -/// and before a new file is opened it's statitics will be checked against the -/// threshold values to determine if the file can be skipped and predicate pushdown -/// will use these to skip rows during the scan. -/// -/// For example, imagine this data gets created if multiple sources with clock skews, -/// network delays, etc. are writing data and you don't do anything fancy to guarantee -/// perfect sorting by `timestamp` (i.e. you naively write out the data to Parquet, maybe do some compaction, etc.). -/// The point is that 99% of yesterday's files have a `timestamp` smaller than 99% of today's files -/// but there may be a couple seconds of overlap between files. -/// To be concrete, let's say this is our data: -// -// | file | min | max | -// |------|-----|-----| -// | 1 | 1 | 10 | -// | 2 | 9 | 19 | -// | 3 | 20 | 31 | -// | 4 | 30 | 35 | -// -// Ideally a [`TableProvider`] is able to use file level stats or other methods to roughly order the files -// within each partition / file group such that we start with the newest / largest `timestamp`s. -// If this is not possible the optimization still works but is less efficient and harder to visualize, -// so for this example let's assume that we process 1 file at a time and we started with file 4. -// After processing file 4 let's say we have 10 values in our TopK heap, the smallest of which is 30. -// The TopK operator will then push down the filter `timestamp < 30` down the tree of [`ExecutionPlan`]s -// and if the data source supports dynamic filter pushdown it will accept a reference to this [`DynamicPhysicalExprSource`] -// and when it goes to open file 3 it will ask the [`DynamicPhysicalExprSource`] for the current filters. -// Since file 3 may contain values larger than 30 we cannot skip it entirely, -// but scanning it may still be more efficient due to page pruning and other optimizations. -// Once we get to file 2 however we can skip it entirely because we know that all values in file 2 are smaller than 30. -// The same goes for file 1. -// So this optimization just saved us 50% of the work of scanning the data. -#[derive(Debug, Clone)] -struct TopKDynamicFilterSource { - /// The children of the dynamic filters produced by this TopK. - /// In particular, this is the columns that are being sorted, derived from the sorting expressions. - children: Vec>, - /// The current filters derived from this TopK - predicate: Arc>>, -} - -impl TopKDynamicFilterSource { - fn new(children: Vec>) -> Self { - Self { - children, - predicate: Arc::new(RwLock::new(lit(true))), - } - } - - fn update_filters(&self, predicate: Arc) -> Result<()> { - let mut current_predicate = self.predicate.write().map_err(|_| { - DataFusionError::Internal( - "Failed to acquire write lock on TopKDynamicPhysicalExprSource" - .to_string(), - ) - })?; - *current_predicate = predicate; - Ok(()) - } -} - -impl TopKDynamicFilterSource { - fn as_dynamic_physical_expr(&self) -> Arc { - let new = self.clone(); - // Transform the sort expresions into referenced columns - let children = self.children.clone(); - Arc::new(DynamicFilterPhysicalExpr::new(children, Arc::new(new))) - } -} - -impl DynamicFilterSource for TopKDynamicFilterSource { - fn snapshot_current_filters(&self) -> Result>> { - let predicate = self - .predicate - .read() - .map_err(|_| { - DataFusionError::Internal( - "Failed to acquire read lock on TopKDynamicPhysicalExprSource" - .to_string(), - ) - })? - .clone(); - Ok(vec![predicate]) - } -} - #[cfg(test)] mod tests { use super::*; From 6889f0c91f656c6aa263fbc46e30c17ba3366d6a Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 31 Mar 2025 23:18:51 -0500 Subject: [PATCH 05/27] get reference to topk that is collectible at optimization time --- datafusion/physical-plan/src/sorts/sort.rs | 95 ++++++++++++---------- 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 47cb95e314e8..4e80f48684fc 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -22,9 +22,10 @@ use std::any::Any; use std::fmt; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use crate::common::spawn_buffered; +use crate::dynamic_filters::DynamicFilterSource; use crate::execution_plan::{ Boundedness, CardinalityEffect, EmissionType, ExecutionPlanFilterPushdownResult, }; @@ -53,7 +54,7 @@ use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays, SortColumn use arrow::datatypes::{DataType, SchemaRef}; use arrow::row::{RowConverter, Rows, SortField}; use datafusion_common::{ - exec_datafusion_err, internal_datafusion_err, internal_err, Result, + exec_datafusion_err, internal_datafusion_err, internal_err, DataFusionError, Result }; use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; @@ -970,6 +971,8 @@ pub struct SortExec { fetch: Option, /// Cache holding plan properties like equivalences, output partitioning etc. cache: PlanProperties, + /// Dynamic filter sources + dynamic_filter_source: SortExecDynamicFilterSource, } impl SortExec { @@ -985,6 +988,7 @@ impl SortExec { preserve_partitioning, fetch: None, cache, + dynamic_filter_source: SortExecDynamicFilterSource::new(), } } @@ -1037,6 +1041,7 @@ impl SortExec { preserve_partitioning: self.preserve_partitioning, fetch, cache, + dynamic_filter_source: self.dynamic_filter_source.clone(), } } @@ -1199,60 +1204,36 @@ impl ExecutionPlan for SortExec { ) -> Result { trace!("Start SortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()); - let sort_satisfied = self - .input - .equivalence_properties() - .ordering_satisfy_requirement(&LexRequirement::from(self.expr.clone())); - - let input_exec = Arc::clone(&self.input); + let mut input = self.input.execute(partition, Arc::clone(&context))?; let execution_options = &context.session_config().options().execution; trace!("End SortExec's input.execute for partition: {}", partition); + let sort_satisfied = self + .input + .equivalence_properties() + .ordering_satisfy_requirement(&LexRequirement::from(self.expr.clone())); + match (sort_satisfied, self.fetch.as_ref()) { - (true, Some(fetch)) => { - let input = input_exec.execute(partition, Arc::clone(&context))?; - Ok(Box::pin(LimitStream::new( - input, - 0, - Some(*fetch), - BaselineMetrics::new(&self.metrics_set, partition), - ))) - } - (true, None) => self.input.execute(partition, Arc::clone(&context)), + (true, Some(fetch)) => Ok(Box::pin(LimitStream::new( + input, + 0, + Some(*fetch), + BaselineMetrics::new(&self.metrics_set, partition), + ))), + (true, None) => Ok(input), (false, Some(fetch)) => { - let schema = input_exec.schema(); let mut topk = TopK::try_new( partition, - schema, + input.schema(), self.expr.clone(), *fetch, context.session_config().batch_size(), context.runtime_env(), &self.metrics_set, )?; - let input_exec = if context - .session_config() - .options() - .optimizer - .enable_dynamic_filter_pushdown - { - // Try to push down the dynamic filter. If the execution plan doesn't - // support it, push_down_filter will return None and we'll - // keep the original input_exec. - let filter = topk.dynamic_filter_source(); - if let Some(pushdown_result) = - Arc::clone(&input_exec).push_down_filters(&[&filter])? - { - pushdown_result.inner - } else { - input_exec - } - } else { - input_exec - }; - let mut input = input_exec.execute(partition, Arc::clone(&context))?; + self.dynamic_filter_source.add_filter(topk.dynamic_filter_source())?; Ok(Box::pin(RecordBatchStreamAdapter::new( self.schema(), futures::stream::once(async move { @@ -1266,7 +1247,6 @@ impl ExecutionPlan for SortExec { ))) } (false, None) => { - let mut input = input_exec.execute(partition, Arc::clone(&context))?; let mut sorter = ExternalSorter::new( partition, input.schema(), @@ -1364,6 +1344,7 @@ impl ExecutionPlan for SortExec { preserve_partitioning: self.preserve_partitioning, fetch: self.fetch, cache: self.cache.clone(), + dynamic_filter_source: self.dynamic_filter_source.clone(), }); Ok(Some(ExecutionPlanFilterPushdownResult::new( new_self, @@ -1375,6 +1356,36 @@ impl ExecutionPlan for SortExec { } } +#[derive(Debug, Clone)] +struct SortExecDynamicFilterSource { + filters: Arc>>>, +} + +impl SortExecDynamicFilterSource { + pub fn new() -> Self { + Self { + filters: Arc::new(RwLock::new(Vec::new())), + } + } + + pub fn add_filter(&self, filter: Arc) -> Result<()> { + let mut filters = self.filters.write().map_err(|_| DataFusionError::Internal( + format!("Failed to acquire write lock on topk filters for adding a new filter.", + )))?; + filters.push(filter); + Ok(()) + } +} + +impl DynamicFilterSource for SortExecDynamicFilterSource { + fn snapshot_current_filters(&self) -> Result>> { + let Ok(filters) = self.filters.read() else { + return internal_err!("Failed to acquire read lock on topk filters"); + }; + Ok(filters.clone()) + } +} + #[cfg(test)] mod tests { use std::collections::HashMap; From c8a133e3630b23fa28583fe439de52162a7cb6b6 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 00:55:44 -0500 Subject: [PATCH 06/27] wip on optimizer pass --- datafusion/datasource/src/source.rs | 4 +- .../physical-optimizer/src/filter_pushdown.rs | 137 ++++++++++ datafusion/physical-optimizer/src/lib.rs | 1 + .../physical-optimizer/src/optimizer.rs | 5 + .../physical-plan/src/coalesce_batches.rs | 22 +- .../physical-plan/src/execution_plan.rs | 33 ++- datafusion/physical-plan/src/filter.rs | 95 ++++--- .../physical-plan/src/repartition/mod.rs | 20 +- datafusion/physical-plan/src/sorts/mod.rs | 1 + datafusion/physical-plan/src/sorts/sort.rs | 78 ++---- .../physical-plan/src/sorts/sort_filters.rs | 239 ++++++++++++++++++ datafusion/physical-plan/src/topk/filter.rs | 114 --------- datafusion/physical-plan/src/topk/mod.rs | 172 ++----------- 13 files changed, 514 insertions(+), 407 deletions(-) create mode 100644 datafusion/physical-optimizer/src/filter_pushdown.rs create mode 100644 datafusion/physical-plan/src/sorts/sort_filters.rs delete mode 100644 datafusion/physical-plan/src/topk/filter.rs diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 4e8e6e361b7a..f5832d6f13e5 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -203,8 +203,8 @@ impl ExecutionPlan for DataSourceExec { self.data_source.try_swapping_with_projection(projection) } - fn push_down_filters( - self: Arc, + fn push_down_filters_from_parents( + &self, filters: &[&Arc], ) -> datafusion_common::Result> { if let Some(pushdown_result) = self.data_source.push_down_filters(filters)? { diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs new file mode 100644 index 000000000000..65c782248085 --- /dev/null +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion_common::{config::ConfigOptions, Result}; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::{ + execution_plan::{ExecutionPlanFilterPushdownResult, FilterPushdownSupport}, + ExecutionPlan, +}; + +use crate::PhysicalOptimizerRule; + +fn pushdown_filters( + node: &Arc, + parent_filters: &[Arc], +) -> Result> { + let node_filters = node.filters_for_pushdown()?; + let children = node.children(); + let mut new_children = Vec::with_capacity(children.len()); + let all_filters = parent_filters + .iter() + .chain(node_filters.iter()) + .cloned() + .collect::>(); + let mut filter_pushdown_result = + vec![FilterPushdownSupport::Exact; all_filters.len()]; + for child in children { + if child.supports_filter_pushdown() { + if let Some(result) = pushdown_filters(child, &all_filters)? { + new_children.push(result.inner); + for (all_filters_idx, support) in result.support.iter().enumerate() { + if !matches!(support, FilterPushdownSupport::Exact) { + filter_pushdown_result[all_filters_idx] = + FilterPushdownSupport::Inexact; + } + } + } else { + new_children.push(Arc::clone(child)); + // If the child does not support filter pushdown, mark all filters as inexact + for support in filter_pushdown_result.iter_mut() { + *support = FilterPushdownSupport::Inexact; + } + } + } else { + // Reset the filters we are pushing down. + if let Some(result) = pushdown_filters(child, &Vec::new())? { + new_children.push(result.inner); + } else { + new_children.push(Arc::clone(child)); + } + }; + } + + let mut result_node = Arc::clone(node); + + // Now update the node with the result of the pushdown of it's filters + let pushdown_result = filter_pushdown_result[parent_filters.len()..].to_vec(); + if let Some(new_node) = + Arc::clone(node).with_filter_pushdown_result(&pushdown_result)? + { + result_node = new_node; + }; + + // And check if it can absorb the remaining filters + let remaining_filter_indexes = (0..parent_filters.len()) + .filter(|&i| !matches!(filter_pushdown_result[i], FilterPushdownSupport::Exact)) + .collect::>(); + if !remaining_filter_indexes.is_empty() { + let remaining_filters = remaining_filter_indexes + .iter() + .map(|&i| &parent_filters[i]) + .collect::>(); + if let Some(result) = node.push_down_filters_from_parents(&remaining_filters)? { + result_node = result.inner; + for (parent_filter_index, support) in + remaining_filter_indexes.iter().zip(result.support) + { + // If any of the remaining filters are not exact, mark them as inexact + if !matches!(support, FilterPushdownSupport::Exact) { + filter_pushdown_result[*parent_filter_index] = + FilterPushdownSupport::Inexact; + } + } + } + } + Ok(Some(ExecutionPlanFilterPushdownResult::new( + result_node, + filter_pushdown_result, + ))) +} + +#[derive(Debug)] +pub struct FilterPushdown {} + +impl FilterPushdown { + pub fn new() -> Self { + Self {} + } +} + +impl PhysicalOptimizerRule for FilterPushdown { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> Result> { + if let Some(result) = pushdown_filters(&plan, &vec![])? { + Ok(result.inner) + } else { + Ok(plan) + } + } + + fn name(&self) -> &str { + "FilterPushdown" + } + + fn schema_check(&self) -> bool { + true // Filter pushdown does not change the schema of the plan + } +} diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index 35503f3b0b5f..5a43d7118d63 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -29,6 +29,7 @@ pub mod coalesce_batches; pub mod combine_partial_final_agg; pub mod enforce_distribution; pub mod enforce_sorting; +pub mod filter_pushdown; pub mod join_selection; pub mod limit_pushdown; pub mod limited_distinct_aggregation; diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index bab31150e250..78d3e2ad8873 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -25,6 +25,7 @@ use crate::coalesce_batches::CoalesceBatches; use crate::combine_partial_final_agg::CombinePartialFinalAggregate; use crate::enforce_distribution::EnforceDistribution; use crate::enforce_sorting::EnforceSorting; +use crate::filter_pushdown::FilterPushdown; use crate::join_selection::JoinSelection; use crate::limit_pushdown::LimitPushdown; use crate::limited_distinct_aggregation::LimitedDistinctAggregation; @@ -121,6 +122,10 @@ impl PhysicalOptimizer { // into an `order by max(x) limit y`. In this case it will copy the limit value down // to the aggregation, allowing it to use only y number of accumulators. Arc::new(TopKAggregation::new()), + // The FilterPushdown rule tries to push down filters as far as it can. + // For example, it will push down filtering from a `FilterExec` to + // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. + Arc::new(FilterPushdown::new()), // The LimitPushdown rule tries to push limits down as far as possible, // replacing operators with fetching variants, or adding limits // past operators that support limit pushdown. diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index e1161352bfbe..d21cdbf3a230 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -32,10 +32,9 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::Result; use datafusion_execution::TaskContext; -use datafusion_physical_expr::PhysicalExpr; use crate::coalesce::{BatchCoalescer, CoalescerState}; -use crate::execution_plan::{CardinalityEffect, ExecutionPlanFilterPushdownResult}; +use crate::execution_plan::CardinalityEffect; use futures::ready; use futures::stream::{Stream, StreamExt}; @@ -214,23 +213,8 @@ impl ExecutionPlan for CoalesceBatchesExec { CardinalityEffect::Equal } - fn push_down_filters( - self: Arc, - filters: &[&Arc], - ) -> Result> { - let input = Arc::clone(&self.input); - if let Some(result) = input.push_down_filters(filters)? { - let new_self = Arc::new( - CoalesceBatchesExec::new(result.inner, self.target_batch_size) - .with_fetch(self.fetch), - ); - Ok(Some(ExecutionPlanFilterPushdownResult::new( - new_self, - result.support, - ))) - } else { - Ok(None) - } + fn supports_filter_pushdown(&self) -> bool { + true } } diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index e7be57b42125..f093b8bce817 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -468,12 +468,43 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { Ok(None) } - fn push_down_filters( + /// Returns a set of filters that this operator owns but would like to be pushed down. + /// For example, a `TopK` operator may produce dynamic filters that reference it's currrent state, + /// while a `FilterExec` will just hand of the filters it has as is. + /// The default implementation returns an empty vector. + fn filters_for_pushdown(&self) -> Result>> { + Ok(Vec::new()) + } + + /// After we've attempted to push down filters into this node's children + /// this will be called with the result for each filter that this node gave in `filters_for_pushdown`. + /// The node should update itself to possibly drop filters that were pushed down as `Exact`. + fn with_filter_pushdown_result( self: Arc, + _pushdown: &[FilterPushdownSupport], + ) -> Result>> { + Ok(None) + } + + /// Push down the given filters into this `ExecutionPlan`. + /// This is called after `with_filter_pushdown_result`. + /// Operators can accept filters from their parents, either as Exact or Unsupported. + /// If the operator accepts a filter as Exact, it should return a new `ExecutionPlan` with the filter applied + /// and the parent that generated the filter might not apply it anymore. + fn push_down_filters_from_parents( + &self, _filters: &[&Arc], ) -> Result> { Ok(None) } + + /// Returns `true` if this `ExecutionPlan` allows filter pushdown to flow throught it and `false` otherwise. + /// Nodes such as aggregations cannot have filters pushed down through them, so they return `false`. + /// On the other hand nodes such as repartitions can have filters pushed down through them, so they return `true`. + /// The default implementation returns `false`. + fn supports_filter_pushdown(&self) -> bool { + false + } } #[derive(Debug, Clone, Copy)] diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index d00ff5a4d9c9..a70289d82772 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -436,48 +436,67 @@ impl ExecutionPlan for FilterExec { try_embed_projection(projection, self) } - fn push_down_filters( + fn supports_filter_pushdown(&self) -> bool { + true + } + + fn filters_for_pushdown(&self) -> Result>> { + Ok(split_conjunction(self.predicate()) + .iter() + .map(|f| Arc::clone(f)) + .collect()) + } + + fn with_filter_pushdown_result( self: Arc, - filters: &[&Arc], - ) -> Result> { - let mut input = Arc::clone(&self.input); - let all_filters = filters + pushdown: &[FilterPushdownSupport], + ) -> Result>> { + // Only keep filters who's index maps to the pushdown result Unsupported + let new_filters = self + .filters_for_pushdown()? .iter() - .map(|f| *f) - .chain(split_conjunction(&self.predicate)) + .zip(pushdown.iter()) + .filter_map(|(f, p)| { + if matches!(p, FilterPushdownSupport::Exact) { + // Exact pushdown support means we keep this filter + Some(Arc::clone(f)) + } else { + None + } + }) .collect::>(); - let mut new_predicate = None; - if let Some(result) = Arc::clone(&input).push_down_filters(&all_filters)? { - // Any filters that our input didn't accept as Exact we apply ourselves - if !result.is_exact() { - new_predicate = Some(conjunction( - (0..all_filters.len()) - .zip(result.support) - .filter_map(|(i, s)| { - matches!(s, FilterPushdownSupport::Exact) - .then_some(Arc::clone(all_filters[i])) - }), - )); - } - input = result.inner; - }; - if let Some(new_predicate) = new_predicate { - // If we have a new predicate, create a new FilterExec - return FilterExec::try_new(new_predicate, input) - .and_then(|e| { - let selectivity = self.default_selectivity(); - e.with_default_selectivity(selectivity) - }) - .map(|e| { - Some(ExecutionPlanFilterPushdownResult { - inner: Arc::new(e) as Arc, - support: vec![FilterPushdownSupport::Exact; all_filters.len()], - }) - }); - } else { - // No new predicate was created, return None - Ok(None) + + if new_filters.is_empty() { + return Ok(Some(Arc::clone(self.input()))); } + + let predicate = conjunction(new_filters.into_iter()); + + let new = FilterExec::try_new(predicate, Arc::clone(self.input())) + .and_then(|e| { + let selectivity = e.default_selectivity(); + e.with_default_selectivity(selectivity) + }) + .and_then(|e| e.with_projection(self.projection().cloned())) + .map(|e| Arc::new(e) as _)?; + Ok(Some(new)) + } + + fn push_down_filters_from_parents( + &self, + filters: &[&Arc], + ) -> Result> { + let new_predicates = conjunction( + std::iter::once(Arc::clone(&self.predicate)) + .chain(filters.iter().map(|f| Arc::clone(f))), + ); + Ok(Some(ExecutionPlanFilterPushdownResult { + inner: Arc::new(Self { + predicate: new_predicates, + ..self.clone() + }), + support: vec![FilterPushdownSupport::Exact; filters.len()], + })) } } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 84c0d504d810..54cd2bd538e4 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -29,9 +29,7 @@ use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use super::{ DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream, }; -use crate::execution_plan::{ - CardinalityEffect, ExecutionPlanFilterPushdownResult, FilterPushdownResult, -}; +use crate::execution_plan::CardinalityEffect; use crate::hash_utils::create_hashes; use crate::metrics::BaselineMetrics; use crate::projection::{all_columns, make_with_child, update_expr, ProjectionExec}; @@ -726,20 +724,8 @@ impl ExecutionPlan for RepartitionExec { )?))) } - fn push_down_filters( - self: Arc, - filters: &[&Arc], - ) -> Result> { - let input = Arc::clone(&self.input); - if let Some(result) = input.push_down_filters(filters)? { - let new_self = Arc::new(RepartitionExec::try_new( - result.inner, - self.partitioning().clone(), - )?); - Ok(Some(FilterPushdownResult::new(new_self, result.support))) - } else { - Ok(None) - } + fn supports_filter_pushdown(&self) -> bool { + true } } diff --git a/datafusion/physical-plan/src/sorts/mod.rs b/datafusion/physical-plan/src/sorts/mod.rs index c7ffae4061c0..29eb3a2647f4 100644 --- a/datafusion/physical-plan/src/sorts/mod.rs +++ b/datafusion/physical-plan/src/sorts/mod.rs @@ -22,6 +22,7 @@ mod cursor; mod merge; pub mod partial_sort; pub mod sort; +pub mod sort_filters; pub mod sort_preserving_merge; mod stream; pub mod streaming_merge; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 4e80f48684fc..e559a479e8d3 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -22,13 +22,10 @@ use std::any::Any; use std::fmt; use std::fmt::{Debug, Formatter}; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use crate::common::spawn_buffered; -use crate::dynamic_filters::DynamicFilterSource; -use crate::execution_plan::{ - Boundedness, CardinalityEffect, EmissionType, ExecutionPlanFilterPushdownResult, -}; +use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType}; use crate::expressions::PhysicalSortExpr; use crate::limit::LimitStream; use crate::metrics::{ @@ -54,7 +51,7 @@ use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays, SortColumn use arrow::datatypes::{DataType, SchemaRef}; use arrow::row::{RowConverter, Rows, SortField}; use datafusion_common::{ - exec_datafusion_err, internal_datafusion_err, internal_err, DataFusionError, Result + exec_datafusion_err, internal_datafusion_err, internal_err, Result, }; use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; @@ -66,6 +63,8 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::{StreamExt, TryStreamExt}; use log::{debug, trace}; +use super::sort_filters::SortDynamicFilterSource; + struct ExternalSorterMetrics { /// metrics baseline: BaselineMetrics, @@ -972,7 +971,7 @@ pub struct SortExec { /// Cache holding plan properties like equivalences, output partitioning etc. cache: PlanProperties, /// Dynamic filter sources - dynamic_filter_source: SortExecDynamicFilterSource, + dynamic_filter_source: Arc, } impl SortExec { @@ -981,6 +980,7 @@ impl SortExec { pub fn new(expr: LexOrdering, input: Arc) -> Self { let preserve_partitioning = false; let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning); + let dynamic_filter_source = Arc::new(SortDynamicFilterSource::new(&expr)); Self { expr, input, @@ -988,7 +988,7 @@ impl SortExec { preserve_partitioning, fetch: None, cache, - dynamic_filter_source: SortExecDynamicFilterSource::new(), + dynamic_filter_source, } } @@ -1233,13 +1233,16 @@ impl ExecutionPlan for SortExec { context.runtime_env(), &self.metrics_set, )?; - self.dynamic_filter_source.add_filter(topk.dynamic_filter_source())?; + let dynamic_filter_source = Arc::clone(&self.dynamic_filter_source); Ok(Box::pin(RecordBatchStreamAdapter::new( self.schema(), futures::stream::once(async move { while let Some(batch) = input.next().await { let batch = batch?; topk.insert_batch(batch)?; + if let Some(values) = topk.get_threshold_values()? { + dynamic_filter_source.update_values(&values); + } } topk.emit() }) @@ -1328,61 +1331,12 @@ impl ExecutionPlan for SortExec { ))) } - // Pass though filter pushdown. - // This often happens in partitioned plans with a TopK because we end up with 1 TopK per partition + a final TopK at the end. - // Implementing this pass-through allows global/top/final TopK to push down filters to the partitions. - fn push_down_filters( - self: Arc, - filters: &[&Arc], - ) -> Result> { - let input = Arc::clone(&self.input); - if let Some(result) = input.push_down_filters(filters)? { - let new_self = Arc::new(SortExec { - input: result.inner, - expr: self.expr.clone(), - metrics_set: self.metrics_set.clone(), - preserve_partitioning: self.preserve_partitioning, - fetch: self.fetch, - cache: self.cache.clone(), - dynamic_filter_source: self.dynamic_filter_source.clone(), - }); - Ok(Some(ExecutionPlanFilterPushdownResult::new( - new_self, - result.support, - ))) - } else { - Ok(None) - } + fn filters_for_pushdown(&self) -> Result>> { + Ok(vec![self.dynamic_filter_source.as_physical_expr()]) } -} - -#[derive(Debug, Clone)] -struct SortExecDynamicFilterSource { - filters: Arc>>>, -} -impl SortExecDynamicFilterSource { - pub fn new() -> Self { - Self { - filters: Arc::new(RwLock::new(Vec::new())), - } - } - - pub fn add_filter(&self, filter: Arc) -> Result<()> { - let mut filters = self.filters.write().map_err(|_| DataFusionError::Internal( - format!("Failed to acquire write lock on topk filters for adding a new filter.", - )))?; - filters.push(filter); - Ok(()) - } -} - -impl DynamicFilterSource for SortExecDynamicFilterSource { - fn snapshot_current_filters(&self) -> Result>> { - let Ok(filters) = self.filters.read() else { - return internal_err!("Failed to acquire read lock on topk filters"); - }; - Ok(filters.clone()) + fn supports_filter_pushdown(&self) -> bool { + true } } diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs new file mode 100644 index 000000000000..7b7b968cb746 --- /dev/null +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::{Arc, RwLock}; + +use arrow_schema::SortOptions; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::Operator; +use datafusion_physical_expr::{ + expressions::{is_not_null, is_null, lit, BinaryExpr}, + LexOrdering, PhysicalExpr, +}; + +use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; + +/// Holds threshold value and sort order information for a column +#[derive(Debug, Clone)] +struct ColumnThreshold { + /// The current threshold value + pub value: Arc>>, + /// The column expression + pub expr: Arc, + /// Sort options + pub sort_options: SortOptions, +} + +/// Pushdown of dynamic fitlers from sort + limit operators (aka `TopK`) is used to speed up queries +/// such as `SELECT * FROM table ORDER BY col DESC LIMIT 10` by pushing down the +/// threshold values for the sort columns to the data source. +/// That is, the TopK operator will keep track of the top 10 values for the sort +/// and before a new file is opened it's statitics will be checked against the +/// threshold values to determine if the file can be skipped and predicate pushdown +/// will use these to skip rows during the scan. +/// +/// For example, imagine this data gets created if multiple sources with clock skews, +/// network delays, etc. are writing data and you don't do anything fancy to guarantee +/// perfect sorting by `timestamp` (i.e. you naively write out the data to Parquet, maybe do some compaction, etc.). +/// The point is that 99% of yesterday's files have a `timestamp` smaller than 99% of today's files +/// but there may be a couple seconds of overlap between files. +/// To be concrete, let's say this is our data: +// +// | file | min | max | +// |------|-----|-----| +// | 1 | 1 | 10 | +// | 2 | 9 | 19 | +// | 3 | 20 | 31 | +// | 4 | 30 | 35 | +// +// Ideally a [`TableProvider`] is able to use file level stats or other methods to roughly order the files +// within each partition / file group such that we start with the newest / largest `timestamp`s. +// If this is not possible the optimization still works but is less efficient and harder to visualize, +// so for this example let's assume that we process 1 file at a time and we started with file 4. +// After processing file 4 let's say we have 10 values in our TopK heap, the smallest of which is 30. +// The TopK operator will then push down the filter `timestamp < 30` down the tree of [`ExecutionPlan`]s +// and if the data source supports dynamic filter pushdown it will accept a reference to this [`DynamicPhysicalExprSource`] +// and when it goes to open file 3 it will ask the [`DynamicPhysicalExprSource`] for the current filters. +// Since file 3 may contain values larger than 30 we cannot skip it entirely, +// but scanning it may still be more efficient due to page pruning and other optimizations. +// Once we get to file 2 however we can skip it entirely because we know that all values in file 2 are smaller than 30. +// The same goes for file 1. +// So this optimization just saved us 50% of the work of scanning the data. +#[derive(Debug, Clone)] +pub struct SortDynamicFilterSource { + thresholds: Vec, +} + +impl SortDynamicFilterSource { + pub fn new(ordering: &LexOrdering) -> Self { + let thresholds = ordering + .iter() + .map(|sort_expr| ColumnThreshold { + value: Arc::new(RwLock::new(None)), + expr: Arc::clone(&sort_expr.expr), + sort_options: sort_expr.options, + }) + .collect(); + + Self { thresholds } + } + + pub fn update_values(&self, new_values: &[ScalarValue]) { + if new_values.len() != self.thresholds.len() { + panic!("New values length does not match the number of thresholds"); + } + for (i, new_value) in new_values.iter().enumerate() { + let threshold = &self.thresholds[i]; + let descending = threshold.sort_options.descending; + let nulls_first = threshold.sort_options.nulls_first; + let mut current_value = threshold + .value + .write() + .expect("Failed to acquire read lock on threshold"); + // Check if the new value is more or less selective than the current value given the sorting + if let Some(ref mut current_value) = *current_value { + let new_value_is_greater = new_value > ¤t_value; + let new_value_is_null = new_value.is_null(); + let current_value_is_null = current_value.is_null(); + if nulls_first && new_value_is_null && !current_value_is_null { + *current_value = new_value.clone(); + } else { + if (descending && new_value_is_greater) + || (!descending && !new_value_is_greater) + { + *current_value = new_value.clone(); + } + } + } else { + *current_value = Some(new_value.clone()); + } + } + } + + pub fn as_physical_expr(&self) -> Arc { + let children = self + .thresholds + .iter() + .map(|threshold| Arc::clone(&threshold.expr)) + .collect::>(); + Arc::new(DynamicFilterPhysicalExpr::new( + children, + Arc::new(self.clone()) as Arc, + )) + } +} + +impl DynamicFilterSource for SortDynamicFilterSource { + fn snapshot_current_filters(&self) -> Result>> { + // Create filter expressions for each threshold + let mut filters: Vec> = + Vec::with_capacity(self.thresholds.len()); + + let mut prev_sort_expr: Option> = None; + for threshold in &self.thresholds { + let value = threshold + .value + .read() + .expect("Failed to acquire read lock on threshold") + .clone(); + + let Some(value) = value else { + // If the value is None, we cannot create a filter for this threshold + // This means we skip this column for filtering + continue; + }; + + // Create the appropriate operator based on sort order + let op = if threshold.sort_options.descending { + // For descending sort, we want col > threshold (exclude smaller values) + Operator::Gt + } else { + // For ascending sort, we want col < threshold (exclude larger values) + Operator::Lt + }; + + let value_null = value.is_null(); + + let comparison = Arc::new(BinaryExpr::new( + Arc::clone(&threshold.expr), + op, + lit(value.clone()), + )); + + let comparison_with_null = + match (threshold.sort_options.nulls_first, value_null) { + // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) + (true, true) => lit(false), + (true, false) => Arc::new(BinaryExpr::new( + is_null(Arc::clone(&threshold.expr))?, + Operator::Or, + comparison, + )), + // For nulls last, transform to (threshold.value is null and threshold.expr is not null) + // or (threshold.value is not null and comparison) + (false, true) => is_not_null(Arc::clone(&threshold.expr))?, + (false, false) => comparison, + }; + + let mut eq_expr = Arc::new(BinaryExpr::new( + Arc::clone(&threshold.expr), + Operator::Eq, + lit(value.clone()), + )); + + if value_null { + eq_expr = Arc::new(BinaryExpr::new( + is_null(Arc::clone(&threshold.expr))?, + Operator::Or, + eq_expr, + )); + } + + // For a query like order by a, b, the filter for column `b` is only applied if + // the condition a = threshold.value (considering null equality) is met. + // Therefore, we add equality predicates for all preceding fields to the filter logic of the current field, + // and include the current field's equality predicate in `prev_sort_expr` for use with subsequent fields. + match prev_sort_expr.take() { + None => { + prev_sort_expr = Some(eq_expr); + filters.push(comparison_with_null); + } + Some(p) => { + filters.push(Arc::new(BinaryExpr::new( + Arc::clone(&p), + Operator::And, + comparison_with_null, + ))); + + prev_sort_expr = + Some(Arc::new(BinaryExpr::new(p, Operator::And, eq_expr))); + } + } + } + + let dynamic_predicate = filters + .into_iter() + .reduce(|a, b| Arc::new(BinaryExpr::new(a, Operator::Or, b))); + + if let Some(predicate) = dynamic_predicate { + if !predicate.eq(&lit(true)) { + return Ok(vec![predicate]); + } + } + Ok(vec![]) + } +} diff --git a/datafusion/physical-plan/src/topk/filter.rs b/datafusion/physical-plan/src/topk/filter.rs deleted file mode 100644 index 4b32549f40f1..000000000000 --- a/datafusion/physical-plan/src/topk/filter.rs +++ /dev/null @@ -1,114 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::{Arc, RwLock}; - -use datafusion_common::DataFusionError; -use datafusion_common::Result; -use datafusion_physical_expr::expressions::lit; -use datafusion_physical_expr::PhysicalExpr; - -use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; - -/// Pushdown of dynamic fitlers from TopK operators is used to speed up queries -/// such as `SELECT * FROM table ORDER BY col DESC LIMIT 10` by pushing down the -/// threshold values for the sort columns to the data source. -/// That is, the TopK operator will keep track of the top 10 values for the sort -/// and before a new file is opened it's statitics will be checked against the -/// threshold values to determine if the file can be skipped and predicate pushdown -/// will use these to skip rows during the scan. -/// -/// For example, imagine this data gets created if multiple sources with clock skews, -/// network delays, etc. are writing data and you don't do anything fancy to guarantee -/// perfect sorting by `timestamp` (i.e. you naively write out the data to Parquet, maybe do some compaction, etc.). -/// The point is that 99% of yesterday's files have a `timestamp` smaller than 99% of today's files -/// but there may be a couple seconds of overlap between files. -/// To be concrete, let's say this is our data: -// -// | file | min | max | -// |------|-----|-----| -// | 1 | 1 | 10 | -// | 2 | 9 | 19 | -// | 3 | 20 | 31 | -// | 4 | 30 | 35 | -// -// Ideally a [`TableProvider`] is able to use file level stats or other methods to roughly order the files -// within each partition / file group such that we start with the newest / largest `timestamp`s. -// If this is not possible the optimization still works but is less efficient and harder to visualize, -// so for this example let's assume that we process 1 file at a time and we started with file 4. -// After processing file 4 let's say we have 10 values in our TopK heap, the smallest of which is 30. -// The TopK operator will then push down the filter `timestamp < 30` down the tree of [`ExecutionPlan`]s -// and if the data source supports dynamic filter pushdown it will accept a reference to this [`DynamicPhysicalExprSource`] -// and when it goes to open file 3 it will ask the [`DynamicPhysicalExprSource`] for the current filters. -// Since file 3 may contain values larger than 30 we cannot skip it entirely, -// but scanning it may still be more efficient due to page pruning and other optimizations. -// Once we get to file 2 however we can skip it entirely because we know that all values in file 2 are smaller than 30. -// The same goes for file 1. -// So this optimization just saved us 50% of the work of scanning the data. -#[derive(Debug, Clone)] -pub struct TopKDynamicFilterSource { - /// The children of the dynamic filters produced by this TopK. - /// In particular, this is the columns that are being sorted, derived from the sorting expressions. - children: Vec>, - /// The current filters derived from this TopK - predicate: Arc>>, -} - -impl TopKDynamicFilterSource { - pub fn new(children: Vec>) -> Self { - Self { - children, - predicate: Arc::new(RwLock::new(lit(true))), - } - } - - pub fn update_filters(&self, predicate: Arc) -> Result<()> { - let mut current_predicate = self.predicate.write().map_err(|_| { - DataFusionError::Internal( - "Failed to acquire write lock on TopKDynamicPhysicalExprSource" - .to_string(), - ) - })?; - *current_predicate = predicate; - Ok(()) - } -} - -impl TopKDynamicFilterSource { - pub fn as_dynamic_physical_expr(&self) -> Arc { - let new = self.clone(); - // Transform the sort expresions into referenced columns - let children = self.children.clone(); - Arc::new(DynamicFilterPhysicalExpr::new(children, Arc::new(new))) - } -} - -impl DynamicFilterSource for TopKDynamicFilterSource { - fn snapshot_current_filters(&self) -> Result>> { - let predicate = self - .predicate - .read() - .map_err(|_| { - DataFusionError::Internal( - "Failed to acquire read lock on TopKDynamicPhysicalExprSource" - .to_string(), - ) - })? - .clone(); - Ok(vec![predicate]) - } -} diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index ce9f00e2fa9b..89bbfd2250e1 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -17,8 +17,6 @@ //! TopK: Combination of Sort / LIMIT -mod filter; - use std::mem::size_of; use std::sync::Arc; use std::{cmp::Ordering, collections::BinaryHeap}; @@ -29,19 +27,15 @@ use arrow::{ compute::interleave, row::{RowConverter, Rows, SortField}, }; -use arrow_schema::SortOptions; -use datafusion_common::Result; use datafusion_common::{internal_err, HashMap}; +use datafusion_common::{Result, ScalarValue}; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, runtime_env::RuntimeEnv, }; use datafusion_expr::ColumnarValue; -use datafusion_expr::Operator; -use datafusion_physical_expr::expressions::{is_not_null, is_null, lit, BinaryExpr}; -use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; +use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use filter::TopKDynamicFilterSource; use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder}; use crate::spill::get_record_batch_memory_size; @@ -99,8 +93,6 @@ pub struct TopK { scratch_rows: Rows, /// stores the top k values and their sort key values, in order heap: TopKHeap, - /// stores the current filters derived from this TopK that can be pushed down - filters: Option>, } impl std::fmt::Debug for TopK { @@ -158,26 +150,9 @@ impl TopK { row_converter, scratch_rows, heap: TopKHeap::new(k, batch_size, schema), - filters: None, }) } - pub(crate) fn dynamic_filter_source(&mut self) -> Arc { - match self.filters { - Some(ref filters) => filters.as_dynamic_physical_expr(), - None => { - let children = self - .expr - .iter() - .map(|e| Arc::clone(&e.expr)) - .collect::>(); - let filters = Arc::new(TopKDynamicFilterSource::new(children)); - self.filters = Some(Arc::clone(&filters)); - filters.as_dynamic_physical_expr() - } - } - } - /// Insert `batch`, remembering if any of its values are among /// the top k seen so far. pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> { @@ -202,7 +177,6 @@ impl TopK { // Idea: filter out rows >= self.heap.max() early (before passing to `RowConverter`) // this avoids some work and also might be better vectorizable. let mut batch_entry = self.heap.register_batch(batch); - let mut need_to_update_dynamic_filters = false; for (index, row) in rows.iter().enumerate() { match self.heap.max() { // heap has k items, and the new row is greater than the @@ -212,23 +186,11 @@ impl TopK { None | Some(_) => { self.heap.add(&mut batch_entry, row, index); self.metrics.row_replacements.add(1); - need_to_update_dynamic_filters = true; } } } self.heap.insert_batch_entry(batch_entry); - if need_to_update_dynamic_filters { - if let Some(filters) = self.filters.as_ref() { - if let Some(thresholds) = self.heap.get_threshold_values(&self.expr)? { - if let Some(predicate) = Self::calculate_dynamic_filters(thresholds)? - { - filters.update_filters(predicate)?; - } - } - } - } - // conserve memory self.heap.maybe_compact()?; @@ -237,88 +199,8 @@ impl TopK { Ok(()) } - fn calculate_dynamic_filters( - thresholds: Vec, - ) -> Result>> { - // Create filter expressions for each threshold - let mut filters: Vec> = - Vec::with_capacity(thresholds.len()); - - let mut prev_sort_expr: Option> = None; - for threshold in thresholds { - // Create the appropriate operator based on sort order - let op = if threshold.sort_options.descending { - // For descending sort, we want col > threshold (exclude smaller values) - Operator::Gt - } else { - // For ascending sort, we want col < threshold (exclude larger values) - Operator::Lt - }; - - let value_null = threshold.value.is_null(); - - let comparison = Arc::new(BinaryExpr::new( - Arc::clone(&threshold.expr), - op, - lit(threshold.value.clone()), - )); - - let comparison_with_null = - match (threshold.sort_options.nulls_first, value_null) { - // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) - (true, true) => lit(false), - (true, false) => Arc::new(BinaryExpr::new( - is_null(Arc::clone(&threshold.expr))?, - Operator::Or, - comparison, - )), - // For nulls last, transform to (threshold.value is null and threshold.expr is not null) - // or (threshold.value is not null and comparison) - (false, true) => is_not_null(Arc::clone(&threshold.expr))?, - (false, false) => comparison, - }; - - let mut eq_expr = Arc::new(BinaryExpr::new( - Arc::clone(&threshold.expr), - Operator::Eq, - lit(threshold.value.clone()), - )); - - if value_null { - eq_expr = Arc::new(BinaryExpr::new( - is_null(Arc::clone(&threshold.expr))?, - Operator::Or, - eq_expr, - )); - } - - // For a query like order by a, b, the filter for column `b` is only applied if - // the condition a = threshold.value (considering null equality) is met. - // Therefore, we add equality predicates for all preceding fields to the filter logic of the current field, - // and include the current field's equality predicate in `prev_sort_expr` for use with subsequent fields. - match prev_sort_expr.take() { - None => { - prev_sort_expr = Some(eq_expr); - filters.push(comparison_with_null); - } - Some(p) => { - filters.push(Arc::new(BinaryExpr::new( - Arc::clone(&p), - Operator::And, - comparison_with_null, - ))); - - prev_sort_expr = - Some(Arc::new(BinaryExpr::new(p, Operator::And, eq_expr))); - } - } - } - - let dynamic_predicate = filters - .into_iter() - .reduce(|a, b| Arc::new(BinaryExpr::new(a, Operator::Or, b))); - - Ok(dynamic_predicate) + pub fn get_threshold_values(&self) -> Result>> { + self.heap.get_threshold_values(&self.expr) } /// Returns the top k results broken into `batch_size` [`RecordBatch`]es, consuming the heap @@ -332,7 +214,6 @@ impl TopK { row_converter: _, scratch_rows: _, mut heap, - filters: _, } = self; let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop @@ -406,16 +287,6 @@ struct TopKHeap { owned_bytes: usize, } -/// Holds threshold value and sort order information for a column -struct ColumnThreshold { - /// The column expression - pub expr: Arc, - /// The threshold value - pub value: datafusion_common::ScalarValue, - /// Sort options - pub sort_options: SortOptions, -} - impl TopKHeap { pub fn new(k: usize, batch_size: usize, schema: SchemaRef) -> Self { assert!(k > 0); @@ -434,7 +305,7 @@ impl TopKHeap { pub fn get_threshold_values( &self, sort_exprs: &[PhysicalSortExpr], - ) -> Result>> { + ) -> Result>> { // If the heap doesn't have k elements yet, we can't create thresholds let max_row = match self.max() { Some(row) => row, @@ -448,7 +319,7 @@ impl TopKHeap { }; // Extract threshold values for each sort expression - let mut thresholds = Vec::with_capacity(sort_exprs.len()); + let mut scalar_values = Vec::with_capacity(sort_exprs.len()); for sort_expr in sort_exprs { // Extract the value for this column from the max row let expr = Arc::clone(&sort_expr.expr); @@ -459,21 +330,17 @@ impl TopKHeap { ColumnarValue::Scalar(scalar) => scalar, ColumnarValue::Array(array) if array.len() == 1 => { // Extract the first (and only) value from the array - datafusion_common::ScalarValue::try_from_array(&array, 0)? + ScalarValue::try_from_array(&array, 0)? } array => { return internal_err!("Expected a scalar value, got {:?}", array) } }; - thresholds.push(ColumnThreshold { - expr, - value: scalar, - sort_options: sort_expr.options, - }); + scalar_values.push(scalar); } - Ok(Some(thresholds)) + Ok(Some(scalar_values)) } /// Register a [`RecordBatch`] with the heap, returning the @@ -841,7 +708,6 @@ impl RecordBatchStore { #[cfg(test)] mod tests { use super::*; - use crate::expressions::col; use arrow::array::{Float64Array, Int32Array, RecordBatch}; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; @@ -911,8 +777,8 @@ mod tests { .unwrap(); // Initially there should be no filters (empty heap) - let filter = topk.dynamic_filter_source().snapshot().unwrap().unwrap(); - assert!(filter.eq(&lit(true)), "{filter:?}"); + let filter = topk.get_threshold_values().unwrap(); + assert_eq!(filter, None, "Expected no filters when heap is empty"); // Insert some data to fill the heap let col1 = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); @@ -928,14 +794,12 @@ mod tests { topk.insert_batch(batch).unwrap(); // Now there should be a filter - let filter = topk.dynamic_filter_source().snapshot().unwrap().unwrap(); - - // We expect a filter for col2 > 6.0 (since we're doing descending sort and have 5 values) - let expected = Arc::new(BinaryExpr::new( - col("col2", &schema).unwrap(), - Operator::Gt, - lit(6.0), - )) as Arc; - assert!(filter.eq(&expected), "{filter:?}"); + let filter = topk.get_threshold_values().unwrap().unwrap(); + assert_eq!(filter.len(), 1, "Expected one filter after inserting data"); + assert_eq!( + filter[0], + ScalarValue::Float64(Some(6.0)), + "Expected filter value to be 6.0" + ); } } From c4a95683c352a71b9f095296b143383dd210e14e Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 01:26:11 -0500 Subject: [PATCH 07/27] wip on optimizer pass --- datafusion/datasource/src/source.rs | 4 +++ .../physical-optimizer/src/filter_pushdown.rs | 28 ++++++++++++++----- datafusion/physical-plan/src/filter.rs | 2 +- .../physical-plan/src/sorts/sort_filters.rs | 13 ++++----- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index f5832d6f13e5..8ddda5169e18 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -203,6 +203,10 @@ impl ExecutionPlan for DataSourceExec { self.data_source.try_swapping_with_projection(projection) } + fn supports_filter_pushdown(&self) -> bool { + true + } + fn push_down_filters_from_parents( &self, filters: &[&Arc], diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 65c782248085..860de6859a18 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -20,8 +20,7 @@ use std::sync::Arc; use datafusion_common::{config::ConfigOptions, Result}; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::{ - execution_plan::{ExecutionPlanFilterPushdownResult, FilterPushdownSupport}, - ExecutionPlan, + execution_plan::{ExecutionPlanFilterPushdownResult, FilterPushdownSupport}, with_new_children_if_necessary, ExecutionPlan }; use crate::PhysicalOptimizerRule; @@ -38,8 +37,11 @@ fn pushdown_filters( .chain(node_filters.iter()) .cloned() .collect::>(); - let mut filter_pushdown_result = - vec![FilterPushdownSupport::Exact; all_filters.len()]; + let mut filter_pushdown_result = if children.is_empty() { + vec![FilterPushdownSupport::Inexact; all_filters.len()] + } else { + vec![FilterPushdownSupport::Exact; all_filters.len()] + }; for child in children { if child.supports_filter_pushdown() { if let Some(result) = pushdown_filters(child, &all_filters)? { @@ -67,7 +69,7 @@ fn pushdown_filters( }; } - let mut result_node = Arc::clone(node); + let mut result_node = with_new_children_if_necessary(Arc::clone(node), new_children)?; // Now update the node with the result of the pushdown of it's filters let pushdown_result = filter_pushdown_result[parent_filters.len()..].to_vec(); @@ -81,11 +83,17 @@ fn pushdown_filters( let remaining_filter_indexes = (0..parent_filters.len()) .filter(|&i| !matches!(filter_pushdown_result[i], FilterPushdownSupport::Exact)) .collect::>(); + println!("Remaining filter indexes: {:?}", remaining_filter_indexes); if !remaining_filter_indexes.is_empty() { let remaining_filters = remaining_filter_indexes .iter() .map(|&i| &parent_filters[i]) .collect::>(); + let remaining_filters_dbg = format!( + "Remaining filters being pushed down into {:?} {:?}", + remaining_filters, node + ); + println!("{}", remaining_filters_dbg); if let Some(result) = node.push_down_filters_from_parents(&remaining_filters)? { result_node = result.inner; for (parent_filter_index, support) in @@ -101,13 +109,19 @@ fn pushdown_filters( } Ok(Some(ExecutionPlanFilterPushdownResult::new( result_node, - filter_pushdown_result, + filter_pushdown_result[..parent_filters.len()].to_vec(), // only return the support for the original parent filters ))) } #[derive(Debug)] pub struct FilterPushdown {} +impl Default for FilterPushdown { + fn default() -> Self { + Self::new() + } +} + impl FilterPushdown { pub fn new() -> Self { Self {} @@ -120,7 +134,7 @@ impl PhysicalOptimizerRule for FilterPushdown { plan: Arc, _config: &ConfigOptions, ) -> Result> { - if let Some(result) = pushdown_filters(&plan, &vec![])? { + if let Some(result) = pushdown_filters(&plan, &[])? { Ok(result.inner) } else { Ok(plan) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index a70289d82772..42b75a41ebcb 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -470,7 +470,7 @@ impl ExecutionPlan for FilterExec { return Ok(Some(Arc::clone(self.input()))); } - let predicate = conjunction(new_filters.into_iter()); + let predicate = conjunction(new_filters); let new = FilterExec::try_new(predicate, Arc::clone(self.input())) .and_then(|e| { diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index 7b7b968cb746..f09ddfd6980f 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -106,17 +106,14 @@ impl SortDynamicFilterSource { .expect("Failed to acquire read lock on threshold"); // Check if the new value is more or less selective than the current value given the sorting if let Some(ref mut current_value) = *current_value { - let new_value_is_greater = new_value > ¤t_value; + let new_value_is_greater = new_value > current_value; let new_value_is_null = new_value.is_null(); let current_value_is_null = current_value.is_null(); - if nulls_first && new_value_is_null && !current_value_is_null { + if (nulls_first && new_value_is_null && !current_value_is_null) + || (descending && new_value_is_greater) + || (!descending && !new_value_is_greater) + { *current_value = new_value.clone(); - } else { - if (descending && new_value_is_greater) - || (!descending && !new_value_is_greater) - { - *current_value = new_value.clone(); - } } } else { *current_value = Some(new_value.clone()); From c15b48f00e06233d817455d5b6b09f67cba085be Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 11:38:30 -0500 Subject: [PATCH 08/27] working implementation --- datafusion/datasource-parquet/src/source.rs | 4 ++ .../physical-optimizer/src/filter_pushdown.rs | 11 ++- .../physical-plan/src/dynamic_filters.rs | 2 +- datafusion/physical-plan/src/filter.rs | 24 ++++--- datafusion/physical-plan/src/sorts/sort.rs | 18 +++-- .../physical-plan/src/sorts/sort_filters.rs | 68 ++++++++++++------- 6 files changed, 78 insertions(+), 49 deletions(-) diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 9a8dd8f03b7f..48b88a91876f 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -582,6 +582,10 @@ impl FileSource for ParquetSource { filters: &[&Arc], ) -> datafusion_common::Result> { let mut conf = self.clone(); + if !self.pushdown_filters() { + // If pushdown filters is not enabled, return early + return Ok(None); + } conf.predicate = match self.predicate.as_ref() { Some(existing_predicate) => { // Combine existing predicate with new filters diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 860de6859a18..7b33b82721bf 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -20,7 +20,8 @@ use std::sync::Arc; use datafusion_common::{config::ConfigOptions, Result}; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::{ - execution_plan::{ExecutionPlanFilterPushdownResult, FilterPushdownSupport}, with_new_children_if_necessary, ExecutionPlan + execution_plan::{ExecutionPlanFilterPushdownResult, FilterPushdownSupport}, + with_new_children_if_necessary, ExecutionPlan, }; use crate::PhysicalOptimizerRule; @@ -83,17 +84,11 @@ fn pushdown_filters( let remaining_filter_indexes = (0..parent_filters.len()) .filter(|&i| !matches!(filter_pushdown_result[i], FilterPushdownSupport::Exact)) .collect::>(); - println!("Remaining filter indexes: {:?}", remaining_filter_indexes); if !remaining_filter_indexes.is_empty() { let remaining_filters = remaining_filter_indexes .iter() .map(|&i| &parent_filters[i]) .collect::>(); - let remaining_filters_dbg = format!( - "Remaining filters being pushed down into {:?} {:?}", - remaining_filters, node - ); - println!("{}", remaining_filters_dbg); if let Some(result) = node.push_down_filters_from_parents(&remaining_filters)? { result_node = result.inner; for (parent_filter_index, support) in @@ -134,7 +129,9 @@ impl PhysicalOptimizerRule for FilterPushdown { plan: Arc, _config: &ConfigOptions, ) -> Result> { + println!("plan before: {:?}", plan); if let Some(result) = pushdown_filters(&plan, &[])? { + println!("plan after filter pushdown: {:?}", result.inner); Ok(result.inner) } else { Ok(plan) diff --git a/datafusion/physical-plan/src/dynamic_filters.rs b/datafusion/physical-plan/src/dynamic_filters.rs index 4bfad498788e..3bfaf4a8d34c 100644 --- a/datafusion/physical-plan/src/dynamic_filters.rs +++ b/datafusion/physical-plan/src/dynamic_filters.rs @@ -55,7 +55,7 @@ pub struct DynamicFilterPhysicalExpr { /// as the children of the dynamic filter source. remapped_children: Option>>, /// The source of dynamic filters. - inner: Arc, + pub inner: Arc, // TODO: remove pub /// For testing purposes track the data type and nullability to make sure they don't change. /// If they do, there's a bug in the implementation. /// But this can have overhead in production, so it's only included in tests. diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 42b75a41ebcb..f3b6489fd2b7 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -36,6 +36,7 @@ use crate::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, DisplayFormatType, ExecutionPlan, }; +use datafusion_physical_expr::expressions::lit; use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, SchemaRef}; @@ -466,20 +467,21 @@ impl ExecutionPlan for FilterExec { }) .collect::>(); - if new_filters.is_empty() { + let predicate = conjunction(new_filters); + + if predicate.eq(&lit(true)) && self.projection.is_none() { return Ok(Some(Arc::clone(self.input()))); } - let predicate = conjunction(new_filters); - - let new = FilterExec::try_new(predicate, Arc::clone(self.input())) - .and_then(|e| { - let selectivity = e.default_selectivity(); - e.with_default_selectivity(selectivity) - }) - .and_then(|e| e.with_projection(self.projection().cloned())) - .map(|e| Arc::new(e) as _)?; - Ok(Some(new)) + let new = FilterExec { + predicate, + input: Arc::clone(self.input()), + metrics: self.metrics.clone(), + default_selectivity: self.default_selectivity, + cache: self.cache.clone(), + projection: self.projection.clone(), + }; + Ok(Some(Arc::new(new))) } fn push_down_filters_from_parents( diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index e559a479e8d3..008d1bd0a4c0 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1128,6 +1128,14 @@ impl SortExec { boundedness, ) } + + fn with_dynamic_filter_source( + mut self, + dynamic_filter_source: Arc, + ) -> Self { + self.dynamic_filter_source = dynamic_filter_source; + self + } } impl DisplayAs for SortExec { @@ -1192,7 +1200,8 @@ impl ExecutionPlan for SortExec { ) -> Result> { let new_sort = SortExec::new(self.expr.clone(), Arc::clone(&children[0])) .with_fetch(self.fetch) - .with_preserve_partitioning(self.preserve_partitioning); + .with_preserve_partitioning(self.preserve_partitioning) + .with_dynamic_filter_source(Arc::clone(&self.dynamic_filter_source)); Ok(Arc::new(new_sort)) } @@ -1241,7 +1250,7 @@ impl ExecutionPlan for SortExec { let batch = batch?; topk.insert_batch(batch)?; if let Some(values) = topk.get_threshold_values()? { - dynamic_filter_source.update_values(&values); + dynamic_filter_source.update_values(&values)?; } } topk.emit() @@ -1327,12 +1336,13 @@ impl ExecutionPlan for SortExec { Ok(Some(Arc::new( SortExec::new(updated_exprs, make_with_child(projection, self.input())?) .with_fetch(self.fetch()) - .with_preserve_partitioning(self.preserve_partitioning()), + .with_preserve_partitioning(self.preserve_partitioning()) + .with_dynamic_filter_source(Arc::clone(&self.dynamic_filter_source)), ))) } fn filters_for_pushdown(&self) -> Result>> { - Ok(vec![self.dynamic_filter_source.as_physical_expr()]) + Ok(vec![self.dynamic_filter_source.as_physical_expr()?]) } fn supports_filter_pushdown(&self) -> bool { diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index f09ddfd6980f..605be4a4610e 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -31,7 +31,7 @@ use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; #[derive(Debug, Clone)] struct ColumnThreshold { /// The current threshold value - pub value: Arc>>, + pub value: Option, /// The column expression pub expr: Arc, /// Sort options @@ -73,9 +73,9 @@ struct ColumnThreshold { // Once we get to file 2 however we can skip it entirely because we know that all values in file 2 are smaller than 30. // The same goes for file 1. // So this optimization just saved us 50% of the work of scanning the data. -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct SortDynamicFilterSource { - thresholds: Vec, + thresholds: Arc>>, } impl SortDynamicFilterSource { @@ -83,29 +83,36 @@ impl SortDynamicFilterSource { let thresholds = ordering .iter() .map(|sort_expr| ColumnThreshold { - value: Arc::new(RwLock::new(None)), + value: None, expr: Arc::clone(&sort_expr.expr), sort_options: sort_expr.options, }) .collect(); + let thresholds = Arc::new(RwLock::new(thresholds)); + Self { thresholds } } - pub fn update_values(&self, new_values: &[ScalarValue]) { - if new_values.len() != self.thresholds.len() { - panic!("New values length does not match the number of thresholds"); + pub fn update_values(&self, new_values: &[ScalarValue]) -> Result<()> { + let mut thresholds = self.thresholds.write().map_err(|_| { + datafusion_common::DataFusionError::Execution( + "Failed to acquire write lock on thresholds".to_string(), + ) + })?; + if new_values.len() != thresholds.len() { + return Err(datafusion_common::DataFusionError::Execution( + "The number of new values does not match the number of thresholds" + .to_string(), + )); } for (i, new_value) in new_values.iter().enumerate() { - let threshold = &self.thresholds[i]; + let threshold = &mut thresholds[i]; let descending = threshold.sort_options.descending; let nulls_first = threshold.sort_options.nulls_first; - let mut current_value = threshold - .value - .write() - .expect("Failed to acquire read lock on threshold"); + let current_value = &threshold.value; // Check if the new value is more or less selective than the current value given the sorting - if let Some(ref mut current_value) = *current_value { + if let Some(current_value) = current_value { let new_value_is_greater = new_value > current_value; let new_value_is_null = new_value.is_null(); let current_value_is_null = current_value.is_null(); @@ -113,40 +120,49 @@ impl SortDynamicFilterSource { || (descending && new_value_is_greater) || (!descending && !new_value_is_greater) { - *current_value = new_value.clone(); + // *current_value = new_value.clone(); + threshold.value = Some(new_value.clone()); } } else { - *current_value = Some(new_value.clone()); + threshold.value = Some(new_value.clone()); } } + Ok(()) } - pub fn as_physical_expr(&self) -> Arc { + pub fn as_physical_expr(self: &Arc) -> Result> { let children = self .thresholds + .read() + .map_err(|_| { + datafusion_common::DataFusionError::Execution( + "Failed to acquire read lock on thresholds".to_string(), + ) + })? .iter() .map(|threshold| Arc::clone(&threshold.expr)) .collect::>(); - Arc::new(DynamicFilterPhysicalExpr::new( + Ok(Arc::new(DynamicFilterPhysicalExpr::new( children, - Arc::new(self.clone()) as Arc, - )) + Arc::clone(self) as Arc, + ))) } } impl DynamicFilterSource for SortDynamicFilterSource { fn snapshot_current_filters(&self) -> Result>> { + let thresholds = self.thresholds.read().map_err(|_| { + datafusion_common::DataFusionError::Execution( + "Failed to acquire read lock on thresholds".to_string(), + ) + })?; // Create filter expressions for each threshold let mut filters: Vec> = - Vec::with_capacity(self.thresholds.len()); + Vec::with_capacity(thresholds.len()); let mut prev_sort_expr: Option> = None; - for threshold in &self.thresholds { - let value = threshold - .value - .read() - .expect("Failed to acquire read lock on threshold") - .clone(); + for threshold in thresholds.iter() { + let value = &threshold.value; let Some(value) = value else { // If the value is None, we cannot create a filter for this threshold From b4bc34cdda76cb0d6a9a503d911b4ddc117e41e7 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 12:15:39 -0500 Subject: [PATCH 09/27] docs, some tweaks --- datafusion/datasource-parquet/src/source.rs | 31 ++-- datafusion/datasource/src/source.rs | 4 + .../physical-expr-common/src/physical_expr.rs | 6 +- datafusion/physical-expr/src/lib.rs | 2 +- datafusion/physical-optimizer/src/pruning.rs | 4 +- .../physical-plan/src/dynamic_filters.rs | 151 ++++++++++-------- datafusion/physical-plan/src/lib.rs | 2 +- .../physical-plan/src/sorts/sort_filters.rs | 25 ++- .../proto/src/physical_plan/to_proto.rs | 4 +- 9 files changed, 139 insertions(+), 90 deletions(-) diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 48b88a91876f..86155a9bd38e 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -37,7 +37,7 @@ use datafusion_common::config::TableParquetOptions; use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; -use datafusion_physical_expr::conjunction; +use datafusion_physical_expr::{conjunction, expressions::lit}; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_optimizer::pruning::PruningPredicate; @@ -339,13 +339,13 @@ impl ParquetSource { } /// Optional reference to this parquet scan's pruning predicate - #[deprecated(note = "ParquetDataSource no longer constructs a PruningPredicate.")] + #[deprecated(note = "ParquetSource no longer constructs a PruningPredicate.")] pub fn pruning_predicate(&self) -> Option<&Arc> { self.pruning_predicate.as_ref() } /// Optional reference to this parquet scan's page pruning predicate - #[deprecated(note = "ParquetDataSource no longer constructs a PruningPredicate.")] + #[deprecated(note = "ParquetSource no longer constructs a PruningPredicate.")] pub fn page_pruning_predicate(&self) -> Option<&Arc> { self.page_pruning_predicate.as_ref() } @@ -586,7 +586,7 @@ impl FileSource for ParquetSource { // If pushdown filters is not enabled, return early return Ok(None); } - conf.predicate = match self.predicate.as_ref() { + let predicate = match self.predicate.as_ref() { Some(existing_predicate) => { // Combine existing predicate with new filters Some(conjunction( @@ -594,18 +594,17 @@ impl FileSource for ParquetSource { .chain(filters.iter().cloned().cloned()), )) } - None => { - if filters.is_empty() { - None - } else { - // If no existing predicate, just use the new filters - Some(conjunction(filters.iter().cloned().cloned())) - } - } + None => Some(conjunction(filters.iter().cloned().cloned())), }; - Ok(Some(FileSourceFilterPushdownResult::new( - Arc::new(conf), - vec![FilterPushdownSupport::Exact; filters.len()], - ))) + match predicate { + Some(new_predicate) if !new_predicate.eq(&lit(true)) => { + conf.predicate = Some(new_predicate); + Ok(Some(FileSourceFilterPushdownResult::new( + Arc::new(conf), + vec![FilterPushdownSupport::Exact; filters.len()], + ))) + } + _no_op_predicate => Ok(None), + } } } diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 8ddda5169e18..74700d25da08 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -81,6 +81,10 @@ pub trait DataSource: Send + Sync + Debug { &self, _projection: &ProjectionExec, ) -> datafusion_common::Result>>; + /// Push down filters from parent execution plans to this data source. + /// This is expected to return Ok(None) if the filters cannot be pushed down. + /// If they can be pushed down it should return a [`FilterPushdownResult`] containing the new + /// data source and the support level for each filter (exact or inexact). fn push_down_filters( &self, _filters: &[&Arc], diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 22f671d349e2..2f5719a061dd 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -291,7 +291,7 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { /// or treat it via downcast matching. /// /// You should not call this method directly as it does not handle recursion. - /// Instead use `shapshot_physical_expr` to handle recursion and capture the + /// Instead use [`snapshot_physical_expr`] to handle recursion and capture the /// full state of the `PhysicalExpr`. /// /// This is expected to return "simple" expressions that do not have mutable state @@ -318,7 +318,7 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { /// contain these dynamic references. /// /// Note for implementers: this method should *not* handle recursion. - /// Recursion is handled in `shapshot_physical_expr`. + /// Recursion is handled in [`snapshot_physical_expr`]. fn snapshot(&self) -> Result>> { // By default, we return None to indicate that this PhysicalExpr does not // have any dynamic references or state. @@ -503,7 +503,7 @@ pub fn fmt_sql(expr: &dyn PhysicalExpr) -> impl Display + '_ { /// Returns an `Option>` which is the snapshot of the /// `PhysicalExpr` if it is dynamic. If the `PhysicalExpr` does not have /// any dynamic references or state, it returns `None`. -pub fn snasphot_physical_expr( +pub fn snapshot_physical_expr( expr: Arc, ) -> Result> { expr.transform_up(|e| { diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 551ee97a8783..34e20a690522 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -60,7 +60,7 @@ pub use physical_expr::{ }; pub use datafusion_physical_expr_common::physical_expr::{ - snasphot_physical_expr, PhysicalExpr, + snapshot_physical_expr, PhysicalExpr, }; pub use datafusion_physical_expr_common::sort_expr::{ LexOrdering, LexRequirement, PhysicalSortExpr, PhysicalSortRequirement, diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 8084772b90de..f0eb5f290094 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -41,7 +41,7 @@ use datafusion_common::{Column, DFSchema}; use datafusion_expr_common::operator::Operator; use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarantee}; use datafusion_physical_expr::{ - expressions as phys_expr, snasphot_physical_expr, PhysicalExprRef, + expressions as phys_expr, snapshot_physical_expr, PhysicalExprRef, }; use datafusion_physical_plan::{ColumnarValue, PhysicalExpr}; @@ -529,7 +529,7 @@ impl PruningPredicate { /// See the struct level documentation on [`PruningPredicate`] for more /// details. pub fn try_new(expr: Arc, schema: SchemaRef) -> Result { - let expr = snasphot_physical_expr(expr)?; + let expr = snapshot_physical_expr(expr)?; let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _; // build predicate expression once diff --git a/datafusion/physical-plan/src/dynamic_filters.rs b/datafusion/physical-plan/src/dynamic_filters.rs index 3bfaf4a8d34c..f954bf0a311c 100644 --- a/datafusion/physical-plan/src/dynamic_filters.rs +++ b/datafusion/physical-plan/src/dynamic_filters.rs @@ -21,12 +21,12 @@ use std::{ sync::{Arc, RwLock}, }; -use datafusion_common::{ - tree_node::{Transformed, TransformedResult, TreeNode}, - Result, -}; +use datafusion_common::Result; use datafusion_expr::ColumnarValue; -use datafusion_physical_expr::{expressions::lit, utils::conjunction, PhysicalExpr}; +use datafusion_physical_expr::{utils::conjunction, PhysicalExpr}; +use datafusion_physical_expr_common::physical_expr::{ + with_new_children_if_necessary, DynEq, DynHash, +}; /// A source of dynamic runtime filters. /// @@ -34,53 +34,71 @@ use datafusion_physical_expr::{expressions::lit, utils::conjunction, PhysicalExp /// filter expressions that other operators can use to dynamically prune data. /// /// See `TopKDynamicFilterSource` in datafusion/physical-plan/src/topk/mod.rs for examples. -pub trait DynamicFilterSource: Send + Sync + std::fmt::Debug + 'static { +pub trait DynamicFilterSource: + Send + Sync + std::fmt::Debug + DynEq + DynHash + 'static +{ /// Take a snapshot of the current state of filtering, returning a non-dynamic PhysicalExpr. /// This is used to e.g. serialize dynamic filters across the wire or to pass them into systems /// that won't use the `PhysicalExpr` API (e.g. matching on the concrete types of the expressions like `PruningPredicate` does). /// For example, it is expected that this returns a relatively simple expression such as `col1 > 5` for a TopK operator or /// `col2 IN (1, 2, ... N)` for a HashJoin operator. fn snapshot_current_filters(&self) -> Result>>; + + fn as_any(&self) -> &dyn Any; +} + +impl PartialEq for dyn DynamicFilterSource { + fn eq(&self, other: &Self) -> bool { + self.dyn_eq(other.as_any()) + } +} + +impl Eq for dyn DynamicFilterSource {} + +#[derive(Debug, Eq, PartialEq, Hash)] +enum Children { + Remapped(Vec>), + Original(Vec>), } +/// A wrapper around a [`DynamicFilterSource`] that allows it to be used as a physical expression. +/// This will call [`DynamicFilterSource::snapshot_current_filters`] to get the current filters for each call to +/// [`PhysicalExpr::evaluate`], [`PhysicalExpr::data_type`], and [`PhysicalExpr::nullable`]. +/// It also implements [`PhysicalExpr::snapshot`] by forwarding the call to [`DynamicFilterSource::snapshot_current_filters`]. #[derive(Debug)] pub struct DynamicFilterPhysicalExpr { /// The children of this expression. /// In particular, it is important that if the dynamic expression will reference any columns /// those columns be marked as children of this expression so that the expression can be properly /// bound to the schema. - children: Vec>, - /// Remapped children, if `PhysicalExpr::with_new_children` was called. - /// This is used to ensure that the children of the expression are always the same - /// as the children of the dynamic filter source. - remapped_children: Option>>, + children: Children, /// The source of dynamic filters. - pub inner: Arc, // TODO: remove pub + inner: Arc, /// For testing purposes track the data type and nullability to make sure they don't change. /// If they do, there's a bug in the implementation. - /// But this can have overhead in production, so it's only included in tests. + /// But this can have overhead in production, so it's only included in our tests. data_type: Arc>>, nullable: Arc>>, } -impl std::fmt::Display for DynamicFilterPhysicalExpr { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "DynamicFilterPhysicalExpr") +impl Hash for DynamicFilterPhysicalExpr { + fn hash(&self, state: &mut H) { + self.inner.dyn_hash(state); + self.children.dyn_hash(state); } } -// Manually derive PartialEq and Hash to work around https://github.com/rust-lang/rust/issues/78808 impl PartialEq for DynamicFilterPhysicalExpr { fn eq(&self, other: &Self) -> bool { - self.current().eq(&other.current()) + self.inner.dyn_eq(&*other.inner.as_any()) && self.children == other.children } } impl Eq for DynamicFilterPhysicalExpr {} -impl Hash for DynamicFilterPhysicalExpr { - fn hash(&self, state: &mut H) { - self.current().hash(state) +impl std::fmt::Display for DynamicFilterPhysicalExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "DynamicFilterPhysicalExpr") } } @@ -90,44 +108,26 @@ impl DynamicFilterPhysicalExpr { inner: Arc, ) -> Self { Self { - children, - remapped_children: None, + children: Children::Original(children), inner, data_type: Arc::new(RwLock::new(None)), nullable: Arc::new(RwLock::new(None)), } } - fn current(&self) -> Arc { - let current = if let Ok(current) = self.inner.snapshot_current_filters() { - conjunction(current) - } else { - lit(false) - }; - if let Some(remapped_children) = &self.remapped_children { - // Remap children to the current children - // of the expression. - current - .transform_up(|expr| { - // Check if this is any of our original children - if let Some(pos) = self - .children - .iter() - .position(|c| c.as_ref() == expr.as_ref()) - { - // If so, remap it to the current children - // of the expression. - let new_child = Arc::clone(&remapped_children[pos]); - Ok(Transformed::yes(new_child)) - } else { - // Otherwise, just return the expression - Ok(Transformed::no(expr)) - } - }) - .data() - .expect("transformation is infallible") - } else { - current + fn current(&self) -> Result> { + let current = conjunction(self.inner.snapshot_current_filters()?); + match self.children { + Children::Original(_) => { + // If the children are the original ones, we can just return the current expression + Ok(current) + } + Children::Remapped(ref remapped_children) => { + // If we have remapped children, we need to replace them in the current expression + let new_current = + with_new_children_if_necessary(current, remapped_children.clone())?; + Ok(new_current) + } } } } @@ -138,11 +138,10 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { } fn children(&self) -> Vec<&Arc> { - self.remapped_children - .as_ref() - .unwrap_or(&self.children) - .iter() - .collect() + match &self.children { + Children::Original(children) => children.iter().collect(), + Children::Remapped(children) => children.iter().collect(), + } } fn with_new_children( @@ -150,8 +149,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { children: Vec>, ) -> Result> { Ok(Arc::new(Self { - children: self.children.clone(), - remapped_children: Some(children), + children: Children::Remapped(children), inner: Arc::clone(&self.inner), data_type: Arc::clone(&self.data_type), nullable: Arc::clone(&self.nullable), @@ -162,7 +160,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { &self, input_schema: &arrow::datatypes::Schema, ) -> Result { - let res = self.current().data_type(input_schema)?; + let res = self.current()?.data_type(input_schema)?; #[cfg(test)] { use datafusion_common::internal_err; @@ -187,7 +185,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { } fn nullable(&self, input_schema: &arrow::datatypes::Schema) -> Result { - let res = self.current().nullable(input_schema)?; + let res = self.current()?.nullable(input_schema)?; #[cfg(test)] { use datafusion_common::internal_err; @@ -215,7 +213,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { &self, batch: &arrow::record_batch::RecordBatch, ) -> Result { - let current = self.current(); + let current = self.current()?; #[cfg(test)] { // Ensure that we are not evaluating after the expression has changed. @@ -236,7 +234,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { fn snapshot(&self) -> Result>> { // Return the current expression as a snapshot. - Ok(Some(self.current())) + Ok(Some(self.current()?)) } } @@ -244,6 +242,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { mod test { use arrow::array::RecordBatch; use datafusion_common::ScalarValue; + use datafusion_physical_expr::expressions::lit; use super::*; @@ -254,11 +253,35 @@ mod test { current_expr: Arc>>, } + impl Hash for MockDynamicFilterSource { + fn hash(&self, state: &mut H) { + // Hash the current expression to ensure uniqueness + self.current_expr.read().unwrap().dyn_hash(state); + } + } + + impl DynEq for MockDynamicFilterSource { + fn dyn_eq(&self, other: &dyn Any) -> bool { + if let Some(other) = other.downcast_ref::() { + self.current_expr + .read() + .unwrap() + .eq(&other.current_expr.read().unwrap()) + } else { + false + } + } + } + impl DynamicFilterSource for MockDynamicFilterSource { fn snapshot_current_filters(&self) -> Result>> { let expr = self.current_expr.read().unwrap().clone(); Ok(vec![expr]) } + + fn as_any(&self) -> &dyn Any { + self + } } let source = Arc::new(MockDynamicFilterSource { diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 2ccfe0b73ed6..44c78c49f4ac 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -36,7 +36,7 @@ pub use datafusion_expr::{Accumulator, ColumnarValue}; pub use datafusion_physical_expr::window::WindowExpr; use datafusion_physical_expr::PhysicalSortExpr; pub use datafusion_physical_expr::{ - expressions, snasphot_physical_expr, Distribution, Partitioning, PhysicalExpr, + expressions, snapshot_physical_expr, Distribution, Partitioning, PhysicalExpr, }; pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay}; diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index 605be4a4610e..4020769d30c2 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::sync::{Arc, RwLock}; +use std::{ + hash::{Hash, Hasher}, + sync::{Arc, RwLock}, +}; use arrow_schema::SortOptions; use datafusion_common::{Result, ScalarValue}; @@ -78,6 +81,22 @@ pub struct SortDynamicFilterSource { thresholds: Arc>>, } +impl Hash for SortDynamicFilterSource { + fn hash(&self, state: &mut H) { + // Hash the pointers to the thresholds + let thresholds = Arc::as_ptr(&self.thresholds) as usize; + thresholds.hash(state); + } +} + +impl PartialEq for SortDynamicFilterSource { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.thresholds, &other.thresholds) + } +} + +impl Eq for SortDynamicFilterSource {} + impl SortDynamicFilterSource { pub fn new(ordering: &LexOrdering) -> Self { let thresholds = ordering @@ -150,6 +169,10 @@ impl SortDynamicFilterSource { } impl DynamicFilterSource for SortDynamicFilterSource { + fn as_any(&self) -> &dyn std::any::Any { + self + } + fn snapshot_current_filters(&self) -> Result>> { let thresholds = self.thresholds.read().map_err(|_| { datafusion_common::DataFusionError::Execution( diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 1e5a27ec4eb6..a2945c4abe32 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -29,7 +29,7 @@ use datafusion::physical_plan::expressions::{ use datafusion::physical_plan::udaf::AggregateFunctionExpr; use datafusion::physical_plan::windows::{PlainAggregateWindowExpr, WindowUDFExpr}; use datafusion::physical_plan::{ - snasphot_physical_expr, Partitioning, PhysicalExpr, WindowExpr, + snapshot_physical_expr, Partitioning, PhysicalExpr, WindowExpr, }; use datafusion::{ datasource::{ @@ -212,7 +212,7 @@ pub fn serialize_physical_expr( value: &Arc, codec: &dyn PhysicalExtensionCodec, ) -> Result { - let value = snasphot_physical_expr(Arc::clone(value))?; + let value = snapshot_physical_expr(Arc::clone(value))?; let expr = value.as_any(); if let Some(expr) = expr.downcast_ref::() { From 14b30057580aaa1a9264cdebd847aad1d8d808f2 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 13:40:04 -0500 Subject: [PATCH 10/27] fix --- .../src/datasource/physical_plan/parquet.rs | 370 ++---------------- datafusion/core/tests/parquet/mod.rs | 364 ++++++++++++++++- datafusion/core/tests/parquet/schema.rs | 2 +- .../physical-optimizer/src/filter_pushdown.rs | 2 - .../physical-plan/src/dynamic_filters.rs | 2 +- datafusion/physical-plan/src/filter.rs | 9 +- datafusion/physical-plan/src/sorts/sort.rs | 2 +- 7 files changed, 398 insertions(+), 353 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 2ad00637e8bf..94f2804587b5 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -43,12 +43,11 @@ mod tests { }; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder}; use arrow::record_batch::RecordBatch; - use arrow::util::pretty::pretty_format_batches; use arrow_schema::SchemaRef; use bytes::{BufMut, BytesMut}; use datafusion_common::config::TableParquetOptions; use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; - use datafusion_common::{assert_batches_eq, assert_contains, Result, ScalarValue}; + use datafusion_common::{assert_contains, Result, ScalarValue}; use datafusion_datasource::file_format::FileFormat; use datafusion_datasource::file_meta::FileMeta; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; @@ -1456,7 +1455,7 @@ mod tests { .await; // should have a pruning predicate - #[expect(deprecated)] + #[allow(deprecated)] let pruning_predicate = rt.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); @@ -1498,7 +1497,7 @@ mod tests { .round_trip(vec![batches.clone()]) .await; - #[expect(deprecated)] + #[allow(deprecated)] let pruning_predicate = rt0.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); @@ -1541,7 +1540,7 @@ mod tests { .await; // should have a pruning predicate - #[expect(deprecated)] + #[allow(deprecated)] let pruning_predicate = rt1.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); let pruning_predicate = rt2.parquet_source.predicate(); @@ -1585,7 +1584,7 @@ mod tests { .await; // Should not contain a pruning predicate (since nothing can be pruned) - #[expect(deprecated)] + #[allow(deprecated)] let pruning_predicate = rt.parquet_source.pruning_predicate(); assert!( pruning_predicate.is_none(), @@ -1621,7 +1620,7 @@ mod tests { .await; // Should have a pruning predicate - #[expect(deprecated)] + #[allow(deprecated)] let pruning_predicate = rt.parquet_source.pruning_predicate(); assert!(pruning_predicate.is_some()); } @@ -1775,13 +1774,13 @@ mod tests { let sql = "select * from base_table where name='test02'"; let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap(); assert_eq!(batch.len(), 1); - insta::assert_snapshot!(batches_to_string(&batch),@r" - +--------------------+----+--------+ - | struct | id | name | - +--------------------+----+--------+ - | {id: 3, name: zzz} | 2 | test02 | - +--------------------+----+--------+ - "); + insta::assert_snapshot!(batches_to_string(&batch),@r###" + +---------------------+----+--------+ + | struct | id | name | + +---------------------+----+--------+ + | {id: 4, name: aaa2} | 2 | test02 | + +---------------------+----+--------+ + "###); Ok(()) } @@ -1804,13 +1803,13 @@ mod tests { let sql = "select * from base_table where name='test02'"; let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap(); assert_eq!(batch.len(), 1); - insta::assert_snapshot!(batches_to_string(&batch),@r" - +--------------------+----+--------+ - | struct | id | name | - +--------------------+----+--------+ - | {id: 3, name: zzz} | 2 | test02 | - +--------------------+----+--------+ - "); + insta::assert_snapshot!(batches_to_string(&batch),@r###" + +---------------------+----+--------+ + | struct | id | name | + +---------------------+----+--------+ + | {id: 4, name: aaa2} | 2 | test02 | + +---------------------+----+--------+ + "###); Ok(()) } @@ -1824,14 +1823,14 @@ mod tests { Field::new("id", DataType::Int64, true), Field::new("name", DataType::Utf8, false), ]); - let id_array = Int64Array::from(vec![Some(2), Some(1)]); + let id_array = Int64Array::from(vec![Some(1), Some(2)]); let columns = vec![ Arc::new(Int64Array::from(vec![3, 4])) as _, - Arc::new(StringArray::from(vec!["zzz", "aaa"])) as _, + Arc::new(StringArray::from(vec!["aaa1", "aaa2"])) as _, ]; let struct_array = StructArray::new(struct_fields, columns, None); - let name_array = StringArray::from(vec![Some("test02"), Some("test01")]); + let name_array = StringArray::from(vec![Some("test01"), Some("test02")]); let schema = Arc::new(schema); let batch = RecordBatch::try_new( @@ -1843,53 +1842,12 @@ mod tests { ], ) .unwrap(); - write_record_batch(file, batch).unwrap(); - } - - fn write_file_with_non_null_ids(file: &String, value: i64) { - let schema = Schema::new(vec![ - Field::new("id", DataType::Int64, true), - Field::new("name", DataType::Utf8, false), - ]); - let id_array = Int64Array::from(vec![Some(value)]); - let name_array = StringArray::from(vec![Some("test")]); - let schema = Arc::new(schema); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(id_array), Arc::new(name_array)], - ) - .unwrap(); - write_record_batch(file, batch).unwrap(); - } - - fn write_file_with_null_ids(file: &String) { - let schema = Schema::new(vec![ - Field::new("id", DataType::Int64, true), - Field::new("name", DataType::Utf8, false), - ]); - let id_array = Int64Array::from(vec![None]); - let name_array = StringArray::from(vec![Some(format!("test{:02}", "null"))]); - let schema = Arc::new(schema); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(id_array), Arc::new(name_array)], - ) - .unwrap(); - write_record_batch(file, batch).unwrap(); - } - - fn write_record_batch(file: &String, batch: RecordBatch) -> Result<()> { - let file = File::create(file)?; - let w_opt = WriterProperties::builder() - .set_max_row_group_size(1) - .build(); - let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(w_opt))?; - writer.write(&batch)?; - writer.flush()?; - writer.close()?; - Ok(()) + let file = File::create(file).unwrap(); + let w_opt = WriterProperties::builder().build(); + let mut writer = ArrowWriter::try_new(file, schema, Some(w_opt)).unwrap(); + writer.write(&batch).unwrap(); + writer.flush().unwrap(); + writer.close().unwrap(); } /// Write out a batch to a parquet file and return the total size of the file @@ -1951,49 +1909,6 @@ mod tests { } } - struct DynamicFilterTestCase { - query: String, - path: String, - } - - impl DynamicFilterTestCase { - fn new(query: String, path: String) -> Self { - Self { query, path } - } - - async fn _run_query(&self, query: &str) -> Vec { - // Force 1 partition and 1 rg per partition because if we widen the plan - // and read all batches at once we won't get any dynamic pushdown. - let mut cfg = SessionConfig::new(); - cfg = cfg.set_u64("datafusion.execution.parquet.max_row_group_size", 1); - let ctx = SessionContext::new_with_config(cfg); - - let mut pq_options = TableParquetOptions::default(); - pq_options.global.max_row_group_size = 1; - pq_options.global.pushdown_filters = true; - let fmt = ParquetFormat::default().with_options(pq_options); - let opt = ListingOptions::new(Arc::new(fmt)).with_target_partitions(1); - ctx.register_listing_table("base_table", &self.path, opt, None, None) - .await - .unwrap(); - - ctx.sql(query).await.unwrap().collect().await.unwrap() - } - - async fn results(&self) -> Vec { - self._run_query(&self.query).await - } - - async fn explain_plan(&self) -> String { - let query = format!("EXPLAIN ANALYZE {}", self.query); - let batches = self._run_query(&query).await; - - pretty_format_batches(&batches) - .map(|s| format!("{}", s)) - .unwrap_or_else(|_| "No explain plan generated".to_string()) - } - } - /// Test passing `metadata_size_hint` to either a single file or the whole exec #[tokio::test] async fn test_metadata_size_hint() { @@ -2066,231 +1981,4 @@ mod tests { assert_eq!(calls.len(), 2); assert_eq!(calls, vec![Some(123), Some(456)]); } - - #[tokio::test] - async fn test_topk_predicate_pushdown() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - for file in 0..5 { - // write 2 files so that one is processed before the other - let name = format!("test{:02}.parquet", file); - write_file(&format!("{path}/{name}")); - } - - let query = "select name from base_table order by id desc limit 3"; - - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+--------+", - "| name |", - "+--------+", - "| test02 |", - "| test02 |", - "| test02 |", - "+--------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=3"); - } - - #[tokio::test] - async fn test_topk_predicate_pushdown_nulls_first() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - for file in 0..5 { - // write multiple files to ensure we get pushdown of dynamic filters from one file to another - let name = format!("test{:02}.parquet", file); - write_file(&format!("{path}/{name}")); - } - - let name = format!("test{:02}.parquet", 100); - write_file_with_null_ids(&format!("{path}/{name}")); - - // nulls first by default - let query = "select name from base_table order by id desc limit 3"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+----------+", - "| name |", - "+----------+", - "| testnull |", - "| test02 |", - "| test02 |", - "+----------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=3"); - } - - #[tokio::test] - async fn test_topk_predicate_pushdown_multi_key() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - for file in 0..5 { - // write multiple files to ensure we get pushdown of dynamic filters from one file to another - // Ensure files are read in order - let name = format!("test{:02}.parquet", file); - write_file_with_non_null_ids(&format!("{path}/{name}"), file); - } - - let query = "select id from base_table order by name desc, id limit 3"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path.clone()); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+----+", - "| id |", - "+----+", - "| 0 |", - "| 1 |", - "| 2 |", - "+----+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=1"); - - let query1 = "select id from base_table order by name desc, id desc limit 3"; - let test_case = DynamicFilterTestCase::new(query1.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+----+", - "| id |", - "+----+", - "| 4 |", - "| 3 |", - "| 2 |", - "+----+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=0"); - } - - #[tokio::test] - async fn test_topk_predicate_pushdown_nulls_last() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - for file in 0..5 { - let name = format!("test{:02}.parquet", file); - write_file(&format!("{path}/{name}")); - } - let name = format!("test{:02}.parquet", 100); - write_file_with_null_ids(&format!("{path}/{name}")); - - let query = "select name from base_table order by id desc nulls last limit 3"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+--------+", - "| name |", - "+--------+", - "| test02 |", - "| test02 |", - "| test02 |", - "+--------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=4"); - } - - #[tokio::test] - async fn test_topk_predicate_pushdown_single_file() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - write_file(&format!("{path}/test.parquet")); - - let query = "select name from base_table order by id desc nulls last limit 1"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+--------+", - "| name |", - "+--------+", - "| test02 |", - "+--------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "pushdown_rows_pruned=1"); - } - - #[tokio::test] - async fn test_topk_predicate_pushdown_ignores_partition_columns() { - // The TopK operator will try to push down predicates on `file_id`. - // But since `file_id` is a partition column and not part of the file itself - // we cannot actually do any filtering on it at the file level. - // Thus it has to be ignored by `ParquetSource`. - // This test only shows that this does not result in any errors or panics, - // it is expected that "nothing exciting" happens here. - // I do think in the future it would be interesting to re-design how partition columns - // get handled, in particular by pushing them into SchemaAdapter so that the table schema == file schema - // and we can do predicate pushdown on them as well without relying on each TableProvider to - // do special handling of partition columns. - - let ctx = SessionContext::new(); - let opt = ListingOptions::new(Arc::new(ParquetFormat::default())) - .with_table_partition_cols(vec![("file_id".to_string(), DataType::UInt32)]) - // We need to force 1 partition because TopK predicate pushdown happens on a per-partition basis - // If we had 1 file per partition (as an example) no pushdown would happen - .with_target_partitions(1); - - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - for file in 0..5 { - // crete a directory for the partition - fs::create_dir_all(format!("{path}/file_id={file}")).unwrap(); - let name = format!("file_id={file}/test.parquet"); - write_file(&format!("{path}/{name}")); - } - ctx.register_listing_table("base_table", path, opt, None, None) - .await - .unwrap(); - - let query = "select file_id from base_table order by file_id asc limit 3"; - - let batches = ctx.sql(query).await.unwrap().collect().await.unwrap(); - #[rustfmt::skip] - let expected = [ - "+---------+", - "| file_id |", - "+---------+", - "| 0 |", - "| 0 |", - "| 1 |", - "+---------+", - ]; - assert_batches_eq!(expected, &batches); - - let sql = format!("explain analyze {query}"); - let batches = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); - let explain_plan = format!("{}", pretty_format_batches(&batches).unwrap()); - assert_contains!(explain_plan, "row_groups_pruned_statistics=0"); // just documenting current behavior - } } diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index f45eacce18df..a9ec0d630aa4 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -22,7 +22,7 @@ use arrow::{ make_array, Array, ArrayRef, BinaryArray, Date32Array, Date64Array, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, - StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + StringArray, StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }, @@ -30,17 +30,25 @@ use arrow::{ record_batch::RecordBatch, util::pretty::pretty_format_batches, }; +use arrow_schema::Fields; use chrono::{Datelike, Duration, TimeDelta}; use datafusion::{ - datasource::{provider_as_source, TableProvider}, + assert_batches_eq, + config::TableParquetOptions, + datasource::{listing::ListingOptions, provider_as_source, TableProvider}, physical_plan::metrics::MetricsSet, prelude::{ParquetReadOptions, SessionConfig, SessionContext}, }; +use datafusion_common::{assert_contains, Result}; +use datafusion_datasource_parquet::ParquetFormat; use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder}; use parquet::arrow::ArrowWriter; use parquet::file::properties::{EnabledStatistics, WriterProperties}; -use std::sync::Arc; -use tempfile::NamedTempFile; +use std::{ + fs::{self, File}, + sync::Arc, +}; +use tempfile::{NamedTempFile, TempDir}; mod custom_reader; mod external_access_plan; @@ -1072,3 +1080,351 @@ async fn make_test_file_page(scenario: Scenario, row_per_page: usize) -> NamedTe writer.close().unwrap(); output_file } + +struct DynamicFilterTestCase { + query: String, + path: String, +} + +impl DynamicFilterTestCase { + fn new(query: String, path: String) -> Self { + Self { query, path } + } + + async fn run_query(&self, query: &str) -> Vec { + // Force 1 partition and 1 rg per partition because if we widen the plan + // and read all batches at once we won't get any dynamic pushdown. + let mut cfg = SessionConfig::new(); + cfg = cfg.set_u64("datafusion.execution.parquet.max_row_group_size", 1); + let ctx = SessionContext::new_with_config(cfg); + + let mut pq_options = TableParquetOptions::default(); + pq_options.global.max_row_group_size = 1; + pq_options.global.pushdown_filters = true; + let fmt = ParquetFormat::default().with_options(pq_options); + let opt = ListingOptions::new(Arc::new(fmt)).with_target_partitions(1); + ctx.register_listing_table("base_table", &self.path, opt, None, None) + .await + .unwrap(); + + ctx.sql(query).await.unwrap().collect().await.unwrap() + } + + async fn results(&self) -> Vec { + self.run_query(&self.query).await + } + + async fn explain_plan(&self) -> String { + let query = format!("EXPLAIN ANALYZE {}", self.query); + let batches = self.run_query(&query).await; + + pretty_format_batches(&batches) + .map(|s| format!("{}", s)) + .unwrap_or_else(|_| "No explain plan generated".to_string()) + } +} + +fn write_file_with_non_null_ids(file: &String, value: i64) { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![Some(value)]); + let name_array = StringArray::from(vec![Some("test")]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(id_array), Arc::new(name_array)], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); +} + +fn write_file_with_null_ids(file: &String) { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![None]); + let name_array = StringArray::from(vec![Some(format!("test{:02}", "null"))]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(id_array), Arc::new(name_array)], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); +} + +fn write_record_batch(file: &String, batch: RecordBatch) -> Result<()> { + let file = File::create(file)?; + let w_opt = WriterProperties::builder() + .set_max_row_group_size(1) + .build(); + let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(w_opt))?; + writer.write(&batch)?; + writer.flush()?; + writer.close()?; + Ok(()) +} + +fn write_file(file: &String) { + let struct_fields = Fields::from(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + ]); + let schema = Schema::new(vec![ + Field::new("struct", DataType::Struct(struct_fields.clone()), false), + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![Some(2), Some(1)]); + let columns = vec![ + Arc::new(Int64Array::from(vec![3, 4])) as _, + Arc::new(StringArray::from(vec!["zzz", "aaa"])) as _, + ]; + let struct_array = StructArray::new(struct_fields, columns, None); + + let name_array = StringArray::from(vec![Some("test02"), Some("test01")]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(struct_array), + Arc::new(id_array), + Arc::new(name_array), + ], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + // write 2 files so that one is processed before the other + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + + let query = "select name from base_table order by id desc limit 3"; + + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "| test02 |", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=3"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_nulls_first() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + // write multiple files to ensure we get pushdown of dynamic filters from one file to another + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + + let name = format!("test{:02}.parquet", 100); + write_file_with_null_ids(&format!("{path}/{name}")); + + // nulls first by default + let query = "select name from base_table order by id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----------+", + "| name |", + "+----------+", + "| testnull |", + "| test02 |", + "| test02 |", + "+----------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=3"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_multi_key() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + for file in 0..5 { + // write multiple files to ensure we get pushdown of dynamic filters from one file to another + // Ensure files are read in order + let name = format!("test{:02}.parquet", file); + write_file_with_non_null_ids(&format!("{path}/{name}"), file); + } + + let query = "select id from base_table order by name desc, id limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path.clone()); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 0 |", + "| 1 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=1"); + + let query1 = "select id from base_table order by name desc, id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query1.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 4 |", + "| 3 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=0"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_nulls_last() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + let name = format!("test{:02}.parquet", 100); + write_file_with_null_ids(&format!("{path}/{name}")); + + let query = "select name from base_table order by id desc nulls last limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "| test02 |", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=4"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_single_file() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + write_file(&format!("{path}/test.parquet")); + + let query = "select name from base_table order by id desc nulls last limit 1"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "pushdown_rows_pruned=1"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_ignores_partition_columns() { + // The TopK operator will try to push down predicates on `file_id`. + // But since `file_id` is a partition column and not part of the file itself + // we cannot actually do any filtering on it at the file level. + // Thus it has to be ignored by `ParquetSource`. + // This test only shows that this does not result in any errors or panics, + // it is expected that "nothing exciting" happens here. + // I do think in the future it would be interesting to re-design how partition columns + // get handled, in particular by pushing them into SchemaAdapter so that the table schema == file schema + // and we can do predicate pushdown on them as well without relying on each TableProvider to + // do special handling of partition columns. + + let ctx = SessionContext::new(); + let opt = ListingOptions::new(Arc::new(ParquetFormat::default())) + .with_table_partition_cols(vec![("file_id".to_string(), DataType::UInt32)]) + // We need to force 1 partition because TopK predicate pushdown happens on a per-partition basis + // If we had 1 file per partition (as an example) no pushdown would happen + .with_target_partitions(1); + + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + for file in 0..5 { + // crete a directory for the partition + fs::create_dir_all(format!("{path}/file_id={file}")).unwrap(); + let name = format!("file_id={file}/test.parquet"); + write_file(&format!("{path}/{name}")); + } + ctx.register_listing_table("base_table", path, opt, None, None) + .await + .unwrap(); + + let query = "select file_id from base_table order by file_id asc limit 3"; + + let batches = ctx.sql(query).await.unwrap().collect().await.unwrap(); + #[rustfmt::skip] + let expected = [ + "+---------+", + "| file_id |", + "+---------+", + "| 0 |", + "| 0 |", + "| 1 |", + "+---------+", + ]; + assert_batches_eq!(expected, &batches); + + let sql = format!("explain analyze {query}"); + let batches = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); + let explain_plan = format!("{}", pretty_format_batches(&batches).unwrap()); + assert_contains!(explain_plan, "row_groups_pruned_statistics=0"); // just documenting current behavior +} diff --git a/datafusion/core/tests/parquet/schema.rs b/datafusion/core/tests/parquet/schema.rs index 29afd3970432..baf0b77e808f 100644 --- a/datafusion/core/tests/parquet/schema.rs +++ b/datafusion/core/tests/parquet/schema.rs @@ -201,7 +201,7 @@ fn write_files(table_path: &Path, schemas: Vec) { let schema = Arc::new(schema); let filename = format!("part-{i}.parquet"); let path = table_path.join(filename); - let file = fs::File::create(path).unwrap(); + let file = File::create(path).unwrap(); let mut writer = ArrowWriter::try_new(file, schema.clone(), None).unwrap(); // create mock record batch diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 7b33b82721bf..f607b5035261 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -129,9 +129,7 @@ impl PhysicalOptimizerRule for FilterPushdown { plan: Arc, _config: &ConfigOptions, ) -> Result> { - println!("plan before: {:?}", plan); if let Some(result) = pushdown_filters(&plan, &[])? { - println!("plan after filter pushdown: {:?}", result.inner); Ok(result.inner) } else { Ok(plan) diff --git a/datafusion/physical-plan/src/dynamic_filters.rs b/datafusion/physical-plan/src/dynamic_filters.rs index f954bf0a311c..d9aae419019b 100644 --- a/datafusion/physical-plan/src/dynamic_filters.rs +++ b/datafusion/physical-plan/src/dynamic_filters.rs @@ -90,7 +90,7 @@ impl Hash for DynamicFilterPhysicalExpr { impl PartialEq for DynamicFilterPhysicalExpr { fn eq(&self, other: &Self) -> bool { - self.inner.dyn_eq(&*other.inner.as_any()) && self.children == other.children + self.inner.dyn_eq(other.inner.as_any()) && self.children == other.children } } diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index f3b6489fd2b7..1d55b4e78921 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -459,16 +459,19 @@ impl ExecutionPlan for FilterExec { .zip(pushdown.iter()) .filter_map(|(f, p)| { if matches!(p, FilterPushdownSupport::Exact) { - // Exact pushdown support means we keep this filter - Some(Arc::clone(f)) - } else { + // Exact pushdown support means we keep discard filter None + } else { + // Otherwise we still have to apply it + Some(Arc::clone(f)) } }) .collect::>(); let predicate = conjunction(new_filters); + println!("predicate: {:?}", predicate); + if predicate.eq(&lit(true)) && self.projection.is_none() { return Ok(Some(Arc::clone(self.input()))); } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 008d1bd0a4c0..f4f51deb76e7 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1041,7 +1041,7 @@ impl SortExec { preserve_partitioning: self.preserve_partitioning, fetch, cache, - dynamic_filter_source: self.dynamic_filter_source.clone(), + dynamic_filter_source: Arc::clone(&self.dynamic_filter_source), } } From ff1aaa511356df8904ef10da2e2d8df2c21ef3bb Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 13:43:28 -0500 Subject: [PATCH 11/27] re-enable feature flag --- datafusion/physical-plan/src/sorts/sort.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index f4f51deb76e7..f5c2d6c7f34d 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1249,8 +1249,10 @@ impl ExecutionPlan for SortExec { while let Some(batch) = input.next().await { let batch = batch?; topk.insert_batch(batch)?; - if let Some(values) = topk.get_threshold_values()? { - dynamic_filter_source.update_values(&values)?; + if context.session_config().options().optimizer.enable_dynamic_filter_pushdown { + if let Some(values) = topk.get_threshold_values()? { + dynamic_filter_source.update_values(&values)?; + } } } topk.emit() From b0cc41f6ca99c3a4c7deeaee7a9ef889646c560c Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 14:37:10 -0500 Subject: [PATCH 12/27] arbitrarily sort in listingtable --- datafusion/datasource-parquet/src/source.rs | 2 +- datafusion/datasource/src/file_groups.rs | 12 ++- datafusion/datasource/src/source.rs | 3 +- .../physical-plan/src/coalesce_batches.rs | 2 +- .../physical-plan/src/dynamic_filters.rs | 82 +++++++++++-------- datafusion/physical-plan/src/filter.rs | 4 +- .../physical-plan/src/repartition/mod.rs | 2 +- datafusion/physical-plan/src/sorts/sort.rs | 9 +- 8 files changed, 72 insertions(+), 44 deletions(-) diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 86155a9bd38e..8690ac22e66a 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -581,11 +581,11 @@ impl FileSource for ParquetSource { &self, filters: &[&Arc], ) -> datafusion_common::Result> { - let mut conf = self.clone(); if !self.pushdown_filters() { // If pushdown filters is not enabled, return early return Ok(None); } + let mut conf = self.clone(); let predicate = match self.predicate.as_ref() { Some(existing_predicate) => { // Combine existing predicate with new filters diff --git a/datafusion/datasource/src/file_groups.rs b/datafusion/datasource/src/file_groups.rs index 5fe3e25eaa1f..4068c048e67c 100644 --- a/datafusion/datasource/src/file_groups.rs +++ b/datafusion/datasource/src/file_groups.rs @@ -426,8 +426,16 @@ impl FileGroup { // ObjectStore::list does not guarantee any consistent order and for some // implementations such as LocalFileSystem, it may be inconsistent. Thus - // Sort files by path to ensure consistent plans when run more than once. - self.files.sort_by(|a, b| a.path().cmp(b.path())); + // Sort files by last_modified desc, path asc to ensure consistent plans when run more than once. + // The choice of sorting by last_modified desc is somewhat arbitrary, but the idea is that it will + // help speed up queries such as `select * from t1 order by timestamp_column desc limit 100` + // and that wanting "the latest" data is generally more common and latency senstive than wanting "the oldest" data. + self.files.sort_by(|a, b| { + b.object_meta + .last_modified + .cmp(&a.object_meta.last_modified) + .then_with(|| a.path().cmp(b.path())) + }); // effectively this is div with rounding up instead of truncating let chunk_size = self.len().div_ceil(n); diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 74700d25da08..285dab25b8ae 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -208,13 +208,14 @@ impl ExecutionPlan for DataSourceExec { } fn supports_filter_pushdown(&self) -> bool { - true + true // DataSourceExec can receive filter pushdowns from upstream operators } fn push_down_filters_from_parents( &self, filters: &[&Arc], ) -> datafusion_common::Result> { + // we forward filter pushdown to our data source if let Some(pushdown_result) = self.data_source.push_down_filters(filters)? { let new_self = Arc::new(DataSourceExec::new(pushdown_result.inner)); return Ok(Some(ExecutionPlanFilterPushdownResult::new( diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index d21cdbf3a230..341ad347a836 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -214,7 +214,7 @@ impl ExecutionPlan for CoalesceBatchesExec { } fn supports_filter_pushdown(&self) -> bool { - true + true // CoalesceBatchesExec does not itself accept any filters but it is happy to have them pushed to it's children } } diff --git a/datafusion/physical-plan/src/dynamic_filters.rs b/datafusion/physical-plan/src/dynamic_filters.rs index d9aae419019b..48e2575b323a 100644 --- a/datafusion/physical-plan/src/dynamic_filters.rs +++ b/datafusion/physical-plan/src/dynamic_filters.rs @@ -21,12 +21,13 @@ use std::{ sync::{Arc, RwLock}, }; -use datafusion_common::Result; +use datafusion_common::{ + tree_node::{Transformed, TransformedResult, TreeNode}, + Result, +}; use datafusion_expr::ColumnarValue; use datafusion_physical_expr::{utils::conjunction, PhysicalExpr}; -use datafusion_physical_expr_common::physical_expr::{ - with_new_children_if_necessary, DynEq, DynHash, -}; +use datafusion_physical_expr_common::physical_expr::{DynEq, DynHash}; /// A source of dynamic runtime filters. /// @@ -55,23 +56,20 @@ impl PartialEq for dyn DynamicFilterSource { impl Eq for dyn DynamicFilterSource {} -#[derive(Debug, Eq, PartialEq, Hash)] -enum Children { - Remapped(Vec>), - Original(Vec>), -} - /// A wrapper around a [`DynamicFilterSource`] that allows it to be used as a physical expression. /// This will call [`DynamicFilterSource::snapshot_current_filters`] to get the current filters for each call to /// [`PhysicalExpr::evaluate`], [`PhysicalExpr::data_type`], and [`PhysicalExpr::nullable`]. /// It also implements [`PhysicalExpr::snapshot`] by forwarding the call to [`DynamicFilterSource::snapshot_current_filters`]. #[derive(Debug)] pub struct DynamicFilterPhysicalExpr { - /// The children of this expression. - /// In particular, it is important that if the dynamic expression will reference any columns - /// those columns be marked as children of this expression so that the expression can be properly - /// bound to the schema. - children: Children, + /// The original children of this PhysicalExpr, if any. + /// This is necessary because the dynamic filter may be initialized with a placeholder (e.g. `lit(true)`) + /// and later remapped to the actual expressions that are being filtered. + /// But we need to know the children (e.g. columns referenced in the expression) ahead of time to evaluate the expression correctly. + children: Vec>, + /// If any of the children were remapped / modified (e.g. to adjust for projections) we need to keep track of the new children + /// so that when we update `current()` in subsequent iterations we can re-apply the replacements. + remapped_children: Option>>, /// The source of dynamic filters. inner: Arc, /// For testing purposes track the data type and nullability to make sure they don't change. @@ -85,12 +83,15 @@ impl Hash for DynamicFilterPhysicalExpr { fn hash(&self, state: &mut H) { self.inner.dyn_hash(state); self.children.dyn_hash(state); + self.remapped_children.dyn_hash(state); } } impl PartialEq for DynamicFilterPhysicalExpr { fn eq(&self, other: &Self) -> bool { - self.inner.dyn_eq(other.inner.as_any()) && self.children == other.children + self.inner.dyn_eq(other.inner.as_any()) + && self.children == other.children + && self.remapped_children == other.remapped_children } } @@ -108,7 +109,8 @@ impl DynamicFilterPhysicalExpr { inner: Arc, ) -> Self { Self { - children: Children::Original(children), + children, + remapped_children: None, // Initially no remapped children inner, data_type: Arc::new(RwLock::new(None)), nullable: Arc::new(RwLock::new(None)), @@ -117,17 +119,29 @@ impl DynamicFilterPhysicalExpr { fn current(&self) -> Result> { let current = conjunction(self.inner.snapshot_current_filters()?); - match self.children { - Children::Original(_) => { - // If the children are the original ones, we can just return the current expression - Ok(current) - } - Children::Remapped(ref remapped_children) => { - // If we have remapped children, we need to replace them in the current expression - let new_current = - with_new_children_if_necessary(current, remapped_children.clone())?; - Ok(new_current) - } + if let Some(remapped_children) = &self.remapped_children { + // Remap children to the current children + // of the expression. + current + .transform_up(|expr| { + // Check if this is any of our original children + if let Some(pos) = self + .children + .iter() + .position(|c| c.as_ref() == expr.as_ref()) + { + // If so, remap it to the current children + // of the expression. + let new_child = Arc::clone(&remapped_children[pos]); + Ok(Transformed::yes(new_child)) + } else { + // Otherwise, just return the expression + Ok(Transformed::no(expr)) + } + }) + .data() + } else { + Ok(current) } } } @@ -138,10 +152,11 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { } fn children(&self) -> Vec<&Arc> { - match &self.children { - Children::Original(children) => children.iter().collect(), - Children::Remapped(children) => children.iter().collect(), - } + self.remapped_children + .as_ref() + .unwrap_or(&self.children) + .iter() + .collect() } fn with_new_children( @@ -149,7 +164,8 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { children: Vec>, ) -> Result> { Ok(Arc::new(Self { - children: Children::Remapped(children), + children: self.children.clone(), + remapped_children: Some(children), inner: Arc::clone(&self.inner), data_type: Arc::clone(&self.data_type), nullable: Arc::clone(&self.nullable), diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 1d55b4e78921..0e63acfbd841 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -438,7 +438,7 @@ impl ExecutionPlan for FilterExec { } fn supports_filter_pushdown(&self) -> bool { - true + true // FilterExec both accepts filters and is happy for them to be pushed onto its children } fn filters_for_pushdown(&self) -> Result>> { @@ -470,8 +470,6 @@ impl ExecutionPlan for FilterExec { let predicate = conjunction(new_filters); - println!("predicate: {:?}", predicate); - if predicate.eq(&lit(true)) && self.projection.is_none() { return Ok(Some(Arc::clone(self.input()))); } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 54cd2bd538e4..f3306e49b04b 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -725,7 +725,7 @@ impl ExecutionPlan for RepartitionExec { } fn supports_filter_pushdown(&self) -> bool { - true + true // RepartitionExec does not accept filters itself but is happy for them to be pushed down to its children } } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index f5c2d6c7f34d..9bb7db325da1 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1249,7 +1249,12 @@ impl ExecutionPlan for SortExec { while let Some(batch) = input.next().await { let batch = batch?; topk.insert_batch(batch)?; - if context.session_config().options().optimizer.enable_dynamic_filter_pushdown { + if context + .session_config() + .options() + .optimizer + .enable_dynamic_filter_pushdown + { if let Some(values) = topk.get_threshold_values()? { dynamic_filter_source.update_values(&values)?; } @@ -1348,7 +1353,7 @@ impl ExecutionPlan for SortExec { } fn supports_filter_pushdown(&self) -> bool { - true + true // SortExec doesn't accept filters itself but it's happy for them to be forwarded down to it's children } } From 3a185cc9adeff40e5cb09a833a146859de24ef6c Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 14:49:55 -0500 Subject: [PATCH 13/27] fix tests --- datafusion/physical-optimizer/src/filter_pushdown.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index f607b5035261..46a081bf5310 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -67,6 +67,8 @@ fn pushdown_filters( } else { new_children.push(Arc::clone(child)); } + filter_pushdown_result = + vec![FilterPushdownSupport::Inexact; all_filters.len()]; }; } From b664183580e5f884b0c3e814ce61061989499b89 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 15:04:50 -0500 Subject: [PATCH 14/27] wip --- datafusion/physical-plan/src/dynamic_filters.rs | 17 ++++++++++++++--- .../physical-plan/src/sorts/sort_filters.rs | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/dynamic_filters.rs b/datafusion/physical-plan/src/dynamic_filters.rs index 48e2575b323a..e76502779416 100644 --- a/datafusion/physical-plan/src/dynamic_filters.rs +++ b/datafusion/physical-plan/src/dynamic_filters.rs @@ -17,6 +17,7 @@ use std::{ any::Any, + fmt::Display, hash::Hash, sync::{Arc, RwLock}, }; @@ -36,7 +37,7 @@ use datafusion_physical_expr_common::physical_expr::{DynEq, DynHash}; /// /// See `TopKDynamicFilterSource` in datafusion/physical-plan/src/topk/mod.rs for examples. pub trait DynamicFilterSource: - Send + Sync + std::fmt::Debug + DynEq + DynHash + 'static + Send + Sync + std::fmt::Debug + DynEq + DynHash + Display + 'static { /// Take a snapshot of the current state of filtering, returning a non-dynamic PhysicalExpr. /// This is used to e.g. serialize dynamic filters across the wire or to pass them into systems @@ -97,9 +98,9 @@ impl PartialEq for DynamicFilterPhysicalExpr { impl Eq for DynamicFilterPhysicalExpr {} -impl std::fmt::Display for DynamicFilterPhysicalExpr { +impl Display for DynamicFilterPhysicalExpr { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "DynamicFilterPhysicalExpr") + write!(f, "DynamicFilterPhysicalExpr [ {} ]", self.inner) } } @@ -300,6 +301,16 @@ mod test { } } + impl Display for MockDynamicFilterSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "MockDynamicFilterSource [ current_expr: {:?} ]", + self.current_expr.read().unwrap() + ) + } + } + let source = Arc::new(MockDynamicFilterSource { current_expr: Arc::new(RwLock::new(lit(42) as Arc)), }); diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index 4020769d30c2..a79b079d3ed9 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -16,6 +16,7 @@ // under the License. use std::{ + fmt::Display, hash::{Hash, Hasher}, sync::{Arc, RwLock}, }; @@ -168,6 +169,19 @@ impl SortDynamicFilterSource { } } +impl Display for SortDynamicFilterSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let thresholds = self + .snapshot_current_filters() + .map_err(|_| std::fmt::Error)? + .iter() + .map(|p| format!("{p}")) + .collect::>(); + let inner = thresholds.join(","); + write!(f, "SortDynamicFilterSource[ {} ]", inner,) + } +} + impl DynamicFilterSource for SortDynamicFilterSource { fn as_any(&self) -> &dyn std::any::Any { self From 67ed488151d872075a3ad57756b4838ca2d20a6f Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 15:12:24 -0500 Subject: [PATCH 15/27] update tests --- datafusion/core/tests/parquet/mod.rs | 27 +++++++++---- .../physical-plan/src/sorts/sort_filters.rs | 38 ++++++++++++++++--- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index a9ec0d630aa4..8c657d7c0c1b 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -1266,7 +1266,7 @@ async fn test_topk_predicate_pushdown_nulls_first() { assert_batches_eq!(expected, &batches); let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=3"); + assert_contains!(&plan, "row_groups_pruned_statistics=5"); } #[tokio::test] @@ -1282,7 +1282,6 @@ async fn test_topk_predicate_pushdown_multi_key() { let query = "select id from base_table order by name desc, id limit 3"; let test_case = DynamicFilterTestCase::new(query.to_string(), path.clone()); - let batches = test_case.results().await; #[rustfmt::skip] let expected = [ @@ -1295,13 +1294,28 @@ async fn test_topk_predicate_pushdown_multi_key() { "+----+", ]; assert_batches_eq!(expected, &batches); + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=0"); + let query1 = "select id from base_table order by name desc, id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query1.to_string(), path.clone()); + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 4 |", + "| 3 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); let plan = test_case.explain_plan().await; assert_contains!(&plan, "row_groups_pruned_statistics=1"); - let query1 = "select id from base_table order by name desc, id desc limit 3"; + let query1 = "select id from base_table order by name asc, id desc limit 3"; let test_case = DynamicFilterTestCase::new(query1.to_string(), path); - let batches = test_case.results().await; #[rustfmt::skip] let expected = [ @@ -1314,9 +1328,8 @@ async fn test_topk_predicate_pushdown_multi_key() { "+----+", ]; assert_batches_eq!(expected, &batches); - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=0"); + assert_contains!(&plan, "row_groups_pruned_statistics=1"); } #[tokio::test] @@ -1348,7 +1361,7 @@ async fn test_topk_predicate_pushdown_nulls_last() { assert_batches_eq!(expected, &batches); let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=4"); + assert_contains!(&plan, "row_groups_pruned_statistics=3"); } #[tokio::test] diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index a79b079d3ed9..8ddd024ef0f0 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -136,11 +136,39 @@ impl SortDynamicFilterSource { let new_value_is_greater = new_value > current_value; let new_value_is_null = new_value.is_null(); let current_value_is_null = current_value.is_null(); - if (nulls_first && new_value_is_null && !current_value_is_null) - || (descending && new_value_is_greater) - || (!descending && !new_value_is_greater) - { - // *current_value = new_value.clone(); + + let update_needed = match (nulls_first, descending) { + // For nulls_first + descending: update if new value is null (and current is not) or if new value is greater + (true, true) => { + (new_value_is_null && !current_value_is_null) + || (!new_value_is_null + && !current_value_is_null + && new_value_is_greater) + } + // For nulls_first + ascending: update if new value is null (and current is not) or if new value is smaller + (true, false) => { + (new_value_is_null && !current_value_is_null) + || (!new_value_is_null + && !current_value_is_null + && !new_value_is_greater) + } + // For nulls_last + descending: update if new value is not null (and current is null) or if new value is greater + (false, true) => { + (!new_value_is_null && current_value_is_null) + || (!new_value_is_null + && !current_value_is_null + && new_value_is_greater) + } + // For nulls_last + ascending: update if new value is not null (and current is null) or if new value is smaller + (false, false) => { + (!new_value_is_null && current_value_is_null) + || (!new_value_is_null + && !current_value_is_null + && !new_value_is_greater) + } + }; + + if update_needed { threshold.value = Some(new_value.clone()); } } else { From 48910b21a3b82a796f044089e8975fc0d2637f28 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 17:14:38 -0500 Subject: [PATCH 16/27] fix --- .../datasource/physical_plan/arrow_file.rs | 2 +- datafusion/datasource-parquet/src/source.rs | 5 +- .../physical-optimizer/src/filter_pushdown.rs | 100 ++++++++++++------ datafusion/physical-optimizer/src/pruning.rs | 2 + 4 files changed, 74 insertions(+), 35 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 5dcf4df73f57..692fc1df28bb 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -201,7 +201,7 @@ impl ExecutionPlan for ArrowExec { /// Arrow configuration struct that is given to DataSourceExec /// Does not hold anything special, since [`FileScanConfig`] is sufficient for arrow -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] pub struct ArrowSource { metrics: ExecutionPlanMetricsSet, projected_statistics: Option, diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 8690ac22e66a..bc8cb48f24a0 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -17,6 +17,7 @@ //! ParquetSource implementation for reading parquet files use std::any::Any; +use std::fmt::Debug; use std::fmt::Formatter; use std::sync::Arc; @@ -581,10 +582,6 @@ impl FileSource for ParquetSource { &self, filters: &[&Arc], ) -> datafusion_common::Result> { - if !self.pushdown_filters() { - // If pushdown filters is not enabled, return early - return Ok(None); - } let mut conf = self.clone(); let predicate = match self.predicate.as_ref() { Some(existing_predicate) => { diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 46a081bf5310..fbd8b43b1472 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -26,6 +26,43 @@ use datafusion_physical_plan::{ use crate::PhysicalOptimizerRule; +#[derive(Clone, Copy, Debug)] +enum FilterPushdownSupportState { + ChildExact, + ChildInexact, + NoChild, +} + +impl FilterPushdownSupportState { + fn combine_with_other( + &self, + other: &FilterPushdownSupport, + ) -> FilterPushdownSupportState { + match (other, self) { + (FilterPushdownSupport::Exact, FilterPushdownSupportState::NoChild) => { + FilterPushdownSupportState::ChildExact + } + (FilterPushdownSupport::Exact, FilterPushdownSupportState::ChildInexact) => { + FilterPushdownSupportState::ChildInexact + } + (FilterPushdownSupport::Inexact, FilterPushdownSupportState::NoChild) => { + FilterPushdownSupportState::ChildInexact + } + (FilterPushdownSupport::Inexact, FilterPushdownSupportState::ChildExact) => { + FilterPushdownSupportState::ChildInexact + } + ( + FilterPushdownSupport::Inexact, + FilterPushdownSupportState::ChildInexact, + ) => FilterPushdownSupportState::ChildInexact, + (FilterPushdownSupport::Exact, FilterPushdownSupportState::ChildExact) => { + // If both are exact, keep it as exact + FilterPushdownSupportState::ChildExact + } + } + } +} + fn pushdown_filters( node: &Arc, parent_filters: &[Arc], @@ -38,27 +75,19 @@ fn pushdown_filters( .chain(node_filters.iter()) .cloned() .collect::>(); - let mut filter_pushdown_result = if children.is_empty() { - vec![FilterPushdownSupport::Inexact; all_filters.len()] - } else { - vec![FilterPushdownSupport::Exact; all_filters.len()] - }; + let mut filter_pushdown_result = + vec![FilterPushdownSupportState::NoChild; all_filters.len()]; for child in children { if child.supports_filter_pushdown() { if let Some(result) = pushdown_filters(child, &all_filters)? { new_children.push(result.inner); for (all_filters_idx, support) in result.support.iter().enumerate() { - if !matches!(support, FilterPushdownSupport::Exact) { - filter_pushdown_result[all_filters_idx] = - FilterPushdownSupport::Inexact; - } + filter_pushdown_result[all_filters_idx] = filter_pushdown_result + [all_filters_idx] + .combine_with_other(support) } } else { new_children.push(Arc::clone(child)); - // If the child does not support filter pushdown, mark all filters as inexact - for support in filter_pushdown_result.iter_mut() { - *support = FilterPushdownSupport::Inexact; - } } } else { // Reset the filters we are pushing down. @@ -67,24 +96,32 @@ fn pushdown_filters( } else { new_children.push(Arc::clone(child)); } - filter_pushdown_result = - vec![FilterPushdownSupport::Inexact; all_filters.len()]; }; } - let mut result_node = with_new_children_if_necessary(Arc::clone(node), new_children)?; + let mut node = with_new_children_if_necessary(Arc::clone(node), new_children)?; // Now update the node with the result of the pushdown of it's filters - let pushdown_result = filter_pushdown_result[parent_filters.len()..].to_vec(); + let pushdown_result = filter_pushdown_result[parent_filters.len()..] + .iter() + .map(|s| match s { + FilterPushdownSupportState::ChildExact => FilterPushdownSupport::Exact, + FilterPushdownSupportState::ChildInexact => FilterPushdownSupport::Inexact, + FilterPushdownSupportState::NoChild => FilterPushdownSupport::Inexact, + }) + .collect::>(); if let Some(new_node) = - Arc::clone(node).with_filter_pushdown_result(&pushdown_result)? + Arc::clone(&node).with_filter_pushdown_result(&pushdown_result)? { - result_node = new_node; + node = new_node; }; // And check if it can absorb the remaining filters let remaining_filter_indexes = (0..parent_filters.len()) - .filter(|&i| !matches!(filter_pushdown_result[i], FilterPushdownSupport::Exact)) + .filter(|&i| match filter_pushdown_result[i] { + FilterPushdownSupportState::ChildExact => false, + _ => true, + }) .collect::>(); if !remaining_filter_indexes.is_empty() { let remaining_filters = remaining_filter_indexes @@ -92,22 +129,25 @@ fn pushdown_filters( .map(|&i| &parent_filters[i]) .collect::>(); if let Some(result) = node.push_down_filters_from_parents(&remaining_filters)? { - result_node = result.inner; + node = result.inner; for (parent_filter_index, support) in remaining_filter_indexes.iter().zip(result.support) { - // If any of the remaining filters are not exact, mark them as inexact - if !matches!(support, FilterPushdownSupport::Exact) { - filter_pushdown_result[*parent_filter_index] = - FilterPushdownSupport::Inexact; - } + filter_pushdown_result[*parent_filter_index] = filter_pushdown_result + [*parent_filter_index] + .combine_with_other(&support) } } } - Ok(Some(ExecutionPlanFilterPushdownResult::new( - result_node, - filter_pushdown_result[..parent_filters.len()].to_vec(), // only return the support for the original parent filters - ))) + let support = filter_pushdown_result[..parent_filters.len()] + .iter() + .map(|s| match s { + FilterPushdownSupportState::ChildExact => FilterPushdownSupport::Exact, + FilterPushdownSupportState::ChildInexact => FilterPushdownSupport::Inexact, + FilterPushdownSupportState::NoChild => FilterPushdownSupport::Inexact, + }) + .collect::>(); + Ok(Some(ExecutionPlanFilterPushdownResult::new(node, support))) } #[derive(Debug)] diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index f0eb5f290094..8a26f2c892b2 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -529,6 +529,8 @@ impl PruningPredicate { /// See the struct level documentation on [`PruningPredicate`] for more /// details. pub fn try_new(expr: Arc, schema: SchemaRef) -> Result { + // Get a (simpler) snapshot of the physical expr here to use with `PruningPredicate` + // which does not handle dynamic exprs in general let expr = snapshot_physical_expr(expr)?; let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _; From d178dc9c59cd9babf069d72dbfc1520ba0597a3c Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 22:46:14 -0500 Subject: [PATCH 17/27] fix parquet tests --- datafusion/datasource-parquet/src/source.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index bc8cb48f24a0..3a1ce61525d1 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -596,9 +596,17 @@ impl FileSource for ParquetSource { match predicate { Some(new_predicate) if !new_predicate.eq(&lit(true)) => { conf.predicate = Some(new_predicate); + // Respect the current pushdown filters setting, + // otherwise we would mark filters as exact but then not filter at the row level + // because the setting gets checked again inside the ParquetOpener! + let support = if self.table_parquet_options.global.pushdown_filters { + vec![FilterPushdownSupport::Exact; filters.len()] + } else { + vec![FilterPushdownSupport::Inexact; filters.len()] + }; Ok(Some(FileSourceFilterPushdownResult::new( Arc::new(conf), - vec![FilterPushdownSupport::Exact; filters.len()], + support, ))) } _no_op_predicate => Ok(None), From 615283cc10ecce22874f3b9ec47c62eec82689b0 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 1 Apr 2025 23:46:53 -0500 Subject: [PATCH 18/27] better test --- .../tests/fuzz_cases/topk_filter_pushdown.rs | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs index aafb38a5d542..2b1b905d9390 100644 --- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -21,6 +21,7 @@ use std::sync::{Arc, LazyLock}; use arrow::array::{Int32Array, StringArray, StringDictionaryBuilder}; use arrow::datatypes::Int32Type; use arrow::record_batch::RecordBatch; +use arrow::util::pretty::pretty_format_batches; use arrow_schema::{DataType, Field, Schema}; use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig}; use datafusion::prelude::{SessionConfig, SessionContext}; @@ -200,6 +201,20 @@ struct RunQueryResult { expected: Vec, } +impl RunQueryResult { + fn expected_formated(&self) -> String { + format!("{}", pretty_format_batches(&self.expected).unwrap()) + } + + fn result_formated(&self) -> String { + format!("{}", pretty_format_batches(&self.result).unwrap()) + } + + fn is_ok(&self) -> bool { + self.expected_formated() == self.result_formated() + } +} + async fn run_query( query: String, cfg: SessionConfig, @@ -318,7 +333,22 @@ async fn test_fuzz_topk_filter_pushdown() { results.sort_unstable_by(|a, b| a.query.cmp(&b.query)); println!("Ran {} test cases in {:?}", results.len(), start.elapsed()); - for result in results { - assert_eq!(result.result, result.expected, "Query: {}", result.query); + let failures = results + .iter() + .filter(|result| !result.is_ok()) + .collect::>(); + + for failure in &failures { + println!("Failure:"); + println!("Query:\n{}", failure.query); + println!("\nExpected:\n{}", failure.expected_formated()); + println!("\nResult:\n{}", failure.result_formated()); + println!("\n\n"); + } + + if !failures.is_empty() { + panic!("Some test cases failed"); + } else { + println!("All test cases passed"); } } From cf24b10ed0b656ea9bd0baf22d1f58d7d9c72da0 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 01:09:35 -0500 Subject: [PATCH 19/27] fix more --- .../core/src/datasource/listing/table.rs | 24 +-- datafusion/core/tests/parquet/mod.rs | 6 +- datafusion/core/tests/sql/path_partition.rs | 18 +- datafusion/datasource-parquet/src/opener.rs | 19 +- datafusion/datasource-parquet/src/source.rs | 1 + datafusion/physical-plan/src/sorts/sort.rs | 2 +- .../physical-plan/src/sorts/sort_filters.rs | 196 ++++++++---------- 7 files changed, 113 insertions(+), 153 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 61eeb419a480..5595931c1006 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -36,22 +36,19 @@ use datafusion_catalog::TableProvider; use datafusion_common::{config_err, DataFusionError, Result}; use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_expr::dml::InsertOp; -use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; +use datafusion_expr::{Expr, TableProviderFilterPushDown}; use datafusion_expr::{SortExpr, TableType}; use datafusion_physical_plan::empty::EmptyExec; use datafusion_physical_plan::{ExecutionPlan, Statistics}; use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef}; use datafusion_common::{ - config_datafusion_err, internal_err, plan_err, project_schema, Constraints, - SchemaExt, ToDFSchema, + config_datafusion_err, internal_err, plan_err, project_schema, Constraints, SchemaExt, }; use datafusion_execution::cache::{ cache_manager::FileStatisticsCache, cache_unit::DefaultFileStatisticsCache, }; -use datafusion_physical_expr::{ - create_physical_expr, LexOrdering, PhysicalSortRequirement, -}; +use datafusion_physical_expr::{LexOrdering, PhysicalSortRequirement}; use async_trait::async_trait; use datafusion_catalog::Session; @@ -918,19 +915,6 @@ impl TableProvider for ListingTable { None => {} // no ordering required }; - let filters = match conjunction(filters.to_vec()) { - Some(expr) => { - let table_df_schema = self.table_schema.as_ref().clone().to_dfschema()?; - let filters = create_physical_expr( - &expr, - &table_df_schema, - state.execution_props(), - )?; - Some(filters) - } - None => None, - }; - let Some(object_store_url) = self.table_paths.first().map(ListingTableUrl::object_store) else { @@ -955,7 +939,7 @@ impl TableProvider for ListingTable { .with_output_ordering(output_ordering) .with_table_partition_cols(table_partition_cols) .build(), - filters.as_ref(), + None, ) .await } diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 8c657d7c0c1b..3c7c1e487bf4 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -1231,7 +1231,7 @@ async fn test_topk_predicate_pushdown() { assert_batches_eq!(expected, &batches); let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=3"); + assert_contains!(&plan, "row_groups_pruned_statistics=2"); } #[tokio::test] @@ -1266,7 +1266,7 @@ async fn test_topk_predicate_pushdown_nulls_first() { assert_batches_eq!(expected, &batches); let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=5"); + assert_contains!(&plan, "row_groups_pruned_statistics=3"); } #[tokio::test] @@ -1361,7 +1361,7 @@ async fn test_topk_predicate_pushdown_nulls_last() { assert_batches_eq!(expected, &batches); let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=3"); + assert_contains!(&plan, "row_groups_pruned_statistics=0"); } #[tokio::test] diff --git a/datafusion/core/tests/sql/path_partition.rs b/datafusion/core/tests/sql/path_partition.rs index bf8466d849f2..773e30914e9c 100644 --- a/datafusion/core/tests/sql/path_partition.rs +++ b/datafusion/core/tests/sql/path_partition.rs @@ -42,8 +42,7 @@ use datafusion_common::stats::Precision; use datafusion_common::test_util::batches_to_sort_string; use datafusion_common::ScalarValue; use datafusion_execution::config::SessionConfig; -use datafusion_expr::{col, lit, Expr, Operator}; -use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; +use datafusion_expr::{col, lit, Expr}; use async_trait::async_trait; use bytes::Bytes; @@ -90,18 +89,9 @@ async fn parquet_partition_pruning_filter() -> Result<()> { if let Some((_, parquet_config)) = data_source_exec.downcast_to_file_source::() { - let pred = parquet_config.predicate().unwrap(); - // Only the last filter should be pushdown to TableScan - let expected = Arc::new(BinaryExpr::new( - Arc::new(Column::new_with_schema("id", &exec.schema()).unwrap()), - Operator::Gt, - Arc::new(Literal::new(ScalarValue::Int32(Some(1)))), - )); - - assert!(pred.as_any().is::()); - let pred = pred.as_any().downcast_ref::().unwrap(); - - assert_eq!(pred, expected.as_ref()); + assert!(parquet_config.predicate().is_none()); + } else { + panic!("Expected parquet source"); } Ok(()) } diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 4752aaadee1d..8717d5301421 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -76,6 +76,8 @@ pub(super) struct ParquetOpener { pub enable_bloom_filter: bool, /// Schema adapter factory pub schema_adapter_factory: Arc, + /// Should row group pruning be applied + pub enable_stats_pruning: bool, } impl FileOpener for ParquetOpener { @@ -109,6 +111,7 @@ impl FileOpener for ParquetOpener { let reorder_predicates = self.reorder_filters; let pushdown_filters = self.pushdown_filters; let enable_bloom_filter = self.enable_bloom_filter; + let enable_stats_pruning = self.enable_stats_pruning; let limit = self.limit; let predicate_creation_errors = MetricBuilder::new(&self.metrics) @@ -207,13 +210,15 @@ impl FileOpener for ParquetOpener { } // If there is a predicate that can be evaluated against the metadata if let Some(predicate) = predicate.as_ref() { - row_groups.prune_by_statistics( - &file_schema, - builder.parquet_schema(), - rg_metadata, - predicate, - &file_metrics, - ); + if enable_stats_pruning { + row_groups.prune_by_statistics( + &file_schema, + builder.parquet_schema(), + rg_metadata, + predicate, + &file_metrics, + ); + } if enable_bloom_filter && !row_groups.is_empty() { row_groups diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 3a1ce61525d1..5b8581ed1223 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -489,6 +489,7 @@ impl FileSource for ParquetSource { reorder_filters: self.reorder_filters(), enable_page_index: self.enable_page_index(), enable_bloom_filter: self.bloom_filter_on_read(), + enable_stats_pruning: self.table_parquet_options.global.pruning, schema_adapter_factory, }) } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 9bb7db325da1..56540ff31d18 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -980,7 +980,7 @@ impl SortExec { pub fn new(expr: LexOrdering, input: Arc) -> Self { let preserve_partitioning = false; let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning); - let dynamic_filter_source = Arc::new(SortDynamicFilterSource::new(&expr)); + let dynamic_filter_source = Arc::new(SortDynamicFilterSource::new(expr.clone())); Self { expr, input, diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index 8ddd024ef0f0..dc2d6127488e 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -21,7 +21,6 @@ use std::{ sync::{Arc, RwLock}, }; -use arrow_schema::SortOptions; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::Operator; use datafusion_physical_expr::{ @@ -31,17 +30,6 @@ use datafusion_physical_expr::{ use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; -/// Holds threshold value and sort order information for a column -#[derive(Debug, Clone)] -struct ColumnThreshold { - /// The current threshold value - pub value: Option, - /// The column expression - pub expr: Arc, - /// Sort options - pub sort_options: SortOptions, -} - /// Pushdown of dynamic fitlers from sort + limit operators (aka `TopK`) is used to speed up queries /// such as `SELECT * FROM table ORDER BY col DESC LIMIT 10` by pushing down the /// threshold values for the sort columns to the data source. @@ -79,7 +67,10 @@ struct ColumnThreshold { // So this optimization just saved us 50% of the work of scanning the data. #[derive(Debug)] pub struct SortDynamicFilterSource { - thresholds: Arc>>, + /// Sort expressions + expr: LexOrdering, + /// Current threshold values + thresholds: Arc>>>, } impl Hash for SortDynamicFilterSource { @@ -99,80 +90,78 @@ impl PartialEq for SortDynamicFilterSource { impl Eq for SortDynamicFilterSource {} impl SortDynamicFilterSource { - pub fn new(ordering: &LexOrdering) -> Self { - let thresholds = ordering - .iter() - .map(|sort_expr| ColumnThreshold { - value: None, - expr: Arc::clone(&sort_expr.expr), - sort_options: sort_expr.options, - }) - .collect(); - - let thresholds = Arc::new(RwLock::new(thresholds)); - - Self { thresholds } + pub fn new(expr: LexOrdering) -> Self { + let thresholds = Arc::new(RwLock::new(vec![None; expr.len()])); + Self { expr, thresholds } } pub fn update_values(&self, new_values: &[ScalarValue]) -> Result<()> { - let mut thresholds = self.thresholds.write().map_err(|_| { - datafusion_common::DataFusionError::Execution( - "Failed to acquire write lock on thresholds".to_string(), - ) - })?; - if new_values.len() != thresholds.len() { - return Err(datafusion_common::DataFusionError::Execution( - "The number of new values does not match the number of thresholds" - .to_string(), - )); - } - for (i, new_value) in new_values.iter().enumerate() { - let threshold = &mut thresholds[i]; - let descending = threshold.sort_options.descending; - let nulls_first = threshold.sort_options.nulls_first; - let current_value = &threshold.value; - // Check if the new value is more or less selective than the current value given the sorting - if let Some(current_value) = current_value { - let new_value_is_greater = new_value > current_value; - let new_value_is_null = new_value.is_null(); - let current_value_is_null = current_value.is_null(); - - let update_needed = match (nulls_first, descending) { - // For nulls_first + descending: update if new value is null (and current is not) or if new value is greater - (true, true) => { - (new_value_is_null && !current_value_is_null) - || (!new_value_is_null - && !current_value_is_null - && new_value_is_greater) + let replace = { + let thresholds = self.thresholds.read().map_err(|_| { + datafusion_common::DataFusionError::Execution( + "Failed to acquire write lock on thresholds".to_string(), + ) + })?; + if new_values.len() != thresholds.len() { + return Err(datafusion_common::DataFusionError::Execution( + "The number of new values does not match the number of thresholds" + .to_string(), + )); + } + // We need to decide if these values replace our current values or not. + // They only replace our current values if they would sort before them given our sorting expression. + // Importantly, since this may be a multi-expressions sort, we need to check that **the entire expression** + // sorts before the current set of values, not just one column. + // This means that if we have a sort expression like `a, b` and the new value is `a = 1, b = 2` + // and the current value is `a = 1, b = 3` we need to check that `a = 1, b = 2` sorts before `a = 1, b = 3` + // and not just that `a = 1` sorts before `a = 1`. + // We also have to handle ASC/DESC and NULLS FIRST/LAST for each column. + let mut replace = true; + for (i, new_value) in new_values.iter().enumerate() { + let current_value = &thresholds[i]; + let sort_expr = &self.expr[i]; + let descending = sort_expr.options.descending; + let nulls_first = sort_expr.options.nulls_first; + if let Some(current_value) = current_value { + let new_value_is_greater_than_current = new_value.gt(current_value); + let new_value_is_null = new_value.is_null(); + let current_value_is_null = current_value.is_null(); + // Handle the null cases + if current_value_is_null && !new_value_is_null && nulls_first { + replace = false; + break; } - // For nulls_first + ascending: update if new value is null (and current is not) or if new value is smaller - (true, false) => { - (new_value_is_null && !current_value_is_null) - || (!new_value_is_null - && !current_value_is_null - && !new_value_is_greater) + if new_value_is_null && !current_value_is_null && !nulls_first { + replace = false; + break; } - // For nulls_last + descending: update if new value is not null (and current is null) or if new value is greater - (false, true) => { - (!new_value_is_null && current_value_is_null) - || (!new_value_is_null - && !current_value_is_null - && new_value_is_greater) + // Handle the descending case + if descending { + if new_value_is_greater_than_current { + replace = false; + break; + } + } else if !new_value_is_greater_than_current { + replace = false; + break; } - // For nulls_last + ascending: update if new value is not null (and current is null) or if new value is smaller - (false, false) => { - (!new_value_is_null && current_value_is_null) - || (!new_value_is_null - && !current_value_is_null - && !new_value_is_greater) + // Handle the equality case + if new_value.eq(current_value) { + replace = false; + break; } - }; - - if update_needed { - threshold.value = Some(new_value.clone()); } - } else { - threshold.value = Some(new_value.clone()); + } + replace + }; + if replace { + let mut thresholds = self.thresholds.write().map_err(|_| { + datafusion_common::DataFusionError::Execution( + "Failed to acquire write lock on thresholds".to_string(), + ) + })?; + for (i, new_value) in new_values.iter().enumerate() { + thresholds[i] = Some(new_value.clone()); } } Ok(()) @@ -180,15 +169,9 @@ impl SortDynamicFilterSource { pub fn as_physical_expr(self: &Arc) -> Result> { let children = self - .thresholds - .read() - .map_err(|_| { - datafusion_common::DataFusionError::Execution( - "Failed to acquire read lock on thresholds".to_string(), - ) - })? + .expr .iter() - .map(|threshold| Arc::clone(&threshold.expr)) + .map(|sort_expr| Arc::clone(&sort_expr.expr)) .collect::>(); Ok(Arc::new(DynamicFilterPhysicalExpr::new( children, @@ -226,9 +209,7 @@ impl DynamicFilterSource for SortDynamicFilterSource { Vec::with_capacity(thresholds.len()); let mut prev_sort_expr: Option> = None; - for threshold in thresholds.iter() { - let value = &threshold.value; - + for (sort_expr, value) in self.expr.iter().zip(thresholds.iter()) { let Some(value) = value else { // If the value is None, we cannot create a filter for this threshold // This means we skip this column for filtering @@ -236,7 +217,7 @@ impl DynamicFilterSource for SortDynamicFilterSource { }; // Create the appropriate operator based on sort order - let op = if threshold.sort_options.descending { + let op = if sort_expr.options.descending { // For descending sort, we want col > threshold (exclude smaller values) Operator::Gt } else { @@ -247,35 +228,34 @@ impl DynamicFilterSource for SortDynamicFilterSource { let value_null = value.is_null(); let comparison = Arc::new(BinaryExpr::new( - Arc::clone(&threshold.expr), + Arc::clone(&sort_expr.expr), op, lit(value.clone()), )); - let comparison_with_null = - match (threshold.sort_options.nulls_first, value_null) { - // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) - (true, true) => lit(false), - (true, false) => Arc::new(BinaryExpr::new( - is_null(Arc::clone(&threshold.expr))?, - Operator::Or, - comparison, - )), - // For nulls last, transform to (threshold.value is null and threshold.expr is not null) - // or (threshold.value is not null and comparison) - (false, true) => is_not_null(Arc::clone(&threshold.expr))?, - (false, false) => comparison, - }; + let comparison_with_null = match (sort_expr.options.nulls_first, value_null) { + // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) + (true, true) => lit(false), + (true, false) => Arc::new(BinaryExpr::new( + is_null(Arc::clone(&sort_expr.expr))?, + Operator::Or, + comparison, + )), + // For nulls last, transform to (threshold.value is null and threshold.expr is not null) + // or (threshold.value is not null and comparison) + (false, true) => is_not_null(Arc::clone(&sort_expr.expr))?, + (false, false) => comparison, + }; let mut eq_expr = Arc::new(BinaryExpr::new( - Arc::clone(&threshold.expr), + Arc::clone(&sort_expr.expr), Operator::Eq, lit(value.clone()), )); if value_null { eq_expr = Arc::new(BinaryExpr::new( - is_null(Arc::clone(&threshold.expr))?, + is_null(Arc::clone(&sort_expr.expr))?, Operator::Or, eq_expr, )); From 054b4154d88da320c3f383231d0026bf838214c2 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 01:47:26 -0500 Subject: [PATCH 20/27] comments --- .../core/tests/parquet/file_statistics.rs | 29 +++++++++++++++++-- .../physical-optimizer/src/filter_pushdown.rs | 8 +++-- datafusion/physical-plan/src/sorts/sort.rs | 12 ++++---- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 7e98ebed6c9a..50dbe4d787a2 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -28,6 +28,7 @@ use datafusion::execution::context::SessionState; use datafusion::execution::session_state::SessionStateBuilder; use datafusion::prelude::SessionContext; use datafusion_common::stats::Precision; +use datafusion_common::DFSchema; use datafusion_execution::cache::cache_manager::CacheManagerConfig; use datafusion_execution::cache::cache_unit::{ DefaultFileStatisticsCache, DefaultListFilesCache, @@ -37,6 +38,10 @@ use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_expr::{col, lit, Expr}; use datafusion::datasource::physical_plan::FileScanConfig; +use datafusion_physical_optimizer::filter_pushdown::FilterPushdown; +use datafusion_physical_optimizer::PhysicalOptimizerRule; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::ExecutionPlan; use tempfile::tempdir; #[tokio::test] @@ -55,8 +60,28 @@ async fn check_stats_precision_with_filter_pushdown() { // Scan with filter pushdown, stats are inexact let filter = Expr::gt(col("id"), lit(1)); - let exec = table.scan(&state, None, &[filter], None).await.unwrap(); - assert_eq!(exec.statistics().unwrap().num_rows, Precision::Inexact(8)); + let data_source_exec = table + .scan(&state, None, &[filter.clone()], None) + .await + .unwrap(); + let df_schema = DFSchema::try_from(table.schema()).unwrap(); + let exec = FilterExec::try_new( + state + .create_physical_expr(filter.clone(), &df_schema) + .unwrap(), + data_source_exec, + ) + .unwrap(); + let exec = FilterPushdown::new() + .optimize(Arc::new(exec), state.config().options()) + .unwrap(); + let filter_exec = exec.as_any().downcast_ref::().unwrap(); + // TODO: we need to get the FilterExec to push down its filters + // since they no longer get applied to the DataSourceExec directly. + // let data_source_exec = Arc::new( + // filter_exec.input().as_any().downcast_ref::().unwrap() + // ) as Arc; + // assert_eq!(data_source_exec.statistics().unwrap().num_rows, Precision::Inexact(8)); } #[tokio::test] diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index fbd8b43b1472..89310ed4130c 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -118,9 +118,11 @@ fn pushdown_filters( // And check if it can absorb the remaining filters let remaining_filter_indexes = (0..parent_filters.len()) - .filter(|&i| match filter_pushdown_result[i] { - FilterPushdownSupportState::ChildExact => false, - _ => true, + .filter(|&i| { + !matches!( + filter_pushdown_result[i], + FilterPushdownSupportState::ChildExact + ) }) .collect::>(); if !remaining_filter_indexes.is_empty() { diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 56540ff31d18..db1ceb2cb673 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1243,18 +1243,18 @@ impl ExecutionPlan for SortExec { &self.metrics_set, )?; let dynamic_filter_source = Arc::clone(&self.dynamic_filter_source); + let enable_dynamic_filter_pushdown = context + .session_config() + .options() + .optimizer + .enable_dynamic_filter_pushdown; Ok(Box::pin(RecordBatchStreamAdapter::new( self.schema(), futures::stream::once(async move { while let Some(batch) = input.next().await { let batch = batch?; topk.insert_batch(batch)?; - if context - .session_config() - .options() - .optimizer - .enable_dynamic_filter_pushdown - { + if enable_dynamic_filter_pushdown { if let Some(values) = topk.get_threshold_values()? { dynamic_filter_source.update_values(&values)?; } From ecc89f962fff6d510e57ee3844d07873727e2031 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 12:56:33 -0500 Subject: [PATCH 21/27] a lot of comments --- .../core/tests/parquet/filter_pushdown.rs | 376 ++++++++++++++++- datafusion/core/tests/parquet/mod.rs | 377 +----------------- datafusion/datasource-parquet/src/source.rs | 4 +- datafusion/datasource/src/file_groups.rs | 12 +- .../physical-optimizer/src/filter_pushdown.rs | 360 +++++++++++++++-- .../physical-plan/src/execution_plan.rs | 28 +- datafusion/physical-plan/src/filter.rs | 4 +- .../physical-plan/src/sorts/sort_filters.rs | 12 +- 8 files changed, 743 insertions(+), 430 deletions(-) diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index 02fb59740493..5108548fcae1 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -26,18 +26,31 @@ //! select * from data limit 10; //! ``` +use std::fs::{self, File}; use std::path::Path; +use std::sync::Arc; +use arrow::array::{Int64Array, StringArray, StructArray}; use arrow::compute::concat_batches; use arrow::record_batch::RecordBatch; +use arrow::util::pretty::pretty_format_batches; +use arrow_schema::{DataType, Field, Fields, Schema}; +use datafusion::assert_batches_eq; +use datafusion::config::TableParquetOptions; +use datafusion::datasource::listing::ListingOptions; use datafusion::physical_plan::collect; use datafusion::physical_plan::metrics::MetricsSet; -use datafusion::prelude::{col, lit, lit_timestamp_nano, Expr, SessionContext}; +use datafusion::prelude::{ + col, lit, lit_timestamp_nano, Expr, SessionConfig, SessionContext, +}; use datafusion::test_util::parquet::{ParquetScanOptions, TestParquetFile}; use datafusion_common::instant::Instant; +use datafusion_common::{assert_contains, Result}; +use datafusion_datasource_parquet::ParquetFormat; use datafusion_expr::utils::{conjunction, disjunction, split_conjunction}; use itertools::Itertools; +use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::TempDir; use test_utils::AccessLogGenerator; @@ -597,3 +610,364 @@ fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize { } } } + +struct DynamicFilterTestCase { + query: String, + path: String, +} + +impl DynamicFilterTestCase { + fn new(query: String, path: String) -> Self { + Self { query, path } + } + + async fn run_query(&self, query: &str) -> Vec { + // Force 1 partition and 1 rg per partition because if we widen the plan + // and read all batches at once we won't get any dynamic pushdown. + let mut cfg = SessionConfig::new(); + cfg = cfg.set_u64("datafusion.execution.parquet.max_row_group_size", 1); + let ctx = SessionContext::new_with_config(cfg); + + let mut pq_options = TableParquetOptions::default(); + pq_options.global.max_row_group_size = 1; + pq_options.global.pushdown_filters = true; + let fmt = ParquetFormat::default().with_options(pq_options); + let opt = ListingOptions::new(Arc::new(fmt)).with_target_partitions(1); + ctx.register_listing_table("base_table", &self.path, opt, None, None) + .await + .unwrap(); + + ctx.sql(query).await.unwrap().collect().await.unwrap() + } + + async fn results(&self) -> Vec { + self.run_query(&self.query).await + } + + async fn explain_plan(&self) -> String { + let query = format!("EXPLAIN ANALYZE {}", self.query); + let batches = self.run_query(&query).await; + + pretty_format_batches(&batches) + .map(|s| format!("{}", s)) + .unwrap_or_else(|_| "No explain plan generated".to_string()) + } +} + +fn write_file_with_non_null_ids(file: &String, value: i64) { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![Some(value)]); + let name_array = StringArray::from(vec![Some("test")]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(id_array), Arc::new(name_array)], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); +} + +fn write_file_with_null_ids(file: &String) { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![None]); + let name_array = StringArray::from(vec![Some(format!("test{:02}", "null"))]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(id_array), Arc::new(name_array)], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); +} + +fn write_record_batch(file: &String, batch: RecordBatch) -> Result<()> { + let file = File::create(file)?; + let w_opt = WriterProperties::builder() + .set_max_row_group_size(1) + .build(); + let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(w_opt))?; + writer.write(&batch)?; + writer.flush()?; + writer.close()?; + Ok(()) +} + +fn write_file(file: &String) { + let struct_fields = Fields::from(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + ]); + let schema = Schema::new(vec![ + Field::new("struct", DataType::Struct(struct_fields.clone()), false), + Field::new("id", DataType::Int64, true), + Field::new("name", DataType::Utf8, false), + ]); + let id_array = Int64Array::from(vec![Some(2), Some(1)]); + let columns = vec![ + Arc::new(Int64Array::from(vec![3, 4])) as _, + Arc::new(StringArray::from(vec!["zzz", "aaa"])) as _, + ]; + let struct_array = StructArray::new(struct_fields, columns, None); + + let name_array = StringArray::from(vec![Some("test02"), Some("test01")]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(struct_array), + Arc::new(id_array), + Arc::new(name_array), + ], + ) + .unwrap(); + write_record_batch(file, batch).unwrap(); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + // write 2 files so that one is processed before the other + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + + let query = "select name from base_table order by id desc limit 3"; + + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "| test02 |", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=2"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_nulls_first() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + // write multiple files to ensure we get pushdown of dynamic filters from one file to another + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + + let name = format!("test{:02}.parquet", 100); + write_file_with_null_ids(&format!("{path}/{name}")); + + // nulls first by default + let query = "select name from base_table order by id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----------+", + "| name |", + "+----------+", + "| testnull |", + "| test02 |", + "| test02 |", + "+----------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=2"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_multi_key() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + for file in 0..5 { + // write multiple files to ensure we get pushdown of dynamic filters from one file to another + // Ensure files are read in order + let name = format!("test{:02}.parquet", file); + write_file_with_non_null_ids(&format!("{path}/{name}"), file); + } + + let query = "select id from base_table order by name desc, id limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path.clone()); + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 0 |", + "| 1 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=1"); + + let query1 = "select id from base_table order by name desc, id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query1.to_string(), path.clone()); + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 4 |", + "| 3 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=0"); + + let query1 = "select id from base_table order by name asc, id desc limit 3"; + let test_case = DynamicFilterTestCase::new(query1.to_string(), path); + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+----+", + "| id |", + "+----+", + "| 4 |", + "| 3 |", + "| 2 |", + "+----+", + ]; + assert_batches_eq!(expected, &batches); + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=0"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_nulls_last() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + for file in 0..5 { + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + let name = format!("test{:02}.parquet", 100); + write_file_with_null_ids(&format!("{path}/{name}")); + + let query = "select name from base_table order by id desc nulls last limit 3"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "| test02 |", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "row_groups_pruned_statistics=3"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_single_file() { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + + write_file(&format!("{path}/test.parquet")); + + let query = "select name from base_table order by id desc nulls last limit 1"; + let test_case = DynamicFilterTestCase::new(query.to_string(), path); + + let batches = test_case.results().await; + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let plan = test_case.explain_plan().await; + assert_contains!(&plan, "pushdown_rows_pruned=1"); +} + +#[tokio::test] +async fn test_topk_predicate_pushdown_ignores_partition_columns() { + // The TopK operator will try to push down predicates on `file_id`. + // But since `file_id` is a partition column and not part of the file itself + // we cannot actually do any filtering on it at the file level. + // Thus it has to be ignored by `ParquetSource`. + // This test only shows that this does not result in any errors or panics, + // it is expected that "nothing exciting" happens here. + // I do think in the future it would be interesting to re-design how partition columns + // get handled, in particular by pushing them into SchemaAdapter so that the table schema == file schema + // and we can do predicate pushdown on them as well without relying on each TableProvider to + // do special handling of partition columns. + + let ctx = SessionContext::new(); + let opt = ListingOptions::new(Arc::new(ParquetFormat::default())) + .with_table_partition_cols(vec![("file_id".to_string(), DataType::UInt32)]) + // We need to force 1 partition because TopK predicate pushdown happens on a per-partition basis + // If we had 1 file per partition (as an example) no pushdown would happen + .with_target_partitions(1); + + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + for file in 0..5 { + // crete a directory for the partition + fs::create_dir_all(format!("{path}/file_id={file}")).unwrap(); + let name = format!("file_id={file}/test.parquet"); + write_file(&format!("{path}/{name}")); + } + ctx.register_listing_table("base_table", path, opt, None, None) + .await + .unwrap(); + + let query = "select file_id from base_table order by file_id asc limit 3"; + + let batches = ctx.sql(query).await.unwrap().collect().await.unwrap(); + #[rustfmt::skip] + let expected = [ + "+---------+", + "| file_id |", + "+---------+", + "| 0 |", + "| 0 |", + "| 1 |", + "+---------+", + ]; + assert_batches_eq!(expected, &batches); + + let sql = format!("explain analyze {query}"); + let batches = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); + let explain_plan = format!("{}", pretty_format_batches(&batches).unwrap()); + assert_contains!(explain_plan, "row_groups_pruned_statistics=0"); // just documenting current behavior +} diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 3c7c1e487bf4..1085c58317b2 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -22,7 +22,7 @@ use arrow::{ make_array, Array, ArrayRef, BinaryArray, Date32Array, Date64Array, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, - StringArray, StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, + StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }, @@ -30,25 +30,17 @@ use arrow::{ record_batch::RecordBatch, util::pretty::pretty_format_batches, }; -use arrow_schema::Fields; use chrono::{Datelike, Duration, TimeDelta}; use datafusion::{ - assert_batches_eq, - config::TableParquetOptions, - datasource::{listing::ListingOptions, provider_as_source, TableProvider}, + datasource::{provider_as_source, TableProvider}, physical_plan::metrics::MetricsSet, prelude::{ParquetReadOptions, SessionConfig, SessionContext}, }; -use datafusion_common::{assert_contains, Result}; -use datafusion_datasource_parquet::ParquetFormat; use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder}; use parquet::arrow::ArrowWriter; use parquet::file::properties::{EnabledStatistics, WriterProperties}; -use std::{ - fs::{self, File}, - sync::Arc, -}; -use tempfile::{NamedTempFile, TempDir}; +use std::{fs::File, sync::Arc}; +use tempfile::NamedTempFile; mod custom_reader; mod external_access_plan; @@ -1080,364 +1072,3 @@ async fn make_test_file_page(scenario: Scenario, row_per_page: usize) -> NamedTe writer.close().unwrap(); output_file } - -struct DynamicFilterTestCase { - query: String, - path: String, -} - -impl DynamicFilterTestCase { - fn new(query: String, path: String) -> Self { - Self { query, path } - } - - async fn run_query(&self, query: &str) -> Vec { - // Force 1 partition and 1 rg per partition because if we widen the plan - // and read all batches at once we won't get any dynamic pushdown. - let mut cfg = SessionConfig::new(); - cfg = cfg.set_u64("datafusion.execution.parquet.max_row_group_size", 1); - let ctx = SessionContext::new_with_config(cfg); - - let mut pq_options = TableParquetOptions::default(); - pq_options.global.max_row_group_size = 1; - pq_options.global.pushdown_filters = true; - let fmt = ParquetFormat::default().with_options(pq_options); - let opt = ListingOptions::new(Arc::new(fmt)).with_target_partitions(1); - ctx.register_listing_table("base_table", &self.path, opt, None, None) - .await - .unwrap(); - - ctx.sql(query).await.unwrap().collect().await.unwrap() - } - - async fn results(&self) -> Vec { - self.run_query(&self.query).await - } - - async fn explain_plan(&self) -> String { - let query = format!("EXPLAIN ANALYZE {}", self.query); - let batches = self.run_query(&query).await; - - pretty_format_batches(&batches) - .map(|s| format!("{}", s)) - .unwrap_or_else(|_| "No explain plan generated".to_string()) - } -} - -fn write_file_with_non_null_ids(file: &String, value: i64) { - let schema = Schema::new(vec![ - Field::new("id", DataType::Int64, true), - Field::new("name", DataType::Utf8, false), - ]); - let id_array = Int64Array::from(vec![Some(value)]); - let name_array = StringArray::from(vec![Some("test")]); - let schema = Arc::new(schema); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(id_array), Arc::new(name_array)], - ) - .unwrap(); - write_record_batch(file, batch).unwrap(); -} - -fn write_file_with_null_ids(file: &String) { - let schema = Schema::new(vec![ - Field::new("id", DataType::Int64, true), - Field::new("name", DataType::Utf8, false), - ]); - let id_array = Int64Array::from(vec![None]); - let name_array = StringArray::from(vec![Some(format!("test{:02}", "null"))]); - let schema = Arc::new(schema); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(id_array), Arc::new(name_array)], - ) - .unwrap(); - write_record_batch(file, batch).unwrap(); -} - -fn write_record_batch(file: &String, batch: RecordBatch) -> Result<()> { - let file = File::create(file)?; - let w_opt = WriterProperties::builder() - .set_max_row_group_size(1) - .build(); - let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(w_opt))?; - writer.write(&batch)?; - writer.flush()?; - writer.close()?; - Ok(()) -} - -fn write_file(file: &String) { - let struct_fields = Fields::from(vec![ - Field::new("id", DataType::Int64, false), - Field::new("name", DataType::Utf8, false), - ]); - let schema = Schema::new(vec![ - Field::new("struct", DataType::Struct(struct_fields.clone()), false), - Field::new("id", DataType::Int64, true), - Field::new("name", DataType::Utf8, false), - ]); - let id_array = Int64Array::from(vec![Some(2), Some(1)]); - let columns = vec![ - Arc::new(Int64Array::from(vec![3, 4])) as _, - Arc::new(StringArray::from(vec!["zzz", "aaa"])) as _, - ]; - let struct_array = StructArray::new(struct_fields, columns, None); - - let name_array = StringArray::from(vec![Some("test02"), Some("test01")]); - let schema = Arc::new(schema); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(struct_array), - Arc::new(id_array), - Arc::new(name_array), - ], - ) - .unwrap(); - write_record_batch(file, batch).unwrap(); -} - -#[tokio::test] -async fn test_topk_predicate_pushdown() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - for file in 0..5 { - // write 2 files so that one is processed before the other - let name = format!("test{:02}.parquet", file); - write_file(&format!("{path}/{name}")); - } - - let query = "select name from base_table order by id desc limit 3"; - - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+--------+", - "| name |", - "+--------+", - "| test02 |", - "| test02 |", - "| test02 |", - "+--------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=2"); -} - -#[tokio::test] -async fn test_topk_predicate_pushdown_nulls_first() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - for file in 0..5 { - // write multiple files to ensure we get pushdown of dynamic filters from one file to another - let name = format!("test{:02}.parquet", file); - write_file(&format!("{path}/{name}")); - } - - let name = format!("test{:02}.parquet", 100); - write_file_with_null_ids(&format!("{path}/{name}")); - - // nulls first by default - let query = "select name from base_table order by id desc limit 3"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+----------+", - "| name |", - "+----------+", - "| testnull |", - "| test02 |", - "| test02 |", - "+----------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=3"); -} - -#[tokio::test] -async fn test_topk_predicate_pushdown_multi_key() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - for file in 0..5 { - // write multiple files to ensure we get pushdown of dynamic filters from one file to another - // Ensure files are read in order - let name = format!("test{:02}.parquet", file); - write_file_with_non_null_ids(&format!("{path}/{name}"), file); - } - - let query = "select id from base_table order by name desc, id limit 3"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path.clone()); - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+----+", - "| id |", - "+----+", - "| 0 |", - "| 1 |", - "| 2 |", - "+----+", - ]; - assert_batches_eq!(expected, &batches); - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=0"); - - let query1 = "select id from base_table order by name desc, id desc limit 3"; - let test_case = DynamicFilterTestCase::new(query1.to_string(), path.clone()); - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+----+", - "| id |", - "+----+", - "| 4 |", - "| 3 |", - "| 2 |", - "+----+", - ]; - assert_batches_eq!(expected, &batches); - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=1"); - - let query1 = "select id from base_table order by name asc, id desc limit 3"; - let test_case = DynamicFilterTestCase::new(query1.to_string(), path); - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+----+", - "| id |", - "+----+", - "| 4 |", - "| 3 |", - "| 2 |", - "+----+", - ]; - assert_batches_eq!(expected, &batches); - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=1"); -} - -#[tokio::test] -async fn test_topk_predicate_pushdown_nulls_last() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - for file in 0..5 { - let name = format!("test{:02}.parquet", file); - write_file(&format!("{path}/{name}")); - } - let name = format!("test{:02}.parquet", 100); - write_file_with_null_ids(&format!("{path}/{name}")); - - let query = "select name from base_table order by id desc nulls last limit 3"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+--------+", - "| name |", - "+--------+", - "| test02 |", - "| test02 |", - "| test02 |", - "+--------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "row_groups_pruned_statistics=0"); -} - -#[tokio::test] -async fn test_topk_predicate_pushdown_single_file() { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - - write_file(&format!("{path}/test.parquet")); - - let query = "select name from base_table order by id desc nulls last limit 1"; - let test_case = DynamicFilterTestCase::new(query.to_string(), path); - - let batches = test_case.results().await; - #[rustfmt::skip] - let expected = [ - "+--------+", - "| name |", - "+--------+", - "| test02 |", - "+--------+", - ]; - assert_batches_eq!(expected, &batches); - - let plan = test_case.explain_plan().await; - assert_contains!(&plan, "pushdown_rows_pruned=1"); -} - -#[tokio::test] -async fn test_topk_predicate_pushdown_ignores_partition_columns() { - // The TopK operator will try to push down predicates on `file_id`. - // But since `file_id` is a partition column and not part of the file itself - // we cannot actually do any filtering on it at the file level. - // Thus it has to be ignored by `ParquetSource`. - // This test only shows that this does not result in any errors or panics, - // it is expected that "nothing exciting" happens here. - // I do think in the future it would be interesting to re-design how partition columns - // get handled, in particular by pushing them into SchemaAdapter so that the table schema == file schema - // and we can do predicate pushdown on them as well without relying on each TableProvider to - // do special handling of partition columns. - - let ctx = SessionContext::new(); - let opt = ListingOptions::new(Arc::new(ParquetFormat::default())) - .with_table_partition_cols(vec![("file_id".to_string(), DataType::UInt32)]) - // We need to force 1 partition because TopK predicate pushdown happens on a per-partition basis - // If we had 1 file per partition (as an example) no pushdown would happen - .with_target_partitions(1); - - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().to_str().unwrap().to_string(); - for file in 0..5 { - // crete a directory for the partition - fs::create_dir_all(format!("{path}/file_id={file}")).unwrap(); - let name = format!("file_id={file}/test.parquet"); - write_file(&format!("{path}/{name}")); - } - ctx.register_listing_table("base_table", path, opt, None, None) - .await - .unwrap(); - - let query = "select file_id from base_table order by file_id asc limit 3"; - - let batches = ctx.sql(query).await.unwrap().collect().await.unwrap(); - #[rustfmt::skip] - let expected = [ - "+---------+", - "| file_id |", - "+---------+", - "| 0 |", - "| 0 |", - "| 1 |", - "+---------+", - ]; - assert_batches_eq!(expected, &batches); - - let sql = format!("explain analyze {query}"); - let batches = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); - let explain_plan = format!("{}", pretty_format_batches(&batches).unwrap()); - assert_contains!(explain_plan, "row_groups_pruned_statistics=0"); // just documenting current behavior -} diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 5b8581ed1223..a46097259f94 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -601,9 +601,9 @@ impl FileSource for ParquetSource { // otherwise we would mark filters as exact but then not filter at the row level // because the setting gets checked again inside the ParquetOpener! let support = if self.table_parquet_options.global.pushdown_filters { - vec![FilterPushdownSupport::Exact; filters.len()] + vec![FilterPushdownSupport::HandledExact; filters.len()] } else { - vec![FilterPushdownSupport::Inexact; filters.len()] + vec![FilterPushdownSupport::Unhandled; filters.len()] }; Ok(Some(FileSourceFilterPushdownResult::new( Arc::new(conf), diff --git a/datafusion/datasource/src/file_groups.rs b/datafusion/datasource/src/file_groups.rs index 4068c048e67c..5fe3e25eaa1f 100644 --- a/datafusion/datasource/src/file_groups.rs +++ b/datafusion/datasource/src/file_groups.rs @@ -426,16 +426,8 @@ impl FileGroup { // ObjectStore::list does not guarantee any consistent order and for some // implementations such as LocalFileSystem, it may be inconsistent. Thus - // Sort files by last_modified desc, path asc to ensure consistent plans when run more than once. - // The choice of sorting by last_modified desc is somewhat arbitrary, but the idea is that it will - // help speed up queries such as `select * from t1 order by timestamp_column desc limit 100` - // and that wanting "the latest" data is generally more common and latency senstive than wanting "the oldest" data. - self.files.sort_by(|a, b| { - b.object_meta - .last_modified - .cmp(&a.object_meta.last_modified) - .then_with(|| a.path().cmp(b.path())) - }); + // Sort files by path to ensure consistent plans when run more than once. + self.files.sort_by(|a, b| a.path().cmp(b.path())); // effectively this is div with rounding up instead of truncating let chunk_size = self.len().div_ceil(n); diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 89310ed4130c..a89b22e3fbce 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -26,43 +26,66 @@ use datafusion_physical_plan::{ use crate::PhysicalOptimizerRule; +/// The state of filter pushdown support for a given filter. #[derive(Clone, Copy, Debug)] -enum FilterPushdownSupportState { +enum PushdownState { + /// A child said it can handle the filter exactly. ChildExact, + /// A child exists and took a look at the filter. + /// It may partially handle it or not handle it at all. + /// The parent still needs to re-apply the filter. ChildInexact, + /// No child exists, there is no one to handle the filter. + /// This is the default / initial state. NoChild, } -impl FilterPushdownSupportState { - fn combine_with_other( - &self, - other: &FilterPushdownSupport, - ) -> FilterPushdownSupportState { +impl PushdownState { + /// Combine the current state with another state. + /// This is used to combine the results of multiple children. + fn combine_with_other(&self, other: &FilterPushdownSupport) -> PushdownState { match (other, self) { - (FilterPushdownSupport::Exact, FilterPushdownSupportState::NoChild) => { - FilterPushdownSupportState::ChildExact + (FilterPushdownSupport::HandledExact, PushdownState::NoChild) => { + PushdownState::ChildExact + } + (FilterPushdownSupport::HandledExact, PushdownState::ChildInexact) => { + PushdownState::ChildInexact } - (FilterPushdownSupport::Exact, FilterPushdownSupportState::ChildInexact) => { - FilterPushdownSupportState::ChildInexact + (FilterPushdownSupport::Unhandled, PushdownState::NoChild) => { + PushdownState::ChildInexact } - (FilterPushdownSupport::Inexact, FilterPushdownSupportState::NoChild) => { - FilterPushdownSupportState::ChildInexact + (FilterPushdownSupport::Unhandled, PushdownState::ChildExact) => { + PushdownState::ChildInexact } - (FilterPushdownSupport::Inexact, FilterPushdownSupportState::ChildExact) => { - FilterPushdownSupportState::ChildInexact + (FilterPushdownSupport::Unhandled, PushdownState::ChildInexact) => { + PushdownState::ChildInexact } - ( - FilterPushdownSupport::Inexact, - FilterPushdownSupportState::ChildInexact, - ) => FilterPushdownSupportState::ChildInexact, - (FilterPushdownSupport::Exact, FilterPushdownSupportState::ChildExact) => { + (FilterPushdownSupport::HandledExact, PushdownState::ChildExact) => { // If both are exact, keep it as exact - FilterPushdownSupportState::ChildExact + PushdownState::ChildExact } } } } +/// Recursively a collection of filters down through the execution plan tree in a depth-first manner. +/// +/// For each filter we try to push it down to children as far down as possible, keeping track of if the children +/// can handle the filter or not. +/// +/// If a child can handle the filter, we mark it as handled exact and parent nodes (including the source of the filter) +/// can decide to discard it / not re-apply it themselves. +/// If a child cannot handle the filter or may return false positives (aka "inexact" handling) we mark it as handled inexact. +/// If a child does not allow filter pushdown at all (e.g. an aggregation node) we keep recursing but clear the current set of filters +/// we are pushing down. +/// +/// As we recurse back up the tree we combine the results of the children to determine if the overall result is exact or inexact: +/// - For nodes with a single child we just take the child's result. +/// - For nodes with multiple children we combine the results of the children to determine if the overall result is exact or inexact. +/// We do this by checking if all children are exact (we return exact up) or if any child is inexact (we return inexact). +/// - If a node has no children this is equivalent to inexact handling (there is no child to handle the filter). +/// +/// See [`FilterPushdown`] for more details on how this works in practice. fn pushdown_filters( node: &Arc, parent_filters: &[Arc], @@ -75,8 +98,7 @@ fn pushdown_filters( .chain(node_filters.iter()) .cloned() .collect::>(); - let mut filter_pushdown_result = - vec![FilterPushdownSupportState::NoChild; all_filters.len()]; + let mut filter_pushdown_result = vec![PushdownState::NoChild; all_filters.len()]; for child in children { if child.supports_filter_pushdown() { if let Some(result) = pushdown_filters(child, &all_filters)? { @@ -105,9 +127,9 @@ fn pushdown_filters( let pushdown_result = filter_pushdown_result[parent_filters.len()..] .iter() .map(|s| match s { - FilterPushdownSupportState::ChildExact => FilterPushdownSupport::Exact, - FilterPushdownSupportState::ChildInexact => FilterPushdownSupport::Inexact, - FilterPushdownSupportState::NoChild => FilterPushdownSupport::Inexact, + PushdownState::ChildExact => FilterPushdownSupport::HandledExact, + PushdownState::ChildInexact => FilterPushdownSupport::Unhandled, + PushdownState::NoChild => FilterPushdownSupport::Unhandled, }) .collect::>(); if let Some(new_node) = @@ -118,12 +140,7 @@ fn pushdown_filters( // And check if it can absorb the remaining filters let remaining_filter_indexes = (0..parent_filters.len()) - .filter(|&i| { - !matches!( - filter_pushdown_result[i], - FilterPushdownSupportState::ChildExact - ) - }) + .filter(|&i| !matches!(filter_pushdown_result[i], PushdownState::ChildExact)) .collect::>(); if !remaining_filter_indexes.is_empty() { let remaining_filters = remaining_filter_indexes @@ -144,14 +161,289 @@ fn pushdown_filters( let support = filter_pushdown_result[..parent_filters.len()] .iter() .map(|s| match s { - FilterPushdownSupportState::ChildExact => FilterPushdownSupport::Exact, - FilterPushdownSupportState::ChildInexact => FilterPushdownSupport::Inexact, - FilterPushdownSupportState::NoChild => FilterPushdownSupport::Inexact, + PushdownState::ChildExact => FilterPushdownSupport::HandledExact, + PushdownState::ChildInexact => FilterPushdownSupport::Unhandled, + PushdownState::NoChild => FilterPushdownSupport::Unhandled, }) .collect::>(); Ok(Some(ExecutionPlanFilterPushdownResult::new(node, support))) } +/// A physical optimizer rule that pushes down filters in the execution plan. +/// For example, consider the following plan: +/// +/// ```text +// ┌──────────────────────┐ +// │ CoalesceBatchesExec │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ FilterExec │ +// │ filters = [ id=1] │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ DataSourceExec │ +// │ projection = * │ +// └──────────────────────┘ +/// ``` +/// +/// Our goal is to move the `id = 1` filter from the `FilterExec` node to the `DataSourceExec` node. +/// If this filter is selective it can avoid massive amounts of data being read from the source (the projection is `*` so all matching columns are read). +/// In this simple case we: +/// 1. Enter the recursion with no filters. +/// 2. We find the `FilterExec` node and it tells us that it has a filter (see [`ExecutionPlan::filters_for_pushdown`] and `datafusion::physical_plan::filter::FilterExec`). +/// 3. We recurse down into it's children (the `DataSourceExec` node) now carrying the filters `[id = 1]`. +/// 4. The `DataSourceExec` node tells us that it can handle the filter and we mark it as handled exact (see [`ExecutionPlan::push_down_filters_from_parents`]). +/// 5. Since the `DataSourceExec` node has no children we recurse back up the tree. +/// 6. We now tell the `FilterExec` node that it has a child that can handle the filter and we mark it as handled exact (see [`ExecutionPlan::with_filter_pushdown_result`]). +/// The `FilterExec` node can now return a new execution plan, either a copy of itself without that filter or if has no work left to do it can even return the child node directly. +/// 7. We recurse back up to `CoalesceBatchesExec` and do nothing there since it had no filters to push down. +/// +/// The new plan looks like: +/// +/// ```text +/// ┌──────────────────────┐ +/// │ CoalesceBatchesExec │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ DataSourceExec │ +// │ projection = * │ +// │ filters = [ id=1] │ +/// └──────────────────────┘ +/// ``` +/// +/// Let's consider a more complex example involving a `ProjectionExec` node in betweeen the `FilterExec` and `DataSourceExec` nodes that creates a new column that the filter depends on. +/// +/// ```text +// ┌──────────────────────┐ +// │ CoalesceBatchesExec │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ FilterExec │ +// │ filters = │ +// │ [cost>50,id=1] │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ ProjectionExec │ +// │ cost = price * 1.2 │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ DataSourceExec │ +// │ projection = * │ +// └──────────────────────┘ +/// ``` +/// +/// We want to push down the filters `[id=1]` to the [`DataSourceExec`] node, but can't push down `[cost>50]` because it requires the `ProjectionExec` node to be executed first: +/// +/// ```text +// ┌──────────────────────┐ +// │ CoalesceBatchesExec │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ FilterExec │ +// │ filters = │ +// │ [cost>50] │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ ProjectionExec │ +// │ cost = price * 1.2 │ +// └──────────────────────┘ +// │ +// ▼ +// ┌──────────────────────┐ +// │ DataSourceExec │ +// │ projection = * │ +// │ filters = [ id=1] │ +// └──────────────────────┘ +/// ``` +/// +/// There are also cases where we may be able to push down filters within a subtree but not the entire tree. +/// A good exmaple of this is aggreagation nodes: +/// +/// projection -> aggregate -> filter -> scan +/// +/// ```text +/// ┌──────────────────────┐ +/// │ ProjectionExec │ +/// │ projection = * │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ FilterExec │ +/// │ filters = [sum > 10] │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌───────────────────────┐ +/// │ AggregateExec │ +/// │ group by = [id] │ +/// │ aggregate = │ +/// │ [sum(price)] │ +/// └───────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ FilterExec │ +/// │ filters = [id=1] │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ DataSourceExec │ +/// │ projection = * │ +/// └──────────────────────┘ +/// ``` +/// +/// The transformation here is to push down the `[id=1]` filter to the `DataSourceExec` node: +/// +/// ```text +/// ┌──────────────────────┐ +/// │ ProjectionExec │ +/// │ projection = * │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ FilterExec │ +/// │ filters = [sum > 10] │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌───────────────────────┐ +/// │ AggregateExec │ +/// │ group by = [id] │ +/// │ aggregate = │ +/// │ [sum(price)] │ +/// └───────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ DataSourceExec │ +/// │ projection = * │ +/// │ filters = [id=1] │ +/// └──────────────────────┘ +/// ``` +/// +/// The point here is that: +/// 1. We cannot push down `sum > 10` through the `AggregateExec` node into the `DataSourceExec` node. +/// Any filters above the `AggregateExec` node are not pushed down. +/// This is determined by calling [`ExecutionPlan::supports_filter_pushdown`] on the `AggregateExec` node. +/// 2. We need to keep recursing into the tree so that we can discover the other `FilterExec` node and push down the `[id=1]` filter. +/// +/// It is also possible to push down filters through joins and from joins. +/// For example, a hash join where we build a hash table of the left side and probe the right side +/// (ignoring why we would choose this order, typically it depends on the size of each table, etc.). +/// +/// ```text +/// ┌─────────────────────┐ +/// │ FilterExec │ +/// │ filters = │ +/// │ [d.size > 100] │ +/// └─────────────────────┘ +/// │ +/// │ +/// ┌──────────▼──────────┐ +/// │ │ +/// │ HashJoinExec │ +/// │ [u.dept@hash(d.id)] │ +/// │ │ +/// └─────────────────────┘ +/// │ +/// ┌────────────┴────────────┐ +/// ┌──────────▼──────────┐ ┌──────────▼──────────┐ +/// │ DataSourceExec │ │ DataSourceExec │ +/// │ alias [users as u] │ │ alias [dept as d] │ +/// │ │ │ │ +/// └─────────────────────┘ └─────────────────────┘ +/// ``` +/// +/// There are two pushdowns we can do here: +/// 1. Push down the `[d.size > 100]` filter through the `HashJoinExec` node to the `DataSourceExec` node for the `departments` table. +/// 2. Push down the hash table state from the `HashJoinExec` node to the `DataSourceExec` node to avoid reading +/// rows from teh `users` table that will be eliminated by the join. +/// This can be done via a bloom filter or similar. +/// +/// ```text +/// ┌─────────────────────┐ +/// │ │ +/// │ HashJoinExec │ +/// │ [u.dept@hash(d.id)] │ +/// │ │ +/// └─────────────────────┘ +/// │ +/// ┌────────────┴────────────┐ +/// ┌──────────▼──────────┐ ┌──────────▼──────────┐ +/// │ DataSourceExec │ │ DataSourceExec │ +/// │ alias [users as u] │ │ alias [dept as d] │ +/// │ filters = │ │ filters = │ +/// │ [depg@hash(d.id)] │ │ [ d.size > 100] │ +/// └─────────────────────┘ └─────────────────────┘ +/// ``` +/// +/// You may notice in this case that the filter is *dynamic*: the hash table is built +/// _after_ the `departments` table is read and at runtime. +/// We don't have a concrete `InList` filter or similar to push down at optimization time. +/// These sorts of dynamic filters are handled by building a specialized [`PhysicalExpr`] that +/// internally maintains a reference to the hash table or other state. +/// To make working with these sorts of dynamic filters more tractable we have the method [`PhysicalExpr::snapshot`] +/// which attempts to simplify a dynamic filter into a "basic" non-dynamic filter. +/// For a join this could mean converting it to an `InList` filter or a min/max filter for example. +/// See `datafusion/physical-plan/src/dynamic_filters.rs` for more details. +/// +/// Another form of dyanmic filter is pushing down the state of a `TopK` operator for queries like +/// `SELECT * FROM t ORDER BY id LIMIT 10`: +/// +/// ```text +/// ┌──────────────────────┐ +/// │ TopK │ +/// │ limit = 10 │ +/// │ order by = [id] │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ DataSourceExec │ +/// │ projection = * │ +/// └──────────────────────┘ +/// ``` +/// +/// We can avoid large amounts of data processing by transforming this into: +/// +/// ```text +/// ┌──────────────────────┐ +/// │ TopK │ +/// │ limit = 10 │ +/// │ order by = [id] │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ DataSourceExec │ +/// │ projection = * │ +/// │ filters = │ +/// │ [id < @ TopKHeap] │ +/// └──────────────────────┘ +/// ``` +/// +/// Now as we fill our `TopK` heap we can push down the state of the heap to the `DataSourceExec` node +/// to avoid reading files / row groups / pages / rows that could not possibly be in the top 10. +/// This is implemented in datafusion/physical-plan/src/sorts/sort_filters.rs. #[derive(Debug)] pub struct FilterPushdown {} diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index f093b8bce817..a5bcfcc0dd72 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -472,6 +472,18 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// For example, a `TopK` operator may produce dynamic filters that reference it's currrent state, /// while a `FilterExec` will just hand of the filters it has as is. /// The default implementation returns an empty vector. + /// These filters are applied row-by row and any that return `false` or `NULL` will be + /// filtered out and any that return `true` will be kept. + /// The expressions returned **must** always return `true` or `false`; + /// other truthy or falsy values are not allowed (e.g. `0`, `1`). + /// + /// # Returns + /// A vector of filters that this operator would like to push down. + /// These should be treated as the split conjunction of a `WHERE` clause. + /// That is, a query such as `WHERE a = 1 AND b = 2` would return two + /// filters: `a = 1` and `b = 2`. + /// They can always be assembled into a single filter using + /// [`crate::physical_expr::split_conjunction`]. fn filters_for_pushdown(&self) -> Result>> { Ok(Vec::new()) } @@ -507,10 +519,20 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { } } +/// The result of pushing down each filter. +/// When a parent plan tries to push down a filter to a child it needs to know if the child +/// can handle the filter or not to determine if it still needs to apply the filter itself. #[derive(Debug, Clone, Copy)] pub enum FilterPushdownSupport { - Inexact, - Exact, + /// Filter may not have been pushed down to the child plan, or the child plan + /// can only partially apply the filter but may have false positives (but not false negatives). + /// In this case the parent **must** behave as if the filter was not pushed down + /// and must apply the filter itself. + Unhandled, + /// Filter was pushed down to the child plan and the child plan promises that + /// it will apply the filter correctly with no false positives or false negatives. + /// The parent can safely drop the filter. + HandledExact, } pub struct FilterPushdownResult { @@ -529,7 +551,7 @@ impl FilterPushdownResult { pub fn is_exact(&self) -> bool { self.support .iter() - .all(|s| matches!(s, FilterPushdownSupport::Exact)) + .all(|s| matches!(s, FilterPushdownSupport::HandledExact)) } } diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 0e63acfbd841..e885f41a3909 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -458,7 +458,7 @@ impl ExecutionPlan for FilterExec { .iter() .zip(pushdown.iter()) .filter_map(|(f, p)| { - if matches!(p, FilterPushdownSupport::Exact) { + if matches!(p, FilterPushdownSupport::HandledExact) { // Exact pushdown support means we keep discard filter None } else { @@ -498,7 +498,7 @@ impl ExecutionPlan for FilterExec { predicate: new_predicates, ..self.clone() }), - support: vec![FilterPushdownSupport::Exact; filters.len()], + support: vec![FilterPushdownSupport::HandledExact; filters.len()], })) } } diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index dc2d6127488e..ae4e5029498b 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -30,11 +30,11 @@ use datafusion_physical_expr::{ use crate::dynamic_filters::{DynamicFilterPhysicalExpr, DynamicFilterSource}; -/// Pushdown of dynamic fitlers from sort + limit operators (aka `TopK`) is used to speed up queries +/// Pushdown of dynamic filters from sort + limit operators (aka `TopK`) is used to speed up queries /// such as `SELECT * FROM table ORDER BY col DESC LIMIT 10` by pushing down the /// threshold values for the sort columns to the data source. /// That is, the TopK operator will keep track of the top 10 values for the sort -/// and before a new file is opened it's statitics will be checked against the +/// and before a new file is opened its statistics will be checked against the /// threshold values to determine if the file can be skipped and predicate pushdown /// will use these to skip rows during the scan. /// @@ -141,9 +141,11 @@ impl SortDynamicFilterSource { replace = false; break; } - } else if !new_value_is_greater_than_current { - replace = false; - break; + } else { + if !new_value_is_greater_than_current { + replace = false; + break; + } } // Handle the equality case if new_value.eq(current_value) { From e59aac5ae18f510ce2377071120fe52ce03783a1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 15:17:04 -0500 Subject: [PATCH 22/27] refactoring --- datafusion/datasource-parquet/src/source.rs | 12 +- datafusion/datasource/src/file.rs | 2 +- datafusion/datasource/src/file_scan_config.rs | 2 +- datafusion/datasource/src/source.rs | 25 ++- .../physical-optimizer/src/filter_pushdown.rs | 189 ++++++++++-------- .../physical-plan/src/coalesce_batches.rs | 8 +- .../physical-plan/src/execution_plan.rs | 86 +++++--- datafusion/physical-plan/src/filter.rs | 96 +++++---- .../physical-plan/src/repartition/mod.rs | 8 +- datafusion/physical-plan/src/sorts/sort.rs | 10 +- 10 files changed, 247 insertions(+), 191 deletions(-) diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index a46097259f94..66695c3e4864 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -42,7 +42,7 @@ use datafusion_physical_expr::{conjunction, expressions::lit}; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_optimizer::pruning::PruningPredicate; -use datafusion_physical_plan::execution_plan::FilterPushdownSupport; +use datafusion_physical_plan::execution_plan::FilterSupport; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder}; use datafusion_physical_plan::DisplayFormatType; @@ -581,7 +581,7 @@ impl FileSource for ParquetSource { fn push_down_filters( &self, - filters: &[&Arc], + filters: &[Arc], ) -> datafusion_common::Result> { let mut conf = self.clone(); let predicate = match self.predicate.as_ref() { @@ -589,10 +589,10 @@ impl FileSource for ParquetSource { // Combine existing predicate with new filters Some(conjunction( std::iter::once(Arc::clone(existing_predicate)) - .chain(filters.iter().cloned().cloned()), + .chain(filters.iter().map(Arc::clone)), )) } - None => Some(conjunction(filters.iter().cloned().cloned())), + None => Some(conjunction(filters.iter().map(Arc::clone))), }; match predicate { Some(new_predicate) if !new_predicate.eq(&lit(true)) => { @@ -601,9 +601,9 @@ impl FileSource for ParquetSource { // otherwise we would mark filters as exact but then not filter at the row level // because the setting gets checked again inside the ParquetOpener! let support = if self.table_parquet_options.global.pushdown_filters { - vec![FilterPushdownSupport::HandledExact; filters.len()] + vec![FilterSupport::HandledExact; filters.len()] } else { - vec![FilterPushdownSupport::Unhandled; filters.len()] + vec![FilterSupport::Unhandled; filters.len()] }; Ok(Some(FileSourceFilterPushdownResult::new( Arc::new(conf), diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index 8dbc43a59a6d..6ddcfff840c8 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -97,7 +97,7 @@ pub trait FileSource: Send + Sync { fn push_down_filters( &self, - _filters: &[&Arc], + _filters: &[Arc], ) -> datafusion_common::Result> { Ok(None) } diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 1f50a8e998df..0c264a1f1fec 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -587,7 +587,7 @@ impl DataSource for FileScanConfig { fn push_down_filters( &self, - filters: &[&Arc], + filters: &[Arc], ) -> Result> { if let Some(file_source_result) = self.file_source.push_down_filters(filters)? { let mut new_self = self.clone(); diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 285dab25b8ae..623c5467c319 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -24,6 +24,7 @@ use std::sync::Arc; use datafusion_physical_plan::execution_plan::{ Boundedness, EmissionType, ExecutionPlanFilterPushdownResult, FilterPushdownResult, + FilterSupport, }; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use datafusion_physical_plan::projection::ProjectionExec; @@ -87,7 +88,7 @@ pub trait DataSource: Send + Sync + Debug { /// data source and the support level for each filter (exact or inexact). fn push_down_filters( &self, - _filters: &[&Arc], + _filters: &[Arc], ) -> datafusion_common::Result> { Ok(None) } @@ -207,23 +208,25 @@ impl ExecutionPlan for DataSourceExec { self.data_source.try_swapping_with_projection(projection) } - fn supports_filter_pushdown(&self) -> bool { - true // DataSourceExec can receive filter pushdowns from upstream operators - } - - fn push_down_filters_from_parents( - &self, - filters: &[&Arc], + fn with_filter_pushdown_result( + self: Arc, + own_filters_result: &[FilterSupport], + parent_filters_remaining: &[Arc], ) -> datafusion_common::Result> { - // we forward filter pushdown to our data source - if let Some(pushdown_result) = self.data_source.push_down_filters(filters)? { + assert!(own_filters_result.is_empty()); // We didn't give out any filters, this should be empty! + // Forward filter pushdown to our data source. + if let Some(pushdown_result) = self + .data_source + .push_down_filters(parent_filters_remaining)? + { let new_self = Arc::new(DataSourceExec::new(pushdown_result.inner)); return Ok(Some(ExecutionPlanFilterPushdownResult::new( new_self, pushdown_result.support, ))); + } else { + return Ok(None); } - Ok(None) } } diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index a89b22e3fbce..8bba6154d9aa 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -20,7 +20,9 @@ use std::sync::Arc; use datafusion_common::{config::ConfigOptions, Result}; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::{ - execution_plan::{ExecutionPlanFilterPushdownResult, FilterPushdownSupport}, + execution_plan::{ + ExecutionPlanFilterPushdownResult, FilterPushdownAllowed, FilterSupport, + }, with_new_children_if_necessary, ExecutionPlan, }; @@ -28,7 +30,7 @@ use crate::PhysicalOptimizerRule; /// The state of filter pushdown support for a given filter. #[derive(Clone, Copy, Debug)] -enum PushdownState { +enum ChildPushdownState { /// A child said it can handle the filter exactly. ChildExact, /// A child exists and took a look at the filter. @@ -40,34 +42,64 @@ enum PushdownState { NoChild, } -impl PushdownState { +impl ChildPushdownState { /// Combine the current state with another state. /// This is used to combine the results of multiple children. - fn combine_with_other(&self, other: &FilterPushdownSupport) -> PushdownState { + fn combine_with_other(&self, other: &FilterSupport) -> ChildPushdownState { match (other, self) { - (FilterPushdownSupport::HandledExact, PushdownState::NoChild) => { - PushdownState::ChildExact + (FilterSupport::HandledExact, ChildPushdownState::NoChild) => { + ChildPushdownState::ChildExact } - (FilterPushdownSupport::HandledExact, PushdownState::ChildInexact) => { - PushdownState::ChildInexact + (FilterSupport::HandledExact, ChildPushdownState::ChildInexact) => { + ChildPushdownState::ChildInexact } - (FilterPushdownSupport::Unhandled, PushdownState::NoChild) => { - PushdownState::ChildInexact + (FilterSupport::Unhandled, ChildPushdownState::NoChild) => { + ChildPushdownState::ChildInexact } - (FilterPushdownSupport::Unhandled, PushdownState::ChildExact) => { - PushdownState::ChildInexact + (FilterSupport::Unhandled, ChildPushdownState::ChildExact) => { + ChildPushdownState::ChildInexact } - (FilterPushdownSupport::Unhandled, PushdownState::ChildInexact) => { - PushdownState::ChildInexact + (FilterSupport::Unhandled, ChildPushdownState::ChildInexact) => { + ChildPushdownState::ChildInexact } - (FilterPushdownSupport::HandledExact, PushdownState::ChildExact) => { + (FilterSupport::HandledExact, ChildPushdownState::ChildExact) => { // If both are exact, keep it as exact - PushdownState::ChildExact + ChildPushdownState::ChildExact } } } } +fn push_down_into_children( + node: &Arc, + filters: &[Arc], +) -> Result { + let children = node.children(); + let mut new_children = Vec::with_capacity(children.len()); + let mut filter_pushdown_result = vec![ChildPushdownState::NoChild; filters.len()]; + for child in children { + if let Some(result) = pushdown_filters(child, &filters)? { + new_children.push(result.inner); + for (idx, support) in result.support.iter().enumerate() { + filter_pushdown_result[idx] = + filter_pushdown_result[idx].combine_with_other(support) + } + } else { + new_children.push(Arc::clone(child)); + } + } + let support = filter_pushdown_result + .iter() + .map(|s| match s { + ChildPushdownState::ChildExact => FilterSupport::HandledExact, + ChildPushdownState::ChildInexact => FilterSupport::Unhandled, + ChildPushdownState::NoChild => FilterSupport::Unhandled, + }) + .collect::>(); + let node = with_new_children_if_necessary(Arc::clone(node), new_children)?; + Ok(ExecutionPlanFilterPushdownResult::new(node, support)) +} + /// Recursively a collection of filters down through the execution plan tree in a depth-first manner. /// /// For each filter we try to push it down to children as far down as possible, keeping track of if the children @@ -90,83 +122,71 @@ fn pushdown_filters( node: &Arc, parent_filters: &[Arc], ) -> Result> { + // Gather the filters from the current node. + // These are the filters the current node "owns" or "produces" and wants to push down. let node_filters = node.filters_for_pushdown()?; - let children = node.children(); - let mut new_children = Vec::with_capacity(children.len()); - let all_filters = parent_filters + // Check which nodes from parents this node is okay with us trying to push down to it's children. + let parent_pushdown_request_result = node.filter_pushdown_request(&parent_filters)?; + // Do some index masking so that we only ever call nodes with the filters relevant to them / that they're allowed to touch. + // But we still need to reconstruct the full result for our caller. + let parent_filter_for_pushdown_indices = parent_pushdown_request_result .iter() - .chain(node_filters.iter()) - .cloned() - .collect::>(); - let mut filter_pushdown_result = vec![PushdownState::NoChild; all_filters.len()]; - for child in children { - if child.supports_filter_pushdown() { - if let Some(result) = pushdown_filters(child, &all_filters)? { - new_children.push(result.inner); - for (all_filters_idx, support) in result.support.iter().enumerate() { - filter_pushdown_result[all_filters_idx] = filter_pushdown_result - [all_filters_idx] - .combine_with_other(support) - } - } else { - new_children.push(Arc::clone(child)); - } - } else { - // Reset the filters we are pushing down. - if let Some(result) = pushdown_filters(child, &Vec::new())? { - new_children.push(result.inner); + .enumerate() + .filter_map(|(i, s)| { + if matches!(s, FilterPushdownAllowed::Allowed(_)) { + Some(i) } else { - new_children.push(Arc::clone(child)); + None } - }; - } - - let mut node = with_new_children_if_necessary(Arc::clone(node), new_children)?; - - // Now update the node with the result of the pushdown of it's filters - let pushdown_result = filter_pushdown_result[parent_filters.len()..] - .iter() - .map(|s| match s { - PushdownState::ChildExact => FilterPushdownSupport::HandledExact, - PushdownState::ChildInexact => FilterPushdownSupport::Unhandled, - PushdownState::NoChild => FilterPushdownSupport::Unhandled, }) .collect::>(); - if let Some(new_node) = - Arc::clone(&node).with_filter_pushdown_result(&pushdown_result)? - { - node = new_node; - }; - - // And check if it can absorb the remaining filters - let remaining_filter_indexes = (0..parent_filters.len()) - .filter(|&i| !matches!(filter_pushdown_result[i], PushdownState::ChildExact)) + let parent_filters_to_push_down = parent_filter_for_pushdown_indices + .iter() + .map(|&i| Arc::clone(&parent_filters[i])) .collect::>(); - if !remaining_filter_indexes.is_empty() { - let remaining_filters = remaining_filter_indexes - .iter() - .map(|&i| &parent_filters[i]) - .collect::>(); - if let Some(result) = node.push_down_filters_from_parents(&remaining_filters)? { - node = result.inner; - for (parent_filter_index, support) in - remaining_filter_indexes.iter().zip(result.support) - { - filter_pushdown_result[*parent_filter_index] = filter_pushdown_result - [*parent_filter_index] - .combine_with_other(&support) - } - } + let all_filters_to_push_down = node_filters + .iter() + .chain(parent_filters_to_push_down.iter()) + .map(|f| Arc::clone(f)) + .collect::>(); + // Push down into children + let child_pushdown_result = push_down_into_children(node, &all_filters_to_push_down)?; + let mut node = child_pushdown_result.inner; + // A bit more index masking to construct the final result for our caller. + let node_filters_pushdown_result = + child_pushdown_result.support[..node_filters.len()].to_vec(); + let mut parent_filter_pushdown_result = + vec![FilterSupport::Unhandled; parent_filters.len()]; + for (parent_filter_idx, support) in parent_filter_for_pushdown_indices + .iter() + .zip(child_pushdown_result.support[node_filters.len()..].iter()) + { + parent_filter_pushdown_result[*parent_filter_idx] = *support; } - let support = filter_pushdown_result[..parent_filters.len()] + // Collect the remaining unhandled parent filters + let unhandled_parent_filter_indices = (0..parent_filters.len()) + .filter(|&i| matches!(parent_filter_pushdown_result[i], FilterSupport::Unhandled)) + .collect::>(); + let unhandled_parent_filters = unhandled_parent_filter_indices .iter() - .map(|s| match s { - PushdownState::ChildExact => FilterPushdownSupport::HandledExact, - PushdownState::ChildInexact => FilterPushdownSupport::Unhandled, - PushdownState::NoChild => FilterPushdownSupport::Unhandled, - }) + .map(|&i| Arc::clone(&parent_filters[i])) .collect::>(); - Ok(Some(ExecutionPlanFilterPushdownResult::new(node, support))) + // Check if the node can handle the filters + if let Some(result) = Arc::clone(&node).with_filter_pushdown_result( + &node_filters_pushdown_result, + &unhandled_parent_filters, + )? { + node = result.inner; + for (parent_filter_index, support) in + unhandled_parent_filter_indices.iter().zip(result.support) + { + parent_filter_pushdown_result[*parent_filter_index] = support; + } + } + Ok(Some(ExecutionPlanFilterPushdownResult::new( + node, + parent_filter_pushdown_result, + ))) } /// A physical optimizer rule that pushes down filters in the execution plan. @@ -275,8 +295,6 @@ fn pushdown_filters( /// There are also cases where we may be able to push down filters within a subtree but not the entire tree. /// A good exmaple of this is aggreagation nodes: /// -/// projection -> aggregate -> filter -> scan -/// /// ```text /// ┌──────────────────────┐ /// │ ProjectionExec │ @@ -466,6 +484,7 @@ impl PhysicalOptimizerRule for FilterPushdown { _config: &ConfigOptions, ) -> Result> { if let Some(result) = pushdown_filters(&plan, &[])? { + println!("new plan: {:?}", result.inner); Ok(result.inner) } else { Ok(plan) diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 341ad347a836..1c5a0500e5d0 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -34,7 +34,7 @@ use datafusion_common::Result; use datafusion_execution::TaskContext; use crate::coalesce::{BatchCoalescer, CoalescerState}; -use crate::execution_plan::CardinalityEffect; +use crate::execution_plan::{CardinalityEffect, TransparentFilterPushdown}; use futures::ready; use futures::stream::{Stream, StreamExt}; @@ -212,12 +212,10 @@ impl ExecutionPlan for CoalesceBatchesExec { fn cardinality_effect(&self) -> CardinalityEffect { CardinalityEffect::Equal } - - fn supports_filter_pushdown(&self) -> bool { - true // CoalesceBatchesExec does not itself accept any filters but it is happy to have them pushed to it's children - } } +impl TransparentFilterPushdown for CoalesceBatchesExec {} + /// Stream for [`CoalesceBatchesExec`]. See [`CoalesceBatchesExec`] for more details. struct CoalesceBatchesStream { /// The input plan diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index a5bcfcc0dd72..e61b0e8b75c9 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -488,42 +488,48 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { Ok(Vec::new()) } + /// Checks which filters this node allows to be pushed down through it from a parent to a child. + /// For example, a `ProjectionExec` node can allow filters that only refernece + /// columns it did not create through but filters that reference columns it is creating cannot be pushed down any further. + /// That is, it only allows some filters through because it changes the schema of the data. + /// Aggregation nodes may not allow any filters to be pushed down as they change the cardinality of the data. + /// RepartitionExec nodes allow all filters to be pushed down as they don't change the schema or cardinality. + fn filter_pushdown_request( + &self, + filters: &[Arc], + ) -> Result> { + Ok(vec![FilterPushdownAllowed::Disallowed; filters.len()]) + } + /// After we've attempted to push down filters into this node's children - /// this will be called with the result for each filter that this node gave in `filters_for_pushdown`. - /// The node should update itself to possibly drop filters that were pushed down as `Exact`. + /// this will be called with the result for each filter that this node gave in `filters_for_pushdown` + /// **and** any filters that children could not handle. fn with_filter_pushdown_result( self: Arc, - _pushdown: &[FilterPushdownSupport], - ) -> Result>> { - Ok(None) - } - - /// Push down the given filters into this `ExecutionPlan`. - /// This is called after `with_filter_pushdown_result`. - /// Operators can accept filters from their parents, either as Exact or Unsupported. - /// If the operator accepts a filter as Exact, it should return a new `ExecutionPlan` with the filter applied - /// and the parent that generated the filter might not apply it anymore. - fn push_down_filters_from_parents( - &self, - _filters: &[&Arc], + _own_filters_result: &[FilterSupport], + _parent_filters_remaining: &[Arc], ) -> Result> { Ok(None) } +} - /// Returns `true` if this `ExecutionPlan` allows filter pushdown to flow throught it and `false` otherwise. - /// Nodes such as aggregations cannot have filters pushed down through them, so they return `false`. - /// On the other hand nodes such as repartitions can have filters pushed down through them, so they return `true`. - /// The default implementation returns `false`. - fn supports_filter_pushdown(&self) -> bool { - false - } +/// The answer to the question: "Can this filter be pushed down through this plan?" +/// Note that this is different from [`FilterSupport`] which is the answer to "Can *this* plan handle this filter?" +#[derive(Debug, Clone)] +pub enum FilterPushdownAllowed { + /// The operator allows this filter to be pushed down to its children. + /// The operator may choose to return a *different* filter expression + /// that is equivalent to the original filter, e.g. to deal with column indexes in a projection + /// or because the original filter can't be pushed down as is but a less-selective filter can be. + Allowed(Arc), + /// The operator does not allow this filter to be pushed down to its children. + Disallowed, } -/// The result of pushing down each filter. -/// When a parent plan tries to push down a filter to a child it needs to know if the child -/// can handle the filter or not to determine if it still needs to apply the filter itself. +/// The answer to the question: "Can this operator handle this filter itself?" +/// Note that this is different from [`FilterPushdownAllowed`] which is the answer to "Can *this* plan handle this filter?" #[derive(Debug, Clone, Copy)] -pub enum FilterPushdownSupport { +pub enum FilterSupport { /// Filter may not have been pushed down to the child plan, or the child plan /// can only partially apply the filter but may have false positives (but not false negatives). /// In this case the parent **must** behave as if the filter was not pushed down @@ -535,13 +541,35 @@ pub enum FilterPushdownSupport { HandledExact, } +/// An extension trait to provide a default implementation of [`ExecutionPlan::supports_filter_pushdown`] +/// that allows all filters to be pushed down. +/// This is useful for nodes that don't modify the schema or cardinality of the data. +/// For example, `RepartitionExec` and `CoalescePartitionsExec` can push down all filters. +pub trait TransparentFilterPushdown { + /// Returns a vector of [`FilterPushdownAllowed`] for each filter. + /// The default implementation returns [`FilterPushdownAllowed::Allowed`] for all filters. + fn supports_filter_pushdown( + &self, + filters: &[Arc], + ) -> Result> { + Ok(filters + .iter() + .map(|f| FilterPushdownAllowed::Allowed(Arc::clone(f))) + .collect()) + } +} + +/// The combined result of a filter pushdown operation. +/// This includes: +/// * The inner plan that was produced by the pushdown operation. +/// * The support for each filter that was pushed down. pub struct FilterPushdownResult { pub inner: T, - pub support: Vec, + pub support: Vec, } impl FilterPushdownResult { - pub fn new(plan: T, support: Vec) -> Self { + pub fn new(plan: T, support: Vec) -> Self { Self { inner: plan, support, @@ -551,7 +579,7 @@ impl FilterPushdownResult { pub fn is_exact(&self) -> bool { self.support .iter() - .all(|s| matches!(s, FilterPushdownSupport::HandledExact)) + .all(|s| matches!(s, FilterSupport::HandledExact)) } } diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index e885f41a3909..9ebd2b3fecc5 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -26,7 +26,8 @@ use super::{ }; use crate::common::can_project; use crate::execution_plan::{ - CardinalityEffect, ExecutionPlanFilterPushdownResult, FilterPushdownSupport, + CardinalityEffect, ExecutionPlanFilterPushdownResult, FilterPushdownAllowed, + FilterSupport, }; use crate::projection::{ make_with_child, try_embed_projection, update_expr, EmbeddedProjection, @@ -51,7 +52,7 @@ use datafusion_expr::Operator; use datafusion_physical_expr::equivalence::ProjectionMapping; use datafusion_physical_expr::expressions::BinaryExpr; use datafusion_physical_expr::intervals::utils::check_support; -use datafusion_physical_expr::utils::collect_columns; +use datafusion_physical_expr::utils::{collect_columns, reassign_predicate_columns}; use datafusion_physical_expr::{ analyze, conjunction, split_conjunction, AcrossPartitions, AnalysisContext, ConstExpr, ExprBoundaries, PhysicalExpr, @@ -437,8 +438,23 @@ impl ExecutionPlan for FilterExec { try_embed_projection(projection, self) } - fn supports_filter_pushdown(&self) -> bool { - true // FilterExec both accepts filters and is happy for them to be pushed onto its children + fn filter_pushdown_request( + &self, + filters: &[Arc], + ) -> Result> { + // Note: we don't have to worry about / deal with the projection here because + // `FilterExec`'s projection can only remove columns, not add them. + // Thus if a filter was valid applied to our output it should be valid applied to our input. + // We do however need to remap the columns. + let input_schema = self.input.schema(); + let filters = filters + .into_iter() + .map(|f| reassign_predicate_columns(Arc::clone(f), &input_schema, false)) + .collect::>>()?; + Ok(filters + .into_iter() + .map(|f| FilterPushdownAllowed::Allowed(f)) + .collect()) } fn filters_for_pushdown(&self) -> Result>> { @@ -450,15 +466,16 @@ impl ExecutionPlan for FilterExec { fn with_filter_pushdown_result( self: Arc, - pushdown: &[FilterPushdownSupport], - ) -> Result>> { + own_filters_result: &[FilterSupport], + parent_filters_remaining: &[Arc], + ) -> Result> { // Only keep filters who's index maps to the pushdown result Unsupported - let new_filters = self - .filters_for_pushdown()? + let filters_for_pushdown = self.filters_for_pushdown()?; + let new_filters = filters_for_pushdown .iter() - .zip(pushdown.iter()) + .zip(own_filters_result.iter()) .filter_map(|(f, p)| { - if matches!(p, FilterPushdownSupport::HandledExact) { + if matches!(p, FilterSupport::HandledExact) { // Exact pushdown support means we keep discard filter None } else { @@ -466,40 +483,33 @@ impl ExecutionPlan for FilterExec { Some(Arc::clone(f)) } }) - .collect::>(); - - let predicate = conjunction(new_filters); - - if predicate.eq(&lit(true)) && self.projection.is_none() { - return Ok(Some(Arc::clone(self.input()))); + // Combine that with any leftover filters from parents that our children couldn't handle + .chain(parent_filters_remaining.iter().map(|f| Arc::clone(f))); + + let new_predicate = conjunction(new_filters); + + if new_predicate.eq(&lit(true)) && self.projection.is_none() { + // We can remove ourselves from the execution tree + Ok(Some(ExecutionPlanFilterPushdownResult::new( + Arc::clone(&self.input), + vec![FilterSupport::HandledExact; parent_filters_remaining.len()], + ))) + } else { + Ok(Some(ExecutionPlanFilterPushdownResult { + inner: Arc::new(Self { + predicate: new_predicate, + input: Arc::clone(&self.input), + metrics: self.metrics.clone(), + default_selectivity: self.default_selectivity, + cache: self.cache.clone(), + projection: self.projection.clone(), + }), + support: vec![ + FilterSupport::HandledExact; + parent_filters_remaining.len() + ], + })) } - - let new = FilterExec { - predicate, - input: Arc::clone(self.input()), - metrics: self.metrics.clone(), - default_selectivity: self.default_selectivity, - cache: self.cache.clone(), - projection: self.projection.clone(), - }; - Ok(Some(Arc::new(new))) - } - - fn push_down_filters_from_parents( - &self, - filters: &[&Arc], - ) -> Result> { - let new_predicates = conjunction( - std::iter::once(Arc::clone(&self.predicate)) - .chain(filters.iter().map(|f| Arc::clone(f))), - ); - Ok(Some(ExecutionPlanFilterPushdownResult { - inner: Arc::new(Self { - predicate: new_predicates, - ..self.clone() - }), - support: vec![FilterPushdownSupport::HandledExact; filters.len()], - })) } } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index f3306e49b04b..544b311e7f0c 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -29,7 +29,7 @@ use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use super::{ DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream, }; -use crate::execution_plan::CardinalityEffect; +use crate::execution_plan::{CardinalityEffect, TransparentFilterPushdown}; use crate::hash_utils::create_hashes; use crate::metrics::BaselineMetrics; use crate::projection::{all_columns, make_with_child, update_expr, ProjectionExec}; @@ -723,12 +723,10 @@ impl ExecutionPlan for RepartitionExec { new_partitioning, )?))) } - - fn supports_filter_pushdown(&self) -> bool { - true // RepartitionExec does not accept filters itself but is happy for them to be pushed down to its children - } } +impl TransparentFilterPushdown for RepartitionExec {} + impl RepartitionExec { /// Create a new RepartitionExec, that produces output `partitioning`, and /// does not preserve the order of the input (see [`Self::with_preserve_order`] diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index db1ceb2cb673..acefdd140828 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -25,7 +25,9 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use crate::common::spawn_buffered; -use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType}; +use crate::execution_plan::{ + Boundedness, CardinalityEffect, EmissionType, TransparentFilterPushdown, +}; use crate::expressions::PhysicalSortExpr; use crate::limit::LimitStream; use crate::metrics::{ @@ -1351,12 +1353,10 @@ impl ExecutionPlan for SortExec { fn filters_for_pushdown(&self) -> Result>> { Ok(vec![self.dynamic_filter_source.as_physical_expr()?]) } - - fn supports_filter_pushdown(&self) -> bool { - true // SortExec doesn't accept filters itself but it's happy for them to be forwarded down to it's children - } } +impl TransparentFilterPushdown for SortExec {} + #[cfg(test)] mod tests { use std::collections::HashMap; From a7ce3bcec2d4a893fe9a75b072f9ec593286b98e Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 15:23:27 -0500 Subject: [PATCH 23/27] fixes --- datafusion/core/tests/parquet/file_statistics.rs | 1 + datafusion/datasource/src/source.rs | 6 +++--- datafusion/physical-optimizer/src/filter_pushdown.rs | 8 ++++---- datafusion/physical-plan/src/filter.rs | 6 +++--- datafusion/physical-plan/src/sorts/sort_filters.rs | 8 +++----- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 50dbe4d787a2..3f3eefd537c4 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -75,6 +75,7 @@ async fn check_stats_precision_with_filter_pushdown() { let exec = FilterPushdown::new() .optimize(Arc::new(exec), state.config().options()) .unwrap(); + println!("exec: {:?}", exec); let filter_exec = exec.as_any().downcast_ref::().unwrap(); // TODO: we need to get the FilterExec to push down its filters // since they no longer get applied to the DataSourceExec directly. diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 623c5467c319..9c5f73f7857b 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -220,12 +220,12 @@ impl ExecutionPlan for DataSourceExec { .push_down_filters(parent_filters_remaining)? { let new_self = Arc::new(DataSourceExec::new(pushdown_result.inner)); - return Ok(Some(ExecutionPlanFilterPushdownResult::new( + Ok(Some(ExecutionPlanFilterPushdownResult::new( new_self, pushdown_result.support, - ))); + ))) } else { - return Ok(None); + Ok(None) } } } diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 8bba6154d9aa..c3a0c14bcc7d 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -70,6 +70,7 @@ impl ChildPushdownState { } } +/// See [`pushdown_filters`] for more details. fn push_down_into_children( node: &Arc, filters: &[Arc], @@ -78,7 +79,7 @@ fn push_down_into_children( let mut new_children = Vec::with_capacity(children.len()); let mut filter_pushdown_result = vec![ChildPushdownState::NoChild; filters.len()]; for child in children { - if let Some(result) = pushdown_filters(child, &filters)? { + if let Some(result) = pushdown_filters(child, filters)? { new_children.push(result.inner); for (idx, support) in result.support.iter().enumerate() { filter_pushdown_result[idx] = @@ -126,7 +127,7 @@ fn pushdown_filters( // These are the filters the current node "owns" or "produces" and wants to push down. let node_filters = node.filters_for_pushdown()?; // Check which nodes from parents this node is okay with us trying to push down to it's children. - let parent_pushdown_request_result = node.filter_pushdown_request(&parent_filters)?; + let parent_pushdown_request_result = node.filter_pushdown_request(parent_filters)?; // Do some index masking so that we only ever call nodes with the filters relevant to them / that they're allowed to touch. // But we still need to reconstruct the full result for our caller. let parent_filter_for_pushdown_indices = parent_pushdown_request_result @@ -147,7 +148,7 @@ fn pushdown_filters( let all_filters_to_push_down = node_filters .iter() .chain(parent_filters_to_push_down.iter()) - .map(|f| Arc::clone(f)) + .map(Arc::clone) .collect::>(); // Push down into children let child_pushdown_result = push_down_into_children(node, &all_filters_to_push_down)?; @@ -484,7 +485,6 @@ impl PhysicalOptimizerRule for FilterPushdown { _config: &ConfigOptions, ) -> Result> { if let Some(result) = pushdown_filters(&plan, &[])? { - println!("new plan: {:?}", result.inner); Ok(result.inner) } else { Ok(plan) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 9ebd2b3fecc5..56c89324c37c 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -448,12 +448,12 @@ impl ExecutionPlan for FilterExec { // We do however need to remap the columns. let input_schema = self.input.schema(); let filters = filters - .into_iter() + .iter() .map(|f| reassign_predicate_columns(Arc::clone(f), &input_schema, false)) .collect::>>()?; Ok(filters .into_iter() - .map(|f| FilterPushdownAllowed::Allowed(f)) + .map(FilterPushdownAllowed::Allowed) .collect()) } @@ -484,7 +484,7 @@ impl ExecutionPlan for FilterExec { } }) // Combine that with any leftover filters from parents that our children couldn't handle - .chain(parent_filters_remaining.iter().map(|f| Arc::clone(f))); + .chain(parent_filters_remaining.iter().map(Arc::clone)); let new_predicate = conjunction(new_filters); diff --git a/datafusion/physical-plan/src/sorts/sort_filters.rs b/datafusion/physical-plan/src/sorts/sort_filters.rs index ae4e5029498b..3e37cd6b3cbd 100644 --- a/datafusion/physical-plan/src/sorts/sort_filters.rs +++ b/datafusion/physical-plan/src/sorts/sort_filters.rs @@ -141,11 +141,9 @@ impl SortDynamicFilterSource { replace = false; break; } - } else { - if !new_value_is_greater_than_current { - replace = false; - break; - } + } else if !new_value_is_greater_than_current { + replace = false; + break; } // Handle the equality case if new_value.eq(current_value) { From 5ebba12a64d880146641ada4aca9f145c3539c63 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 15:30:25 -0500 Subject: [PATCH 24/27] fix additional test --- .../core/tests/parquet/file_statistics.rs | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 3f3eefd537c4..0137f8532b3c 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -53,36 +53,38 @@ async fn check_stats_precision_with_filter_pushdown() { let opt = ListingOptions::new(Arc::new(ParquetFormat::default())); let table = get_listing_table(&table_path, None, &opt).await; let (_, _, state) = get_cache_runtime_state(); + + let filter = Expr::gt(col("id"), lit(1)); + // Scan without filter, stats are exact let exec = table.scan(&state, None, &[], None).await.unwrap(); assert_eq!(exec.statistics().unwrap().num_rows, Precision::Exact(8)); - // Scan with filter pushdown, stats are inexact - let filter = Expr::gt(col("id"), lit(1)); - - let data_source_exec = table - .scan(&state, None, &[filter.clone()], None) - .await - .unwrap(); + // Apply filter pushdown, this should make the estimate inexact because we don't know + // how many rows will be filtered out by the predicate. let df_schema = DFSchema::try_from(table.schema()).unwrap(); let exec = FilterExec::try_new( state .create_physical_expr(filter.clone(), &df_schema) .unwrap(), - data_source_exec, + exec, ) .unwrap(); let exec = FilterPushdown::new() .optimize(Arc::new(exec), state.config().options()) .unwrap(); - println!("exec: {:?}", exec); - let filter_exec = exec.as_any().downcast_ref::().unwrap(); - // TODO: we need to get the FilterExec to push down its filters - // since they no longer get applied to the DataSourceExec directly. - // let data_source_exec = Arc::new( - // filter_exec.input().as_any().downcast_ref::().unwrap() - // ) as Arc; - // assert_eq!(data_source_exec.statistics().unwrap().num_rows, Precision::Inexact(8)); + let data_source_exec = exec + .as_any() + .downcast_ref::() + .unwrap() + .input() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + data_source_exec.statistics().unwrap().num_rows, + Precision::Inexact(8) + ); } #[tokio::test] From d4238660f4cd964e2bb58e9ccb49680f3e4ff966 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 2 Apr 2025 16:32:12 -0400 Subject: [PATCH 25/27] Chore: Add basic filter pushdown tests (#16) --- .../physical_optimizer/filter_pushdown.rs | 213 ++++++++++++++++++ .../core/tests/physical_optimizer/mod.rs | 1 + datafusion/datasource/src/source.rs | 10 + .../physical-optimizer/src/filter_pushdown.rs | 32 +-- 4 files changed, 241 insertions(+), 15 deletions(-) create mode 100644 datafusion/core/tests/physical_optimizer/filter_pushdown.rs diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs new file mode 100644 index 000000000000..a3aebf315fef --- /dev/null +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::{ + datasource::object_store::ObjectStoreUrl, + logical_expr::Operator, + physical_plan::{ + expressions::{BinaryExpr, Column, Literal}, + PhysicalExpr, + }, + scalar::ScalarValue, +}; +use datafusion_common::config::{ConfigOptions, TableParquetOptions}; +use datafusion_common::internal_err; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource_parquet::source::ParquetSource; +use datafusion_physical_optimizer::filter_pushdown::FilterPushdown; +use datafusion_physical_optimizer::PhysicalOptimizerRule; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::{displayable, ExecutionPlan}; +use insta; +use std::fmt::{Display, Formatter}; +use std::sync::{Arc, OnceLock}; + +#[test] +fn test_pushdown_into_scan() { + let scan = parquet_scan(); + let predicate = col_lit_predicate("a", "foo", schema()); + let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap()); + + // expect the predicate to be pushed down into the DataSource + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown{}), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = foo + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet + output: + Ok: + - FilterExec: a@0 = foo + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=a@0 = foo + " + ); +} + +#[test] +fn test_parquet_pushdown() { + // filter should be pushed down into the parquet scan with two filters + let scan = parquet_scan(); + let predicate1 = col_lit_predicate("a", "foo", schema()); + let filter1 = Arc::new(FilterExec::try_new(predicate1, scan).unwrap()); + let predicate2 = col_lit_predicate("b", "bar", schema()); + let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown{}), + @r" + OptimizationTest: + input: + - FilterExec: b@1 = bar + - FilterExec: a@0 = foo + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet + output: + Ok: + - FilterExec: b@1 = bar + - FilterExec: a@0 = foo AND b@1 = bar + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=b@1 = bar AND a@0 = foo + " + ); +} + +/// Schema: +/// a: String +/// b: String +/// c: f64 +static TEST_SCHEMA: OnceLock = OnceLock::new(); + +fn schema() -> &'static SchemaRef { + TEST_SCHEMA.get_or_init(|| { + let fields = vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + Field::new("c", DataType::Float64, false), + ]; + Arc::new(Schema::new(fields)) + }) +} + +/// Return a execution plan that reads from a parquet file +fn parquet_scan() -> Arc { + let schema = schema(); + let source = ParquetSource::new(TableParquetOptions::default()) + .with_schema(Arc::clone(schema)); + let base_config = FileScanConfigBuilder::new( + ObjectStoreUrl::parse("test://").unwrap(), + Arc::clone(schema), + source, + ) + .build(); + DataSourceExec::from_data_source(base_config) +} + +/// Returns a predicate that is a binary expression col = lit +fn col_lit_predicate( + column_name: &str, + scalar_value: impl Into, + schema: &Schema, +) -> Arc { + let scalar_value = scalar_value.into(); + Arc::new(BinaryExpr::new( + Arc::new(Column::new_with_schema(column_name, schema).unwrap()), + Operator::Eq, + Arc::new(Literal::new(scalar_value)), + )) +} + +/// A harness for testing physical optimizers. +/// +/// You can use this to test the output of a physical optimizer rule using insta snapshots +#[derive(Debug)] +pub struct OptimizationTest { + input: Vec, + output: Result, String>, +} + +impl OptimizationTest { + pub fn new(input_plan: Arc, opt: O) -> Self + where + O: PhysicalOptimizerRule, + { + Self::new_with_config(input_plan, opt, &ConfigOptions::default()) + } + + pub fn new_with_config( + input_plan: Arc, + opt: O, + config: &ConfigOptions, + ) -> Self + where + O: PhysicalOptimizerRule, + { + let input = format_execution_plan(&input_plan); + + let input_schema = input_plan.schema(); + + let output_result = opt.optimize(input_plan, config); + let output = output_result + .and_then(|plan| { + if opt.schema_check() && (plan.schema() != input_schema) { + internal_err!( + "Schema mismatch:\n\nBefore:\n{:?}\n\nAfter:\n{:?}", + input_schema, + plan.schema() + ) + } else { + Ok(plan) + } + }) + .map(|plan| format_execution_plan(&plan)) + .map_err(|e| e.to_string()); + + Self { input, output } + } +} + +impl Display for OptimizationTest { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + writeln!(f, "OptimizationTest:")?; + writeln!(f, " input:")?; + for line in &self.input { + writeln!(f, " - {line}")?; + } + writeln!(f, " output:")?; + match &self.output { + Ok(output) => { + writeln!(f, " Ok:")?; + for line in output { + writeln!(f, " - {line}")?; + } + } + Err(err) => { + writeln!(f, " Err: {err}")?; + } + } + Ok(()) + } +} + +pub fn format_execution_plan(plan: &Arc) -> Vec { + format_lines(&displayable(plan.as_ref()).indent(false).to_string()) +} + +fn format_lines(s: &str) -> Vec { + s.trim().split('\n').map(|s| s.to_string()).collect() +} diff --git a/datafusion/core/tests/physical_optimizer/mod.rs b/datafusion/core/tests/physical_optimizer/mod.rs index 7d5d07715eeb..fe7b9decfebf 100644 --- a/datafusion/core/tests/physical_optimizer/mod.rs +++ b/datafusion/core/tests/physical_optimizer/mod.rs @@ -21,6 +21,7 @@ mod aggregate_statistics; mod combine_partial_final_agg; mod enforce_distribution; mod enforce_sorting; +mod filter_pushdown; mod join_selection; mod limit_pushdown; mod limited_distinct_aggregation; diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 9c5f73f7857b..c54c893e545b 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -290,3 +290,13 @@ impl DataSourceExec { }) } } + +/// Create a new `DataSourceExec` from a `DataSource` +impl From for DataSourceExec +where + S: DataSource + 'static, +{ + fn from(source: S) -> Self { + Self::new(Arc::new(source)) + } +} diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index c3a0c14bcc7d..47f7bd341c73 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -194,21 +194,21 @@ fn pushdown_filters( /// For example, consider the following plan: /// /// ```text -// ┌──────────────────────┐ -// │ CoalesceBatchesExec │ -// └──────────────────────┘ -// │ -// ▼ -// ┌──────────────────────┐ -// │ FilterExec │ -// │ filters = [ id=1] │ -// └──────────────────────┘ -// │ -// ▼ -// ┌──────────────────────┐ -// │ DataSourceExec │ -// │ projection = * │ -// └──────────────────────┘ +/// ┌──────────────────────┐ +/// │ CoalesceBatchesExec │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ FilterExec │ +/// │ filters = [ id=1] │ +/// └──────────────────────┘ +/// │ +/// ▼ +/// ┌──────────────────────┐ +/// │ DataSourceExec │ +/// │ projection = * │ +/// └──────────────────────┘ /// ``` /// /// Our goal is to move the `id = 1` filter from the `FilterExec` node to the `DataSourceExec` node. @@ -463,6 +463,8 @@ fn pushdown_filters( /// Now as we fill our `TopK` heap we can push down the state of the heap to the `DataSourceExec` node /// to avoid reading files / row groups / pages / rows that could not possibly be in the top 10. /// This is implemented in datafusion/physical-plan/src/sorts/sort_filters.rs. +/// +// TODO potentially rename this to align with logical optimizer `PushdownFilter` #[derive(Debug)] pub struct FilterPushdown {} From b683507786526c75dc142a5706e2e63e995ac7e1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 15:35:32 -0500 Subject: [PATCH 26/27] fix new tests --- datafusion/core/tests/parquet/file_statistics.rs | 4 ++-- .../core/tests/physical_optimizer/filter_pushdown.rs | 11 +++++------ datafusion/physical-optimizer/src/filter_pushdown.rs | 10 ++++------ datafusion/physical-optimizer/src/optimizer.rs | 4 ++-- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 0137f8532b3c..58a9148029bf 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -38,7 +38,7 @@ use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_expr::{col, lit, Expr}; use datafusion::datasource::physical_plan::FileScanConfig; -use datafusion_physical_optimizer::filter_pushdown::FilterPushdown; +use datafusion_physical_optimizer::filter_pushdown::PushdownFilter; use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::filter::FilterExec; use datafusion_physical_plan::ExecutionPlan; @@ -70,7 +70,7 @@ async fn check_stats_precision_with_filter_pushdown() { exec, ) .unwrap(); - let exec = FilterPushdown::new() + let exec = PushdownFilter::new() .optimize(Arc::new(exec), state.config().options()) .unwrap(); let data_source_exec = exec diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs index a3aebf315fef..012b2bbab087 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs @@ -31,7 +31,7 @@ use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; use datafusion_datasource_parquet::source::ParquetSource; -use datafusion_physical_optimizer::filter_pushdown::FilterPushdown; +use datafusion_physical_optimizer::filter_pushdown::PushdownFilter; use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::filter::FilterExec; use datafusion_physical_plan::{displayable, ExecutionPlan}; @@ -47,7 +47,7 @@ fn test_pushdown_into_scan() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}), + OptimizationTest::new(plan, PushdownFilter{}), @r" OptimizationTest: input: @@ -71,7 +71,7 @@ fn test_parquet_pushdown() { let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap()); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}), + OptimizationTest::new(plan, PushdownFilter{}), @r" OptimizationTest: input: @@ -80,9 +80,8 @@ fn test_parquet_pushdown() { - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet output: Ok: - - FilterExec: b@1 = bar - - FilterExec: a@0 = foo AND b@1 = bar - - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=b@1 = bar AND a@0 = foo + - FilterExec: a@0 = foo AND b@1 = bar + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=a@0 = foo AND b@1 = bar " ); } diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 47f7bd341c73..cbbc4c737de3 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -463,24 +463,22 @@ fn pushdown_filters( /// Now as we fill our `TopK` heap we can push down the state of the heap to the `DataSourceExec` node /// to avoid reading files / row groups / pages / rows that could not possibly be in the top 10. /// This is implemented in datafusion/physical-plan/src/sorts/sort_filters.rs. -/// -// TODO potentially rename this to align with logical optimizer `PushdownFilter` #[derive(Debug)] -pub struct FilterPushdown {} +pub struct PushdownFilter {} -impl Default for FilterPushdown { +impl Default for PushdownFilter { fn default() -> Self { Self::new() } } -impl FilterPushdown { +impl PushdownFilter { pub fn new() -> Self { Self {} } } -impl PhysicalOptimizerRule for FilterPushdown { +impl PhysicalOptimizerRule for PushdownFilter { fn optimize( &self, plan: Arc, diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index 78d3e2ad8873..8bd22cbf1bda 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -25,7 +25,7 @@ use crate::coalesce_batches::CoalesceBatches; use crate::combine_partial_final_agg::CombinePartialFinalAggregate; use crate::enforce_distribution::EnforceDistribution; use crate::enforce_sorting::EnforceSorting; -use crate::filter_pushdown::FilterPushdown; +use crate::filter_pushdown::PushdownFilter; use crate::join_selection::JoinSelection; use crate::limit_pushdown::LimitPushdown; use crate::limited_distinct_aggregation::LimitedDistinctAggregation; @@ -125,7 +125,7 @@ impl PhysicalOptimizer { // The FilterPushdown rule tries to push down filters as far as it can. // For example, it will push down filtering from a `FilterExec` to // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. - Arc::new(FilterPushdown::new()), + Arc::new(PushdownFilter::new()), // The LimitPushdown rule tries to push limits down as far as possible, // replacing operators with fetching variants, or adding limits // past operators that support limit pushdown. From fbf93a2bdd0a5c1532336026dfa71ac7305c1655 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 2 Apr 2025 23:42:38 -0500 Subject: [PATCH 27/27] fixes --- .../src/datasource/physical_plan/parquet.rs | 1 - .../physical_optimizer/filter_pushdown.rs | 71 ++++++++++++++++--- datafusion/datasource/src/source.rs | 5 +- .../physical-optimizer/src/filter_pushdown.rs | 12 ++-- .../physical-plan/src/coalesce_batches.rs | 15 +++- .../physical-plan/src/execution_plan.rs | 18 ----- datafusion/physical-plan/src/filter.rs | 29 ++++---- .../physical-plan/src/repartition/mod.rs | 14 +++- datafusion/physical-plan/src/sorts/sort.rs | 14 +++- .../test_files/information_schema.slt | 2 - 10 files changed, 122 insertions(+), 59 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 94f2804587b5..74cbf164720e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -68,7 +68,6 @@ mod tests { use chrono::{TimeZone, Utc}; use datafusion_datasource::file_groups::FileGroup; use futures::StreamExt; - use insta; use insta::assert_snapshot; use object_store::local::LocalFileSystem; use object_store::path::Path; diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs index 012b2bbab087..6c6cb0f20af3 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs @@ -16,6 +16,7 @@ // under the License. use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow_schema::SortOptions; use datafusion::{ datasource::object_store::ObjectStoreUrl, logical_expr::Operator, @@ -31,11 +32,11 @@ use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; use datafusion_datasource_parquet::source::ParquetSource; +use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; use datafusion_physical_optimizer::filter_pushdown::PushdownFilter; use datafusion_physical_optimizer::PhysicalOptimizerRule; -use datafusion_physical_plan::filter::FilterExec; use datafusion_physical_plan::{displayable, ExecutionPlan}; -use insta; +use datafusion_physical_plan::{filter::FilterExec, sorts::sort::SortExec}; use std::fmt::{Display, Formatter}; use std::sync::{Arc, OnceLock}; @@ -55,8 +56,7 @@ fn test_pushdown_into_scan() { - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet output: Ok: - - FilterExec: a@0 = foo - - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=a@0 = foo + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=a@0 = foo " ); } @@ -80,8 +80,62 @@ fn test_parquet_pushdown() { - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet output: Ok: - - FilterExec: a@0 = foo AND b@1 = bar - - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=a@0 = foo AND b@1 = bar + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=a@0 = foo AND b@1 = bar + " + ); +} + +#[test] +fn test_topk_pushdown() { + // filter should be pushed down into the parquet scan with two filters + let scan = parquet_scan(); + let predicate = col_lit_predicate("a", "foo", schema()); + let filter = + Arc::new(FilterExec::try_new(Arc::clone(&predicate), Arc::clone(&scan)).unwrap()); + let plan = Arc::new(SortExec::new( + LexOrdering::new(vec![PhysicalSortExpr::new( + Arc::new(Column::new_with_schema("a", schema()).unwrap()), + SortOptions::default(), + )]), + filter, + )); + + insta::assert_snapshot!( + OptimizationTest::new(plan, PushdownFilter{}), + @r" + OptimizationTest: + input: + - SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + - FilterExec: a@0 = foo + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet + output: + Ok: + - SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=a@0 = foo AND DynamicFilterPhysicalExpr [ SortDynamicFilterSource[ ] ] + " + ); + + let sort = Arc::new(SortExec::new( + LexOrdering::new(vec![PhysicalSortExpr::new( + Arc::new(Column::new_with_schema("a", schema()).unwrap()), + SortOptions::default(), + )]), + Arc::clone(&scan), + )); + let plan = Arc::new(FilterExec::try_new(predicate, sort).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, PushdownFilter{}), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = foo + - SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet + output: + Ok: + - SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + - DataSourceExec: file_groups={0 groups: []}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ SortDynamicFilterSource[ ] ] AND a@0 = foo " ); } @@ -106,8 +160,9 @@ fn schema() -> &'static SchemaRef { /// Return a execution plan that reads from a parquet file fn parquet_scan() -> Arc { let schema = schema(); - let source = ParquetSource::new(TableParquetOptions::default()) - .with_schema(Arc::clone(schema)); + let mut options = TableParquetOptions::default(); + options.global.pushdown_filters = true; + let source = ParquetSource::new(options).with_schema(Arc::clone(schema)); let base_config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test://").unwrap(), Arc::clone(schema), diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index c54c893e545b..b5fe5d2d80dd 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -213,8 +213,9 @@ impl ExecutionPlan for DataSourceExec { own_filters_result: &[FilterSupport], parent_filters_remaining: &[Arc], ) -> datafusion_common::Result> { - assert!(own_filters_result.is_empty()); // We didn't give out any filters, this should be empty! - // Forward filter pushdown to our data source. + // We didn't give out any filters, this should be empty! + assert!(own_filters_result.is_empty()); + // Forward filter pushdown to our data source. if let Some(pushdown_result) = self .data_source .push_down_filters(parent_filters_remaining)? diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index cbbc4c737de3..21610564306c 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -118,7 +118,7 @@ fn push_down_into_children( /// We do this by checking if all children are exact (we return exact up) or if any child is inexact (we return inexact). /// - If a node has no children this is equivalent to inexact handling (there is no child to handle the filter). /// -/// See [`FilterPushdown`] for more details on how this works in practice. +/// See [`PushdownFilter`] for more details on how this works in practice. fn pushdown_filters( node: &Arc, parent_filters: &[Arc], @@ -217,7 +217,7 @@ fn pushdown_filters( /// 1. Enter the recursion with no filters. /// 2. We find the `FilterExec` node and it tells us that it has a filter (see [`ExecutionPlan::filters_for_pushdown`] and `datafusion::physical_plan::filter::FilterExec`). /// 3. We recurse down into it's children (the `DataSourceExec` node) now carrying the filters `[id = 1]`. -/// 4. The `DataSourceExec` node tells us that it can handle the filter and we mark it as handled exact (see [`ExecutionPlan::push_down_filters_from_parents`]). +/// 4. The `DataSourceExec` node tells us that it can handle the filter and we mark it as handled exact (see [`ExecutionPlan::with_filter_pushdown_result`]). /// 5. Since the `DataSourceExec` node has no children we recurse back up the tree. /// 6. We now tell the `FilterExec` node that it has a child that can handle the filter and we mark it as handled exact (see [`ExecutionPlan::with_filter_pushdown_result`]). /// The `FilterExec` node can now return a new execution plan, either a copy of itself without that filter or if has no work left to do it can even return the child node directly. @@ -265,7 +265,7 @@ fn pushdown_filters( // └──────────────────────┘ /// ``` /// -/// We want to push down the filters `[id=1]` to the [`DataSourceExec`] node, but can't push down `[cost>50]` because it requires the `ProjectionExec` node to be executed first: +/// We want to push down the filters [id=1] to the `DataSourceExec` node, but can't push down `cost>50` because it requires the `ProjectionExec` node to be executed first: /// /// ```text // ┌──────────────────────┐ @@ -362,8 +362,8 @@ fn pushdown_filters( /// The point here is that: /// 1. We cannot push down `sum > 10` through the `AggregateExec` node into the `DataSourceExec` node. /// Any filters above the `AggregateExec` node are not pushed down. -/// This is determined by calling [`ExecutionPlan::supports_filter_pushdown`] on the `AggregateExec` node. -/// 2. We need to keep recursing into the tree so that we can discover the other `FilterExec` node and push down the `[id=1]` filter. +/// This is determined by calling [`ExecutionPlan::filter_pushdown_request`] on the `AggregateExec` node. +/// 2. We need to keep recursing into the tree so that we can discover the other `FilterExec` node and push down the [id=1] filter. /// /// It is also possible to push down filters through joins and from joins. /// For example, a hash join where we build a hash table of the left side and probe the right side @@ -393,7 +393,7 @@ fn pushdown_filters( /// ``` /// /// There are two pushdowns we can do here: -/// 1. Push down the `[d.size > 100]` filter through the `HashJoinExec` node to the `DataSourceExec` node for the `departments` table. +/// 1. Push down the `d.size > 100` filter through the `HashJoinExec` node to the `DataSourceExec` node for the `departments` table. /// 2. Push down the hash table state from the `HashJoinExec` node to the `DataSourceExec` node to avoid reading /// rows from teh `users` table that will be eliminated by the join. /// This can be done via a bloom filter or similar. diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 1c5a0500e5d0..10d0ebacf41d 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -32,9 +32,10 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::Result; use datafusion_execution::TaskContext; +use datafusion_physical_expr::PhysicalExpr; use crate::coalesce::{BatchCoalescer, CoalescerState}; -use crate::execution_plan::{CardinalityEffect, TransparentFilterPushdown}; +use crate::execution_plan::{CardinalityEffect, FilterPushdownAllowed}; use futures::ready; use futures::stream::{Stream, StreamExt}; @@ -212,9 +213,17 @@ impl ExecutionPlan for CoalesceBatchesExec { fn cardinality_effect(&self) -> CardinalityEffect { CardinalityEffect::Equal } -} -impl TransparentFilterPushdown for CoalesceBatchesExec {} + fn filter_pushdown_request( + &self, + filters: &[Arc], + ) -> Result> { + Ok(filters + .iter() + .map(|f| FilterPushdownAllowed::Allowed(Arc::clone(f))) + .collect()) + } +} /// Stream for [`CoalesceBatchesExec`]. See [`CoalesceBatchesExec`] for more details. struct CoalesceBatchesStream { diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index e61b0e8b75c9..595e1afd32c6 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -541,24 +541,6 @@ pub enum FilterSupport { HandledExact, } -/// An extension trait to provide a default implementation of [`ExecutionPlan::supports_filter_pushdown`] -/// that allows all filters to be pushed down. -/// This is useful for nodes that don't modify the schema or cardinality of the data. -/// For example, `RepartitionExec` and `CoalescePartitionsExec` can push down all filters. -pub trait TransparentFilterPushdown { - /// Returns a vector of [`FilterPushdownAllowed`] for each filter. - /// The default implementation returns [`FilterPushdownAllowed::Allowed`] for all filters. - fn supports_filter_pushdown( - &self, - filters: &[Arc], - ) -> Result> { - Ok(filters - .iter() - .map(|f| FilterPushdownAllowed::Allowed(Arc::clone(f))) - .collect()) - } -} - /// The combined result of a filter pushdown operation. /// This includes: /// * The inner plan that was produced by the pushdown operation. diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 56c89324c37c..f15849f95d9f 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -442,15 +442,16 @@ impl ExecutionPlan for FilterExec { &self, filters: &[Arc], ) -> Result> { - // Note: we don't have to worry about / deal with the projection here because - // `FilterExec`'s projection can only remove columns, not add them. - // Thus if a filter was valid applied to our output it should be valid applied to our input. - // We do however need to remap the columns. - let input_schema = self.input.schema(); - let filters = filters - .iter() - .map(|f| reassign_predicate_columns(Arc::clone(f), &input_schema, false)) - .collect::>>()?; + let filters = if self.projection.is_some() { + let input_schema = self.input.schema(); + filters + .iter() + .map(|f| reassign_predicate_columns(Arc::clone(f), &input_schema, false)) + .collect::>>()? + } else { + filters.to_vec() + }; + Ok(filters .into_iter() .map(FilterPushdownAllowed::Allowed) @@ -458,10 +459,12 @@ impl ExecutionPlan for FilterExec { } fn filters_for_pushdown(&self) -> Result>> { - Ok(split_conjunction(self.predicate()) - .iter() - .map(|f| Arc::clone(f)) - .collect()) + let predicate = reassign_predicate_columns( + Arc::clone(&self.predicate), + &self.input.schema(), + false, + )?; + Ok(vec![predicate]) } fn with_filter_pushdown_result( diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 544b311e7f0c..bf070a8c9348 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -29,7 +29,7 @@ use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use super::{ DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream, }; -use crate::execution_plan::{CardinalityEffect, TransparentFilterPushdown}; +use crate::execution_plan::{CardinalityEffect, FilterPushdownAllowed}; use crate::hash_utils::create_hashes; use crate::metrics::BaselineMetrics; use crate::projection::{all_columns, make_with_child, update_expr, ProjectionExec}; @@ -723,9 +723,17 @@ impl ExecutionPlan for RepartitionExec { new_partitioning, )?))) } -} -impl TransparentFilterPushdown for RepartitionExec {} + fn filter_pushdown_request( + &self, + filters: &[Arc], + ) -> Result> { + Ok(filters + .iter() + .map(|f| FilterPushdownAllowed::Allowed(Arc::clone(f))) + .collect()) + } +} impl RepartitionExec { /// Create a new RepartitionExec, that produces output `partitioning`, and diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index acefdd140828..d55b3a730720 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -26,7 +26,7 @@ use std::sync::Arc; use crate::common::spawn_buffered; use crate::execution_plan::{ - Boundedness, CardinalityEffect, EmissionType, TransparentFilterPushdown, + Boundedness, CardinalityEffect, EmissionType, FilterPushdownAllowed, }; use crate::expressions::PhysicalSortExpr; use crate::limit::LimitStream; @@ -1353,9 +1353,17 @@ impl ExecutionPlan for SortExec { fn filters_for_pushdown(&self) -> Result>> { Ok(vec![self.dynamic_filter_source.as_physical_expr()?]) } -} -impl TransparentFilterPushdown for SortExec {} + fn filter_pushdown_request( + &self, + filters: &[Arc], + ) -> Result> { + Ok(filters + .iter() + .map(|f| FilterPushdownAllowed::Allowed(Arc::clone(f))) + .collect()) + } +} #[cfg(test)] mod tests { diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f709c3875a9a..496f24abf6ed 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -241,7 +241,6 @@ datafusion.explain.show_statistics false datafusion.optimizer.allow_symmetric_joins_without_pruning true datafusion.optimizer.default_filter_selectivity 20 datafusion.optimizer.enable_distinct_aggregation_soft_limit true -datafusion.optimizer.enable_dynamic_filter_pushdown true datafusion.optimizer.enable_round_robin_repartition true datafusion.optimizer.enable_topk_aggregation true datafusion.optimizer.expand_views_at_output false @@ -341,7 +340,6 @@ datafusion.explain.show_statistics false When set to true, the explain statement datafusion.optimizer.allow_symmetric_joins_without_pruning true Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. datafusion.optimizer.default_filter_selectivity 20 The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. -datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. datafusion.optimizer.enable_round_robin_repartition true When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores datafusion.optimizer.enable_topk_aggregation true When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible datafusion.optimizer.expand_views_at_output false When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.