From b15467006c72918f76f9c4c28ea76ad38d9f5376 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 01:08:05 +1000 Subject: [PATCH 01/17] Require Debug for PhysicalOptimizerRule --- datafusion/core/src/physical_optimizer/coalesce_batches.rs | 2 +- datafusion/core/src/physical_optimizer/enforce_distribution.rs | 2 +- datafusion/core/src/physical_optimizer/enforce_sorting.rs | 2 +- datafusion/core/src/physical_optimizer/join_selection.rs | 2 +- datafusion/core/src/physical_optimizer/optimizer.rs | 2 +- datafusion/core/src/physical_optimizer/projection_pushdown.rs | 2 +- datafusion/core/src/physical_optimizer/sanity_checker.rs | 2 +- datafusion/core/src/physical_optimizer/update_aggr_exprs.rs | 2 +- datafusion/physical-optimizer/src/aggregate_statistics.rs | 2 +- datafusion/physical-optimizer/src/combine_partial_final_agg.rs | 2 +- datafusion/physical-optimizer/src/limit_pushdown.rs | 2 +- .../physical-optimizer/src/limited_distinct_aggregation.rs | 1 + datafusion/physical-optimizer/src/optimizer.rs | 3 ++- datafusion/physical-optimizer/src/topk_aggregation.rs | 1 + 14 files changed, 15 insertions(+), 12 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/coalesce_batches.rs b/datafusion/core/src/physical_optimizer/coalesce_batches.rs index da0e44c8de4e..2f834813ede9 100644 --- a/datafusion/core/src/physical_optimizer/coalesce_batches.rs +++ b/datafusion/core/src/physical_optimizer/coalesce_batches.rs @@ -34,7 +34,7 @@ use datafusion_physical_optimizer::PhysicalOptimizerRule; /// Optimizer rule that introduces CoalesceBatchesExec to avoid overhead with small batches that /// are produced by highly selective filters -#[derive(Default)] +#[derive(Default, Debug)] pub struct CoalesceBatches {} impl CoalesceBatches { diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index ceb701ad00c9..c971e6150633 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -176,7 +176,7 @@ use itertools::izip; /// /// This rule only chooses the exact match and satisfies the Distribution(a, b, c) /// by a HashPartition(a, b, c). -#[derive(Default)] +#[derive(Default, Debug)] pub struct EnforceDistribution {} impl EnforceDistribution { diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 14afe3546633..aa28f9d6b6aa 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -72,7 +72,7 @@ use itertools::izip; /// This rule inspects [`SortExec`]'s in the given physical plan and removes the /// ones it can prove unnecessary. -#[derive(Default)] +#[derive(Default, Debug)] pub struct EnforceSorting {} impl EnforceSorting { diff --git a/datafusion/core/src/physical_optimizer/join_selection.rs b/datafusion/core/src/physical_optimizer/join_selection.rs index 2643ade8f481..499fb9cbbcf0 100644 --- a/datafusion/core/src/physical_optimizer/join_selection.rs +++ b/datafusion/core/src/physical_optimizer/join_selection.rs @@ -46,7 +46,7 @@ use datafusion_physical_optimizer::PhysicalOptimizerRule; /// The [`JoinSelection`] rule tries to modify a given plan so that it can /// accommodate infinite sources and optimize joins in the plan according to /// available statistical information, if there is any. -#[derive(Default)] +#[derive(Default, Debug)] pub struct JoinSelection {} impl JoinSelection { diff --git a/datafusion/core/src/physical_optimizer/optimizer.rs b/datafusion/core/src/physical_optimizer/optimizer.rs index e09d7b28bf5f..7a6f991121ef 100644 --- a/datafusion/core/src/physical_optimizer/optimizer.rs +++ b/datafusion/core/src/physical_optimizer/optimizer.rs @@ -35,7 +35,7 @@ use crate::physical_optimizer::sanity_checker::SanityCheckPlan; use crate::physical_optimizer::topk_aggregation::TopKAggregation; /// A rule-based physical optimizer. -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct PhysicalOptimizer { /// All rules to apply pub rules: Vec>, diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs index 4ca59da55bad..f75f4965f76e 100644 --- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs @@ -60,7 +60,7 @@ use itertools::Itertools; /// This rule inspects [`ProjectionExec`]'s in the given physical plan and tries to /// remove or swap with its child. -#[derive(Default)] +#[derive(Default, Debug)] pub struct ProjectionPushdown {} impl ProjectionPushdown { diff --git a/datafusion/core/src/physical_optimizer/sanity_checker.rs b/datafusion/core/src/physical_optimizer/sanity_checker.rs index e392105fbcb7..65a712cc69c2 100644 --- a/datafusion/core/src/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/src/physical_optimizer/sanity_checker.rs @@ -41,7 +41,7 @@ use itertools::izip; /// are not satisfied by their children. /// 2. Plans that use pipeline-breaking operators on infinite input(s), /// it is impossible to execute such queries (they will never generate output nor finish) -#[derive(Default)] +#[derive(Default, Debug)] pub struct SanityCheckPlan {} impl SanityCheckPlan { diff --git a/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs b/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs index 8b5084c67e42..c0d9140c025e 100644 --- a/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs +++ b/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs @@ -48,7 +48,7 @@ use datafusion_physical_plan::{ /// This rule analyzes aggregate expressions of type `Beneficial` to see whether /// their input ordering requirements are satisfied. If this is the case, the /// aggregators are modified to run in a more efficient mode. -#[derive(Default)] +#[derive(Default, Debug)] pub struct OptimizeAggregateOrder {} impl OptimizeAggregateOrder { diff --git a/datafusion/physical-optimizer/src/aggregate_statistics.rs b/datafusion/physical-optimizer/src/aggregate_statistics.rs index 863c5ab2d288..71f129be984d 100644 --- a/datafusion/physical-optimizer/src/aggregate_statistics.rs +++ b/datafusion/physical-optimizer/src/aggregate_statistics.rs @@ -33,7 +33,7 @@ use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::udaf::AggregateFunctionExpr; /// Optimizer that uses available statistics for aggregate functions -#[derive(Default)] +#[derive(Default, Debug)] pub struct AggregateStatistics {} impl AggregateStatistics { diff --git a/datafusion/physical-optimizer/src/combine_partial_final_agg.rs b/datafusion/physical-optimizer/src/combine_partial_final_agg.rs index 67e40c9b507e..4e352e25b52c 100644 --- a/datafusion/physical-optimizer/src/combine_partial_final_agg.rs +++ b/datafusion/physical-optimizer/src/combine_partial_final_agg.rs @@ -37,7 +37,7 @@ use datafusion_physical_expr::{physical_exprs_equal, PhysicalExpr}; /// /// This rule should be applied after the EnforceDistribution and EnforceSorting rules /// -#[derive(Default)] +#[derive(Default, Debug)] pub struct CombinePartialFinalAggregate {} impl CombinePartialFinalAggregate { diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs index 15d210e1b10b..8f392b683077 100644 --- a/datafusion/physical-optimizer/src/limit_pushdown.rs +++ b/datafusion/physical-optimizer/src/limit_pushdown.rs @@ -33,7 +33,7 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; /// This rule inspects [`ExecutionPlan`]'s and pushes down the fetch limit from /// the parent to the child if applicable. -#[derive(Default)] +#[derive(Default, Debug)] pub struct LimitPushdown {} /// This is a "data class" we use within the [`LimitPushdown`] rule to push diff --git a/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs b/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs index 8653ad19da77..7833324f64fa 100644 --- a/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs +++ b/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs @@ -35,6 +35,7 @@ use itertools::Itertools; /// rows in the group to be processed for correctness. Example queries fitting this description are: /// - `SELECT distinct l_orderkey FROM lineitem LIMIT 10;` /// - `SELECT l_orderkey FROM lineitem GROUP BY l_orderkey LIMIT 10;` +#[derive(Debug)] pub struct LimitedDistinctAggregation {} impl LimitedDistinctAggregation { diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index 885dc4a95b8c..609890e2d43f 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -20,6 +20,7 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::Result; use datafusion_physical_plan::ExecutionPlan; +use std::fmt::Debug; use std::sync::Arc; /// `PhysicalOptimizerRule` transforms one ['ExecutionPlan'] into another which @@ -29,7 +30,7 @@ use std::sync::Arc; /// `PhysicalOptimizerRule`s. /// /// [`SessionState::add_physical_optimizer_rule`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionState.html#method.add_physical_optimizer_rule -pub trait PhysicalOptimizerRule { +pub trait PhysicalOptimizerRule: Debug { /// Rewrite `plan` to an optimized form fn optimize( &self, diff --git a/datafusion/physical-optimizer/src/topk_aggregation.rs b/datafusion/physical-optimizer/src/topk_aggregation.rs index 0cb76d9c2393..804dd165d335 100644 --- a/datafusion/physical-optimizer/src/topk_aggregation.rs +++ b/datafusion/physical-optimizer/src/topk_aggregation.rs @@ -37,6 +37,7 @@ use crate::PhysicalOptimizerRule; use itertools::Itertools; /// An optimizer rule that passes a `limit` hint to aggregations if the whole result is not needed +#[derive(Debug)] pub struct TopKAggregation {} impl TopKAggregation { From f69d73c0f5c130d78f0b3bcf0967185466fd9c50 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 01:12:06 +1000 Subject: [PATCH 02/17] Add reference to meet &JoinType type required --- datafusion/core/src/physical_optimizer/enforce_distribution.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index c971e6150633..72df7b15bd27 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -298,7 +298,7 @@ fn adjust_input_keys_ordering( right.clone(), new_conditions.0, filter.clone(), - join_type, + &join_type, // TODO: although projection is not used in the join here, because projection pushdown is after enforce_distribution. Maybe we need to handle it later. Same as filter. projection.clone(), PartitionMode::Partitioned, From 41d4788b7eadf34470eba2e35e979837fbf8028a Mon Sep 17 00:00:00 2001 From: Anthony Date: Thu, 26 Sep 2024 02:29:13 +1000 Subject: [PATCH 03/17] Revert "Add reference to meet &JoinType type required" as clippy lint informs this is unnecessary This reverts commit f69d73c0f5c130d78f0b3bcf0967185466fd9c50. --- datafusion/core/src/physical_optimizer/enforce_distribution.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 72df7b15bd27..c971e6150633 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -298,7 +298,7 @@ fn adjust_input_keys_ordering( right.clone(), new_conditions.0, filter.clone(), - &join_type, + join_type, // TODO: although projection is not used in the join here, because projection pushdown is after enforce_distribution. Maybe we need to handle it later. Same as filter. projection.clone(), PartitionMode::Partitioned, From a58847eeabfd13e61c65f07c7715c3d9648e969f Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 17:29:10 +1000 Subject: [PATCH 04/17] Require `Debug` for `CatalogProvider`, `CatalogProviderList`, UrlTableFactory --- datafusion/catalog/src/catalog.rs | 5 +++-- datafusion/catalog/src/dynamic_file/catalog.rs | 6 +++++- datafusion/catalog/src/schema.rs | 3 ++- datafusion/catalog/src/session.rs | 1 + datafusion/core/src/catalog_common/information_schema.rs | 1 + datafusion/core/src/catalog_common/listing_schema.rs | 1 + datafusion/core/src/catalog_common/memory.rs | 4 ++++ datafusion/core/src/datasource/dynamic_file.rs | 2 +- 8 files changed, 18 insertions(+), 5 deletions(-) diff --git a/datafusion/catalog/src/catalog.rs b/datafusion/catalog/src/catalog.rs index 9ee94e8f1fc3..048a7f14ed37 100644 --- a/datafusion/catalog/src/catalog.rs +++ b/datafusion/catalog/src/catalog.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::fmt::Debug; use std::sync::Arc; pub use crate::schema::SchemaProvider; @@ -101,7 +102,7 @@ use datafusion_common::Result; /// /// [`TableProvider`]: crate::TableProvider -pub trait CatalogProvider: Sync + Send { +pub trait CatalogProvider: Debug + Sync + Send { /// Returns the catalog provider as [`Any`] /// so that it can be downcast to a specific implementation. fn as_any(&self) -> &dyn Any; @@ -152,7 +153,7 @@ pub trait CatalogProvider: Sync + Send { /// /// Please see the documentation on `CatalogProvider` for details of /// implementing a custom catalog. -pub trait CatalogProviderList: Sync + Send { +pub trait CatalogProviderList: Debug + Sync + Send { /// Returns the catalog list as [`Any`] /// so that it can be downcast to a specific implementation. fn as_any(&self) -> &dyn Any; diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index cd586446f82c..ccccb9762eb4 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -20,9 +20,11 @@ use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider}; use async_trait::async_trait; use std::any::Any; +use std::fmt::Debug; use std::sync::Arc; /// Wrap another catalog provider list +#[derive(Debug)] pub struct DynamicFileCatalog { /// The inner catalog provider list inner: Arc, @@ -67,6 +69,7 @@ impl CatalogProviderList for DynamicFileCatalog { } /// Wraps another catalog provider +#[derive(Debug)] struct DynamicFileCatalogProvider { /// The inner catalog provider inner: Arc, @@ -114,6 +117,7 @@ impl CatalogProvider for DynamicFileCatalogProvider { /// /// The provider will try to create a table provider from the file path if the table provider /// isn't exist in the inner schema provider. +#[derive(Debug)] pub struct DynamicFileSchemaProvider { /// The inner schema provider inner: Arc, @@ -174,7 +178,7 @@ impl SchemaProvider for DynamicFileSchemaProvider { /// [UrlTableFactory] is a factory that can create a table provider from the given url. #[async_trait] -pub trait UrlTableFactory: Sync + Send { +pub trait UrlTableFactory: Debug + Sync + Send { /// create a new table provider from the provided url async fn try_new( &self, diff --git a/datafusion/catalog/src/schema.rs b/datafusion/catalog/src/schema.rs index 21bca9fa828d..5b37348fd742 100644 --- a/datafusion/catalog/src/schema.rs +++ b/datafusion/catalog/src/schema.rs @@ -21,6 +21,7 @@ use async_trait::async_trait; use datafusion_common::{exec_err, DataFusionError}; use std::any::Any; +use std::fmt::Debug; use std::sync::Arc; use crate::table::TableProvider; @@ -32,7 +33,7 @@ use datafusion_common::Result; /// /// [`CatalogProvider`]: super::CatalogProvider #[async_trait] -pub trait SchemaProvider: Sync + Send { +pub trait SchemaProvider: Debug + Sync + Send { /// Returns the owner of the Schema, default is None. This value is reported /// as part of `information_tables.schemata fn owner_name(&self) -> Option<&str> { diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index 61d9c2d8a71e..db49529ac43f 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -139,6 +139,7 @@ impl From<&dyn Session> for TaskContext { } type SessionRefLock = Arc>>>>; /// The state store that stores the reference of the runtime session state. +#[derive(Debug)] pub struct SessionStore { session: SessionRefLock, } diff --git a/datafusion/core/src/catalog_common/information_schema.rs b/datafusion/core/src/catalog_common/information_schema.rs index df4257504b1d..163ce8738931 100644 --- a/datafusion/core/src/catalog_common/information_schema.rs +++ b/datafusion/core/src/catalog_common/information_schema.rs @@ -57,6 +57,7 @@ pub const INFORMATION_SCHEMA_TABLES: &[&str] = /// demand. This means that if more tables are added to the underlying /// providers, they will appear the next time the `information_schema` /// table is queried. +#[derive(Debug)] pub struct InformationSchemaProvider { config: InformationSchemaConfig, } diff --git a/datafusion/core/src/catalog_common/listing_schema.rs b/datafusion/core/src/catalog_common/listing_schema.rs index 5b91f963ca24..e45c8a8d4aeb 100644 --- a/datafusion/core/src/catalog_common/listing_schema.rs +++ b/datafusion/core/src/catalog_common/listing_schema.rs @@ -48,6 +48,7 @@ use object_store::ObjectStore; /// - `s3://host.example.com:3000/data/tpch/customer/_delta_log/` /// /// [`ObjectStore`]: object_store::ObjectStore +#[derive(Debug)] pub struct ListingSchemaProvider { authority: String, path: object_store::path::Path, diff --git a/datafusion/core/src/catalog_common/memory.rs b/datafusion/core/src/catalog_common/memory.rs index 6d8bddec4547..f25146616891 100644 --- a/datafusion/core/src/catalog_common/memory.rs +++ b/datafusion/core/src/catalog_common/memory.rs @@ -28,6 +28,7 @@ use std::any::Any; use std::sync::Arc; /// Simple in-memory list of catalogs +#[derive(Debug)] pub struct MemoryCatalogProviderList { /// Collection of catalogs containing schemas and ultimately TableProviders pub catalogs: DashMap>, @@ -71,6 +72,7 @@ impl CatalogProviderList for MemoryCatalogProviderList { } /// Simple in-memory implementation of a catalog. +#[derive(Debug)] pub struct MemoryCatalogProvider { schemas: DashMap>, } @@ -136,6 +138,7 @@ impl CatalogProvider for MemoryCatalogProvider { } /// Simple in-memory implementation of a schema. +#[derive(Debug)] pub struct MemorySchemaProvider { tables: DashMap>, } @@ -248,6 +251,7 @@ mod test { #[test] fn default_register_schema_not_supported() { // mimic a new CatalogProvider and ensure it does not support registering schemas + #[derive(Debug)] struct TestProvider {} impl CatalogProvider for TestProvider { fn as_any(&self) -> &dyn Any { diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs index a95f3abb939b..3c409af29703 100644 --- a/datafusion/core/src/datasource/dynamic_file.rs +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -30,7 +30,7 @@ use crate::error::Result; use crate::execution::context::SessionState; /// [DynamicListTableFactory] is a factory that can create a [ListingTable] from the given url. -#[derive(Default)] +#[derive(Default, Debug)] pub struct DynamicListTableFactory { /// The session store that contains the current session. session_store: SessionStore, From 2d4a92d77c6e7a0ae79c8401f36f15abd93a42e1 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 17:30:41 +1000 Subject: [PATCH 05/17] Add derive Debug to meet api-change --- datafusion-examples/examples/catalog.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion-examples/examples/catalog.rs b/datafusion-examples/examples/catalog.rs index 8c2b1aad56c6..f40f1dfb5a15 100644 --- a/datafusion-examples/examples/catalog.rs +++ b/datafusion-examples/examples/catalog.rs @@ -135,6 +135,7 @@ struct DirSchemaOpts<'a> { format: Arc, } /// Schema where every file with extension `ext` in a given `dir` is a table. +#[derive(Debug)] struct DirSchema { ext: String, tables: RwLock>>, @@ -218,6 +219,7 @@ impl SchemaProvider for DirSchema { } } /// Catalog holds multiple schemas +#[derive(Debug)] struct DirCatalog { schemas: RwLock>>, } @@ -259,6 +261,7 @@ impl CatalogProvider for DirCatalog { } } /// Catalog lists holds multiple catalog providers. Each context has a single catalog list. +#[derive(Debug)] struct CustomCatalogProviderList { catalogs: RwLock>>, } From 4154da53c2393429d4996112966d72071cc20d74 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 17:51:05 +1000 Subject: [PATCH 06/17] Add derive Debug in datafusion-cli to support api-change of CatalogProviderList --- datafusion-cli/src/catalog.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index 9b9afc1c2420..ceb72dbc546b 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -34,6 +34,7 @@ use dirs::home_dir; use parking_lot::RwLock; /// Wraps another catalog, automatically register require object stores for the file locations +#[derive(Debug)] pub struct DynamicObjectStoreCatalog { inner: Arc, state: Weak>, @@ -74,6 +75,7 @@ impl CatalogProviderList for DynamicObjectStoreCatalog { } /// Wraps another catalog provider +#[derive(Debug)] struct DynamicObjectStoreCatalogProvider { inner: Arc, state: Weak>, @@ -115,6 +117,7 @@ impl CatalogProvider for DynamicObjectStoreCatalogProvider { /// Wraps another schema provider. [DynamicObjectStoreSchemaProvider] is responsible for registering the required /// object stores for the file locations. +#[derive(Debug)] struct DynamicObjectStoreSchemaProvider { inner: Arc, state: Weak>, From b507fd93ec3ec0a3b7fa2249a6e311b83a9be9ff Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 17:56:53 +1000 Subject: [PATCH 07/17] Require Debug for ExprPlanner --- datafusion/core/tests/user_defined/expr_planner.rs | 1 + datafusion/expr/src/planner.rs | 3 ++- datafusion/functions-nested/src/planner.rs | 2 ++ datafusion/functions/src/core/planner.rs | 2 +- datafusion/functions/src/planner.rs | 2 +- 5 files changed, 7 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/user_defined/expr_planner.rs b/datafusion/core/tests/user_defined/expr_planner.rs index 1b23bf9ab2ef..ad9c1280d6b1 100644 --- a/datafusion/core/tests/user_defined/expr_planner.rs +++ b/datafusion/core/tests/user_defined/expr_planner.rs @@ -29,6 +29,7 @@ use datafusion_expr::expr::Alias; use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawBinaryExpr}; use datafusion_expr::BinaryExpr; +#[derive(Debug)] struct MyCustomPlanner; impl ExprPlanner for MyCustomPlanner { diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index 24f589c41582..7dd7360e478f 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -17,6 +17,7 @@ //! [`ContextProvider`] and [`ExprPlanner`] APIs to customize SQL query planning +use std::fmt::Debug; use std::sync::Arc; use arrow::datatypes::{DataType, Field, SchemaRef}; @@ -88,7 +89,7 @@ pub trait ContextProvider { } /// This trait allows users to customize the behavior of the SQL planner -pub trait ExprPlanner: Send + Sync { +pub trait ExprPlanner: Debug + Send + Sync { /// Plan the binary operation between two expressions, returns original /// BinaryExpr if not possible fn plan_binary_op( diff --git a/datafusion/functions-nested/src/planner.rs b/datafusion/functions-nested/src/planner.rs index 4cd8faa3ca98..9ae2fa781d87 100644 --- a/datafusion/functions-nested/src/planner.rs +++ b/datafusion/functions-nested/src/planner.rs @@ -34,6 +34,7 @@ use crate::{ make_array::make_array, }; +#[derive(Debug)] pub struct NestedFunctionPlanner; impl ExprPlanner for NestedFunctionPlanner { @@ -130,6 +131,7 @@ impl ExprPlanner for NestedFunctionPlanner { } } +#[derive(Debug)] pub struct FieldAccessPlanner; impl ExprPlanner for FieldAccessPlanner { diff --git a/datafusion/functions/src/core/planner.rs b/datafusion/functions/src/core/planner.rs index 889f191d592f..33b4d3386eba 100644 --- a/datafusion/functions/src/core/planner.rs +++ b/datafusion/functions/src/core/planner.rs @@ -24,7 +24,7 @@ use datafusion_expr::{lit, Expr}; use super::named_struct; -#[derive(Default)] +#[derive(Default, Debug)] pub struct CoreFunctionPlanner {} impl ExprPlanner for CoreFunctionPlanner { diff --git a/datafusion/functions/src/planner.rs b/datafusion/functions/src/planner.rs index ad42c5edd6e6..93edec7ece30 100644 --- a/datafusion/functions/src/planner.rs +++ b/datafusion/functions/src/planner.rs @@ -24,7 +24,7 @@ use datafusion_expr::{ Expr, }; -#[derive(Default)] +#[derive(Default, Debug)] pub struct UserDefinedFunctionPlanner; impl ExprPlanner for UserDefinedFunctionPlanner { From 4abab5f3b93fa3b333479c662e03187aa583338f Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 18:04:23 +1000 Subject: [PATCH 08/17] Require Debug for QueryPlanner --- datafusion/core/src/execution/context/mod.rs | 3 ++- datafusion/core/src/execution/session_state.rs | 1 + datafusion/core/tests/user_defined/user_defined_plan.rs | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 53eb7c431b47..420223f7d776 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1550,7 +1550,7 @@ impl From for SessionStateBuilder { /// A planner used to add extensions to DataFusion logical and physical plans. #[async_trait] -pub trait QueryPlanner { +pub trait QueryPlanner: Debug { /// Given a `LogicalPlan`, create an [`ExecutionPlan`] suitable for execution async fn create_physical_plan( &self, @@ -2132,6 +2132,7 @@ mod tests { } } + #[derive(Debug)] struct MyQueryPlanner {} #[async_trait] diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 3e6577a48608..5e8662970865 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1795,6 +1795,7 @@ impl From<&SessionState> for TaskContext { } /// The query planner used if no user defined planner is provided +#[derive(Debug)] struct DefaultQueryPlanner {} #[async_trait] diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index caf639434a99..e51adbc4ddc1 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -312,6 +312,7 @@ fn make_topk_context() -> SessionContext { // ------ The implementation of the TopK code follows ----- +#[derive(Debug)] struct TopKQueryPlanner {} #[async_trait] From 5459c46f043ded7dac22b74f36c00791ae726da7 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 18:11:37 +1000 Subject: [PATCH 09/17] Require Debug for TableFunctionImpl --- datafusion-cli/src/functions.rs | 1 + datafusion-examples/examples/simple_udtf.rs | 1 + datafusion/core/src/datasource/function.rs | 4 +++- .../core/tests/user_defined/user_defined_table_functions.rs | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index dd56b0196dd5..c622463de033 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -315,6 +315,7 @@ fn fixed_len_byte_array_to_string(val: Option<&FixedLenByteArray>) -> Option Result>; } /// A table that uses a function to generate data +#[derive(Debug)] pub struct TableFunction { /// Name of the table function name: String, diff --git a/datafusion/core/tests/user_defined/user_defined_table_functions.rs b/datafusion/core/tests/user_defined/user_defined_table_functions.rs index fe57752db52e..0cc156866d4d 100644 --- a/datafusion/core/tests/user_defined/user_defined_table_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_table_functions.rs @@ -192,6 +192,7 @@ impl SimpleCsvTable { } } +#[derive(Debug)] struct SimpleCsvTableFunc {} impl TableFunctionImpl for SimpleCsvTableFunc { From a14ec640520ed75d4ab03a7ae15f81fdd6c716a3 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 18:17:04 +1000 Subject: [PATCH 10/17] Require Debug for SerializerRegistry --- datafusion/core/src/execution/context/mod.rs | 1 + datafusion/expr/src/registry.rs | 7 ++++--- datafusion/substrait/tests/cases/roundtrip_logical_plan.rs | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 420223f7d776..7ed6bd3b7284 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1586,6 +1586,7 @@ pub enum RegisterFunction { /// Default implementation of [SerializerRegistry] that throws unimplemented error /// for all requests. +#[derive(Debug)] pub struct EmptySerializerRegistry; impl SerializerRegistry for EmptySerializerRegistry { diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 988dc0f5aeda..6d3457f70d4c 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -21,8 +21,9 @@ use crate::expr_rewriter::FunctionRewrite; use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; use datafusion_common::{not_impl_err, plan_datafusion_err, Result}; -use std::collections::HashMap; -use std::{collections::HashSet, sync::Arc}; +use std::collections::{HashMap, HashSet}; +use std::fmt::Debug; +use std::sync::Arc; /// A registry knows how to build logical expressions out of user-defined function' names pub trait FunctionRegistry { @@ -123,7 +124,7 @@ pub trait FunctionRegistry { } /// Serializer and deserializer registry for extensions like [UserDefinedLogicalNode]. -pub trait SerializerRegistry: Send + Sync { +pub trait SerializerRegistry: Debug + Send + Sync { /// Serialize this node to a byte array. This serialization should not include /// input plans. fn serialize_logical_plan( diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index ea85092f7a6c..f7686bec5435 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -45,6 +45,7 @@ use substrait::proto::extensions::SimpleExtensionDeclaration; use substrait::proto::rel::RelType; use substrait::proto::{plan_rel, Plan, Rel}; +#[derive(Debug)] struct MockSerializerRegistry; impl SerializerRegistry for MockSerializerRegistry { From c259a94697c6465e3eec99b7c64ff0d3a8924b68 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 18:19:29 +1000 Subject: [PATCH 11/17] Require Debug for FunctionFactory --- datafusion/core/src/execution/context/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 7ed6bd3b7284..0e0049b1eb7b 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1563,7 +1563,7 @@ pub trait QueryPlanner: Debug { /// and interact with [SessionState] to registers new udf, udaf or udwf. #[async_trait] -pub trait FunctionFactory: Sync + Send { +pub trait FunctionFactory: Debug + Sync + Send { /// Handles creation of user defined function specified in [CreateFunction] statement async fn create( &self, From 58587e004f137e8f55cd1a2426422456e2fbcf6c Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 18:21:34 +1000 Subject: [PATCH 12/17] Derive `Debug` on `SessionStateBuilder` --- datafusion/core/src/execution/session_state.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 5e8662970865..42158beb6222 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -909,6 +909,7 @@ impl SessionState { /// be used for all values unless explicitly provided. /// /// See example on [`SessionState`] +#[derive(Debug)] pub struct SessionStateBuilder { session_id: Option, analyzer: Option, From c9da519ac61b0a000a41ef27b355a0a5d3cbfc52 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 21:40:47 +1000 Subject: [PATCH 13/17] Implement `Debug` for `SessionStateBuilder` to reorder output fields, keep consistent Debug field order with `SessionState` --- .../core/src/execution/session_state.rs | 52 +++++++++++++++---- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 42158beb6222..cffb63f52047 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -177,23 +177,23 @@ impl Debug for SessionState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("SessionState") .field("session_id", &self.session_id) - .field("analyzer", &"...") + .field("config", &self.config) + .field("runtime_env", &self.runtime_env) + .field("catalog_list", &"...") + .field("serializer_registry", &"...") + .field("execution_props", &self.execution_props) + .field("table_options", &self.table_options) + .field("table_factories", &"...") + .field("function_factory", &"...") .field("expr_planners", &"...") + .field("query_planner", &"...") + .field("analyzer", &"...") .field("optimizer", &"...") .field("physical_optimizers", &"...") - .field("query_planner", &"...") - .field("catalog_list", &"...") .field("table_functions", &"...") .field("scalar_functions", &self.scalar_functions) .field("aggregate_functions", &self.aggregate_functions) .field("window_functions", &self.window_functions) - .field("serializer_registry", &"...") - .field("config", &self.config) - .field("table_options", &self.table_options) - .field("execution_props", &self.execution_props) - .field("table_factories", &"...") - .field("runtime_env", &self.runtime_env) - .field("function_factory", &"...") .finish_non_exhaustive() } } @@ -909,7 +909,6 @@ impl SessionState { /// be used for all values unless explicitly provided. /// /// See example on [`SessionState`] -#[derive(Debug)] pub struct SessionStateBuilder { session_id: Option, analyzer: Option, @@ -1520,6 +1519,37 @@ impl SessionStateBuilder { } } +impl Debug for SessionStateBuilder { + /// Prefer having short fields at the top and long vector fields near the end + /// Group fields by + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SessionStateBuilder") + .field("session_id", &self.session_id) + .field("config", &self.config) + .field("runtime_env", &self.runtime_env) + .field("catalog_list", &self.catalog_list) + .field("serializer_registry", &self.serializer_registry) + .field("file_formats", &self.file_formats) + .field("execution_props", &self.execution_props) + .field("table_options", &self.table_options) + .field("table_factories", &self.table_factories) + .field("function_factory", &self.function_factory) + .field("expr_planners", &self.expr_planners) + .field("query_planners", &self.query_planner) + .field("analyzer_rules", &self.analyzer_rules) + .field("analyzer", &self.analyzer) + .field("optimizer_rules", &self.optimizer_rules) + .field("optimizer", &self.optimizer) + .field("physical_optimizer_rules", &self.physical_optimizer_rules) + .field("physical_optimizers", &self.physical_optimizers) + .field("table_functions", &self.table_functions) + .field("scalar_functions", &self.scalar_functions) + .field("aggregate_functions", &self.aggregate_functions) + .field("window_functions", &self.window_functions) + .finish() + } +} + impl Default for SessionStateBuilder { fn default() -> Self { Self::new() From cdadebea43e4102bf1e28a667aee3082c70d31ef Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Thu, 26 Sep 2024 21:56:48 +1000 Subject: [PATCH 14/17] Settle TODO for displaying `Debug` of `InformationSchemaConfig` after `CatalogProviderList` requires `Debug` --- .../core/src/catalog_common/information_schema.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/datafusion/core/src/catalog_common/information_schema.rs b/datafusion/core/src/catalog_common/information_schema.rs index 163ce8738931..180994b1cbe8 100644 --- a/datafusion/core/src/catalog_common/information_schema.rs +++ b/datafusion/core/src/catalog_common/information_schema.rs @@ -26,7 +26,7 @@ use arrow::{ }; use async_trait::async_trait; use datafusion_common::DataFusionError; -use std::fmt::{Debug, Formatter}; +use std::fmt::Debug; use std::{any::Any, sync::Arc}; use crate::catalog::{CatalogProviderList, SchemaProvider, TableProvider}; @@ -71,20 +71,11 @@ impl InformationSchemaProvider { } } -#[derive(Clone)] +#[derive(Clone, Debug)] struct InformationSchemaConfig { catalog_list: Arc, } -impl Debug for InformationSchemaConfig { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("InformationSchemaConfig") - // TODO it would be great to print the catalog list here - // but that would require CatalogProviderList to implement Debug - .finish_non_exhaustive() - } -} - impl InformationSchemaConfig { /// Construct the `information_schema.tables` virtual table async fn make_tables( From 4fd973817293af35238d0f6d3ec4908c519ab551 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Sun, 29 Sep 2024 00:47:28 +1000 Subject: [PATCH 15/17] Implement `Debug` for `SessionState` to be in sync with `SessionStateBuilder` now that the fields all require `Debug` --- .../core/src/execution/session_state.rs | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index cffb63f52047..a2fc5cfb7bea 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -174,27 +174,30 @@ pub struct SessionState { } impl Debug for SessionState { + /// Prefer having short fields at the top and long vector fields near the end + /// Group fields by fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SessionState") + f.debug_struct("SessionStateBuilder") .field("session_id", &self.session_id) .field("config", &self.config) .field("runtime_env", &self.runtime_env) - .field("catalog_list", &"...") - .field("serializer_registry", &"...") + .field("catalog_list", &self.catalog_list) + .field("serializer_registry", &self.serializer_registry) + .field("file_formats", &self.file_formats) .field("execution_props", &self.execution_props) .field("table_options", &self.table_options) - .field("table_factories", &"...") - .field("function_factory", &"...") - .field("expr_planners", &"...") - .field("query_planner", &"...") - .field("analyzer", &"...") - .field("optimizer", &"...") - .field("physical_optimizers", &"...") - .field("table_functions", &"...") + .field("table_factories", &self.table_factories) + .field("function_factory", &self.function_factory) + .field("expr_planners", &self.expr_planners) + .field("query_planners", &self.query_planner) + .field("analyzer", &self.analyzer) + .field("optimizer", &self.optimizer) + .field("physical_optimizers", &self.physical_optimizers) + .field("table_functions", &self.table_functions) .field("scalar_functions", &self.scalar_functions) .field("aggregate_functions", &self.aggregate_functions) .field("window_functions", &self.window_functions) - .finish_non_exhaustive() + .finish() } } From e16d08c885a48feea04089f6ffe6b7595fb77d77 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Sun, 29 Sep 2024 00:47:28 +1000 Subject: [PATCH 16/17] Implement `Debug` for `SessionState` to be in sync with `SessionStateBuilder` now that the fields have `Debug` available --- .../core/src/execution/session_state.rs | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index cffb63f52047..a2fc5cfb7bea 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -174,27 +174,30 @@ pub struct SessionState { } impl Debug for SessionState { + /// Prefer having short fields at the top and long vector fields near the end + /// Group fields by fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SessionState") + f.debug_struct("SessionStateBuilder") .field("session_id", &self.session_id) .field("config", &self.config) .field("runtime_env", &self.runtime_env) - .field("catalog_list", &"...") - .field("serializer_registry", &"...") + .field("catalog_list", &self.catalog_list) + .field("serializer_registry", &self.serializer_registry) + .field("file_formats", &self.file_formats) .field("execution_props", &self.execution_props) .field("table_options", &self.table_options) - .field("table_factories", &"...") - .field("function_factory", &"...") - .field("expr_planners", &"...") - .field("query_planner", &"...") - .field("analyzer", &"...") - .field("optimizer", &"...") - .field("physical_optimizers", &"...") - .field("table_functions", &"...") + .field("table_factories", &self.table_factories) + .field("function_factory", &self.function_factory) + .field("expr_planners", &self.expr_planners) + .field("query_planners", &self.query_planner) + .field("analyzer", &self.analyzer) + .field("optimizer", &self.optimizer) + .field("physical_optimizers", &self.physical_optimizers) + .field("table_functions", &self.table_functions) .field("scalar_functions", &self.scalar_functions) .field("aggregate_functions", &self.aggregate_functions) .field("window_functions", &self.window_functions) - .finish_non_exhaustive() + .finish() } } From 251a52f361a98e4bdab57cbd20b61f12d1215e83 Mon Sep 17 00:00:00 2001 From: "Anthony Zh. Oon" Date: Sun, 29 Sep 2024 01:03:58 +1000 Subject: [PATCH 17/17] Correct name in `Debug` impl for `SessionState` --- datafusion/core/src/execution/session_state.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index a2fc5cfb7bea..4953eecd66e3 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -177,7 +177,7 @@ impl Debug for SessionState { /// Prefer having short fields at the top and long vector fields near the end /// Group fields by fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SessionStateBuilder") + f.debug_struct("SessionState") .field("session_id", &self.session_id) .field("config", &self.config) .field("runtime_env", &self.runtime_env)