Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: infer equal columns from the query #168

Merged
merged 18 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions optd-datafusion-repr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ assert_approx_eq = "1.1.0"
serde = { version = "1.0", features = ["derive"] }
serde_with = {version = "3.7.0", features = ["json"]}
bincode = "1.3.3"
union-find = { git = "https://github.com/Gun9niR/union-find-rs.git", rev = "3ffda352c2c3d9a74daed1546c0e71c97b732bf1" }
Gun9niR marked this conversation as resolved.
Show resolved Hide resolved
43 changes: 41 additions & 2 deletions optd-datafusion-repr/src/cost/base_cost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ mod join;
mod limit;
pub(crate) mod stats;

use crate::{plan_nodes::OptRelNodeTyp, properties::column_ref::ColumnRef};
use crate::{
plan_nodes::OptRelNodeTyp,
properties::column_ref::{BaseTableColumnRef, ColumnRef},
};
use itertools::Itertools;
use optd_core::{
cascades::{CascadesOptimizer, RelNodeContext},
Expand Down Expand Up @@ -207,7 +210,7 @@ impl<
&self,
col_ref: &ColumnRef,
) -> Option<&ColumnCombValueStats<M, D>> {
if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref {
if let ColumnRef::BaseTableColumnRef(BaseTableColumnRef { table, col_idx }) = col_ref {
self.get_column_comb_stats(table, &[*col_idx])
} else {
None
Expand Down Expand Up @@ -314,6 +317,7 @@ mod tests {

pub const TABLE1_NAME: &str = "table1";
pub const TABLE2_NAME: &str = "table2";
pub const TABLE3_NAME: &str = "table3";

// one column is sufficient for all filter selectivity tests
pub fn create_one_column_cost_model(per_column_stats: TestPerColumnStats) -> TestOptCostModel {
Expand All @@ -340,6 +344,41 @@ mod tests {
)
}

/// Two columns is sufficient for all join selectivity tests
Gun9niR marked this conversation as resolved.
Show resolved Hide resolved
pub fn create_three_table_cost_model(
tbl1_per_column_stats: TestPerColumnStats,
tbl2_per_column_stats: TestPerColumnStats,
tbl3_per_column_stats: TestPerColumnStats,
) -> TestOptCostModel {
OptCostModel::new(
vec![
(
String::from(TABLE1_NAME),
TableStats::new(
100,
vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(),
),
),
(
String::from(TABLE2_NAME),
TableStats::new(
100,
vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(),
),
),
(
String::from(TABLE3_NAME),
TableStats::new(
100,
vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(),
),
),
]
.into_iter()
.collect(),
)
}

/// We need custom row counts because some join algorithms rely on the row cnt
pub fn create_two_table_cost_model_custom_row_cnts(
tbl1_per_column_stats: TestPerColumnStats,
Expand Down
15 changes: 8 additions & 7 deletions optd-datafusion-repr/src/cost/base_cost/agg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ use optd_core::{
use serde::{de::DeserializeOwned, Serialize};

use crate::{
cost::{
base_cost::stats::{Distribution, MostCommonValues},
base_cost::DEFAULT_NUM_DISTINCT,
cost::base_cost::{
stats::{Distribution, MostCommonValues},
DEFAULT_NUM_DISTINCT,
},
plan_nodes::{ExprList, OptRelNode, OptRelNodeTyp},
properties::column_ref::{ColumnRef, ColumnRefPropertyBuilder},
properties::column_ref::{BaseTableColumnRef, ColumnRef, ColumnRefPropertyBuilder},
};

use super::{OptCostModel, DEFAULT_UNK_SEL};
Expand Down Expand Up @@ -61,13 +61,14 @@ impl<
} else {
// Multiply the n-distinct of all the group by columns.
// TODO: improve with multi-dimensional n-distinct
let base_table_col_refs = optimizer
let group_col_refs = optimizer
.get_property_by_group::<ColumnRefPropertyBuilder>(context.group_id, 1);
base_table_col_refs
group_col_refs
.column_refs()
.iter()
.take(group_by.len())
.map(|col_ref| match col_ref {
ColumnRef::BaseTableColumnRef { table, col_idx } => {
ColumnRef::BaseTableColumnRef(BaseTableColumnRef { table, col_idx }) => {
let table_stats = self.per_table_stats_map.get(table);
let column_stats = table_stats.and_then(|table_stats| {
table_stats.column_comb_stats.get(&vec![*col_idx])
Expand Down
Loading
Loading