Merge branch 'main' into zhidong/adaptive-perftest

cmu-db · Apr 29, 2024 · 3ff7285 · 3ff7285
2 parents 7bf69b5 + 5958b3d
commit 3ff7285
Show file tree

Hide file tree

Showing 20 changed files with 1,511 additions and 449 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/dev_scripts/which_queries_work.sh b/dev_scripts/which_queries_work.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 benchmark_name=$1
-USAGE="Usage: $0 [job|tpch]"
+USAGE="Usage: $0 [job|joblight|tpch]"
 
 if [ $# -ne 1 ]; then
     echo >&2 $USAGE
@@ -9,8 +9,13 @@ fi
 
 if [[ "$benchmark_name" == "job" ]]; then
     all_ids="1a,1b,1c,1d,2a,2b,2c,2d,3a,3b,3c,4a,4b,4c,5a,5b,5c,6a,6b,6c,6d,6e,6f,7a,7b,7c,8a,8b,8c,8d,9a,9b,9c,9d,10a,10b,10c,11a,11b,11c,11d,12a,12b,12c,13a,13b,13c,13d,14a,14b,14c,15a,15b,15c,15d,16a,16b,16c,16d,17a,17b,17c,17d,17e,17f,18a,18b,18c,19a,19b,19c,19d,20a,20b,20c,21a,21b,21c,22a,22b,22c,22d,23a,23b,23c,24a,24b,25a,25b,25c,26a,26b,26c,27a,27b,27c,28a,28b,28c,29a,29b,29c,30a,30b,30c,31a,31b,31c,32a,32b,33a,33b,33c"
+    vec_var_name="WORKING_QUERY_IDS"
+elif [[ "$benchmark_name" == "joblight" ]]; then
+    all_ids="1a,1b,1c,1d,2a,3a,3b,3c,4a,4b,4c,5a,5b,5c,6a,6b,6c,6d,6e,7a,7b,7c,8a,8b,8c,9a,9b,10a,10b,10c,11a,11b,11c,12a,12b,12c,13a,14a,14b,14c,15a,15b,15c,16a,17a,17b,17c,18a,18b,18c,19a,19b,20a,20b,20c,21a,21b,22a,22b,22c,23a,23b,24a,24b,25a,26a,26b,27a,27b,28a"
+    vec_var_name="WORKING_JOBLIGHT_QUERY_IDS"
 elif [[ "$benchmark_name" == "tpch" ]]; then
     all_ids="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22"
+    vec_var_name="WORKING_JOB_QUERY_IDS"
 else
     echo >&2 $USAGE
     exit 1
@@ -19,7 +24,7 @@ fi
 successful_ids=()
 IFS=','
 for id in $all_ids; do
-    cargo run --bin optd-perftest cardtest --benchmark-name $benchmark_name --query-ids $id &>/dev/null
+    cargo run --bin optd-perftest cardtest $benchmark_name --query-ids $id &>/dev/null
 
     if [ $? -eq 0 ]; then
         echo >&2 $id succeeded
@@ -32,7 +37,7 @@ done
 echo >&2
 echo " Useful Outputs"
 echo "================"
-working_query_ids_vec="pub const WORKING_QUERY_IDS: &[&str] = &[\"${successful_ids[0]}\""
+working_query_ids_vec="pub const ${vec_var_name}: &[&str] = &[\"${successful_ids[0]}\""
 IFS=" "
 for id in "${successful_ids[@]:1}"; do
     working_query_ids_vec+=", \"$id\""

diff --git a/optd-datafusion-repr/Cargo.toml b/optd-datafusion-repr/Cargo.toml
@@ -25,3 +25,5 @@ assert_approx_eq = "1.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = {version = "3.7.0", features = ["json"]}
 bincode = "1.3.3"
+union-find = { git = "https://github.com/Gun9niR/union-find-rs.git", rev = "794821514f7daefcbb8d5f38ef04e62fc18b5665" }
+test-case = "3.3"
diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -4,7 +4,10 @@ mod join;
 mod limit;
 pub(crate) mod stats;
 
-use crate::{plan_nodes::OptRelNodeTyp, properties::column_ref::ColumnRef};
+use crate::{
+    plan_nodes::OptRelNodeTyp,
+    properties::column_ref::{BaseTableColumnRef, ColumnRef},
+};
 use itertools::Itertools;
 use optd_core::{
     cascades::{CascadesOptimizer, RelNodeContext},
@@ -207,7 +210,7 @@ impl<
         &self,
         col_ref: &ColumnRef,
     ) -> Option<&ColumnCombValueStats<M, D>> {
-        if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref {
+        if let ColumnRef::BaseTableColumnRef(BaseTableColumnRef { table, col_idx }) = col_ref {
             self.get_column_comb_stats(table, &[*col_idx])
         } else {
             None
@@ -314,6 +317,8 @@ mod tests {
 
     pub const TABLE1_NAME: &str = "table1";
     pub const TABLE2_NAME: &str = "table2";
+    pub const TABLE3_NAME: &str = "table3";
+    pub const TABLE4_NAME: &str = "table4";
 
     // one column is sufficient for all filter selectivity tests
     pub fn create_one_column_cost_model(per_column_stats: TestPerColumnStats) -> TestOptCostModel {
@@ -327,7 +332,7 @@ mod tests {
         )
     }
 
-    /// Two columns is sufficient for all join selectivity tests
+    /// Create a cost model with two columns, one for each table. Each column has 100 values.
     pub fn create_two_table_cost_model(
         tbl1_per_column_stats: TestPerColumnStats,
         tbl2_per_column_stats: TestPerColumnStats,
@@ -340,6 +345,84 @@ mod tests {
         )
     }
 
+    /// Create a cost model with three columns, one for each table. Each column has 100 values.
+    pub fn create_three_table_cost_model(
+        tbl1_per_column_stats: TestPerColumnStats,
+        tbl2_per_column_stats: TestPerColumnStats,
+        tbl3_per_column_stats: TestPerColumnStats,
+    ) -> TestOptCostModel {
+        OptCostModel::new(
+            vec![
+                (
+                    String::from(TABLE1_NAME),
+                    TableStats::new(
+                        100,
+                        vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(),
+                    ),
+                ),
+                (
+                    String::from(TABLE2_NAME),
+                    TableStats::new(
+                        100,
+                        vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(),
+                    ),
+                ),
+                (
+                    String::from(TABLE3_NAME),
+                    TableStats::new(
+                        100,
+                        vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(),
+                    ),
+                ),
+            ]
+            .into_iter()
+            .collect(),
+        )
+    }
+
+    /// Create a cost model with three columns, one for each table. Each column has 100 values.
+    pub fn create_four_table_cost_model(
+        tbl1_per_column_stats: TestPerColumnStats,
+        tbl2_per_column_stats: TestPerColumnStats,
+        tbl3_per_column_stats: TestPerColumnStats,
+        tbl4_per_column_stats: TestPerColumnStats,
+    ) -> TestOptCostModel {
+        OptCostModel::new(
+            vec![
+                (
+                    String::from(TABLE1_NAME),
+                    TableStats::new(
+                        100,
+                        vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(),
+                    ),
+                ),
+                (
+                    String::from(TABLE2_NAME),
+                    TableStats::new(
+                        100,
+                        vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(),
+                    ),
+                ),
+                (
+                    String::from(TABLE3_NAME),
+                    TableStats::new(
+                        100,
+                        vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(),
+                    ),
+                ),
+                (
+                    String::from(TABLE4_NAME),
+                    TableStats::new(
+                        100,
+                        vec![(vec![0], tbl4_per_column_stats)].into_iter().collect(),
+                    ),
+                ),
+            ]
+            .into_iter()
+            .collect(),
+        )
+    }
+
     /// We need custom row counts because some join algorithms rely on the row cnt
     pub fn create_two_table_cost_model_custom_row_cnts(
         tbl1_per_column_stats: TestPerColumnStats,

diff --git a/optd-datafusion-repr/src/cost/base_cost/agg.rs b/optd-datafusion-repr/src/cost/base_cost/agg.rs
@@ -8,12 +8,12 @@ use optd_core::{
 use serde::{de::DeserializeOwned, Serialize};
 
 use crate::{
-    cost::{
-        base_cost::stats::{Distribution, MostCommonValues},
-        base_cost::DEFAULT_NUM_DISTINCT,
+    cost::base_cost::{
+        stats::{Distribution, MostCommonValues},
+        DEFAULT_NUM_DISTINCT,
     },
     plan_nodes::{ExprList, OptRelNode, OptRelNodeTyp},
-    properties::column_ref::{ColumnRef, ColumnRefPropertyBuilder},
+    properties::column_ref::{BaseTableColumnRef, ColumnRef, ColumnRefPropertyBuilder},
 };
 
 use super::{OptCostModel, DEFAULT_UNK_SEL};
@@ -61,13 +61,14 @@ impl<
             } else {
                 // Multiply the n-distinct of all the group by columns.
                 // TODO: improve with multi-dimensional n-distinct
-                let base_table_col_refs = optimizer
+                let group_col_refs = optimizer
                     .get_property_by_group::<ColumnRefPropertyBuilder>(context.group_id, 1);
-                base_table_col_refs
+                group_col_refs
+                    .column_refs()
                     .iter()
                     .take(group_by.len())
                     .map(|col_ref| match col_ref {
-                        ColumnRef::BaseTableColumnRef { table, col_idx } => {
+                        ColumnRef::BaseTableColumnRef(BaseTableColumnRef { table, col_idx }) => {
                             let table_stats = self.per_table_stats_map.get(table);
                             let column_stats = table_stats.and_then(|table_stats| {
                                 table_stats.column_comb_stats.get(&vec![*col_idx])