cmu-db · wangpatrick57 · Apr 30, 2024 · Apr 27, 2024 · Apr 27, 2024 · Apr 27, 2024
diff --git a/dev_scripts/which_queries_work.sh b/dev_scripts/which_queries_work.sh
@@ -9,13 +9,13 @@ fi
 
 if [[ "$benchmark_name" == "job" ]]; then
     all_ids="1a,1b,1c,1d,2a,2b,2c,2d,3a,3b,3c,4a,4b,4c,5a,5b,5c,6a,6b,6c,6d,6e,6f,7a,7b,7c,8a,8b,8c,8d,9a,9b,9c,9d,10a,10b,10c,11a,11b,11c,11d,12a,12b,12c,13a,13b,13c,13d,14a,14b,14c,15a,15b,15c,15d,16a,16b,16c,16d,17a,17b,17c,17d,17e,17f,18a,18b,18c,19a,19b,19c,19d,20a,20b,20c,21a,21b,21c,22a,22b,22c,22d,23a,23b,23c,24a,24b,25a,25b,25c,26a,26b,26c,27a,27b,27c,28a,28b,28c,29a,29b,29c,30a,30b,30c,31a,31b,31c,32a,32b,33a,33b,33c"
-    vec_var_name="WORKING_QUERY_IDS"
+    vec_var_name="WORKING_JOB_QUERY_IDS"
 elif [[ "$benchmark_name" == "joblight" ]]; then
     all_ids="1a,1b,1c,1d,2a,3a,3b,3c,4a,4b,4c,5a,5b,5c,6a,6b,6c,6d,6e,7a,7b,7c,8a,8b,8c,9a,9b,10a,10b,10c,11a,11b,11c,12a,12b,12c,13a,14a,14b,14c,15a,15b,15c,16a,17a,17b,17c,18a,18b,18c,19a,19b,20a,20b,20c,21a,21b,22a,22b,22c,23a,23b,24a,24b,25a,26a,26b,27a,27b,28a"
     vec_var_name="WORKING_JOBLIGHT_QUERY_IDS"
 elif [[ "$benchmark_name" == "tpch" ]]; then
     all_ids="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22"
-    vec_var_name="WORKING_JOB_QUERY_IDS"
+    vec_var_name="WORKING_QUERY_IDS"
 else
     echo >&2 $USAGE
     exit 1
@@ -24,7 +24,8 @@ fi
 successful_ids=()
 IFS=','
 for id in $all_ids; do
-    cargo run --bin optd-perftest cardtest $benchmark_name --query-ids $id &>/dev/null
+    # make sure to execute with --adaptive so that we actually run the query in datafusion
+    cargo run --bin optd-perftest cardtest $benchmark_name --query-ids $id --adaptive &>/dev/null
 
     if [ $? -eq 0 ]; then
         echo >&2 $id succeeded

diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs
@@ -20,6 +20,7 @@ use datafusion::{
     execution::{
         config::SessionConfig,
         context::{SessionContext, SessionState},
+        options::CsvReadOptions,
         runtime_env::{RuntimeConfig, RuntimeEnv},
     },
     sql::{parser::DFParser, sqlparser::dialect::GenericDialect},
@@ -36,9 +37,12 @@ pub struct DatafusionDBMS {
     workspace_dpath: PathBuf,
     rebuild_cached_stats: bool,
     adaptive: bool,
-    ctx: SessionContext,
+    ctx: Option<SessionContext>,
 }
 
+const WITH_LOGICAL_FOR_TPCH: bool = true;
+const WITH_LOGICAL_FOR_JOB: bool = false;
+
 #[async_trait]
 impl CardtestRunnerDBMSHelper for DatafusionDBMS {
     fn get_name(&self) -> &str {
@@ -50,18 +54,22 @@ impl CardtestRunnerDBMSHelper for DatafusionDBMS {
         benchmark: &Benchmark,
     ) -> anyhow::Result<Vec<usize>> {
         let base_table_stats = self.get_benchmark_stats(benchmark).await?;
-        self.clear_state(Some(base_table_stats)).await?;
+        // clear_state() is how we "load" the stats into datafusion
+        self.clear_state(Some(base_table_stats), benchmark).await?;
+
+        if self.adaptive {
+            // We need to load the stats if we're doing adaptivity because that involves executing the queries in datafusion.
+            // This function also calls create_tables().
+            self.load_benchmark_data_no_stats(benchmark).await?;
+        } else {
+            // We only create the tables so that the optimizer doesn't work. However, we can save on the time of loading
+            //   the data if we're not doing adaptivity because we won't be executing queries.
+            self.create_benchmark_tables(benchmark).await?;
+        }
 
         match benchmark {
-            Benchmark::Tpch(tpch_kit_config) => {
-                // Create the tables. This must be done after clear_state because that clears everything
-                let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
-                self.create_tpch_tables(&tpch_kit).await?;
-                self.eval_tpch_estcards(tpch_kit_config).await
-            }
+            Benchmark::Tpch(tpch_kit_config) => self.eval_tpch_estcards(tpch_kit_config).await,
             Benchmark::Job(job_kit_config) | Benchmark::Joblight(job_kit_config) => {
-                let job_kit = JobKit::build(&self.workspace_dpath)?;
-                self.create_job_tables(&job_kit).await?;
                 self.eval_job_estcards(job_kit_config).await
             }
         }
@@ -78,7 +86,7 @@ impl DatafusionDBMS {
             workspace_dpath: workspace_dpath.as_ref().to_path_buf(),
             rebuild_cached_stats,
             adaptive,
-            ctx: Self::new_session_ctx(None, adaptive).await?,
+            ctx: None,
         })
     }
 
@@ -87,16 +95,30 @@ impl DatafusionDBMS {
     ///
     /// A more ideal way to generate statistics would be to use the `ANALYZE`
     /// command in SQL, but DataFusion does not support that yet.
-    async fn clear_state(&mut self, stats: Option<DataFusionBaseTableStats>) -> anyhow::Result<()> {
-        self.ctx = Self::new_session_ctx(stats, self.adaptive).await?;
+    async fn clear_state(
+        &mut self,
+        stats: Option<DataFusionBaseTableStats>,
+        benchmark: &Benchmark,
+    ) -> anyhow::Result<()> {
+        let with_logical = match benchmark {
+            Benchmark::Tpch(_) => WITH_LOGICAL_FOR_TPCH,
+            Benchmark::Job(_) | Benchmark::Joblight(_) => WITH_LOGICAL_FOR_JOB,
+        };
+        self.ctx = Some(Self::new_session_ctx(stats, self.adaptive, with_logical).await?);
         Ok(())
     }
 
     async fn new_session_ctx(
         stats: Option<DataFusionBaseTableStats>,
         adaptive: bool,
+        with_logical: bool,
     ) -> anyhow::Result<SessionContext> {
-        let session_config = SessionConfig::from_env()?.with_information_schema(true);
+        let mut session_config = SessionConfig::from_env()?.with_information_schema(true);
+
+        if !with_logical {
+            session_config.options_mut().optimizer.max_passes = 0;
+        }
+
         let rn_config = RuntimeConfig::new();
         let runtime_env = RuntimeEnv::new(rn_config.clone())?;
         let ctx = {
@@ -166,6 +188,11 @@ impl DatafusionDBMS {
             let sql = fs::read_to_string(sql_fpath)?;
             let estcard = self.eval_query_estcard(&sql).await?;
             estcards.push(estcard);
+
+            if self.adaptive {
+                // If we're in adaptive mode, execute the query to fill the true cardinality cache.
+                self.execute_query(&sql).await?;
+            }
         }
 
         Ok(estcards)
@@ -189,6 +216,11 @@ impl DatafusionDBMS {
             let sql = fs::read_to_string(sql_fpath)?;
             let estcard = self.eval_query_estcard(&sql).await?;
             estcards.push(estcard);
+
+            if self.adaptive {
+                // Execute the query to fill the true cardinality cache.
+                self.execute_query(&sql).await?;
+            }
         }
 
         Ok(estcards)
@@ -204,11 +236,15 @@ impl DatafusionDBMS {
         log::info!("{} {}", self.get_name(), explain_str);
     }
 
+    fn get_ctx(&self) -> &SessionContext {
+        self.ctx.as_ref().unwrap()
+    }
+
     async fn eval_query_estcard(&self, sql: &str) -> anyhow::Result<usize> {
         lazy_static! {
             static ref ROW_CNT_RE: Regex = Regex::new(r"row_cnt=(\d+\.\d+)").unwrap();
         }
-        let explains = Self::execute(&self.ctx, &format!("explain verbose {}", sql)).await?;
+        let explains = Self::execute(self.get_ctx(), &format!("explain verbose {}", sql)).await?;
         self.log_explain(&explains);
         // Find first occurrence of row_cnt=... in the output.
         let row_cnt = explains
@@ -228,19 +264,27 @@ impl DatafusionDBMS {
         Ok(row_cnt)
     }
 
+    /// This is used to execute the query in order to load the true cardinalities back into optd
+    /// in order to use the adaptive cost model.
+    async fn execute_query(&self, sql: &str) -> anyhow::Result<()> {
+        Self::execute(self.get_ctx(), sql).await?;
+        Ok(())
+    }
+
     /// Load the data into DataFusion without building the stats used by optd.
     /// Unlike Postgres, where both data and stats are used by the same program, for this class the
     ///   data is used by DataFusion while the stats are used by optd. That is why there are two
     ///   separate functions to load them.
-    #[allow(dead_code)]
     async fn load_benchmark_data_no_stats(&mut self, benchmark: &Benchmark) -> anyhow::Result<()> {
         match benchmark {
             Benchmark::Tpch(tpch_kit_config) => self.load_tpch_data_no_stats(tpch_kit_config).await,
-            _ => unimplemented!(),
+            Benchmark::Job(job_kit_config) | Benchmark::Joblight(job_kit_config) => {
+                self.load_job_data_no_stats(job_kit_config).await
+            }
         }
     }
 
-    /// Build the stats that optd's cost model uses.
+    /// Build the stats that optd's cost model uses, or get the stats from the cache.
     async fn get_benchmark_stats(
         &mut self,
         benchmark: &Benchmark,
@@ -270,6 +314,21 @@ impl DatafusionDBMS {
         }
     }
 
+    /// This function creates the tables for the benchmark without loading the data.
+    async fn create_benchmark_tables(&mut self, benchmark: &Benchmark) -> anyhow::Result<()> {
+        match benchmark {
+            Benchmark::Tpch(_) => {
+                let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
+                self.create_tpch_tables(&tpch_kit).await?;
+            }
+            Benchmark::Job(_) | Benchmark::Joblight(_) => {
+                let job_kit = JobKit::build(&self.workspace_dpath)?;
+                Self::create_job_tables(self.get_ctx(), &job_kit).await?;
+            }
+        };
+        Ok(())
+    }
+
     async fn create_tpch_tables(&mut self, tpch_kit: &TpchKit) -> anyhow::Result<()> {
         let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?;
         let ddls = ddls
@@ -278,25 +337,24 @@ impl DatafusionDBMS {
             .filter(|s| !s.is_empty())
             .collect::<Vec<_>>();
         for ddl in ddls {
-            Self::execute(&self.ctx, ddl).await?;
+            Self::execute(self.get_ctx(), ddl).await?;
         }
         Ok(())
     }
 
-    async fn create_job_tables(&mut self, job_kit: &JobKit) -> anyhow::Result<()> {
+    async fn create_job_tables(ctx: &SessionContext, job_kit: &JobKit) -> anyhow::Result<()> {
         let ddls = fs::read_to_string(&job_kit.schema_fpath)?;
         let ddls = ddls
             .split(';')
             .map(|s| s.trim())
             .filter(|s| !s.is_empty())
             .collect::<Vec<_>>();
         for ddl in ddls {
-            Self::execute(&self.ctx, ddl).await?;
+            Self::execute(ctx, ddl).await?;
         }
         Ok(())
     }
 
-    #[allow(dead_code)]
     async fn load_tpch_data_no_stats(
         &mut self,
         tpch_kit_config: &TpchKitConfig,
@@ -313,7 +371,7 @@ impl DatafusionDBMS {
         for tbl_fpath in tbl_fpath_iter {
             let tbl_name = tbl_fpath.file_stem().unwrap().to_str().unwrap();
             Self::execute(
-                &self.ctx,
+                self.get_ctx(),
                 &format!(
                     "create external table {}_tbl stored as csv delimiter '|' location '{}';",
                     tbl_name,
@@ -324,7 +382,7 @@ impl DatafusionDBMS {
 
             // Get the number of columns of this table.
             let schema = self
-                .ctx
+                .get_ctx()
                 .catalog("datafusion")
                 .unwrap()
                 .schema("public")
@@ -338,7 +396,7 @@ impl DatafusionDBMS {
                 .collect::<Vec<_>>()
                 .join(", ");
             Self::execute(
-                &self.ctx,
+                self.get_ctx(),
                 &format!(
                     "insert into {} select {} from {}_tbl;",
                     tbl_name, projection_list, tbl_name,
@@ -350,6 +408,47 @@ impl DatafusionDBMS {
         Ok(())
     }
 
+    // Load job data from a .csv file.
+    async fn load_job_data_no_stats(
+        &mut self,
+        job_kit_config: &JobKitConfig,
+    ) -> anyhow::Result<()> {
+        let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?;
+
+        // Download the tables.
+        let job_kit = JobKit::build(&self.workspace_dpath)?;
+        job_kit.download_tables(job_kit_config)?;
+
+        // Create the tables.
+        Self::create_job_tables(&ctx, &job_kit).await?;
+
+        // Load each table using register_csv()
+        let tbl_fpath_iter = job_kit.get_tbl_fpath_iter().unwrap();
+        for tbl_fpath in tbl_fpath_iter {
+            let tbl_name = tbl_fpath.file_stem().unwrap().to_str().unwrap();
+            let schema = ctx
+                .catalog("datafusion")
+                .unwrap()
+                .schema("public")
+                .unwrap()
+                .table(tbl_name)
+                .await
+                .unwrap()
+                .schema();
+            self.get_ctx()
+                .register_csv(
+                    tbl_name,
+                    tbl_fpath.to_str().unwrap(),
+                    CsvReadOptions::new()
+                        .schema(&schema)
+                        .delimiter(b',')
+                        .escape(b'\\'),
+                )
+                .await?;
+        }
+        Ok(())
+    }
+
     async fn get_tpch_stats(
         &mut self,
         tpch_kit_config: &TpchKitConfig,
@@ -359,7 +458,7 @@ impl DatafusionDBMS {
         tpch_kit.gen_tables(tpch_kit_config)?;
 
         // To get the schema of each table.
-        let ctx = Self::new_session_ctx(None, self.adaptive).await?;
+        let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_TPCH).await?;
         let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?;
         let ddls = ddls
             .split(';')
@@ -419,7 +518,7 @@ impl DatafusionDBMS {
         job_kit.download_tables(job_kit_config)?;
 
         // To get the schema of each table.
-        let ctx = Self::new_session_ctx(None, self.adaptive).await?;
+        let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?;
         let ddls = fs::read_to_string(&job_kit.schema_fpath)?;
         let ddls = ddls
             .split(';')

diff --git a/optd-perftest/src/main.rs b/optd-perftest/src/main.rs
@@ -60,8 +60,8 @@ enum Commands {
         rebuild_cached_optd_stats: bool,
 
         #[clap(long)]
+        #[clap(action)]
         #[clap(help = "Whether to enable adaptivity for optd")]
-        #[clap(default_value = "true")]
         adaptive: bool,
 
         #[clap(long)]

diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs
@@ -13,9 +13,8 @@ use std::path::{Path, PathBuf};
 const TPCH_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/tpch-kit.git";
 pub const TPCH_KIT_POSTGRES: &str = "POSTGRESQL";
 const NUM_TPCH_QUERIES: usize = 22;
-pub const WORKING_QUERY_IDS: &[&str] = &[
-    "2", "3", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "17", "19",
-];
+pub const WORKING_QUERY_IDS: &[&str] =
+    &["2", "3", "5", "7", "8", "9", "10", "12", "13", "14", "17"];
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct TpchKitConfig {