From 076372634db618dea4fbec63b684f8d30022f5a8 Mon Sep 17 00:00:00 2001
From: mikivee <87154654+mikivee@users.noreply.github.com>
Date: Tue, 11 Feb 2025 21:07:06 +0100
Subject: [PATCH] Write out megastock table with sample size tag (#31)

Co-authored-by: mikivee <mikivee>
---
 scripts/megastock/README.md                    |  8 +++-----
 scripts/megastock/feature_extract_02.py        | 18 ++++++++----------
 .../write_databricks_to_bigquery_03.py         |  8 ++++----
 3 files changed, 15 insertions(+), 19 deletions(-)
diff --git a/scripts/megastock/README.md b/scripts/megastock/README.md
index 4194e01..33121dd 100644
--- a/scripts/megastock/README.md
+++ b/scripts/megastock/README.md
@@ -8,7 +8,7 @@ A. Generate resstock building samples using the resstock repo.
    - See the [resstock github repo](https://github.com/NREL/resstock/tree/develop?tab=readme-ov-file), and the [relevant documentation](https://resstock.readthedocs.io/en/latest/basic_tutorial/architecture.html#sampling).
    - Follow their installation instructions -- you'll have to install OpenStudio and the appropriate version of ruby to match what is defined in the resstock repo. They use [rbenv](https://github.com/rbenv/rbenv#readme) to manage ruby versions.
    - generate building metadata csv files using their sampling script
-   - Sampled files using v3.3.0 are currently on GCS at `the-cube/data/processed/sampling_resstock/resstock_v3.3.0`. There are files corresponding to multiple sample sizes including N=10k, 1M, 2M, and 5M.
+   - Sampled files using v3.3.0 are currently on GCS at `the-cube/data/processed/sampling_resstock/resstock_v3.3.0`. There are files corresponding to multiple sample sizes including N=10k, 1M, 2M, 5M, 10M, 15M, 20M.
 
 B. Run the [MegaStock Job](https://4617764665359845.5.gcp.databricks.com/jobs/724743198057405?o=4617764665359845) with the job parameter `n_sample_tag` set to the sample size suffix of the CSV from step 1. (e.g, '5M'). This will perform the following: 
 
@@ -18,10 +18,8 @@ B. Run the [MegaStock Job](https://4617764665359845.5.gcp.databricks.com/jobs/72
 2. Run `feature_extract_02`, referencing appropriate file names based on the job parameter. There are functions/code which:
      - transform building features and add upgrades and weather city
      - write out building metadata and upgrades to the feature store
-3. Run `write_databricks_to_bigquery_03`, , referencing appropriate file names based on the job parameter. There code will write out two tables to BQ, *which will overwrite the current tables based on whatever the chosen sample size is*. 
-      - `cube-machine-learning.ds_api_datasets.megastock_metadata`
-      - `cube-machine-learning.ds_api_datasets.megastock_features`
-
+3. Run `write_databricks_to_bigquery_03`, , referencing appropriate file names based on the job parameter. There code will write the following table to BQ:
+      - `cube-machine-learning.ds_api_datasets.megastock_combined_baseline_{n_sample_tag}`
 
 ## Useful info
 - [Reference figma diagram](https://www.figma.com/board/HbgKjS4P6tHGDLmz84fxTK/SuMo%2FDoyho?node-id=9-429&node-type=section&t=UCFHhbgvIyBZKoQM-0)
\ No newline at end of file
diff --git a/scripts/megastock/feature_extract_02.py b/scripts/megastock/feature_extract_02.py
index 564ad7e..1083852 100644
--- a/scripts/megastock/feature_extract_02.py
+++ b/scripts/megastock/feature_extract_02.py
@@ -84,13 +84,11 @@
 # DBTITLE 1,Write out building metadata feature store
 table_name = f"ml.megastock.building_features_{N_SAMPLE_TAG}"
 df = building_metadata_upgrades
-if spark.catalog.tableExists(table_name):
-    fe.write_table(name=table_name, df=df, mode="merge")
-else:
-    fe.create_table(
-        name=table_name,
-        primary_keys=["building_id", "upgrade_id", "weather_file_city"],
-        df=df,
-        schema=df.schema,
-        description="megastock building metadata features",
-    )
+spark.sql(f"DROP TABLE IF EXISTS {table_name}")
+fe.create_table(
+    name=table_name,
+    primary_keys=["building_id", "upgrade_id", "weather_file_city"],
+    df=df,
+    schema=df.schema,
+    description="megastock building metadata features",
+)
diff --git a/scripts/megastock/write_databricks_to_bigquery_03.py b/scripts/megastock/write_databricks_to_bigquery_03.py
index 9b2a26b..55b09cc 100644
--- a/scripts/megastock/write_databricks_to_bigquery_03.py
+++ b/scripts/megastock/write_databricks_to_bigquery_03.py
@@ -16,7 +16,7 @@
 # MAGIC - `ml.megastock.building_features_{n_sample_tag}`
 # MAGIC
 # MAGIC ## Outputs: tables on BigQuery
-# MAGIC - `cube-machine-learning.ds_api_datasets.megastock_combined_baseline`
+# MAGIC - `cube-machine-learning.ds_api_datasets.megastock_combined_baseline_{n_sample_tag}`
 # MAGIC
 
 # COMMAND ----------
@@ -62,7 +62,7 @@
 # set up paths to write to 
 bq_project = "cube-machine-learning"
 bq_dataset = "ds_api_datasets"
-bq_megastock_table = 'megastock_combined_baseline'
+bq_megastock_table = f'megastock_combined_baseline_{N_SAMPLE_TAG}'
 bq_write_path = f"{bq_project}.{bq_dataset}.{bq_megastock_table}"
 
 # COMMAND ----------
@@ -110,7 +110,7 @@
 # optimize the table by partitioning and clustering
 query = f"""
 CREATE TABLE `{bq_write_path}_optimized`
-PARTITION BY RANGE_BUCKET(climate_zone_int__m, GENERATE_ARRAY(1, {len(climate_zone_mapping)+1}, 1))
+PARTITION BY RANGE_BUCKET(climate_zone_int__m, GENERATE_ARRAY(1, {len(CLIMATE_ZONE_TO_INDEX)+1}, 1))
 CLUSTER BY  heating_fuel__m, geometry_building_type_acs__m, geometry_floor_area__m, vintage__m AS
 SELECT *,
 FROM `{bq_write_path}`
@@ -144,7 +144,7 @@
 rows = query_job.result()  # Waits for query to finish
 
 for row in rows:
-    print(row) #3,234,218
+    print(row)
 
 # COMMAND ----------