From 076372634db618dea4fbec63b684f8d30022f5a8 Mon Sep 17 00:00:00 2001 From: mikivee <87154654+mikivee@users.noreply.github.com> Date: Tue, 11 Feb 2025 21:07:06 +0100 Subject: [PATCH] Write out megastock table with sample size tag (#31) Co-authored-by: mikivee --- scripts/megastock/README.md | 8 +++----- scripts/megastock/feature_extract_02.py | 18 ++++++++---------- .../write_databricks_to_bigquery_03.py | 8 ++++---- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/scripts/megastock/README.md b/scripts/megastock/README.md index 4194e01..33121dd 100644 --- a/scripts/megastock/README.md +++ b/scripts/megastock/README.md @@ -8,7 +8,7 @@ A. Generate resstock building samples using the resstock repo. - See the [resstock github repo](https://github.com/NREL/resstock/tree/develop?tab=readme-ov-file), and the [relevant documentation](https://resstock.readthedocs.io/en/latest/basic_tutorial/architecture.html#sampling). - Follow their installation instructions -- you'll have to install OpenStudio and the appropriate version of ruby to match what is defined in the resstock repo. They use [rbenv](https://github.com/rbenv/rbenv#readme) to manage ruby versions. - generate building metadata csv files using their sampling script - - Sampled files using v3.3.0 are currently on GCS at `the-cube/data/processed/sampling_resstock/resstock_v3.3.0`. There are files corresponding to multiple sample sizes including N=10k, 1M, 2M, and 5M. + - Sampled files using v3.3.0 are currently on GCS at `the-cube/data/processed/sampling_resstock/resstock_v3.3.0`. There are files corresponding to multiple sample sizes including N=10k, 1M, 2M, 5M, 10M, 15M, 20M. B. Run the [MegaStock Job](https://4617764665359845.5.gcp.databricks.com/jobs/724743198057405?o=4617764665359845) with the job parameter `n_sample_tag` set to the sample size suffix of the CSV from step 1. (e.g, '5M'). This will perform the following: @@ -18,10 +18,8 @@ B. Run the [MegaStock Job](https://4617764665359845.5.gcp.databricks.com/jobs/72 2. Run `feature_extract_02`, referencing appropriate file names based on the job parameter. There are functions/code which: - transform building features and add upgrades and weather city - write out building metadata and upgrades to the feature store -3. Run `write_databricks_to_bigquery_03`, , referencing appropriate file names based on the job parameter. There code will write out two tables to BQ, *which will overwrite the current tables based on whatever the chosen sample size is*. - - `cube-machine-learning.ds_api_datasets.megastock_metadata` - - `cube-machine-learning.ds_api_datasets.megastock_features` - +3. Run `write_databricks_to_bigquery_03`, , referencing appropriate file names based on the job parameter. There code will write the following table to BQ: + - `cube-machine-learning.ds_api_datasets.megastock_combined_baseline_{n_sample_tag}` ## Useful info - [Reference figma diagram](https://www.figma.com/board/HbgKjS4P6tHGDLmz84fxTK/SuMo%2FDoyho?node-id=9-429&node-type=section&t=UCFHhbgvIyBZKoQM-0) \ No newline at end of file diff --git a/scripts/megastock/feature_extract_02.py b/scripts/megastock/feature_extract_02.py index 564ad7e..1083852 100644 --- a/scripts/megastock/feature_extract_02.py +++ b/scripts/megastock/feature_extract_02.py @@ -84,13 +84,11 @@ # DBTITLE 1,Write out building metadata feature store table_name = f"ml.megastock.building_features_{N_SAMPLE_TAG}" df = building_metadata_upgrades -if spark.catalog.tableExists(table_name): - fe.write_table(name=table_name, df=df, mode="merge") -else: - fe.create_table( - name=table_name, - primary_keys=["building_id", "upgrade_id", "weather_file_city"], - df=df, - schema=df.schema, - description="megastock building metadata features", - ) +spark.sql(f"DROP TABLE IF EXISTS {table_name}") +fe.create_table( + name=table_name, + primary_keys=["building_id", "upgrade_id", "weather_file_city"], + df=df, + schema=df.schema, + description="megastock building metadata features", +) diff --git a/scripts/megastock/write_databricks_to_bigquery_03.py b/scripts/megastock/write_databricks_to_bigquery_03.py index 9b2a26b..55b09cc 100644 --- a/scripts/megastock/write_databricks_to_bigquery_03.py +++ b/scripts/megastock/write_databricks_to_bigquery_03.py @@ -16,7 +16,7 @@ # MAGIC - `ml.megastock.building_features_{n_sample_tag}` # MAGIC # MAGIC ## Outputs: tables on BigQuery -# MAGIC - `cube-machine-learning.ds_api_datasets.megastock_combined_baseline` +# MAGIC - `cube-machine-learning.ds_api_datasets.megastock_combined_baseline_{n_sample_tag}` # MAGIC # COMMAND ---------- @@ -62,7 +62,7 @@ # set up paths to write to bq_project = "cube-machine-learning" bq_dataset = "ds_api_datasets" -bq_megastock_table = 'megastock_combined_baseline' +bq_megastock_table = f'megastock_combined_baseline_{N_SAMPLE_TAG}' bq_write_path = f"{bq_project}.{bq_dataset}.{bq_megastock_table}" # COMMAND ---------- @@ -110,7 +110,7 @@ # optimize the table by partitioning and clustering query = f""" CREATE TABLE `{bq_write_path}_optimized` -PARTITION BY RANGE_BUCKET(climate_zone_int__m, GENERATE_ARRAY(1, {len(climate_zone_mapping)+1}, 1)) +PARTITION BY RANGE_BUCKET(climate_zone_int__m, GENERATE_ARRAY(1, {len(CLIMATE_ZONE_TO_INDEX)+1}, 1)) CLUSTER BY heating_fuel__m, geometry_building_type_acs__m, geometry_floor_area__m, vintage__m AS SELECT *, FROM `{bq_write_path}` @@ -144,7 +144,7 @@ rows = query_job.result() # Waits for query to finish for row in rows: - print(row) #3,234,218 + print(row) # COMMAND ----------