From 30d9cfdb5c28e1bd19157df56a82431a0c7188e9 Mon Sep 17 00:00:00 2001 From: Michael Conan <67560023+michaelconan@users.noreply.github.com> Date: Wed, 5 Feb 2025 19:31:22 +0000 Subject: [PATCH 1/3] Update dbt.py --- dags/michael/dbt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/michael/dbt.py b/dags/michael/dbt.py index a847aa4..ea3c7f2 100755 --- a/dags/michael/dbt.py +++ b/dags/michael/dbt.py @@ -16,6 +16,7 @@ @dag( + dag_id="dbt__michael", # Run after source datasets refreshed schedule=[NOTION_DAILY_HABITS_DS, NOTION_WEEKLY_HABITS_DS], catchup=False, From 5b2cbbbbf0d12285acab7ca88dd366664d8c8bdf Mon Sep 17 00:00:00 2001 From: Michael Conan <67560023+michaelconan@users.noreply.github.com> Date: Wed, 5 Feb 2025 19:32:13 +0000 Subject: [PATCH 2/3] Update migrate.py --- dags/michael/migrate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/michael/migrate.py b/dags/michael/migrate.py index f811233..c8c6132 100755 --- a/dags/michael/migrate.py +++ b/dags/michael/migrate.py @@ -19,7 +19,7 @@ DATASET = os.getenv("ADMIN_DATASET", "admin") with DAG( - "migrate_raw_tables", + "bq__migrate_schema", schedule="@once", # also consider "None" start_date=datetime(1970, 1, 1), params={"command": "upgrade", "revision": "head"}, From 398ae6c768bccd9b951611de48e020da221c13de Mon Sep 17 00:00:00 2001 From: Michael Conan <67560023+michaelconan@users.noreply.github.com> Date: Wed, 5 Feb 2025 19:40:11 +0000 Subject: [PATCH 3/3] Update README.md --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6d8688e..54dd5ed 100755 --- a/README.md +++ b/README.md @@ -54,6 +54,15 @@ graph TB 2. [Airflow](https://airflow.apache.org/) to orchestrate data loading scripts and additional automated workflows 3. [DBT core](https://docs.getdbt.com/) to define data models and transformations, again orchestrated by Airflow (via CLI / bash TaskFlow) +## Standards + +The project has been strucutrd and designed with inspiration from [dbt project recommendations](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) and other sources. + +- DBT projects stored in separate subdirectory from DAGs (at least now) +- DAGs and DBT projects organised at the top level by owner (should more get involved) +- Further organisation by data source and / or function +- Naming generally follows DBT recommended `[layer]_[source]__[entity]`, adapted for Airflow DAGs with `__[refresh-type]` and other modifications as needed. + ## Setup @@ -73,8 +82,6 @@ To run Airflow on a single instance, I used Honcho to run multiple processes via - `AIRFLOW__CORE__FERNKET_KEY={generated-key}` following [this guidance](https://airflow.apache.org/docs/apache-airflow/1.10.8/howto/secure-connections.html) to encrypt connection data - `AIRFLOW__CORE__INTERNAL_API_SECRET_KEY={generated-secret1}` following [this guidance](https://flask.palletsprojects.com/en/stable/config/#SECRET_KEY) - `AIRFLOW__WEBSERVER__SECRET_KEY={generated-secret2}` following guidance above - - `AIRFLOW__WEBSERVER__BASE_URL={deployed-url}` - - `AIRFLOW__CLI__ENDPOINT_URL={deployed-url}` - `AIRFLOW__WEBSERVER__INSTANCE_NAME=MY INSTANCE!` 4. Generate Publish Profile file and deploy application code from GitHub 5. Set startup command to use the `startup.txt` file