diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 33b2bd11..46eb8873 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,3 +35,18 @@ terraform-docs . ``` This will replace the readme file at `deploy/infrastructure/README.md` and `deploy/meltano/README.md` with any changes made to the module and header docs. + +## Developing transforms + +To develop transforms, you'll need to duplicate `.env.template` as `.env` and ensure that at least these env vars are declared: + +- `SNOWFLAKE_USER` +- `SNOWFLAKE_PASSWORD` + +## Incremental build with `--defer` option + +```console +meltano invoke dbt-snowflake:seed +meltano invoke dbt-snowflake:snapshot +meltano invoke dbt-snowflake:build +``` diff --git a/data/.env.template b/data/.env.template index 50819b03..342d6683 100644 --- a/data/.env.template +++ b/data/.env.template @@ -19,10 +19,9 @@ PERMISSION_BOT_PASSWORD="****" PERMISSION_BOT_ACCOUNT="****" # Snowflake MELTANO User Password +SNOWFLAKE_USER="...." SNOWFLAKE_PASSWORD="****" - -# dbt Snowflake Password -DBT_SNOWFLAKE_PASSWORD="****" +SNOWFLAKE_ROLE="DEVELOPER" # Slack Webhooks TARGET_APPRISE_SINGER_ACTIVITY_URIS=["https://hooks.slack.com/services/{}/{}/{}"] diff --git a/data/.sqlfluff b/data/.sqlfluff index e764e243..d9cf2193 100644 --- a/data/.sqlfluff +++ b/data/.sqlfluff @@ -17,7 +17,9 @@ profile = meltano tab_space_size = 4 max_line_length = 80 indent_unit = space -comma_style = trailing + +[sqlfluff:layout:type:comma] +line_position = trailing [sqlfluff:rules:L010] # Keywords capitalisation_policy = upper diff --git a/data/environments/dbt_dev.meltano.yml b/data/environments/dbt_dev.meltano.yml new file mode 100644 index 00000000..00fa72c9 --- /dev/null +++ b/data/environments/dbt_dev.meltano.yml @@ -0,0 +1,42 @@ +environments: +- name: dbt_dev + annotations: + docs: + description: > + Environment for developing dbt transforms separately from EL. + config: + plugins: + utilities: + - name: dbt-snowflake + config: + user: ${SNOWFLAKE_USER} + role: ${SNOWFLAKE_USER} + warehouse: CORE + skip_pre_invoke: true + database: USERDEV_PROD + database_prep: USERDEV_PREP + target_schema_prefix: ${SNOWFLAKE_USER}_ + - name: sqlfluff + config: + user: ${SNOWFLAKE_USER} + - name: great_expectations + config: + prod_database: USERDEV_PROD + raw_database: USERDEV_RAW + username: ${SNOWFLAKE_USER} + role: ${SNOWFLAKE_USER} + warehouse: CORE + env: + USER_PREFIX: ${SNOWFLAKE_USER} + SUPERSET_API_URL: http://localhost:8088 + SUPERSET_USER: admin + SUPERSET_PASS: admin + # https://airflow.apache.org/docs/apache-airflow/stable/configurations-ref.html + AIRFLOW__CORE__PLUGINS_FOLDER: $MELTANO_PROJECT_ROOT/orchestrate/plugins_local + AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL: '30' + AIRFLOW_VAR_MELTANO_ENVIRONMENT: userdev + AIRFLOW_VAR_OPERATOR_TYPE: bash + # Secrets via KMS + KMS_PUBLIC_KEY_PATH: utilities/kms/Publickey.pem + KMS_DOTENV_PATH: .env + KMS_SECRETS_PATH: secrets.yml diff --git a/data/environments/userdev.meltano.yml b/data/environments/userdev.meltano.yml index 79e8f760..bbd8ced4 100644 --- a/data/environments/userdev.meltano.yml +++ b/data/environments/userdev.meltano.yml @@ -22,8 +22,8 @@ environments: - name: tap-snowflake config: dbname: USERDEV_PROD - user: ${USER_PREFIX} - role: ${USER_PREFIX} + user: ${SNOWFLAKE_USER} + role: ${SNOWFLAKE_USER} warehouse: CORE - name: tap-snowflake-metrics-legacy config: @@ -85,7 +85,7 @@ environments: role: ${USER_PREFIX} warehouse: CORE env: - USER_PREFIX: PNADOLNY + USER_PREFIX: ${SNOWFLAKE_USER} SUPERSET_API_URL: http://localhost:8088 SUPERSET_USER: admin SUPERSET_PASS: admin @@ -97,4 +97,4 @@ environments: # Secrets via KMS KMS_PUBLIC_KEY_PATH: utilities/kms/Publickey.pem KMS_DOTENV_PATH: .env - KMS_SECTRETS_PATH: secrets.yml + KMS_SECRETS_PATH: secrets.yml diff --git a/data/meltano.yml b/data/meltano.yml index 6122bd80..9ad0fb0f 100644 --- a/data/meltano.yml +++ b/data/meltano.yml @@ -1,5 +1,5 @@ version: 1 -default_environment: userdev +default_environment: dbt_dev send_anonymous_usage_stats: false project_id: c15e971a-d318-4a9d-979b-1039ce5fd1b1 include_paths: @@ -9,3 +9,5 @@ include_paths: - ./orchestrate/*.meltano.yml - ./transform/*.meltano.yml - ./utilities/*.meltano.yml +ff: + strict_env_var_mode: true diff --git a/data/transform/models/marts/engineering/list_issues.sql b/data/transform/models/marts/engineering/list_issues.sql new file mode 100644 index 00000000..19931289 --- /dev/null +++ b/data/transform/models/marts/engineering/list_issues.sql @@ -0,0 +1,21 @@ +SELECT + repo_name, + organization_name, + state, + author_association, + labels, + reactions, + assignees, + milestone, + issue_id, + issue_number, + comment_count, + is_locked, + author_id, + author_username, + assignee_id, + assignee_username, + last_updated_ts, + created_at_ts, + closed_at_ts +FROM {{ref('stg_github__issues')}} diff --git a/data/transform/models/marts/engineering/schema.yml b/data/transform/models/marts/engineering/schema.yml new file mode 100644 index 00000000..c5951e54 --- /dev/null +++ b/data/transform/models/marts/engineering/schema.yml @@ -0,0 +1,15 @@ +version: 2 + +models: + - name: list_issues + description: One row per item, describing all github issues and PRs. + columns: + - name: issue_number + description: The PR or Issue number. + tests: + - not_null + - name: issue_id + description: The unique key for the issue or PR. + tests: + - not_null + - unique diff --git a/data/transform/models/marts/telemetry/base/unstructured_parsing/event_unstruct.sql b/data/transform/models/marts/telemetry/base/unstructured_parsing/event_unstruct.sql index b24e7eac..b1059db4 100644 --- a/data/transform/models/marts/telemetry/base/unstructured_parsing/event_unstruct.sql +++ b/data/transform/models/marts/telemetry/base/unstructured_parsing/event_unstruct.sql @@ -1,3 +1,9 @@ +{{ + config( + materialized='table' + ) +}} + WITH base AS ( SELECT diff --git a/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql b/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql index 05ecace4..92dcc663 100644 --- a/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql +++ b/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql @@ -1,3 +1,9 @@ +{{ + config( + materialized='table' + ) +}} + WITH reparse_1 AS ( -- Incident: new schema not registered to SnowcatCloud versions 2.14.0 and -- 2.15.0 (partial) diff --git a/data/transform/models/staging/snowplow/stg_snowplow__events.sql b/data/transform/models/staging/snowplow/stg_snowplow__events.sql index ab5672ae..cabccf84 100644 --- a/data/transform/models/staging/snowplow/stg_snowplow__events.sql +++ b/data/transform/models/staging/snowplow/stg_snowplow__events.sql @@ -1,212 +1,22 @@ {{ config( - materialized='incremental' + materialized='table' ) }} -WITH blended_source AS ( - SELECT - *, - FALSE AS snowplow_bad_parsed - {% if env_var("MELTANO_ENVIRONMENT") == "cicd" %} - - FROM raw.snowplow.events - WHERE derived_tstamp::TIMESTAMP >= DATEADD('day', -3, CURRENT_DATE) - {% else %} - - FROM {{ source('snowplow', 'events') }} - - {% if is_incremental() %} - - WHERE UPLOADED_AT >= (SELECT max(UPLOADED_AT) FROM {{ this }} WHERE snowplow_bad_parsed = FALSE) - - {% endif %} - {% endif %} - - UNION ALL - - SELECT - *, - TRUE AS snowplow_bad_parsed - FROM {{ ref('snowplow_bad_parsed') }} - {% if is_incremental() %} - - WHERE event_id NOT IN (SELECT DISTINCT event_id FROM {{ this }} WHERE snowplow_bad_parsed = TRUE) - - {% endif %} - -), - -source AS ( +WITH source AS ( SELECT *, ROW_NUMBER() OVER ( PARTITION BY event_id - ORDER BY derived_tstamp::TIMESTAMP DESC - ) AS row_num - FROM blended_source - -), - -clean_new_source AS ( - - SELECT * - FROM source - WHERE row_num = 1 - {% if is_incremental() %} - - AND event_id NOT IN ( - SELECT DISTINCT event_id FROM {{ this }} - ) - {% endif %} - -), - -renamed AS ( - - SELECT -- noqa: L034 - -- only meltano events. For the first ~6 months no app_id was - -- sent from Meltano. So nulls are from meltano. - COALESCE(app_id, 'meltano') AS app_id, - platform, - etl_tstamp::TIMESTAMP AS etl_enriched_at, - collector_tstamp::TIMESTAMP AS collector_received_at, - dvce_created_tstamp::TIMESTAMP AS device_created_at, - dvce_sent_tstamp::TIMESTAMP AS device_sent_at, - refr_dvce_tstamp::TIMESTAMP AS refr_device_processed_at, - derived_tstamp::TIMESTAMP AS event_created_at, - derived_tstamp::DATE AS event_created_date, - true_tstamp::TIMESTAMP AS true_sent_at, - uploaded_at, - event, - event_id, - txn_id, - name_tracker, - v_tracker, - v_collector, - v_etl, - user_id, - user_ipaddress, - user_fingerprint, - domain_userid, - domain_sessionidx, - network_userid, - geo_country, - geo_region, - geo_city, - geo_zipcode, - geo_latitude, - geo_longitude, - geo_region_name, - ip_isp, - ip_organization, - ip_domain, - ip_netspeed, - page_url, - page_title, - page_referrer, - page_urlscheme, - page_urlhost, - page_urlport, - page_urlpath, - page_urlquery, - page_urlfragment, - refr_urlscheme, - refr_urlhost, - refr_urlport, - refr_urlpath, - refr_urlquery, - refr_urlfragment, - refr_medium, - refr_source, - refr_term, - mkt_medium, - mkt_source, - mkt_term, - mkt_content, - mkt_campaign, - contexts, - se_category, - se_action, - se_label, - se_property, - se_value, - unstruct_event, - tr_orderid, - tr_affiliation, - tr_total, - tr_tax, - tr_shipping, - tr_city, - tr_state, - tr_country, - ti_orderid, - ti_sku, - ti_name, - ti_category, - ti_price, - ti_quantity, - pp_xoffset_min, - pp_xoffset_max, - pp_yoffset_min, - pp_yoffset_max, - useragent, - br_name, - br_family, - br_version, - br_type, - br_renderengine, - br_lang, - br_features_pdf, - br_features_flash, - br_features_java, - br_features_director, - br_features_quicktime, - br_features_realplayer, - br_features_windowsmedia, - br_features_gears, - br_features_silverlight, - br_cookies, - br_colordepth, - br_viewwidth, - br_viewheight, - os_name, - os_family, - os_manufacturer, - os_timezone, - dvce_type, - dvce_ismobile, - dvce_screenwidth, - dvce_screenheight, - doc_charset, - doc_width, - doc_height, - tr_currency, - tr_total_base, - tr_tax_base, - tr_shipping_base, - ti_currency, - ti_price_base, - base_currency, - geo_timezone, - mkt_clickid, - mkt_network, - etl_tags, - refr_domain_userid, - derived_contexts::VARIANT AS derived_contexts, - domain_sessionid, - event_vendor, - event_name, - event_format, - event_version, - event_fingerprint, - MD5(user_ipaddress) AS ip_address_hash, - snowplow_bad_parsed - FROM clean_new_source + ORDER BY event_created_at::TIMESTAMP DESC + ) AS dedupe_rank + FROM {{ ref('stg_snowplow__events_union_all') }} ) SELECT * -FROM renamed +FROM source +WHERE dedupe_rank = 1 diff --git a/data/transform/models/staging/snowplow/stg_snowplow__events_union_all.sql b/data/transform/models/staging/snowplow/stg_snowplow__events_union_all.sql new file mode 100644 index 00000000..7907e955 --- /dev/null +++ b/data/transform/models/staging/snowplow/stg_snowplow__events_union_all.sql @@ -0,0 +1,187 @@ +{{ + config( + materialized='incremental', + cluster_by=['event_id'] + ) +}} + +WITH blended_source AS ( + + SELECT + *, + FALSE AS snowplow_bad_parsed + {% if env_var("MELTANO_ENVIRONMENT") == "cicd" %} + + FROM raw.snowplow.events + WHERE derived_tstamp::TIMESTAMP >= DATEADD('day', -3, CURRENT_DATE) + {% else %} + + FROM {{ source('snowplow', 'events') }} + + {% if is_incremental() %} + + WHERE UPLOADED_AT >= (SELECT max(UPLOADED_AT) FROM {{ this }} WHERE snowplow_bad_parsed = FALSE) + + {% endif %} + {% endif %} + + UNION ALL + + SELECT + *, + TRUE AS snowplow_bad_parsed + FROM {{ ref('snowplow_bad_parsed') }} + {% if is_incremental() %} + + WHERE event_id NOT IN (SELECT DISTINCT event_id FROM {{ this }} WHERE snowplow_bad_parsed = TRUE) + + {% endif %} + +), + +renamed AS ( + + SELECT -- noqa: L034 + -- only meltano events. For the first ~6 months no app_id was + -- sent from Meltano. So nulls are from meltano. + COALESCE(app_id, 'meltano') AS app_id, + platform, + etl_tstamp::TIMESTAMP AS etl_enriched_at, + collector_tstamp::TIMESTAMP AS collector_received_at, + dvce_created_tstamp::TIMESTAMP AS device_created_at, + dvce_sent_tstamp::TIMESTAMP AS device_sent_at, + refr_dvce_tstamp::TIMESTAMP AS refr_device_processed_at, + derived_tstamp::TIMESTAMP AS event_created_at, + derived_tstamp::DATE AS event_created_date, + true_tstamp::TIMESTAMP AS true_sent_at, + uploaded_at, + event, + event_id, + txn_id, + name_tracker, + v_tracker, + v_collector, + v_etl, + user_id, + user_ipaddress, + user_fingerprint, + domain_userid, + domain_sessionidx, + network_userid, + geo_country, + geo_region, + geo_city, + geo_zipcode, + geo_latitude, + geo_longitude, + geo_region_name, + ip_isp, + ip_organization, + ip_domain, + ip_netspeed, + page_url, + page_title, + page_referrer, + page_urlscheme, + page_urlhost, + page_urlport, + page_urlpath, + page_urlquery, + page_urlfragment, + refr_urlscheme, + refr_urlhost, + refr_urlport, + refr_urlpath, + refr_urlquery, + refr_urlfragment, + refr_medium, + refr_source, + refr_term, + mkt_medium, + mkt_source, + mkt_term, + mkt_content, + mkt_campaign, + contexts, + se_category, + se_action, + se_label, + se_property, + se_value, + unstruct_event, + tr_orderid, + tr_affiliation, + tr_total, + tr_tax, + tr_shipping, + tr_city, + tr_state, + tr_country, + ti_orderid, + ti_sku, + ti_name, + ti_category, + ti_price, + ti_quantity, + pp_xoffset_min, + pp_xoffset_max, + pp_yoffset_min, + pp_yoffset_max, + useragent, + br_name, + br_family, + br_version, + br_type, + br_renderengine, + br_lang, + br_features_pdf, + br_features_flash, + br_features_java, + br_features_director, + br_features_quicktime, + br_features_realplayer, + br_features_windowsmedia, + br_features_gears, + br_features_silverlight, + br_cookies, + br_colordepth, + br_viewwidth, + br_viewheight, + os_name, + os_family, + os_manufacturer, + os_timezone, + dvce_type, + dvce_ismobile, + dvce_screenwidth, + dvce_screenheight, + doc_charset, + doc_width, + doc_height, + tr_currency, + tr_total_base, + tr_tax_base, + tr_shipping_base, + ti_currency, + ti_price_base, + base_currency, + geo_timezone, + mkt_clickid, + mkt_network, + etl_tags, + refr_domain_userid, + derived_contexts::VARIANT AS derived_contexts, + domain_sessionid, + event_vendor, + event_name, + event_format, + event_version, + event_fingerprint, + MD5(user_ipaddress) AS ip_address_hash, + snowplow_bad_parsed + FROM blended_source + +) + +SELECT * +FROM renamed diff --git a/data/transform/profiles/snowflake/profiles.yml b/data/transform/profiles/snowflake/profiles.yml index df7b0fbb..23b47043 100644 --- a/data/transform/profiles/snowflake/profiles.yml +++ b/data/transform/profiles/snowflake/profiles.yml @@ -18,6 +18,16 @@ meltano: warehouse: "{{ env_var('DBT_SNOWFLAKE_WAREHOUSE') }}" database: "{{ env_var('DBT_SNOWFLAKE_DATABASE') }}" schema: "DEFAULT" + dbt_dev: + type: snowflake + threads: "{{ env_var('DBT_THREADS', 6) | as_number }}" + account: "{{ env_var('DBT_SNOWFLAKE_ACCOUNT') }}" + user: "{{ env_var('SNOWFLAKE_USER') }}" + password: "{{ env_var('SNOWFLAKE_PASSWORD') }}" + role: "{{ env_var('DBT_SNOWFLAKE_ROLE', env_var('SNOWFLAKE_USER', )) }}" + warehouse: "{{ env_var('DBT_SNOWFLAKE_WAREHOUSE') }}" + database: "{{ env_var('DBT_SNOWFLAKE_DATABASE') }}" + schema: "DEFAULT" userdev: type: snowflake threads: 6 diff --git a/data/transform/transformers.meltano.yml b/data/transform/transformers.meltano.yml index 21e91564..792062cf 100644 --- a/data/transform/transformers.meltano.yml +++ b/data/transform/transformers.meltano.yml @@ -38,6 +38,7 @@ plugins: run_slack_notifications: run --select +publish.slack_notifications.* run_hubspot_publish: run --select publish.hubspot.* test_hubspot_publish: test --select publish.hubspot.* + build: run --fail-fast --defer --state=${MELTANO_PROJECT_ROOT}/.meltano/transformers/dbt/target/ config: account: epa06486 diff --git a/data/utilities/utilities.meltano.yml b/data/utilities/utilities.meltano.yml index 50ed631f..b1460469 100644 --- a/data/utilities/utilities.meltano.yml +++ b/data/utilities/utilities.meltano.yml @@ -68,9 +68,9 @@ plugins: executable: kms commands: encrypt: encrypt $KMS_PUBLIC_KEY_PATH --dotenv-path $KMS_DOTENV_PATH --output-path - $KMS_SECTRETS_PATH + $KMS_SECRETS_PATH decrypt: decrypt $KMS_KEY_ID --input-path $KMS_SECRETS_PATH --output-path $KMS_DOTENV_PATH env: KMS_PUBLIC_KEY_PATH: utilities/kms/Publickey.pem KMS_DOTENV_PATH: .env - KMS_SECTRETS_PATH: secrets.yml + KMS_SECRETS_PATH: secrets.yml