From 3a7589ea2431fe5ed0e94e3f95d97095f9e32c37 Mon Sep 17 00:00:00 2001 From: pnadolny13 Date: Tue, 2 Aug 2022 15:15:15 -0400 Subject: [PATCH 1/4] cleanup environments --- .../telemetry/base/cli_executions_base.sql | 5 +-- .../marts/telemetry/base/environment_dim.sql | 40 +++++++++++++++++++ .../marts/telemetry/base/environments.sql | 14 ------- .../telemetry/base/execution_env_map.sql | 24 +++++++++++ .../telemetry/base/pipeline_executions.sql | 6 ++- .../models/marts/telemetry/base/schema.yml | 17 +++++++- .../marts/telemetry/fact_cli_executions.sql | 7 +++- 7 files changed, 92 insertions(+), 21 deletions(-) create mode 100644 data/transform/models/marts/telemetry/base/environment_dim.sql delete mode 100644 data/transform/models/marts/telemetry/base/environments.sql create mode 100644 data/transform/models/marts/telemetry/base/execution_env_map.sql diff --git a/data/transform/models/marts/telemetry/base/cli_executions_base.sql b/data/transform/models/marts/telemetry/base/cli_executions_base.sql index ef53dae4..2aa4d43e 100644 --- a/data/transform/models/marts/telemetry/base/cli_executions_base.sql +++ b/data/transform/models/marts/telemetry/base/cli_executions_base.sql @@ -80,7 +80,6 @@ combined AS ( NULL AS machine, NULL AS system_release, NULL AS is_dev_build, - NULL AS environment_name_hash, NULL AS python_implementation, NULL AS system_name, NULL AS system_version, @@ -158,7 +157,6 @@ combined AS ( unstructured_executions.machine, unstructured_executions.system_release, unstructured_executions.is_dev_build, - unstructured_executions.environment_name_hash, unstructured_executions.python_implementation, unstructured_executions.system_name, unstructured_executions.system_version, @@ -218,7 +216,7 @@ combined AS ( COALESCE(unstruct_prep.is_plugin_great_ex, FALSE) AS is_plugin_great_ex, -- OS Features COALESCE( - unstructured_executions.environment_name_hash IS NOT NULL, FALSE + unstructured_executions.env_hash IS NOT NULL, FALSE ) AS is_os_feature_environments, COALESCE( unstruct_prep.is_os_feature_mappers, @@ -262,7 +260,6 @@ SELECT combined.machine, combined.system_release, combined.is_dev_build, - combined.environment_name_hash, combined.python_implementation, combined.system_name, combined.system_version, diff --git a/data/transform/models/marts/telemetry/base/environment_dim.sql b/data/transform/models/marts/telemetry/base/environment_dim.sql new file mode 100644 index 00000000..ba935680 --- /dev/null +++ b/data/transform/models/marts/telemetry/base/environment_dim.sql @@ -0,0 +1,40 @@ +SELECT + {{ dbt_utils.surrogate_key( + [ + 'structured_executions.project_id', + 'cmd_parsed_all.environment' + ] + ) }} AS environment_pk, + structured_executions.project_id, + cmd_parsed_all.environment AS env_hash, + COALESCE( + hash_lookup.unhashed_value, + cmd_parsed_all.environment + ) AS env_name +FROM {{ ref('structured_executions') }} +LEFT JOIN + {{ ref('cmd_parsed_all') }} ON + structured_executions.command = cmd_parsed_all.command +LEFT JOIN {{ ref('hash_lookup') }} + ON cmd_parsed_all.environment = hash_lookup.hash_value + AND hash_lookup.category = 'environment' + +UNION ALL + +SELECT + {{ dbt_utils.surrogate_key( + [ + 'unstructured_executions.project_id', + 'unstructured_executions.environment_name_hash' + ] + ) }} AS environment_pk, + unstructured_executions.project_id, + unstructured_executions.environment_name_hash AS env_hash, + COALESCE( + hash_lookup.unhashed_value, + unstructured_executions.environment_name_hash + ) AS env_name +FROM {{ ref('unstructured_executions') }} +LEFT JOIN {{ ref('hash_lookup') }} + ON unstructured_executions.environment_name_hash = hash_lookup.hash_value + AND hash_lookup.category = 'environment' diff --git a/data/transform/models/marts/telemetry/base/environments.sql b/data/transform/models/marts/telemetry/base/environments.sql deleted file mode 100644 index 5ac9d456..00000000 --- a/data/transform/models/marts/telemetry/base/environments.sql +++ /dev/null @@ -1,14 +0,0 @@ -SELECT - structured_executions.execution_id, - cmd_parsed_all.environment AS env_hash, - hash_lookup.unhashed_value AS env_name, - NULL AS is_ephemeral, - NULL AS is_cicd, - NULL AS is_cloud -FROM {{ ref('structured_executions') }} -LEFT JOIN - {{ ref('cmd_parsed_all') }} ON - structured_executions.command = cmd_parsed_all.command -LEFT JOIN {{ ref('hash_lookup') }} - ON cmd_parsed_all.environment = hash_lookup.hash_value - AND hash_lookup.category = 'environment' diff --git a/data/transform/models/marts/telemetry/base/execution_env_map.sql b/data/transform/models/marts/telemetry/base/execution_env_map.sql new file mode 100644 index 00000000..13f29e6a --- /dev/null +++ b/data/transform/models/marts/telemetry/base/execution_env_map.sql @@ -0,0 +1,24 @@ +SELECT + {{ dbt_utils.surrogate_key( + [ + 'structured_executions.project_id', + 'cmd_parsed_all.environment' + ] + ) }} AS environment_fk, + structured_executions.execution_id +FROM {{ ref('structured_executions') }} +LEFT JOIN + {{ ref('cmd_parsed_all') }} ON + structured_executions.command = cmd_parsed_all.command + +UNION ALL + +SELECT + {{ dbt_utils.surrogate_key( + [ + 'unstructured_executions.project_id', + 'unstructured_executions.environment_name_hash' + ] + ) }} AS environment_fk, + unstructured_executions.execution_id +FROM {{ ref('unstructured_executions') }} diff --git a/data/transform/models/marts/telemetry/base/pipeline_executions.sql b/data/transform/models/marts/telemetry/base/pipeline_executions.sql index d4374b4b..a86b994a 100644 --- a/data/transform/models/marts/telemetry/base/pipeline_executions.sql +++ b/data/transform/models/marts/telemetry/base/pipeline_executions.sql @@ -2,7 +2,7 @@ WITH plugin_prep AS ( SELECT plugin_executions.execution_id, cli_executions_base.project_id, - cli_executions_base.environment_name_hash AS env_id, + environment_dim.env_hash AS env_id, ARRAY_AGG( COALESCE( plugin_executions.plugin_surrogate_key, @@ -12,6 +12,10 @@ WITH plugin_prep AS ( FROM {{ ref('plugin_executions') }} LEFT JOIN {{ ref('cli_executions_base') }} ON plugin_executions.execution_id = cli_executions_base.execution_id + LEFT JOIN {{ ref('execution_env_map') }} + ON plugin_executions.execution_id = execution_env_map.execution_id + LEFT JOIN {{ ref('environment_dim') }} + ON execution_env_map.environment_fk = environment_dim.environment_pk WHERE cli_executions_base.cli_command IN ('elt', 'run') GROUP BY 1, 2, 3 ) diff --git a/data/transform/models/marts/telemetry/base/schema.yml b/data/transform/models/marts/telemetry/base/schema.yml index 42db1094..574569f6 100644 --- a/data/transform/models/marts/telemetry/base/schema.yml +++ b/data/transform/models/marts/telemetry/base/schema.yml @@ -134,4 +134,19 @@ models: - name: event_date tests: - not_null - \ No newline at end of file + + - name: environment_dim + description: This table contains attributes about project environments. + columns: + - name: environment_pk + tests: + - not_null + - unique + + - name: execution_env_map + columns: + - name: environment_fk + tests: + - relationships: + to: ref('environment_dim') + field: environment_pk diff --git a/data/transform/models/marts/telemetry/fact_cli_executions.sql b/data/transform/models/marts/telemetry/fact_cli_executions.sql index 882b67d1..80c0c736 100644 --- a/data/transform/models/marts/telemetry/fact_cli_executions.sql +++ b/data/transform/models/marts/telemetry/fact_cli_executions.sql @@ -9,7 +9,8 @@ SELECT cli_executions_base.python_version, ip_address_dim.ip_address_hash, ip_address_dim.cloud_provider, - ip_address_dim.execution_location + ip_address_dim.execution_location, + environment_dim.env_name FROM {{ ref('cli_executions_base') }} LEFT JOIN {{ ref('pipeline_executions') }} ON cli_executions_base.execution_id = pipeline_executions.execution_id @@ -19,3 +20,7 @@ LEFT JOIN {{ ref('date_dim') }} ON cli_executions_base.event_date = date_dim.date_day LEFT JOIN {{ ref('ip_address_dim') }} ON cli_executions_base.ip_address_hash = ip_address_dim.ip_address_hash +LEFT JOIN {{ ref('execution_env_map') }} + ON cli_executions_base.execution_id = execution_env_map.execution_id +LEFT JOIN {{ ref('environment_dim') }} + ON execution_env_map.environment_fk = environment_dim.environment_pk From 8c195425ead3db8116139496eb3da76ed8f4f33a Mon Sep 17 00:00:00 2001 From: pnadolny13 Date: Tue, 2 Aug 2022 15:24:02 -0400 Subject: [PATCH 2/4] use sampling of snowplow events --- .../models/staging/snowplow/stg_snowplow__events.sql | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/data/transform/models/staging/snowplow/stg_snowplow__events.sql b/data/transform/models/staging/snowplow/stg_snowplow__events.sql index 0f2c1351..764f7558 100644 --- a/data/transform/models/staging/snowplow/stg_snowplow__events.sql +++ b/data/transform/models/staging/snowplow/stg_snowplow__events.sql @@ -16,10 +16,9 @@ WITH source AS ( {% if env_var("MELTANO_ENVIRONMENT") == "cicd" %} - FROM raw.snowplow.events - WHERE derived_tstamp::TIMESTAMP >= DATEADD('day', -7, CURRENT_DATE) - -- filter test events - AND app_id != 'test' + FROM raw.snowplow.events SAMPLE ROW (100000 ROWS) + WHERE COALESCE(app_id, '') != 'test' + {% else %} FROM {{ source('snowplow', 'events') }} From 376b58c97b280eb398efe39d833798c25a08b2a2 Mon Sep 17 00:00:00 2001 From: pnadolny13 Date: Tue, 2 Aug 2022 16:56:34 -0400 Subject: [PATCH 3/4] fix invalid reference --- .../models/marts/telemetry/base/cli_executions_base.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/transform/models/marts/telemetry/base/cli_executions_base.sql b/data/transform/models/marts/telemetry/base/cli_executions_base.sql index 2aa4d43e..3b7c4f76 100644 --- a/data/transform/models/marts/telemetry/base/cli_executions_base.sql +++ b/data/transform/models/marts/telemetry/base/cli_executions_base.sql @@ -216,7 +216,7 @@ combined AS ( COALESCE(unstruct_prep.is_plugin_great_ex, FALSE) AS is_plugin_great_ex, -- OS Features COALESCE( - unstructured_executions.env_hash IS NOT NULL, FALSE + unstructured_executions.environment_name_hash IS NOT NULL, FALSE ) AS is_os_feature_environments, COALESCE( unstruct_prep.is_os_feature_mappers, From 64b3e32fe6c5a64c3e9e563c74e7f1aae007bd5e Mon Sep 17 00:00:00 2001 From: pnadolny13 Date: Tue, 2 Aug 2022 17:55:59 -0400 Subject: [PATCH 4/4] materialize for speed --- .../models/marts/telemetry/base/environment_dim.sql | 4 ++++ .../models/marts/telemetry/base/execution_env_map.sql | 4 ++++ .../models/marts/telemetry/fact_plugin_usage.sql | 11 +++++------ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/data/transform/models/marts/telemetry/base/environment_dim.sql b/data/transform/models/marts/telemetry/base/environment_dim.sql index ba935680..3944239d 100644 --- a/data/transform/models/marts/telemetry/base/environment_dim.sql +++ b/data/transform/models/marts/telemetry/base/environment_dim.sql @@ -1,3 +1,7 @@ +{{ + config(materialized='table') +}} + SELECT {{ dbt_utils.surrogate_key( [ diff --git a/data/transform/models/marts/telemetry/base/execution_env_map.sql b/data/transform/models/marts/telemetry/base/execution_env_map.sql index 13f29e6a..dc278c64 100644 --- a/data/transform/models/marts/telemetry/base/execution_env_map.sql +++ b/data/transform/models/marts/telemetry/base/execution_env_map.sql @@ -1,3 +1,7 @@ +{{ + config(materialized='table') +}} + SELECT {{ dbt_utils.surrogate_key( [ diff --git a/data/transform/models/marts/telemetry/fact_plugin_usage.sql b/data/transform/models/marts/telemetry/fact_plugin_usage.sql index 32aa45ec..bd8429ca 100644 --- a/data/transform/models/marts/telemetry/fact_plugin_usage.sql +++ b/data/transform/models/marts/telemetry/fact_plugin_usage.sql @@ -20,8 +20,7 @@ SELECT plugin_executions.plugin_surrogate_key, -- CLI Attributes cli_executions_base.cli_command, - cli_executions_base.environment_name_hash AS env_id, - hash_lookup.unhashed_value AS env_name, + environment_dim.env_name, cli_executions_base.exit_code AS cli_exit_code, cli_executions_base.meltano_version, cli_executions_base.num_cpu_cores_available, @@ -58,7 +57,7 @@ LEFT JOIN {{ ref('project_dim') }} ON cli_executions_base.project_id = project_dim.project_id LEFT JOIN {{ ref('ip_address_dim') }} ON cli_executions_base.ip_address_hash = ip_address_dim.ip_address_hash --- TODO: move this parsing up stream -LEFT JOIN {{ ref('hash_lookup') }} - ON cli_executions_base.environment_name_hash = hash_lookup.hash_value - AND hash_lookup.category = 'environment' +LEFT JOIN {{ ref('execution_env_map') }} + ON cli_executions_base.execution_id = execution_env_map.execution_id +LEFT JOIN {{ ref('environment_dim') }} + ON execution_env_map.environment_fk = environment_dim.environment_pk