From 0b03e6b6da644ceac7671e36748525253bcf52a1 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 9 Feb 2022 18:37:22 +1300 Subject: [PATCH] Metrics package (#1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * dbt-client compatibility changes * Create helper macros for primary and secondary aggregation * average rename * Add comment discussing difference between primary and secondary calc macro dispatch strategy * Hide original file from dbt * Swap out dynamic macro dispatch for hardcoded list that actually works * Move `how` to PoP macro as that’s the only place it happens * Move debug macros, change helpers to use consistent signature * Pull get_metric out to its own macro * Swap in a proper table * Update dbt_project.yml, add slack metric def * Change metric calculation to support arbitrary calendar properties * Protect against not execute * Metrics namespacing * Access relations without using ref * Import utils * Count * if no expression provided * Remove redundant files and add to gitignore * Remove redundant aggregate references in secondary calcs * whitespace management and remove obsolete TODOs * Remove redundant metric_name arg (available on the main metric object) * Moving stuff around * Pull through date columns needed for secondary calcs * Give aliases to calculations, protect against pulling a whole table for a count(*) * Everythig uses refs now, but it is very bad * Rename metrics file * Splitting into smaller files * add todo * Forgot to bring through calculation alias * validate metrics queries make sense (legal grains, aggregates) * Moving stuff around, run legality tests * Protect against missing key * Use joiner for prettier error message * Goodbye debug file * Actually rename debug file * swap one todo for another * Swap out loop for fancy one-liner * Add builtin calendar * swap out fancy one-liner for a good-old-fashioned loop * Add integration tests project * Protect against missing meta configs * Remove duplicate average key * Add defaults to metric call * Write README * warnings about experimental behaviour * add secondary calcs shoutout * Broader utils support * Cleanup todos, protect against whitespace sql agg * readme tweaks * Update get_metric_sql.sql * Cross-db support * Add ci, circle for now * Update get_metric_sql.sql * Revert "Add ci, circle for now" This reverts commit a7e8940b66fc491cc150de0ed03cb906507140d4. * Set up end to end testing with GHA (#3) * first pass at setting up postgres integration tests * try number 2 * try number 3 * try number 4 * try number 5 * try number 6 * try number 7 * add snowflake integration tests * use snowflake target * add rest of 'em * try again please * fix target arg * fix syntax * use json for bigquery keyfile * fix env var * rename workflow * fix bigquery * more fix bigquery * more fix bigquery * re run * test * asdf * aha! * Remove star macro * Remove debug command * Properly remove star macro * Add comment expanding on problem * Add support for min aggregate (#4) * fix a bunch of badly named stuff * meta accessing isn't dependent on index anymore * Update readme to contain info on secondary calcs * tweak TOC builder file * Update create-table-of-contents.yml * Update README.md * Auto update table of contents * Update README.md * Auto update table of contents Co-authored-by: Jeremy Cohen Co-authored-by: Kyle Wigley Co-authored-by: joellabes --- .github/actions/end-to-end-test/action.yml | 35 +++ .../end-to-end-test/sample.profiles.yml | 48 +++ .github/workflows/ci.yml | 125 ++++++++ .../workflows/create-table-of-contents.yml | 22 ++ .gitignore | 2 + README.md | 161 +++++++++- dbt_project.yml | 8 +- integration_tests/.gitignore | 4 + integration_tests/README.md | 16 + .../analyses}/.gitkeep | 0 integration_tests/dbt_project.yml | 22 ++ integration_tests/macros/.gitkeep | 0 .../metric_definitions/slack_joiners.yml | 24 ++ .../models/metric_transformation.sql | 28 ++ .../models/metric_transformation.yml | 6 + integration_tests/packages.yml | 2 + integration_tests/seeds/.gitkeep | 0 .../seeds/expected/slack_users_expected.csv | 9 + .../seeds/source/seed_slack_users.csv | 5 + integration_tests/snapshots/.gitkeep | 0 integration_tests/tests/.gitkeep | 0 macros/aggregate_primary_metric.sql | 57 ++++ macros/get_metric_relation.sql | 57 ++++ macros/get_metric_sql.sql | 176 +++++++++++ macros/is_dim_from_model.sql | 14 + macros/metric.sql | 39 +++ macros/metrics.sql | 274 ------------------ .../generate_secondary_calculation_alias.sql | 27 ++ .../perform_secondary_calculation.sql | 26 ++ ...condary_calculation_period_over_period.sql | 54 ++++ .../secondary_calculation_period_to_date.sql | 38 +++ .../secondary_calculation_rolling.sql | 37 +++ .../validate_aggregate_coherence.sql | 23 ++ .../validate_grain_order.sql | 23 ++ models/dbt_metrics_default_calendar.sql | 23 ++ models/fct_orders.sql | 8 - models/schema.yml | 61 ---- packages.yml | 3 + 38 files changed, 1109 insertions(+), 348 deletions(-) create mode 100644 .github/actions/end-to-end-test/action.yml create mode 100644 .github/actions/end-to-end-test/sample.profiles.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/create-table-of-contents.yml create mode 100644 integration_tests/.gitignore create mode 100644 integration_tests/README.md rename {macros => integration_tests/analyses}/.gitkeep (100%) create mode 100644 integration_tests/dbt_project.yml create mode 100644 integration_tests/macros/.gitkeep create mode 100644 integration_tests/models/metric_definitions/slack_joiners.yml create mode 100644 integration_tests/models/metric_transformation.sql create mode 100644 integration_tests/models/metric_transformation.yml create mode 100644 integration_tests/packages.yml create mode 100644 integration_tests/seeds/.gitkeep create mode 100644 integration_tests/seeds/expected/slack_users_expected.csv create mode 100644 integration_tests/seeds/source/seed_slack_users.csv create mode 100644 integration_tests/snapshots/.gitkeep create mode 100644 integration_tests/tests/.gitkeep create mode 100644 macros/aggregate_primary_metric.sql create mode 100644 macros/get_metric_relation.sql create mode 100644 macros/get_metric_sql.sql create mode 100644 macros/is_dim_from_model.sql create mode 100644 macros/metric.sql delete mode 100644 macros/metrics.sql create mode 100644 macros/secondary_calculations/generate_secondary_calculation_alias.sql create mode 100644 macros/secondary_calculations/perform_secondary_calculation.sql create mode 100644 macros/secondary_calculations/secondary_calculation_period_over_period.sql create mode 100644 macros/secondary_calculations/secondary_calculation_period_to_date.sql create mode 100644 macros/secondary_calculations/secondary_calculation_rolling.sql create mode 100644 macros/secondary_calculations/validate_aggregate_coherence.sql create mode 100644 macros/secondary_calculations/validate_grain_order.sql create mode 100644 models/dbt_metrics_default_calendar.sql delete mode 100644 models/fct_orders.sql delete mode 100644 models/schema.yml create mode 100644 packages.yml diff --git a/.github/actions/end-to-end-test/action.yml b/.github/actions/end-to-end-test/action.yml new file mode 100644 index 00000000..4fd62f70 --- /dev/null +++ b/.github/actions/end-to-end-test/action.yml @@ -0,0 +1,35 @@ +name: "End to end testing" +description: "Set up profile and run dbt with test project" +inputs: + dbt-project: + description: "Location of test project" + required: false + default: "integration_tests" + dbt-target: + description: "Name of target to use when running dbt" + required: true + database-adapter-package: + description: "Name of database adapter to install" + required: true +runs: + using: "composite" + steps: + - name: Install python dependencies + shell: bash + run: | + pip install --user --upgrade pip + pip --version + pip install --pre ${{ inputs.database-adapter-package }} + + - name: Setup dbt profile + shell: bash + run: | + mkdir -p $HOME/.dbt + cp ${{ github.action_path }}/sample.profiles.yml $HOME/.dbt/profiles.yml + + - name: Run dbt + shell: bash + run: | + cd ${{ inputs.dbt-project }} + dbt deps --target ${{ inputs.dbt-target }} + dbt build --target ${{ inputs.dbt-target }} --full-refresh diff --git a/.github/actions/end-to-end-test/sample.profiles.yml b/.github/actions/end-to-end-test/sample.profiles.yml new file mode 100644 index 00000000..f703717d --- /dev/null +++ b/.github/actions/end-to-end-test/sample.profiles.yml @@ -0,0 +1,48 @@ +# HEY! This file is used in the dbt-utils integrations tests with GHA. +# You should __NEVER__ check credentials into version control. Thanks for reading :) + +config: + send_anonymous_usage_stats: False + use_colors: True + +dbt_metrics_integration_tests: + target: postgres + outputs: + postgres: + type: postgres + host: "{{ env_var('POSTGRES_TEST_HOST') }}" + user: "{{ env_var('POSTGRES_TEST_USER') }}" + pass: "{{ env_var('POSTGRES_TEST_PASSWORD') }}" + port: "{{ env_var('POSTGRES_TEST_PORT') | as_number }}" + dbname: "{{ env_var('POSTGRES_TEST_DB') }}" + schema: dbt_metrics_integration_tests + threads: 5 + + redshift: + type: redshift + host: "{{ env_var('REDSHIFT_TEST_HOST') }}" + user: "{{ env_var('REDSHIFT_TEST_USER') }}" + pass: "{{ env_var('REDSHIFT_TEST_PASS') }}" + dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}" + port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}" + schema: dbt_metrics_integration_tests + threads: 5 + + bigquery: + type: bigquery + method: service-account + keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}" + project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}" + schema: dbt_metrics_integration_tests + threads: 10 + + snowflake: + type: snowflake + account: "{{ env_var('SNOWFLAKE_TEST_ACCOUNT') }}" + user: "{{ env_var('SNOWFLAKE_TEST_USER') }}" + password: "{{ env_var('SNOWFLAKE_TEST_PASSWORD') }}" + role: "{{ env_var('SNOWFLAKE_TEST_ROLE') }}" + database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}" + warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" + schema: dbt_metrics_integration_tests + threads: 10 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..43d3853e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,125 @@ +name: Continuous Integration + +on: + push: + branches: + - "main" + pull_request: + +jobs: + postgres: + runs-on: ubuntu-latest + + # set up env vars so that we can use them to start an instance of postgres + env: + POSTGRES_TEST_USER: postgres + POSTGRES_TEST_PASSWORD: postgres + POSTGRES_TEST_DB: gha_test + POSTGRES_TEST_PORT: 5432 + POSTGRES_TEST_HOST: localhost + + services: + postgres: + image: postgres + env: + POSTGRES_USER: ${{ env.POSTGRES_TEST_USER }} + POSTGRES_PASSWORD: ${{ env.POSTGRES_TEST_PASSWORD }} + POSTGRES_DB: ${{ env.POSTGRES_TEST_DB }} + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - name: Check out the repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - uses: ./.github/actions/end-to-end-test + with: + dbt-target: postgres + database-adapter-package: dbt-postgres + + snowflake: + needs: postgres + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - uses: ./.github/actions/end-to-end-test + env: + SNOWFLAKE_TEST_ACCOUNT: ${{ secrets.SNOWFLAKE_TEST_ACCOUNT }} + SNOWFLAKE_TEST_USER: ${{ secrets.SNOWFLAKE_TEST_USER }} + SNOWFLAKE_TEST_PASSWORD: ${{ secrets.SNOWFLAKE_TEST_PASSWORD }} + SNOWFLAKE_TEST_ROLE: ${{ secrets.SNOWFLAKE_TEST_ROLE }} + SNOWFLAKE_TEST_DATABASE: ${{ secrets.SNOWFLAKE_TEST_DATABASE }} + SNOWFLAKE_TEST_WAREHOUSE: ${{ secrets.SNOWFLAKE_TEST_WAREHOUSE }} + with: + dbt-target: snowflake + database-adapter-package: dbt-snowflake + + redshift: + needs: postgres + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - uses: ./.github/actions/end-to-end-test + env: + REDSHIFT_TEST_HOST: ${{ secrets.REDSHIFT_TEST_HOST }} + REDSHIFT_TEST_USER: ${{ secrets.REDSHIFT_TEST_USER }} + REDSHIFT_TEST_PASS: ${{ secrets.REDSHIFT_TEST_PASS }} + REDSHIFT_TEST_DBNAME: ${{ secrets.REDSHIFT_TEST_DBNAME }} + REDSHIFT_TEST_PORT: ${{ secrets.REDSHIFT_TEST_PORT }} + with: + dbt-target: redshift + database-adapter-package: dbt-redshift + + bigquery: + needs: postgres + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Set up service key file + id: keyfile + env: + BIGQUERY_TEST_SERVICE_ACCOUNT_JSON: ${{ secrets.BIGQUERY_TEST_SERVICE_ACCOUNT_JSON }} + run: | + mkdir -p $HOME/.dbt + KEYFILE_PATH=$HOME/.dbt/bigquery-service-key.json + echo $BIGQUERY_TEST_SERVICE_ACCOUNT_JSON > $KEYFILE_PATH + echo ::set-output name=path::$KEYFILE_PATH + + - uses: ./.github/actions/end-to-end-test + env: + BIGQUERY_SERVICE_KEY_PATH: ${{ steps.keyfile.outputs.path }} + BIGQUERY_TEST_DATABASE: ${{ secrets.BIGQUERY_TEST_DATABASE }} + with: + dbt-target: bigquery + database-adapter-package: dbt-bigquery diff --git a/.github/workflows/create-table-of-contents.yml b/.github/workflows/create-table-of-contents.yml new file mode 100644 index 00000000..aa672e0b --- /dev/null +++ b/.github/workflows/create-table-of-contents.yml @@ -0,0 +1,22 @@ +name: Update table of contents + +# Controls when the workflow will run +on: + push: + branches: '*' + paths: ['README.md'] + +jobs: + build: + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@v2 + - run: | + curl https://raw.githubusercontent.com/ekalinin/github-markdown-toc/master/gh-md-toc -o gh-md-toc + chmod a+x gh-md-toc + ./gh-md-toc --insert --no-backup README.md + rm ./gh-md-toc + - uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: Auto update table of contents diff --git a/.gitignore b/.gitignore index dad33a45..a7a41faf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ target/ dbt_modules/ +dbt_packages/ logs/ +.DS_Store diff --git a/README.md b/README.md index e5bc1494..a34a44de 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,161 @@ +# dbt_metrics + + +* [dbt_metrics](#dbt_metrics) +* [About](#about) + * [A note on refs](#warning-a-note-on-refs) +* [Usage](#usage) +* [Secondary calculations](#secondary-calculations) + * [Period over Period (source)](#period-over-period-source) + * [Period to Date (source)](#period-to-date-source) + * [Rolling (source)](#rolling-source) +* [Customisation](#customisation) + * [Calendar](#calendar) + * [Time Grains](#time-grains) + * [Custom aggregations](#custom-aggregations) + * [Secondary calculation column aliases](#secondary-calculation-column-aliases) +* [Experimental behaviour](#-experimental-behaviour) + * [Dimensions on calendar tables](#dimensions-on-calendar-tables) -This repo calculates metrics. + + + + +# About +This dbt package generates queries based on [metrics](https://docs.getdbt.com/docs/building-a-dbt-project/metrics), introduced to dbt Core in v1.0. + +## :warning: A note on `ref`s +To enable the dynamic referencing of models necessary for macro queries through the dbt Server, queries generated by this package do not participate in the DAG and `ref`'d nodes will not necessarily be built before they are accessed. Refer to the docs on [forcing dependencies](https://docs.getdbt.com/reference/dbt-jinja-functions/ref#forcing-dependencies) for more details. + +# Usage +Access metrics [like any other macro](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros#using-a-macro-from-a-package): +```sql +select * +from {{ metrics.metric( + metric_name='new_customers', + grain='week', + dimensions=['plan', 'country'], + secondary_calculations=[ + period_over_period(comparison_strategy="ratio", interval=1, alias="pop_1wk"), + period_over_period(comparison_strategy="difference", interval=1), + + period_to_date(aggregate="average", period="month", alias="this_month_average"), + period_to_date(aggregate="sum", period="year"), + + rolling(aggregate="average", interval=4, alias="avg_past_4wks"), + rolling(aggregate="min", interval=4) + ] +) }} +``` + +# Secondary calculations +Secondary calculations are window functions which act on the primary metric. You can use them to compare a metric's value to an earlier period and calculate year-to-date sums or rolling averages. + +Calculations are passed into a list of dictionary elements which can be created manually: +```python +[ + {"calculation": "period_over_period", "interval": 1, "comparison_strategy": "difference", "alias": "pop_1mth"}, + {"calculation": "rolling", "interval": 3, "aggregate": "sum"} +] +``` +or by using the convenience [constructor](https://en.wikipedia.org/wiki/Constructor_(object-oriented_programming)) macros. + +Column aliases are [automatically generated](#secondary-calculation-column-aliases), but you can override them by setting `alias`. + +## Period over Period ([source](/macros/secondary_calculations/secondary_calculation_period_over_period.sql)) + +Constructor: `period_over_period(comparison_strategy, interval [, alias])` + +- `comparison_strategy`: How to calculate the delta between the two periods. One of [`"ratio"`, `"difference"`]. Required +- `interval`: The number of periods to look back. Required +- `alias`: The column alias for the resulting calculation. Optional + +## Period to Date ([source](/macros/secondary_calculations/secondary_calculation_period_to_date.sql)) + +Constructor: `period_to_date(aggregate, period [, alias])` + +- `aggregate`: The aggregation to use in the window function. Options vary based on the primary aggregation and are enforced in [validate_aggregate_coherence()](/macros/secondary_calculations/validate_aggregate_coherence.sql). Required +- `period`: The time grain to aggregate to. One of [`"day"`, `"week"`, `"month"`, `"quarter"`, `"year"`]. Must be at equal or lesser granularity than the metric's grain (see [Time Grains](#time-grains) below). Required +- `alias`: The column alias for the resulting calculation. Optional + +## Rolling ([source](/macros/secondary_calculations/secondary_calculation_rolling.sql)) + +Constructor: `rolling(aggregate, interval [, alias])` + +- `aggregate`: The aggregation to use in the window function. Options vary based on the primary aggregation and are enforced in [validate_aggregate_coherence()](/macros/secondary_calculations/validate_aggregate_coherence.sql). Required +- `interval`: The number of periods to look back. Required +- `alias`: The column alias for the resulting calculation. Optional + + +# Customisation +Most behaviour in the package can be overridden or customised. + +## Calendar +The package comes with a [basic calendar table](/models/dbt_metrics_default_calendar.sql), running between 2010-01-01 and 2029-12-31 inclusive. You can replace it with any custom calendar table which meets the following requirements: +- Contains a `date_day` column. +- Contains the following columns: `date_week`, `date_month`, `date_quarter`, `date_year`, or equivalents. +- Additional date columns need to be prefixed with `date_`, e.g. `date_4_5_4_month` for a 4-5-4 retail calendar date set. Dimensions can have any name (see [dimensions on calendar tables](#dimensions-on-calendar-tables)). + +To do this, set the value of the `dbt_metrics_calendar_model` variable in your `dbt_project.yml` file: +```yaml +#dbt_project.yml +config-version: 2 +[...] +vars: + dbt_metrics_calendar_model: ref('my_custom_table') +``` + +## Time Grains +The package protects against nonsensical secondary calculations, such as a month-to-date aggregate of data which has been rolled up to the quarter. If you customise your calendar (for example by adding a [4-5-4 retail calendar](https://nrf.com/resources/4-5-4-calendar) month), you will need to override the [`get_grain_order()`](/macros/secondary_calculations/validate_grain_order.sql) macro. In that case, you might remove `month` and replace it with `month_4_5_4`. All date columns must be prefixed with `date_` in the table. Do not include the prefix when defining your metric, it will be added automatically. + +## Custom aggregations +To create a custom primary aggregation (as exposed through the `type` config of a metric), create a macro of the form `metric_my_aggregate(expression)`, then override the [`aggregate_primary_metric()`](/macros/aggregate_primary_metric.sql) macro to add it to the dispatch list. The package also protects against nonsensical secondary calculations such as an average of an average; you will need to override the [`get_metric_allowlist()`](/macros/secondary_calculations/validate_aggregate_coherence.sql) macro to both add your new aggregate to to the existing aggregations' allowlists, and to make an allowlist for your new aggregation: +``` + {% do return ({ + "average": ['max', 'min'], + "count": ['max', 'min', 'average', 'my_new_aggregate'], + [...] + "my_new_aggregate": ['max', 'min', 'sum', 'average', 'my_new_aggregate'] + }) %} +``` + +To create a custom secondary aggregation (as exposed through the `secondary_calculations` parameter in the `metric` macro), create a macro of the form `secondary_calculation_my_calculation(metric_name, dimensions, calc_config)`, then override the [`perform_secondary_calculations()`](/macros/secondary_calculations/perform_secondary_calculation.sql) macro. + +## Secondary calculation column aliases +Aliases can be set for a secondary calculation. If no alias is provided, one will be automatically generated. To modify the existing alias logic, or add support for a custom secondary calculation, override [`generate_secondary_calculation_alias()`](/macros/secondary_calculations/generate_secondary_calculation_alias.sql). + +# đŸ§ª Experimental behaviour +:warning: This behaviour is subject to change in future versions of dbt Core and this package. + +## Dimensions on calendar tables +You may want to aggregate metrics by a dimension in your custom calendar table, for example `is_weekend`. _In addition to_ the primary `dimensions` list, add the following `meta` properties to your metric: +```yaml +version: 2 +metrics: + - name: new_customers + [...] + dimensions: + - plan + - country + + meta: + dimensions: + - type: model + columns: + - plan + - country + - type: calendar + columns: + - is_weekend +``` + +You can then access the additional dimensions as normal: +```sql +select * +from {{ metrics.metric( + metric_name='new_customers', + grain='week', + dimensions=['plan', 'country', 'is_weekend'], + secondary_calcs=[] +) }} +``` \ No newline at end of file diff --git a/dbt_project.yml b/dbt_project.yml index b90c7687..f1acd395 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -7,19 +7,19 @@ version: '1.0.0' config-version: 2 # This setting configures which "profile" dbt uses for this project. -profile: 'default' +profile: 'user' # These configurations specify where dbt should look for different types of files. # The `source-paths` config, for example, states that models in this project can be # found in the "models/" directory. You probably won't need to change these! model-paths: ["models"] -analysis-paths: ["analysis"] +analysis-paths: ["analyses"] test-paths: ["tests"] -seed-paths: ["data"] +seed-paths: ["seeds"] macro-paths: ["macros"] snapshot-paths: ["snapshots"] target-path: "target" # directory which will store compiled SQL files clean-targets: # directories to be removed by `dbt clean` - "target" - - "dbt_modules" + - "dbt_packages" diff --git a/integration_tests/.gitignore b/integration_tests/.gitignore new file mode 100644 index 00000000..49f147cb --- /dev/null +++ b/integration_tests/.gitignore @@ -0,0 +1,4 @@ + +target/ +dbt_packages/ +logs/ diff --git a/integration_tests/README.md b/integration_tests/README.md new file mode 100644 index 00000000..92d6523c --- /dev/null +++ b/integration_tests/README.md @@ -0,0 +1,16 @@ +Welcome to your new dbt project! + +### Using the starter project + +Try running the following commands: + +- dbt run +- dbt test + +### Resources: + +- Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) +- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers +- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support +- Find [dbt events](https://events.getdbt.com) near you +- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices diff --git a/macros/.gitkeep b/integration_tests/analyses/.gitkeep similarity index 100% rename from macros/.gitkeep rename to integration_tests/analyses/.gitkeep diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml new file mode 100644 index 00000000..9849f6cd --- /dev/null +++ b/integration_tests/dbt_project.yml @@ -0,0 +1,22 @@ +# Name your project! Project names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: "dbt_metrics_integration_tests" +version: "1.0.0" +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. +profile: "dbt_metrics_integration_tests" + +model-paths: ["models"] +analysis-paths: ["analyses"] +test-paths: ["tests"] +seed-paths: ["seeds"] +macro-paths: ["macros"] +snapshot-paths: ["snapshots"] + +target-path: "target" +clean-targets: + - "target" + - "dbt_packages" + - "logs" diff --git a/integration_tests/macros/.gitkeep b/integration_tests/macros/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/integration_tests/models/metric_definitions/slack_joiners.yml b/integration_tests/models/metric_definitions/slack_joiners.yml new file mode 100644 index 00000000..b4e1a057 --- /dev/null +++ b/integration_tests/models/metric_definitions/slack_joiners.yml @@ -0,0 +1,24 @@ +version: 2 +metrics: + - name: slack_joiners + label: Members of Community Slack + model: ref('seed_slack_users') + + type: count + sql: "*" + timestamp: joined_at + time_grains: [day, week, month] #does it make sense to have num weekly active members over a month? + + dimensions: + - is_active_past_quarter + - has_messaged + + meta: + dimensions: + - type: model + columns: + - is_active_past_quarter + - has_messaged + - type: calendar + columns: + - is_weekend \ No newline at end of file diff --git a/integration_tests/models/metric_transformation.sql b/integration_tests/models/metric_transformation.sql new file mode 100644 index 00000000..4a99ad5b --- /dev/null +++ b/integration_tests/models/metric_transformation.sql @@ -0,0 +1,28 @@ +select + + period, + has_messaged, + slack_joiners, + ytd_sum, + max_for_month, + pop_1mth, + ratio_to_1_month_ago, + trunc(avg_3mth, 3) as avg_3mth, -- different databases return this ratio differently + rolling_sum_3_month + +from +{{ metrics.metric( + metric_name="slack_joiners", + grain='month', + dimensions=['has_messaged'], + secondary_calcs=[ + {"type": "period_to_date", "aggregate": "sum", "period": "year", "alias": "ytd_sum"}, + {"type": "period_to_date", "aggregate": "max", "period": "month"}, + {"type": "period_over_period", "lag": 1, "how": "difference", "alias": "pop_1mth"}, + {"type": "period_over_period", "lag": 1, "how": "ratio"}, + {"type": "rolling", "window": 3, "aggregate": "average", "alias": "avg_3mth"}, + {"type": "rolling", "window": 3, "aggregate": "sum"}, + ]) +}} + +where period >= '2021-01-01' and period < '2021-05-01' \ No newline at end of file diff --git a/integration_tests/models/metric_transformation.yml b/integration_tests/models/metric_transformation.yml new file mode 100644 index 00000000..4f2ba2fa --- /dev/null +++ b/integration_tests/models/metric_transformation.yml @@ -0,0 +1,6 @@ +version: 2 +models: + - name: metric_transformation + tests: + - dbt_utils.equality: + compare_model: ref('slack_users_expected') diff --git a/integration_tests/packages.yml b/integration_tests/packages.yml new file mode 100644 index 00000000..4a6b9c19 --- /dev/null +++ b/integration_tests/packages.yml @@ -0,0 +1,2 @@ +packages: + - local: ../ diff --git a/integration_tests/seeds/.gitkeep b/integration_tests/seeds/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/integration_tests/seeds/expected/slack_users_expected.csv b/integration_tests/seeds/expected/slack_users_expected.csv new file mode 100644 index 00000000..7ebca2e8 --- /dev/null +++ b/integration_tests/seeds/expected/slack_users_expected.csv @@ -0,0 +1,9 @@ +period,has_messaged,slack_joiners,ytd_sum,max_for_month,pop_1mth,ratio_to_1_month_ago,avg_3mth,rolling_sum_3_month +2021-01-01,FALSE,0,0,0,0,,0.000,0 +2021-01-01,TRUE,1,1,1,1,,0.333,1 +2021-02-01,FALSE,0,0,0,0,,0.000,0 +2021-02-01,TRUE,1,2,1,0,1,0.666,2 +2021-03-01,FALSE,0,0,0,0,,0.000,0 +2021-03-01,TRUE,0,2,0,-1,0,0.666,2 +2021-04-01,FALSE,2,2,2,2,,0.666,2 +2021-04-01,TRUE,0,2,0,0,,0.333,1 \ No newline at end of file diff --git a/integration_tests/seeds/source/seed_slack_users.csv b/integration_tests/seeds/source/seed_slack_users.csv new file mode 100644 index 00000000..1dc55313 --- /dev/null +++ b/integration_tests/seeds/source/seed_slack_users.csv @@ -0,0 +1,5 @@ +user_id,joined_at,is_active_past_quarter,has_messaged +1,2021-01-01 14:18:27,true,true +2,2021-02-03 17:18:55,false,true +3,2021-04-01 11:01:28,false,false +4,2021-04-08 22:43:09,false,false \ No newline at end of file diff --git a/integration_tests/snapshots/.gitkeep b/integration_tests/snapshots/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/integration_tests/tests/.gitkeep b/integration_tests/tests/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/macros/aggregate_primary_metric.sql b/macros/aggregate_primary_metric.sql new file mode 100644 index 00000000..dc08f522 --- /dev/null +++ b/macros/aggregate_primary_metric.sql @@ -0,0 +1,57 @@ + +--TODO: Do we have a list of aggregations that we're supporting on day one? +{% macro aggregate_primary_metric(aggregate, expression) %} + {{ return(adapter.dispatch('aggregate_primary_metric', 'metrics')(aggregate, expression)) }} +{% endmacro %} + +{% macro default__aggregate_primary_metric(aggregate, expression) %} + {% if aggregate == 'count' %} + {{ return(adapter.dispatch('metric_count', 'metrics')(expression)) }} + + {% elif aggregate == 'count_distinct' %} + {{ return(adapter.dispatch('metric_count_distinct', 'metrics')(expression)) }} + + {% elif aggregate == 'average' %} + {{ return(adapter.dispatch('metric_average', 'metrics')(expression)) }} + + {% elif aggregate == 'max' %} + {{ return(adapter.dispatch('metric_max', 'metrics')(expression)) }} + + {% elif aggregate == 'min' %} + {{ return(adapter.dispatch('metric_min', 'metrics')(expression)) }} + + {% elif aggregate == 'sum' %} + {{ return(adapter.dispatch('metric_sum', 'metrics')(expression)) }} + + {% else %} + {% do exceptions.raise_compiler_error("Unknown aggregation style: " ~ aggregate) %} + {% endif %} +{% endmacro %} + +{% macro default__metric_count(expression) %} + count({{ expression }}) +{% endmacro %} + +{% macro default__metric_count_distinct(expression) %} + count(distinct {{ expression }}) +{% endmacro %} + +{% macro default__metric_average(expression) %} + avg({{ expression }}) +{% endmacro %} + +{% macro redshift__metric_average(expression) %} + avg(cast({{ expression }} as float)) +{% endmacro %} + +{% macro default__metric_max(expression) %} + max({{ expression }}) +{% endmacro %} + +{% macro default__metric_min(expression) %} + min({{ expression }}) +{% endmacro %} + +{% macro default__metric_sum(expression) %} + sum({{ expression }}) +{% endmacro %} \ No newline at end of file diff --git a/macros/get_metric_relation.sql b/macros/get_metric_relation.sql new file mode 100644 index 00000000..cdf9a651 --- /dev/null +++ b/macros/get_metric_relation.sql @@ -0,0 +1,57 @@ +{% macro get_metric_relation(ref_name) %} + {% if execute %} + {% set model_ref_node = graph.nodes.values() | selectattr('name', 'equalto', ref_name[0]) | first %} + {% set relation = api.Relation.create( + database = model_ref_node.database, + schema = model_ref_node.schema, + identifier = model_ref_node.alias + ) + %} + {% do return(relation) %} + {% else %} + {% do return(api.Relation.create()) %} + {% endif %} +{% endmacro %} + +{% macro get_metric_calendar(ref_name) %} + /* + TODO: this is HORRID. + Short version: How do we properly handle refs[0] for the metric's model, and the ref() syntax for the calendar table? + */ + + /* + Long version: even though the metric yml file has its model as a full ref + + - name: slack_joiners + model: ref('dim_slack_users_2') + + the refs array from the graph contains just the string, inside a second array: + + { + "fqn":["joel_sandbox","metrics","slack_joiners"], + "unique_id":"metric.joel_sandbox.slack_joiners", + "time_grains":["day", "week", "month"], + "dimensions":["has_messaged"], + "resource_type":"metric", + "refs":[ + [ + "dim_slack_users_2" + ] + ], + "created_at":1642578505.5324879 + } + + + Whereas the calendar variable: + vars: + dbt_metrics_calendar_model: ref('all_days_extended_2') + + comes through as the entire ref string (it hasn't been parsed or processed yet). + This splits on the ' character, takes the second element, and wraps it inside an array, + to have the same shape as get_metric_relation expects, + which is written to expect the metric's `model`. + */ + + {% set split_ref_name = ref_name.split("'")[1] %} + {% do return(metrics.get_metric_relation([split_ref_name])) %} +{% endmacro %} \ No newline at end of file diff --git a/macros/get_metric_sql.sql b/macros/get_metric_sql.sql new file mode 100644 index 00000000..04032619 --- /dev/null +++ b/macros/get_metric_sql.sql @@ -0,0 +1,176 @@ +/* + Core metric query generation logic. + TODO: + - validate that the requested dim is actually an option (or fail at query execution instead of at compilation if they don't exist? is it a problem to expose columns that exist in the table but aren't "blessed" for the metric?) + - allow start/end dates on metrics. Maybe special-case "today"? + - allow passing in a seed with targets for a metric's value +*/ + + +{%- macro get_metric_sql(metric, grain, dimensions, secondary_calculations) %} +{%- if not execute %} + {%- do return("not execute") %} +{%- endif %} + +{%- if not metric %} + {%- do exceptions.raise_compiler_error("No metric provided") %} +{%- endif %} + +{%- if not grain %} + {%- do exceptions.raise_compiler_error("No date grain provided") %} +{%- endif %} + +{#-/* TODO: This refs[0][0] stuff is totally ick */#} +{%- set model = metrics.get_metric_relation(metric.refs[0] if execute else "") %} +{%- set calendar_tbl = metrics.get_metric_calendar(var('dbt_metrics_calendar_model', "ref('dbt_metrics_default_calendar')")) %} + +{#- /* TODO: Do I need to validate that the requested grain is defined on the metric? */ #} +{#- /* TODO: build a list of failures and return them all at once*/ #} +{%- for calc_config in secondary_calculations if calc_config.aggregate %} + {%- do metrics.validate_aggregate_coherence(metric.type, calc_config.aggregate) %} +{%- endfor %} + +{#- /* TODO: build a list of failures and return them all at once*/ #} +{%- for calc_config in secondary_calculations if calc_config.period %} + {%- do metrics.validate_grain_order(grain, calc_config.period) %} +{%- endfor %} + +{%- set relevant_periods = [] %} +{%- for calc_config in secondary_calculations if calc_config.period and calc_config.period not in relevant_periods %} + {%- set _ = relevant_periods.append(calc_config.period) %} +{%- endfor -%} + +with source_query as ( + + select + /* Always trunc to the day, then use dimensions on calendar table to achieve the _actual_ desired aggregates. */ + /* Need to cast as a date otherwise we get values like 2021-01-01 and 2021-01-01T00:00:00+00:00 that don't join :( */ + cast({{ dbt_utils.date_trunc('day', 'cast(' ~ metric.timestamp ~ ' as date)') }} as date) as date_day, + + {% for dim in dimensions %} + {%- if metrics.is_dim_from_model(metric, dim) -%} + {{ dim }}, + {% endif -%} + + {%- endfor %} + + {#- /*When metric.sql is undefined or '*' for a count, + it's unnecessary to pull through the whole table */ #} + {%- if metric.sql and metric.sql | replace('*', '') | trim != '' -%} + {{ metric.sql }} as property_to_aggregate + {%- elif metric.type == 'count' -%} + 1 as property_to_aggregate /*a specific expression to aggregate wasn't provided, so this effectively creates count(*) */ + {%- else -%} + {%- do exceptions.raise_compiler_error("Expression to aggregate is required for non-count aggregation in metric `" ~ metric.name ~ "`") -%} + {%- endif %} + + from {{ model }} + where 1=1 + {%- for filter in metric.filters %} + and {{ filter.field }} {{ filter.operator }} {{ filter.value }} + {%- endfor %} +), + + spine__time as ( + select + /* this could be the same as date_day if grain is day. That's OK! + They're used for different things: date_day for joining to the spine, period for aggregating.*/ + date_{{ grain }} as period, + {% for period in relevant_periods %} + date_{{ period }}, + {% endfor %} + {% for dim in dimensions if not metrics.is_dim_from_model(metric, dim) %} + {{ dim }}, + {% endfor %} + date_day + + from {{ calendar_tbl }} + + ), + +{%- for dim in dimensions -%} + {%- if metrics.is_dim_from_model(metric, dim) %} + + spine__values__{{ dim }} as ( + + select distinct {{ dim }} + from source_query + + ), + {% endif -%} + + +{%- endfor %} + +spine as ( + + select * + from spine__time + {%- for dim in dimensions -%} + + {%- if metrics.is_dim_from_model(metric, dim) %} + cross join spine__values__{{ dim }} + {%- endif %} + {%- endfor %} + +), + +joined as ( + select + spine.period, + {% for period in relevant_periods %} + spine.date_{{ period }}, + {% endfor %} + {% for dim in dimensions %} + spine.{{ dim }}, + {% endfor %} + + -- has to be done down here to allow dimensions coming from the calendar table + {{- metrics.aggregate_primary_metric(metric.type, 'source_query.property_to_aggregate') }} as {{ metric.name }} + + from spine + left outer join source_query on source_query.date_day = spine.date_day + {% for dim in dimensions %} + {%- if metrics.is_dim_from_model(metric, dim) %} + and source_query.{{ dim }} = spine.{{ dim }} + {%- endif %} + {% endfor %} + + {#- /* Add 1 twice to account for 1) timeseries dim and 2) to be inclusive of the last dim */ #} + group by {{ range(1, (dimensions | length) + (relevant_periods | length) + 1 + 1) | join (", ") }} + + +), + +secondary_calculations as ( + + select * + + {% for calc_config in secondary_calculations -%} + + , {{ metrics.perform_secondary_calculation(metric.name, dimensions, calc_config) -}} as {{ metrics.generate_secondary_calculation_alias(calc_config, grain) }} + + {% endfor %} + + from joined + +), + +final as ( + select + period + {% for dim in dimensions %} + , {{ dim }} + {% endfor %} + , coalesce({{ metric.name }}, 0) as {{ metric.name }} + {% for calc_config in secondary_calculations %} + , {{ metrics.generate_secondary_calculation_alias(calc_config, grain) }} + {% endfor %} + + from secondary_calculations + order by {{ range(1, (dimensions | length) + 1 + 1) | join (", ") }} +) + +select * from final + +{% endmacro %} \ No newline at end of file diff --git a/macros/is_dim_from_model.sql b/macros/is_dim_from_model.sql new file mode 100644 index 00000000..3dfaf68d --- /dev/null +++ b/macros/is_dim_from_model.sql @@ -0,0 +1,14 @@ +{% macro is_dim_from_model(metric, dim_name) %} + {% if execute %} + -- For now, time dimensions have to be encoded in the meta tag. + -- If there's no meta config, then assume all dimensions belong to the main model. + {% if not metric['meta']['dimensions'] %} + {% do return(True) %} + {% endif %} + + {% set model_dims = (metric['meta']['dimensions'] | selectattr('type', '==', 'model') | first)['columns'] %} + {% do return (dim_name in model_dims) %} + {% else %} + {% do return (False) %} + {% endif %} +{% endmacro %} \ No newline at end of file diff --git a/macros/metric.sql b/macros/metric.sql new file mode 100644 index 00000000..4866cd9c --- /dev/null +++ b/macros/metric.sql @@ -0,0 +1,39 @@ +{% macro metric(metric_name, grain, dimensions=[], secondary_calculations=[]) -%} + -- Need this here, since the actual ref is nested within loops/conditions: + -- depends on: {{ ref('dbt_metrics_default_calendar') }} + + {%- if not execute %} + {%- do return("not execute") %} + {%- endif %} + + {%- set metric = metrics.get_metric(metric_name) %} + + {%- set sql = metrics.get_metric_sql( + metric=metric, + grain=grain, + dimensions=dimensions, + secondary_calculations=secondary_calculations + ) %} + ({{ sql }}) metric_subq +{%- endmacro %} + +{% macro get_metric(metric_name) %} + {% if not execute %} + {% do return(None) %} + {% else %} + {% set metric_info = namespace(metric_id=none) %} + {% for metric in graph.metrics.values() %} + {% if metric.name == metric_name %} + {% set metric_info.metric_id = metric.unique_id %} + {% endif %} + {% endfor %} + + {% if metric_info.metric_id is none %} + {% do exceptions.raise_compiler_error("Metric named '" ~ metric_name ~ "' not found") %} + {% endif %} + + + {% do return(graph.metrics[metric_info.metric_id]) %} + {% endif %} + +{% endmacro %} \ No newline at end of file diff --git a/macros/metrics.sql b/macros/metrics.sql deleted file mode 100644 index e2250dbd..00000000 --- a/macros/metrics.sql +++ /dev/null @@ -1,274 +0,0 @@ -/* - Core metric query generation logic. - TODO: - - Lots of stuff! - - account for adding timeseries calcs (eg. period-over-period change, Year-to-date, etc) - - add support for defining filters (either in metric definition or in user query) - - do something smarter about the date spine used below - - think harder about how we actually calculate aggregates... -*/ - -{% macro get_metric_sql(metric, grain, dims, calcs, metric_name='metric_value') %} - -with source_query as ( - - select - -- requested dimensions - - -- DEBUG: date_trunc(month, created_at)::date as month, - -- DEBUG: Don't hard-code the time dimension? hmm.... - -- DEBUG: Need to cast as a date otherwise we get values like 2021-01-01 and 2021-01-01T00:00:00+00:00 that don't join :( - date_trunc({{ grain }}, {{ metric.timestamp }})::date as period - - {% for dim in dims %} - , {{ dim }} - {% endfor %} - - -- aggregate - -- DEBUG: , count(*) as metric_value - -- TODO : Handle count distinct - , {{ metric.type }}( {{ metric.sql }} ) as metric_value - - from {{ ref(metric.model) }} - where 1=1 - -- via metric definition - -- DEBUG: and customers.plan != 'free' - - -- user-supplied. Filters that are not present in the - -- list of selected dimensions are applied at the source query - -- DEBUG: and not customers.is_training_account - -- DEBUG: Add 1 twice to account for 1) timeseries dim and 2) to be inclusive of the last dim - group by {{ range(1, (dims | length) + 1 + 1) | join (", ") }} - -), - --- DEBUG: This is a total hack - we'll need some sort of a days table... - spine__time as ( - - select distinct - date_trunc({{ grain }}, dateadd(day, '-' || seq4(), current_date())) as period - from table(generator(rowcount => 365)) - - ), - -{% for dim in dims %} - - spine__values__{{ dim }} as ( - - select distinct {{ dim }} - from source_query - - ), - -{% endfor %} - -spine as ( - - select * - from spine__time - {% for dim in dims %} - cross join spine__values__{{ dim }} - {% endfor %} - -), - -joined as ( - - select - spine.period - -- TODO : Exclude time grains that are finer grained - -- than the specified time grain for the metric - , date_trunc(day, spine.period) as period__day - , date_trunc(week, spine.period) as period__week - , date_trunc(month, spine.period) as period__month - , date_trunc(quarter, spine.period) as period__quarter - , date_trunc(year, spine.period) as period__year - - {% for dim in dims %} - , spine.{{ dim }} - {% endfor %} - , metric_value as {{ metric_name }} - -from spine -left join source_query on source_query.period = spine.period -{% for dim in dims %} - and source_query.{{ dim }} = spine.{{ dim }} -{% endfor %} - -), - -with_calcs as ( - - select * - - /* - TODO: - - Make sure denominators are all nonzero - - Make sure division happens with floats, not ints - - Wrap these expressions up in macros in this package - */ - - {% for calc in calcs %} - {% if calc.type == 'period_over_period' and calc.how == 'difference'%} - , coalesce({{ metric_name }}, 0) - coalesce( - lag({{ metric_name }}, {{ calc.lag }}) over ( - partition by {% for dim in dims -%} - {{ dim }} {% if not loop.last %} , {% endif %} - {%- endfor %} - order by period - ), 0 - ) as calc_{{ loop.index }} - - {% elif calc.type == 'period_over_period' and calc.how == 'ratio'%} - , coalesce({{ metric_name }}, 0) / nullif( - lag({{ metric_name }}, {{ calc.lag }}) over ( - partition by {% for dim in dims -%} - {{ dim }} {% if not loop.last %} , {% endif %} - {%- endfor %} - order by period - ), 0 - )::float as calc_{{ loop.index }} - - {% elif calc.type == 'period_over_period' %} - {% do exceptions.raise_compiler_error("Bad 'how' for period_over_period: " ~ calc.how) %} - - {% elif calc.type == 'rolling' %} - , {{ calc.aggregate }}({{ metric_name }}) over ( - partition by {% for dim in dims -%} - {{ dim }} {% if not loop.last %} , {% endif %} - {%- endfor %} - order by period - rows between {{ calc.window - 1 }} preceding and current row - ) as calc_{{ loop.index }} - - {% elif calc.type == 'period_to_date' %} - , {{ calc.aggregate }}({{ metric_name }}) over ( - partition by period__{{ calc.period }} - {% for dim in dims -%} - , {{ dim }} - {%- endfor %} - order by period - rows between unbounded preceding and current row - ) as calc_{{ loop.index }} - - {% endif %} - {% endfor %} - - from joined - -) - -select - period - {% for dim in dims %} - , {{ dim }} - {% endfor %} - , coalesce({{ metric_name }}, 0) as {{ metric_name }} - {% for calc in calcs %} - , calc_{{ loop.index }} - {% endfor %} - -from with_calcs -order by {{ range(1, (dims | length) + 1 + 1) | join (", ") }} - -{% endmacro %} - -/* -------------------------------------------------- */ - -/* - Small helper to look up a metric definition and call the macro - to get the resulting SQL query. Sort of a stub for how we'd want - to define / call metrics longer-term - TODO: - - Delete this? -*/ -{% macro metric(metric_name, by, grain) %} - {% set def = get_metric(metric_name) %} - {% set sql = get_metric_sql( - table = ref(def['table']), - aggregate = def['aggregate'], - expression = def['expression'], - datetime = def['datetime'], - - grain = grain, - dims = by - ) %} - {% do return(sql) %} -{% endmacro %} - - -/* - Small helper to return the metric subquery as a Selectable, ie. - a thing that can be selected from, rather than a select statement itself -*/ -{% macro query(metric_name, grain) %} - {% set def = get_metric(metric_name) %} - {% set sql = get_metric_sql( - table = ref(def['table']), - aggregate = def['aggregate'], - expression = def['expression'], - datetime = def['datetime'], - - grain = grain, - dims = varargs - ) %} - - {% set as_subquery %} - ( - {{ sql }} - ) - {% endset %} - - {% do return(as_subquery) %} -{% endmacro %} - -/* -------------------------------------------------- */ - -/* - For debugging, prints out a rendered query for a metric + params - TODO: - - Delete this? -*/ -{% macro debug_metric(metric_name, by, grain) %} - - {% set query = metric(metric_name, by, grain) %} - {% do log(query, info=True) %} - {% set res = run_query(query) %} - - {% do res.print_table() %} - -{% endmacro %} - -{% macro debug(metric_name) %} - {% set metric_info = namespace(metric_id=none) %} - {% for metric in graph.metrics.values() %} - {% if metric.name == metric_name %} - {% set metric_info.metric_id = metric.unique_id %} - {% endif %} - {% endfor %} - - {% if metric_info.metric_id is none %} - {% do exceptions.raise_compiler_error("Metric named '" ~ metric_name ~ "' not found") %} - {% endif %} - - - {% set metric = graph.metrics[metric_info.metric_id] %} - {% do log(metric, info=true) %} - {% set sql = get_metric_sql( - metric, - grain='day', - dims=['order_total_band'], - calcs=[ - {"type": "period_over_period", "lag": 1, "how": "difference"}, - {"type": "period_over_period", "lag": 1, "how": "ratio"}, - {"type": "rolling", "window": 3, "aggregate": "avg"}, - {"type": "rolling", "window": 3, "aggregate": "sum"}, - {"type": "period_to_date", "aggregate": "sum", "period": "year"}, - {"type": "period_to_date", "aggregate": "max", "period": "month"}, - ] - ) %} - {% set res = run_query('select * from (' ~ sql ~ ') order by 2,1') %} - {% do res.print_table(max_columns=none, max_rows=10) %} - -{% endmacro %} - diff --git a/macros/secondary_calculations/generate_secondary_calculation_alias.sql b/macros/secondary_calculations/generate_secondary_calculation_alias.sql new file mode 100644 index 00000000..30780d3a --- /dev/null +++ b/macros/secondary_calculations/generate_secondary_calculation_alias.sql @@ -0,0 +1,27 @@ +{% macro generate_secondary_calculation_alias(calc_config, grain) %} + + {{ return(adapter.dispatch('generate_secondary_calculation_alias', 'metrics')(calc_config, grain)) }} + +{% endmacro %} + +{% macro default__generate_secondary_calculation_alias(calc_config, grain) %} + {% if calc_config.alias %} + {% do return(calc_config.alias) %} + {% endif %} + + {%- set calc_type = calc_config.calculation %} + {%- if calc_type == 'period_over_period' %} + {%- do return(calc_config.comparison_strategy ~ "_to_" ~ calc_config.interval ~ "_" ~ grain ~ "_ago") %} + + {%- elif calc_type == 'rolling' %} + {%- do return("rolling_" ~ calc_config.aggregate ~ "_" ~ calc_config.interval ~ "_" ~ grain) %} + + {%- elif calc_type == 'period_to_date' %} + {%- do return(calc_config.aggregate ~ "_for_" ~ calc_config.period) %} + + {%- else %} + {%- do exceptions.raise_compiler_error("Can't generate alias for unknown secondary calculation: " ~ calc_type ~ ". calc_config: " ~ calc_config) %} + {%- endif %} + + {{- calc_sql }} +{% endmacro %} \ No newline at end of file diff --git a/macros/secondary_calculations/perform_secondary_calculation.sql b/macros/secondary_calculations/perform_secondary_calculation.sql new file mode 100644 index 00000000..661b4cdf --- /dev/null +++ b/macros/secondary_calculations/perform_secondary_calculation.sql @@ -0,0 +1,26 @@ +{% macro perform_secondary_calculation(metric_name, dimensions, calc_config) %} + + {{ return(adapter.dispatch('perform_secondary_calculation', 'metrics')(metric_name, dimensions, calc_config)) }} + +{% endmacro %} + +{% macro default__perform_secondary_calculation(metric_name, dimensions, calc_config) %} + {%- set calc_type = calc_config.calculation %} + {%- set calc_sql = '' %} + + {%- if calc_type == 'period_over_period' %} + {%- set calc_sql = adapter.dispatch('secondary_calculation_period_over_period', 'metrics')(metric_name, dimensions, calc_config) %} + + {%- elif calc_type == 'rolling' %} + {%- set calc_sql = adapter.dispatch('secondary_calculation_rolling', 'metrics')(metric_name, dimensions, calc_config) %} + + {%- elif calc_type == 'period_to_date' %} + {%- set calc_sql = adapter.dispatch('secondary_calculation_period_to_date', 'metrics')(metric_name, dimensions, calc_config) %} + + {%- else %} + {%- do exceptions.raise_compiler_error("Unknown secondary calculation: " ~ calc_type ~ ". calc_config: " ~ calc_config) %} + {%- endif %} + + {{- calc_sql }} + +{% endmacro %} \ No newline at end of file diff --git a/macros/secondary_calculations/secondary_calculation_period_over_period.sql b/macros/secondary_calculations/secondary_calculation_period_over_period.sql new file mode 100644 index 00000000..e7d91e17 --- /dev/null +++ b/macros/secondary_calculations/secondary_calculation_period_over_period.sql @@ -0,0 +1,54 @@ +{% macro default__secondary_calculation_period_over_period(metric_name, dimensions, calc_config) %} + {% set calc_sql %} + lag( + {{- metric_name }}, {{ calc_config.interval -}} + ) over ( + {% if dimensions -%} + partition by {{ dimensions | join(", ") }} + {% endif -%} + order by period + ) + {% endset %} + + + {% if calc_config.comparison_strategy == 'difference' %} + {% do return (adapter.dispatch('metric_comparison_strategy_difference', 'metrics')(metric_name, calc_sql)) %} + + {% elif calc_config.comparison_strategy == 'ratio' %} + {% do return (adapter.dispatch('metric_comparison_strategy_ratio', 'metrics')(metric_name, calc_sql)) %} + + {% else %} + {% do exceptions.raise_compiler_error("Bad comparison_strategy for period_over_period: " ~ calc_config.comparison_strategy) %} + {% endif %} + +{% endmacro %} + +{% macro default__metric_comparison_strategy_difference(metric_name, calc_sql) %} + coalesce({{ metric_name }}, 0) - coalesce({{ calc_sql }}, 0) +{% endmacro %} + +{% macro default__metric_comparison_strategy_ratio(metric_name, calc_sql) %} + cast(coalesce({{ metric_name }}, 0) / nullif({{ calc_sql }}, 0) as {{ dbt_utils.type_float() }}) +{% endmacro %} + +{% macro period_over_period(comparison_strategy, interval, alias) %} + + {% set missing_args = [] %} + {% if not comparison_strategy %} + {% set _ = missing_args.append("comparison_strategy") %} + {% endif %} + {% if not interval %} + {% set _ = missing_args.append("interval") %} + {% endif %} + {% if missing_args | length > 0 %} + {% do exceptions.raise_compiler_error( missing_args | join(", ") ~ ' not provided to period_over_period') %} + {% endif %} + + {% do return ({ + "calculation": "period_over_period", + "comparison_strategy": comparison_strategy, + "interval": interval, + "alias": alias + }) + %} +{% endmacro %} \ No newline at end of file diff --git a/macros/secondary_calculations/secondary_calculation_period_to_date.sql b/macros/secondary_calculations/secondary_calculation_period_to_date.sql new file mode 100644 index 00000000..1817d121 --- /dev/null +++ b/macros/secondary_calculations/secondary_calculation_period_to_date.sql @@ -0,0 +1,38 @@ +{% macro default__secondary_calculation_period_to_date(metric_name, dimensions, calc_config) %} + {%- set calc_sql %} + {{- adapter.dispatch('aggregate_primary_metric', 'metrics')(calc_config.aggregate, metric_name) -}} + over ( + partition by date_{{ calc_config.period }} + {% if dimensions -%} + , {{ dimensions | join(", ") }} + {%- endif %} + order by period + rows between unbounded preceding and current row + ) + {%- endset %} + + {%- do return (calc_sql) %} + +{% endmacro %} + +{% macro period_to_date(aggregate, period, alias) %} + + {% set missing_args = [] %} + {% if not aggregate %} + {% set _ = missing_args.append("aggregate") %} + {% endif %} + {% if not period %} + {% set _ = missing_args.append("period") %} + {% endif %} + {% if missing_args | length > 0 %} + {% do exceptions.raise_compiler_error( missing_args | join(", ") ~ ' not provided to period_to_date') %} + {% endif %} + + {% do return ({ + "calculation": "period_to_date", + "aggregate": aggregate, + "period": period, + "alias": alias + }) + %} +{% endmacro %} \ No newline at end of file diff --git a/macros/secondary_calculations/secondary_calculation_rolling.sql b/macros/secondary_calculations/secondary_calculation_rolling.sql new file mode 100644 index 00000000..8b37a0ba --- /dev/null +++ b/macros/secondary_calculations/secondary_calculation_rolling.sql @@ -0,0 +1,37 @@ +{% macro default__secondary_calculation_rolling(metric_name, dimensions, calc_config) %} + {% set calc_sql %} + {{ adapter.dispatch('aggregate_primary_metric', 'metrics')(calc_config.aggregate, metric_name) }} + over ( + {% if dimensions -%} + partition by {{ dimensions | join(", ") }} + {% endif -%} + order by period + rows between {{ calc_config.interval - 1 }} preceding and current row + ) + {% endset %} + + {% do return (calc_sql) %} + +{% endmacro %} + +{% macro rolling(aggregate, interval, alias) %} + + {% set missing_args = [] %} + {% if not aggregate %} + {% set _ = missing_args.append("aggregate") %} + {% endif %} + {% if not interval %} + {% set _ = missing_args.append("interval") %} + {% endif %} + {% if missing_args | length > 0 %} + {% do exceptions.raise_compiler_error( missing_args | join(", ") ~ ' not provided to period_over_period') %} + {% endif %} + + {% do return ({ + "calculation": "period_over_period", + "interval": interval, + "comparison_strategy": comparison_strategy, + "alias": alias + }) + %} +{% endmacro %} \ No newline at end of file diff --git a/macros/secondary_calculations/validate_aggregate_coherence.sql b/macros/secondary_calculations/validate_aggregate_coherence.sql new file mode 100644 index 00000000..5c5e1eb3 --- /dev/null +++ b/macros/secondary_calculations/validate_aggregate_coherence.sql @@ -0,0 +1,23 @@ +{% macro validate_aggregate_coherence(metric_aggregate, calculation_aggregate) %} + {% set allowlist = metrics.get_metric_allowlist()[metric_aggregate] %} + + {% if (calculation_aggregate not in allowlist) %} + {% do exceptions.raise_compiler_error("Can't calculate secondary aggregate " ~ calculation_aggregate ~ " when metric's aggregation is " ~ metric_aggregate ~ ". Allowed options are " ~ allowlist ~ ".") %} + {% endif %} +{% endmacro %} + +{% macro get_metric_allowlist() %} + {{ return(adapter.dispatch('get_metric_allowlist', 'metrics')()) }} +{% endmacro %} + +{% macro default__get_metric_allowlist() %} + {# Keys are the primary aggregation, values are the permitted aggregations to run in secondary calculations. #} + {% do return ({ + "average": ['min', 'max'], + "count": ['min', 'max', 'sum', 'average'], + "count_distinct": ['min', 'max', 'sum', 'average'], + "sum": ['min', 'max', 'sum', 'average'], + "max": ['min', 'max', 'sum', 'average'], + "min": ['min', 'max', 'sum', 'average'], + }) %} +{% endmacro %} \ No newline at end of file diff --git a/macros/secondary_calculations/validate_grain_order.sql b/macros/secondary_calculations/validate_grain_order.sql new file mode 100644 index 00000000..a7da9f86 --- /dev/null +++ b/macros/secondary_calculations/validate_grain_order.sql @@ -0,0 +1,23 @@ +{% macro validate_grain_order(metric_grain, calculation_grain) %} + {% set grains = metrics.get_grain_order() %} + + {% if metric_grain not in grains or calculation_grain not in grains %} + {% set comma = joiner(", ") %} + {% do exceptions.raise_compiler_error("Unknown grains: [" ~ (comma() ~ metric_grain if metric_grain not in grains) ~ (comma() ~ calculation_grain if calculation_grain not in grains) ~ "]") %} + {% endif %} + + {% set metric_grain_index = grains.index(metric_grain) %} + {% set calculation_grain_index = grains.index(calculation_grain) %} + + {% if (calculation_grain_index < metric_grain_index) %} + {% do exceptions.raise_compiler_error("Can't calculate secondary metric at " ~ calculation_grain ~"-level when metric is at " ~ metric_grain ~ "-level") %} + {% endif %} +{% endmacro %} + +{% macro get_grain_order() %} + {{ return(adapter.dispatch('get_grain_order', 'metrics')()) }} +{% endmacro %} + +{% macro default__get_grain_order() %} + {% do return (['day', 'week', 'month', 'quarter', 'year']) %} +{% endmacro %} \ No newline at end of file diff --git a/models/dbt_metrics_default_calendar.sql b/models/dbt_metrics_default_calendar.sql new file mode 100644 index 00000000..152a7619 --- /dev/null +++ b/models/dbt_metrics_default_calendar.sql @@ -0,0 +1,23 @@ +{{ config(materialized='table') }} + +--TODO: Don't want to depend on utils long term. +with days as ( + {{ dbt_utils.date_spine( + datepart="day", + start_date="cast('2010-01-01' as date)", + end_date="cast('2030-01-01' as date)" + ) + }} +), + +final as ( + select + cast(date_day as date) as date_day, + cast({{ dbt_utils.date_trunc('week', 'date_day') }} as date) as date_week, + cast({{ dbt_utils.date_trunc('month', 'date_day') }} as date) as date_month, + cast({{ dbt_utils.date_trunc('quarter', 'date_day') }} as date) as date_quarter, + cast({{ dbt_utils.date_trunc('year', 'date_day') }} as date) as date_year + from days +) + +select * from final diff --git a/models/fct_orders.sql b/models/fct_orders.sql deleted file mode 100644 index a7d17c6e..00000000 --- a/models/fct_orders.sql +++ /dev/null @@ -1,8 +0,0 @@ - -select *, - case - when order_total < 50 then 'small' - else 'large' - end as order_total_band - -from PARTNER_MODE.RAW.ORDERS diff --git a/models/schema.yml b/models/schema.yml deleted file mode 100644 index b960ea19..00000000 --- a/models/schema.yml +++ /dev/null @@ -1,61 +0,0 @@ - -version: 2 - -metrics: - - model: fct_orders - name: revenue - description: "Order revenue" - label: Revenue - type: sum - sql: order_total - timestamp: created_at - dimensions: - - company - - order_total_band - time_grains: - - day - - week - - month - - - model: fct_orders - name: average_order_value - description: "Average order Value" - label: Average Order Value - type: avg - sql: order_total - timestamp: created_at - dimensions: - - company - - order_total_band - time_grains: - - day - - week - - month - - - model: fct_orders - name: orders - description: "Total number of orders" - label: Total orders - type: count - timestamp: created_at - dimensions: - - company - - order_total_band - time_grains: - - day - - week - - month - - - model: fct_orders - name: customers - description: "Total distinct customers" - label: Total customers - type: count_distinct - sql: email - timestamp: created_at - dimensions: - - company - time_grains: - - day - - week - - month diff --git a/packages.yml b/packages.yml new file mode 100644 index 00000000..538aeec5 --- /dev/null +++ b/packages.yml @@ -0,0 +1,3 @@ +packages: + - package: dbt-labs/dbt_utils + version: [">=0.8.0", "<0.9.0"]