From be1a703cdbf369b975c596f5af221d48f1a908ec Mon Sep 17 00:00:00 2001
From: Mike Alfare <13974384+mikealfare@users.noreply.github.com>
Date: Fri, 18 Oct 2024 21:21:22 -0400
Subject: [PATCH] Add internal publish workflow (#745)
---
.github/workflows/integration-tests.yml | 2 +-
.github/workflows/publish-internal.yml | 94 +++
.github/workflows/publish-pypi.yml | 14 +
.github/workflows/publish.yml | 24 +-
dbt-athena-community/README.md | 882 ++++++++++++++++++++++++
dbt-athena-community/pyproject.toml | 6 +-
dbt-athena/README.md | 882 ++++++++++++++++++++++++
dbt-athena/pyproject.toml | 6 +-
8 files changed, 1900 insertions(+), 10 deletions(-)
create mode 100644 .github/workflows/publish-internal.yml
create mode 100644 dbt-athena-community/README.md
create mode 100644 dbt-athena/README.md
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 04c91abf..33ab6073 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -56,7 +56,7 @@ jobs:
with:
python-version: ${{ vars.DEFAULT_PYTHON_VERSION }}
- uses: pypa/hatch@install
- - uses: aws-actions/configure-aws-credentials@v2
+ - uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ secrets.ASSUMABLE_ROLE_NAME }}
aws-region: ${{ vars.DBT_TEST_ATHENA_REGION_NAME }}
diff --git a/.github/workflows/publish-internal.yml b/.github/workflows/publish-internal.yml
new file mode 100644
index 00000000..6df86dce
--- /dev/null
+++ b/.github/workflows/publish-internal.yml
@@ -0,0 +1,94 @@
+name: "Publish Internally"
+
+on:
+ workflow_call:
+ inputs:
+ package:
+ description: "Choose the package to publish"
+ type: string
+ default: "dbt-athena"
+ deploy-to:
+ description: "Choose whether to publish to test or prod"
+ type: string
+ default: "prod"
+ branch:
+ description: "Choose the branch to publish"
+ type: string
+ default: "main"
+ workflow_dispatch:
+ inputs:
+ package:
+ description: "Choose the package to publish"
+ type: string
+ default: "dbt-athena"
+ deploy-to:
+ description: "Choose whether to publish to test or prod"
+ type: string
+ default: "test"
+ branch:
+ description: "Choose the branch to publish"
+ type: string
+ default: "main"
+
+defaults:
+ run:
+ shell: bash
+
+jobs:
+ publish:
+ runs-on: ubuntu-latest
+ environment:
+ name: ${{ inputs.deploy-to }}
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ ref: ${{ inputs.branch }}
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ vars.DEFAULT_PYTHON_VERSION }}
+ - uses: pypa/hatch@install
+ - uses: aws-actions/configure-aws-credentials@v4
+ with:
+ aws-region: ${{ vars.AWS_REGION }}
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ - id: package
+ run: echo "version=$(hatch version)" >> $GITHUB_OUTPUT
+ working-directory: ./${{ inputs.package }}
+ - id: published
+ run: |
+ versions_published="$(aws codeartifact list-package-versions \
+ --domain ${{ vars.AWS_DOMAIN }} \
+ --repository ${{ vars.AWS_REPOSITORY }} \
+ --format pypi \
+ --package ${{ inputs.package }} \
+ --output json \
+ --query 'versions[*].version' | jq -r '.[]' | grep "^${{ steps.package.outputs.version }}" || true )" # suppress pipefail only here
+ echo "versions=$(echo "${versions_published[*]}"| tr '\n' ',')" >> $GITHUB_OUTPUT
+ - id: next
+ uses: dbt-labs/dbt-release/.github/actions/next-cloud-release-version@main
+ with:
+ version_number: ${{ steps.package.outputs.version }}
+ versions_published: ${{ steps.published.outputs.versions }}
+ - run: |
+ hatch version ${{ steps.next.outputs.internal_release_version }}+$(git rev-parse HEAD)
+ hatch build --clean
+ hatch run build:check-all
+ working-directory: ./${{ inputs.package }}
+ - run: |
+ export HATCH_INDEX_USER=${{ secrets.AWS_USER }}
+
+ export HATCH_INDEX_AUTH=$(aws codeartifact get-authorization-token \
+ --domain ${{ vars.AWS_DOMAIN }} \
+ --output text \
+ --query authorizationToken)
+
+ export HATCH_INDEX_REPO=$(aws codeartifact get-repository-endpoint \
+ --domain ${{ vars.AWS_DOMAIN }} \
+ --repository ${{ vars.AWS_REPOSITORY }} \
+ --format pypi \
+ --output text \
+ --query repositoryEndpoint)
+
+ hatch publish
+ working-directory: ./${{ inputs.package }}
diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml
index 35f7427f..379d8b90 100644
--- a/.github/workflows/publish-pypi.yml
+++ b/.github/workflows/publish-pypi.yml
@@ -15,6 +15,20 @@ on:
description: "Choose the branch to publish"
type: string
default: "main"
+ workflow_dispatch:
+ inputs:
+ package:
+ description: "Choose the package to publish"
+ type: string
+ default: "dbt-athena"
+ deploy-to:
+ description: "Choose whether to publish to test or prod"
+ type: string
+ default: "test"
+ branch:
+ description: "Choose the branch to publish"
+ type: string
+ default: "main"
permissions:
contents: read
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 82597c7d..c5398d9d 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -11,6 +11,14 @@ on:
description: "Choose the branch to release from"
type: string
default: "main"
+ pypi-internal:
+ description: "Publish Internally"
+ type: boolean
+ default: true
+ pypi-public:
+ description: "Publish to PyPI"
+ type: boolean
+ default: false
# don't attempt to release the same target in parallel
concurrency:
@@ -30,17 +38,27 @@ jobs:
repository: ${{ github.repository }}
secrets: inherit
- publish-dbt-athena:
+ publish-internal:
+ if: ${{ inputs.pypi-internal == true }}
+ needs: [unit-tests, integration-tests]
+ uses: ./.github/workflows/publish-internal.yml
+ with:
+ deploy-to: ${{ inputs.deploy-to }}
+ branch: ${{ inputs.branch }}
+
+ publish-pypi:
+ if: ${{ inputs.pypi-public == true }}
needs: [unit-tests, integration-tests]
uses: ./.github/workflows/publish-pypi.yml
with:
deploy-to: ${{ inputs.deploy-to }}
branch: ${{ inputs.branch }}
- publish-dbt-athena-community:
+ publish-pypi-dbt-athena-community:
+ if: ${{ inputs.pypi-public == true }}
# dbt-athena-community is hard pinned to dbt-athena to ensure they are the same
# this means we need to finish publishing dbt-athena before starting to build dbt-athena-community
- needs: publish-dbt-athena
+ needs: publish-pypi
uses: ./.github/workflows/publish-pypi.yml
with:
package: "dbt-athena-community"
diff --git a/dbt-athena-community/README.md b/dbt-athena-community/README.md
new file mode 100644
index 00000000..c5487c05
--- /dev/null
+++ b/dbt-athena-community/README.md
@@ -0,0 +1,882 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+- [Features](#features)
+ - [Quick start](#quick-start)
+ - [Installation](#installation)
+ - [Prerequisites](#prerequisites)
+ - [Credentials](#credentials)
+ - [Configuring your profile](#configuring-your-profile)
+ - [Additional information](#additional-information)
+ - [Models](#models)
+ - [Table configuration](#table-configuration)
+ - [Table location](#table-location)
+ - [Incremental models](#incremental-models)
+ - [On schema change](#on-schema-change)
+ - [Iceberg](#iceberg)
+ - [Highly available table (HA)](#highly-available-table-ha)
+ - [HA known issues](#ha-known-issues)
+ - [Update glue data catalog](#update-glue-data-catalog)
+ - [Snapshots](#snapshots)
+ - [Timestamp strategy](#timestamp-strategy)
+ - [Check strategy](#check-strategy)
+ - [Hard-deletes](#hard-deletes)
+ - [Working example](#working-example)
+ - [Snapshots known issues](#snapshots-known-issues)
+ - [AWS Lake Formation integration](#aws-lake-formation-integration)
+ - [Python models](#python-models)
+ - [Contracts](#contracts)
+ - [Contributing](#contributing)
+ - [Contributors ✨](#contributors-)
+
+
+# Features
+
+- Supports dbt version `1.7.*`
+- Support for Python
+- Supports [seeds][seeds]
+- Correctly detects views and their columns
+- Supports [table materialization][table]
+ - [Iceberg tables][athena-iceberg] are supported **only with Athena Engine v3** and **a unique table location**
+ (see table location section below)
+ - Hive tables are supported by both Athena engines
+- Supports [incremental models][incremental]
+ - On Iceberg tables:
+ - Supports the use of `unique_key` only with the `merge` strategy
+ - Supports the `append` strategy
+ - On Hive tables:
+ - Supports two incremental update strategies: `insert_overwrite` and `append`
+ - Does **not** support the use of `unique_key`
+- Supports [snapshots][snapshots]
+- Supports [Python models][python-models]
+
+[seeds]: https://docs.getdbt.com/docs/building-a-dbt-project/seeds
+
+[incremental]: https://docs.getdbt.com/docs/build/incremental-models
+
+[table]: https://docs.getdbt.com/docs/build/materializations#table
+
+[python-models]: https://docs.getdbt.com/docs/build/python-models#configuring-python-models
+
+[athena-iceberg]: https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html
+
+[snapshots]: https://docs.getdbt.com/docs/build/snapshots
+
+## Quick start
+
+### Installation
+
+- `pip install dbt-athena-community`
+- Or `pip install git+https://github.com/dbt-athena/dbt-athena.git`
+
+### Prerequisites
+
+To start, you will need an S3 bucket, for instance `my-bucket` and an Athena database:
+
+```sql
+CREATE DATABASE IF NOT EXISTS analytics_dev
+COMMENT 'Analytics models generated by dbt (development)'
+LOCATION 's3://my-bucket/'
+WITH DBPROPERTIES ('creator'='Foo Bar', 'email'='foo@bar.com');
+```
+
+Notes:
+
+- Take note of your AWS region code (e.g. `us-west-2` or `eu-west-2`, etc.).
+- You can also use [AWS Glue](https://docs.aws.amazon.com/athena/latest/ug/glue-athena.html) to create and manage Athena
+ databases.
+
+### Credentials
+
+Credentials can be passed directly to the adapter, or they can
+be [determined automatically](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) based
+on `aws cli`/`boto3` conventions.
+You can either:
+
+- Configure `aws_access_key_id` and `aws_secret_access_key`
+- Configure `aws_profile_name` to match a profile defined in your AWS credentials file.
+ Checkout dbt profile configuration below for details.
+
+### Configuring your profile
+
+A dbt profile can be configured to run against AWS Athena using the following configuration:
+
+| Option | Description | Required? | Example |
+|-----------------------|------------------------------------------------------------------------------------------|-----------|--------------------------------------------|
+| s3_staging_dir | S3 location to store Athena query results and metadata | Required | `s3://bucket/dbt/` |
+| s3_data_dir | Prefix for storing tables, if different from the connection's `s3_staging_dir` | Optional | `s3://bucket2/dbt/` |
+| s3_data_naming | How to generate table paths in `s3_data_dir` | Optional | `schema_table_unique` |
+| s3_tmp_table_dir | Prefix for storing temporary tables, if different from the connection's `s3_data_dir` | Optional | `s3://bucket3/dbt/` |
+| region_name | AWS region of your Athena instance | Required | `eu-west-1` |
+| schema | Specify the schema (Athena database) to build models into (lowercase **only**) | Required | `dbt` |
+| database | Specify the database (Data catalog) to build models into (lowercase **only**) | Required | `awsdatacatalog` |
+| poll_interval | Interval in seconds to use for polling the status of query results in Athena | Optional | `5` |
+| debug_query_state | Flag if debug message with Athena query state is needed | Optional | `false` |
+| aws_access_key_id | Access key ID of the user performing requests | Optional | `AKIAIOSFODNN7EXAMPLE` |
+| aws_secret_access_key | Secret access key of the user performing requests | Optional | `wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY` |
+| aws_profile_name | Profile to use from your AWS shared credentials file | Optional | `my-profile` |
+| work_group | Identifier of Athena workgroup | Optional | `my-custom-workgroup` |
+| skip_workgroup_check | Indicates if the WorkGroup check (additional AWS call) can be skipped | Optional | `true` |
+| num_retries | Number of times to retry a failing query | Optional | `3` |
+| num_boto3_retries | Number of times to retry boto3 requests (e.g. deleting S3 files for materialized tables) | Optional | `5` |
+| num_iceberg_retries | Number of times to retry iceberg commit queries to fix ICEBERG_COMMIT_ERROR | Optional | `3` |
+| spark_work_group | Identifier of Athena Spark workgroup for running Python models | Optional | `my-spark-workgroup` |
+| seed_s3_upload_args | Dictionary containing boto3 ExtraArgs when uploading to S3 | Optional | `{"ACL": "bucket-owner-full-control"}` |
+| lf_tags_database | Default LF tags for new database if it's created by dbt | Optional | `tag_key: tag_value` |
+
+**Example profiles.yml entry:**
+
+```yaml
+athena:
+ target: dev
+ outputs:
+ dev:
+ type: athena
+ s3_staging_dir: s3://athena-query-results/dbt/
+ s3_data_dir: s3://your_s3_bucket/dbt/
+ s3_data_naming: schema_table
+ s3_tmp_table_dir: s3://your_s3_bucket/temp/
+ region_name: eu-west-1
+ schema: dbt
+ database: awsdatacatalog
+ threads: 4
+ aws_profile_name: my-profile
+ work_group: my-workgroup
+ spark_work_group: my-spark-workgroup
+ seed_s3_upload_args:
+ ACL: bucket-owner-full-control
+```
+
+### Additional information
+
+- `threads` is supported
+- `database` and `catalog` can be used interchangeably
+
+## Models
+
+### Table configuration
+
+- `external_location` (`default=none`)
+ - If set, the full S3 path to which the table will be saved
+ - Works only with incremental models
+ - Does not work with Hive table with `ha` set to true
+- `partitioned_by` (`default=none`)
+ - An array list of columns by which the table will be partitioned
+ - Limited to creation of 100 partitions (*currently*)
+- `bucketed_by` (`default=none`)
+ - An array list of columns to bucket data, ignored if using Iceberg
+- `bucket_count` (`default=none`)
+ - The number of buckets for bucketing your data, ignored if using Iceberg
+- `table_type` (`default='hive'`)
+ - The type of table
+ - Supports `hive` or `iceberg`
+- `ha` (`default=false`)
+ - If the table should be built using the high-availability method. This option is only available for Hive tables
+ since it is by default for Iceberg tables (see the section [below](#highly-available-table-ha))
+- `format` (`default='parquet'`)
+ - The data format for the table
+ - Supports `ORC`, `PARQUET`, `AVRO`, `JSON`, `TEXTFILE`
+- `write_compression` (`default=none`)
+ - The compression type to use for any storage format that allows compression to be specified. To see which options are
+ available, check out [CREATE TABLE AS][create-table-as]
+- `field_delimiter` (`default=none`)
+ - Custom field delimiter, for when format is set to `TEXTFILE`
+- `table_properties`: table properties to add to the table, valid for Iceberg only
+- `native_drop`: Relation drop operations will be performed with SQL, not direct Glue API calls. No S3 calls will be
+ made to manage data in S3. Data in S3 will only be cleared up for Iceberg
+ tables [see AWS docs](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-managing-tables.html). Note that
+ Iceberg DROP TABLE operations may timeout if they take longer than 60 seconds.
+- `seed_by_insert` (`default=false`)
+ - Default behaviour uploads seed data to S3. This flag will create seeds using an SQL insert statement
+ - Large seed files cannot use `seed_by_insert`, as the SQL insert statement would
+ exceed [the Athena limit of 262144 bytes](https://docs.aws.amazon.com/athena/latest/ug/service-limits.html)
+- `force_batch` (`default=false`)
+ - Skip creating the table as CTAS and run the operation directly in batch insert mode
+ - This is particularly useful when the standard table creation process fails due to partition limitations,
+ allowing you to work with temporary tables and persist the dataset more efficiently
+- `unique_tmp_table_suffix` (`default=false`)
+ - For incremental models using insert overwrite strategy on hive table
+ - Replace the __dbt_tmp suffix used as temporary table name suffix by a unique uuid
+ - Useful if you are looking to run multiple dbt build inserting in the same table in parallel
+- `temp_schema` (`default=none`)
+ - For incremental models, it allows to define a schema to hold temporary create statements
+ used in incremental model runs
+ - Schema will be created in the model target database if does not exist
+- `lf_tags_config` (`default=none`)
+ - [AWS Lake Formation](#aws-lake-formation-integration) tags to associate with the table and columns
+ - `enabled` (`default=False`) whether LF tags management is enabled for a model
+ - `tags` dictionary with tags and their values to assign for the model
+ - `tags_columns` dictionary with a tag key, value and list of columns they must be assigned to
+ - `lf_inherited_tags` (`default=none`)
+ - List of Lake Formation tag keys that are intended to be inherited from the database level and thus shouldn't be
+ removed during association of those defined in `lf_tags_config`
+ - i.e., the default behavior of `lf_tags_config` is to be exhaustive and first remove any pre-existing tags from
+ tables and columns before associating the ones currently defined for a given model
+ - This breaks tag inheritance as inherited tags appear on tables and columns like those associated directly
+
+```sql
+{{
+ config(
+ materialized='incremental',
+ incremental_strategy='append',
+ on_schema_change='append_new_columns',
+ table_type='iceberg',
+ schema='test_schema',
+ lf_tags_config={
+ 'enabled': true,
+ 'tags': {
+ 'tag1': 'value1',
+ 'tag2': 'value2'
+ },
+ 'tags_columns': {
+ 'tag1': {
+ 'value1': ['column1', 'column2'],
+ 'value2': ['column3', 'column4']
+ }
+ },
+ 'inherited_tags': ['tag1', 'tag2']
+ }
+ )
+}}
+```
+
+- Format for `dbt_project.yml`:
+
+```yaml
+ +lf_tags_config:
+ enabled: true
+ tags:
+ tag1: value1
+ tag2: value2
+ tags_columns:
+ tag1:
+ value1: [ column1, column2 ]
+ inherited_tags: [ tag1, tag2 ]
+```
+
+- `lf_grants` (`default=none`)
+ - Lake Formation grants config for data_cell filters
+ - Format:
+
+ ```python
+ lf_grants={
+ 'data_cell_filters': {
+ 'enabled': True | False,
+ 'filters': {
+ 'filter_name': {
+ 'row_filter': '',
+ 'principals': ['principal_arn1', 'principal_arn2']
+ }
+ }
+ }
+ }
+ ```
+
+> Notes:
+>
+> - `lf_tags` and `lf_tags_columns` configs support only attaching lf tags to corresponding resources.
+> We recommend managing LF Tags permissions somewhere outside dbt. For example, you may use
+> [terraform](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_permissions) or
+> [aws cdk](https://docs.aws.amazon.com/cdk/api/v1/docs/aws-lakeformation-readme.html) for such purpose.
+> - `data_cell_filters` management can't be automated outside dbt because the filter can't be attached to the table
+> which doesn't exist. Once you `enable` this config, dbt will set all filters and their permissions during every
+> dbt run. Such approach keeps the actual state of row level security configuration actual after every dbt run and
+> apply changes if they occur: drop, create, update filters and their permissions.
+> - Any tags listed in `lf_inherited_tags` should be strictly inherited from the database level and never overridden at
+ the table and column level
+> - Currently `dbt-athena` does not differentiate between an inherited tag association and an override of same it made
+> previously
+> - e.g. If an inherited tag is overridden by an `lf_tags_config` value in one DBT run, and that override is removed
+ prior to a subsequent run, the prior override will linger and no longer be encoded anywhere (in e.g. Terraform
+ where the inherited value is configured nor in the DBT project where the override previously existed but now is
+ gone)
+
+[create-table-as]: https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html#ctas-table-properties
+
+### Table location
+
+The location a table is saved to is determined by:
+
+1. If `external_location` is defined, that value is used
+2. If `s3_data_dir` is defined, the path is determined by that and `s3_data_naming`
+3. If `s3_data_dir` is not defined, data is stored under `s3_staging_dir/tables/`
+
+Here all the options available for `s3_data_naming`:
+
+- `unique`: `{s3_data_dir}/{uuid4()}/`
+- `table`: `{s3_data_dir}/{table}/`
+- `table_unique`: `{s3_data_dir}/{table}/{uuid4()}/`
+- `schema_table`: `{s3_data_dir}/{schema}/{table}/`
+- `s3_data_naming=schema_table_unique`: `{s3_data_dir}/{schema}/{table}/{uuid4()}/`
+
+It's possible to set the `s3_data_naming` globally in the target profile, or overwrite the value in the table config,
+or setting up the value for groups of model in dbt_project.yml.
+
+> Note: when using a workgroup with a default output location configured, `s3_data_naming` and any configured buckets
+> are ignored and the location configured in the workgroup is used.
+
+### Incremental models
+
+Support for [incremental models](https://docs.getdbt.com/docs/build/incremental-models).
+
+These strategies are supported:
+
+- `insert_overwrite` (default): The insert overwrite strategy deletes the overlapping partitions from the destination
+ table, and then inserts the new records from the source. This strategy depends on the `partitioned_by` keyword! If no
+ partitions are defined, dbt will fall back to the `append` strategy.
+- `append`: Insert new records without updating, deleting or overwriting any existing data. There might be duplicate
+ data (e.g. great for log or historical data).
+- `merge`: Conditionally updates, deletes, or inserts rows into an Iceberg table. Used in combination with `unique_key`.
+ Only available when using Iceberg.
+
+### On schema change
+
+`on_schema_change` is an option to reflect changes of schema in incremental models.
+The following options are supported:
+
+- `ignore` (default)
+- `fail`
+- `append_new_columns`
+- `sync_all_columns`
+
+For details, please refer
+to [dbt docs](https://docs.getdbt.com/docs/build/incremental-models#what-if-the-columns-of-my-incremental-model-change).
+
+### Iceberg
+
+The adapter supports table materialization for Iceberg.
+
+To get started just add this as your model:
+
+```sql
+{{ config(
+ materialized='table',
+ table_type='iceberg',
+ format='parquet',
+ partitioned_by=['bucket(user_id, 5)'],
+ table_properties={
+ 'optimize_rewrite_delete_file_threshold': '2'
+ }
+) }}
+
+select 'A' as user_id,
+ 'pi' as name,
+ 'active' as status,
+ 17.89 as cost,
+ 1 as quantity,
+ 100000000 as quantity_big,
+ current_date as my_date
+```
+
+Iceberg supports bucketing as hidden partitions, therefore use the `partitioned_by` config to add specific bucketing
+conditions.
+
+Iceberg supports several table formats for data : `PARQUET`, `AVRO` and `ORC`.
+
+It is possible to use Iceberg in an incremental fashion, specifically two strategies are supported:
+
+- `append`: New records are appended to the table, this can lead to duplicates.
+- `merge`: Performs an upsert (and optional delete), where new records are added and existing records are updated. Only
+ available with Athena engine version 3.
+ - `unique_key` **(required)**: columns that define a unique record in the source and target tables.
+ - `incremental_predicates` (optional): SQL conditions that enable custom join clauses in the merge statement. This can
+ be useful for improving performance via predicate pushdown on the target table.
+ - `delete_condition` (optional): SQL condition used to identify records that should be deleted.
+ - `update_condition` (optional): SQL condition used to identify records that should be updated.
+ - `insert_condition` (optional): SQL condition used to identify records that should be inserted.
+ - `incremental_predicates`, `delete_condition`, `update_condition` and `insert_condition` can include any column of
+ the incremental table (`src`) or the final table (`target`).
+ Column names must be prefixed by either `src` or `target` to prevent a `Column is ambiguous` error.
+
+`delete_condition` example:
+
+```sql
+{{ config(
+ materialized='incremental',
+ table_type='iceberg',
+ incremental_strategy='merge',
+ unique_key='user_id',
+ incremental_predicates=["src.quantity > 1", "target.my_date >= now() - interval '4' year"],
+ delete_condition="src.status != 'active' and target.my_date < now() - interval '2' year",
+ format='parquet'
+) }}
+
+select 'A' as user_id,
+ 'pi' as name,
+ 'active' as status,
+ 17.89 as cost,
+ 1 as quantity,
+ 100000000 as quantity_big,
+ current_date as my_date
+```
+
+`update_condition` example:
+
+```sql
+{{ config(
+ materialized='incremental',
+ incremental_strategy='merge',
+ unique_key=['id'],
+ update_condition='target.id > 1',
+ schema='sandbox'
+ )
+}}
+
+{% if is_incremental() %}
+
+select * from (
+ values
+ (1, 'v1-updated')
+ , (2, 'v2-updated')
+) as t (id, value)
+
+{% else %}
+
+select * from (
+ values
+ (-1, 'v-1')
+ , (0, 'v0')
+ , (1, 'v1')
+ , (2, 'v2')
+) as t (id, value)
+
+{% endif %}
+```
+
+`insert_condition` example:
+
+```sql
+{{ config(
+ materialized='incremental',
+ incremental_strategy='merge',
+ unique_key=['id'],
+ insert_condition='target.status != 0',
+ schema='sandbox'
+ )
+}}
+
+select * from (
+ values
+ (1, 0)
+ , (2, 1)
+) as t (id, status)
+
+```
+
+### Highly available table (HA)
+
+The current implementation of the table materialization can lead to downtime, as the target table is
+dropped and re-created. To have the less destructive behavior it's possible to use the `ha` config on
+your `table` materialized models. It leverages the table versions feature of glue catalog, creating
+a temp table and swapping the target table to the location of the temp table. This materialization
+is only available for `table_type=hive` and requires using unique locations. For iceberg, high
+availability is the default.
+
+```sql
+{{ config(
+ materialized='table',
+ ha=true,
+ format='parquet',
+ table_type='hive',
+ partitioned_by=['status'],
+ s3_data_naming='table_unique'
+) }}
+
+select 'a' as user_id,
+ 'pi' as user_name,
+ 'active' as status
+union all
+select 'b' as user_id,
+ 'sh' as user_name,
+ 'disabled' as status
+```
+
+By default, the materialization keeps the last 4 table versions, you can change it by setting `versions_to_keep`.
+
+#### HA known issues
+
+- When swapping from a table with partitions to a table without (and the other way around), there could be a little
+ downtime.
+ If high performances is needed consider bucketing instead of partitions
+- By default, Glue "duplicates" the versions internally, so the last two versions of a table point to the same location
+- It's recommended to set `versions_to_keep` >= 4, as this will avoid having the older location removed
+
+### Update glue data catalog
+
+Optionally persist resource descriptions as column and relation comments to the glue data catalog, and meta as
+[glue table properties](https://docs.aws.amazon.com/glue/latest/dg/tables-described.html#table-properties)
+and [column parameters](https://docs.aws.amazon.com/glue/latest/webapi/API_Column.html).
+By default, documentation persistence is disabled, but it can be enabled for specific resources or
+groups of resources as needed.
+
+For example:
+
+```yaml
+models:
+ - name: test_deduplicate
+ description: another value
+ config:
+ persist_docs:
+ relation: true
+ columns: true
+ meta:
+ test: value
+ columns:
+ - name: id
+ meta:
+ primary_key: true
+```
+
+See [persist docs](https://docs.getdbt.com/reference/resource-configs/persist_docs) for more details.
+
+## Snapshots
+
+The adapter supports snapshot materialization. It supports both timestamp and check strategy. To create a snapshot
+create a snapshot file in the snapshots directory. If the directory does not exist create one.
+
+### Timestamp strategy
+
+To use the timestamp strategy refer to
+the [dbt docs](https://docs.getdbt.com/docs/build/snapshots#timestamp-strategy-recommended)
+
+### Check strategy
+
+To use the check strategy refer to the [dbt docs](https://docs.getdbt.com/docs/build/snapshots#check-strategy)
+
+### Hard-deletes
+
+The materialization also supports invalidating hard deletes. Check
+the [docs](https://docs.getdbt.com/docs/build/snapshots#hard-deletes-opt-in) to understand usage.
+
+### Working example
+
+seed file - employent_indicators_november_2022_csv_tables.csv
+
+```csv
+Series_reference,Period,Data_value,Suppressed
+MEIM.S1WA,1999.04,80267,
+MEIM.S1WA,1999.05,70803,
+MEIM.S1WA,1999.06,65792,
+MEIM.S1WA,1999.07,66194,
+MEIM.S1WA,1999.08,67259,
+MEIM.S1WA,1999.09,69691,
+MEIM.S1WA,1999.1,72475,
+MEIM.S1WA,1999.11,79263,
+MEIM.S1WA,1999.12,86540,
+MEIM.S1WA,2000.01,82552,
+MEIM.S1WA,2000.02,81709,
+MEIM.S1WA,2000.03,84126,
+MEIM.S1WA,2000.04,77089,
+MEIM.S1WA,2000.05,73811,
+MEIM.S1WA,2000.06,70070,
+MEIM.S1WA,2000.07,69873,
+MEIM.S1WA,2000.08,71468,
+MEIM.S1WA,2000.09,72462,
+MEIM.S1WA,2000.1,74897,
+```
+
+model.sql
+
+```sql
+{{ config(
+ materialized='table'
+) }}
+
+select row_number() over() as id
+ , *
+ , cast(from_unixtime(to_unixtime(now())) as timestamp(6)) as refresh_timestamp
+from {{ ref('employment_indicators_november_2022_csv_tables') }}
+```
+
+timestamp strategy - model_snapshot_1
+
+```sql
+{% snapshot model_snapshot_1 %}
+
+{{
+ config(
+ strategy='timestamp',
+ updated_at='refresh_timestamp',
+ unique_key='id'
+ )
+}}
+
+select *
+from {{ ref('model') }} {% endsnapshot %}
+```
+
+invalidate hard deletes - model_snapshot_2
+
+```sql
+{% snapshot model_snapshot_2 %}
+
+{{
+ config
+ (
+ unique_key='id',
+ strategy='timestamp',
+ updated_at='refresh_timestamp',
+ invalidate_hard_deletes=True,
+ )
+}}
+select *
+from {{ ref('model') }} {% endsnapshot %}
+```
+
+check strategy - model_snapshot_3
+
+```sql
+{% snapshot model_snapshot_3 %}
+
+{{
+ config
+ (
+ unique_key='id',
+ strategy='check',
+ check_cols=['series_reference','data_value']
+ )
+}}
+select *
+from {{ ref('model') }} {% endsnapshot %}
+```
+
+### Snapshots known issues
+
+- Incremental Iceberg models - Sync all columns on schema change can't remove columns used for partitioning.
+ The only way, from a dbt perspective, is to do a full-refresh of the incremental model.
+
+- Tables, schemas and database names should only be lowercase
+
+- In order to avoid potential conflicts, make sure [`dbt-athena-adapter`](https://github.com/Tomme/dbt-athena) is not
+ installed in the target environment.
+ See for more details.
+
+- Snapshot does not support dropping columns from the source table. If you drop a column make sure to drop the column
+ from the snapshot as well. Another workaround is to NULL the column in the snapshot definition to preserve history
+
+## AWS Lake Formation integration
+
+The adapter implements AWS Lake Formation tags management in the following way:
+
+- You can enable or disable lf-tags management via [config](#table-configuration) (disabled by default)
+- Once you enable the feature, lf-tags will be updated on every dbt run
+- First, all lf-tags for columns are removed to avoid inheritance issues
+- Then, all redundant lf-tags are removed from tables and actual tags from table configs are applied
+- Finally, lf-tags for columns are applied
+
+It's important to understand the following points:
+
+- dbt does not manage lf-tags for databases
+- dbt does not manage Lake Formation permissions
+
+That's why you should handle this by yourself manually or using an automation tool like terraform, AWS CDK etc.
+You may find the following links useful to manage that:
+
+
+* [terraform aws_lakeformation_permissions](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_permissions)
+* [terraform aws_lakeformation_resource_lf_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_resource_lf_tags)
+
+
+## Python models
+
+The adapter supports Python models using [`spark`](https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark.html).
+
+### Setup
+
+- A Spark-enabled workgroup created in Athena
+- Spark execution role granted access to Athena, Glue and S3
+- The Spark workgroup is added to the `~/.dbt/profiles.yml` file and the profile to be used
+ is referenced in `dbt_project.yml`
+
+### Spark-specific table configuration
+
+- `timeout` (`default=43200`)
+ - Time out in seconds for each Python model execution. Defaults to 12 hours/43200 seconds.
+- `spark_encryption` (`default=false`)
+ - If this flag is set to true, encrypts data in transit between Spark nodes and also encrypts data at rest stored
+ locally by Spark.
+- `spark_cross_account_catalog` (`default=false`)
+ - When using the Spark Athena workgroup, queries can only be made against catalogs located on the same
+ AWS account by default. However, sometimes you want to query another catalog located on an external AWS
+ account. Setting this additional Spark properties parameter to true will enable querying external catalogs.
+ You can use the syntax `external_catalog_id/database.table` to access the external table on the external
+ catalog (ex: `999999999999/mydatabase.cloudfront_logs` where 999999999999 is the external catalog ID)
+- `spark_requester_pays` (`default=false`)
+ - When an Amazon S3 bucket is configured as requester pays, the account of the user running the query is charged for
+ data access and data transfer fees associated with the query.
+ - If this flag is set to true, requester pays S3 buckets are enabled in Athena for Spark.
+
+### Spark notes
+
+- A session is created for each unique engine configuration defined in the models that are part of the invocation.
+- A session's idle timeout is set to 10 minutes. Within the timeout period, if there is a new calculation
+ (Spark Python model) ready for execution and the engine configuration matches, the process will reuse the same session.
+- The number of Python models running at a time depends on the `threads`. The number of sessions created for the
+ entire run depends on the number of unique engine configurations and the availability of sessions to maintain
+ thread concurrency.
+- For Iceberg tables, it is recommended to use `table_properties` configuration to set the `format_version` to 2.
+ This is to maintain compatibility between Iceberg tables created by Trino with those created by Spark.
+
+### Example models
+
+#### Simple pandas model
+
+```python
+import pandas as pd
+
+
+def model(dbt, session):
+ dbt.config(materialized="table")
+
+ model_df = pd.DataFrame({"A": [1, 2, 3, 4]})
+
+ return model_df
+```
+
+#### Simple spark
+
+```python
+def model(dbt, spark_session):
+ dbt.config(materialized="table")
+
+ data = [(1,), (2,), (3,), (4,)]
+
+ df = spark_session.createDataFrame(data, ["A"])
+
+ return df
+```
+
+#### Spark incremental
+
+```python
+def model(dbt, spark_session):
+ dbt.config(materialized="incremental")
+ df = dbt.ref("model")
+
+ if dbt.is_incremental:
+ max_from_this = (
+ f"select max(run_date) from {dbt.this.schema}.{dbt.this.identifier}"
+ )
+ df = df.filter(df.run_date >= spark_session.sql(max_from_this).collect()[0][0])
+
+ return df
+```
+
+#### Config spark model
+
+```python
+def model(dbt, spark_session):
+ dbt.config(
+ materialized="table",
+ engine_config={
+ "CoordinatorDpuSize": 1,
+ "MaxConcurrentDpus": 3,
+ "DefaultExecutorDpuSize": 1
+ },
+ spark_encryption=True,
+ spark_cross_account_catalog=True,
+ spark_requester_pays=True
+ polling_interval=15,
+ timeout=120,
+ )
+
+ data = [(1,), (2,), (3,), (4,)]
+
+ df = spark_session.createDataFrame(data, ["A"])
+
+ return df
+```
+
+#### Create pySpark udf using imported external python files
+
+```python
+def model(dbt, spark_session):
+ dbt.config(
+ materialized="incremental",
+ incremental_strategy="merge",
+ unique_key="num",
+ )
+ sc = spark_session.sparkContext
+ sc.addPyFile("s3://athena-dbt/test/file1.py")
+ sc.addPyFile("s3://athena-dbt/test/file2.py")
+
+ def func(iterator):
+ from file2 import transform
+
+ return [transform(i) for i in iterator]
+
+ from pyspark.sql.functions import udf
+ from pyspark.sql.functions import col
+
+ udf_with_import = udf(func)
+
+ data = [(1, "a"), (2, "b"), (3, "c")]
+ cols = ["num", "alpha"]
+ df = spark_session.createDataFrame(data, cols)
+
+ return df.withColumn("udf_test_col", udf_with_import(col("alpha")))
+```
+
+### Known issues in Python models
+
+- Python models cannot
+ [reference Athena SQL views](https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark.html).
+- Third-party Python libraries can be used, but they must be [included in the pre-installed list][pre-installed list]
+ or [imported manually][imported manually].
+- Python models can only reference or write to tables with names meeting the
+ regular expression: `^[0-9a-zA-Z_]+$`. Dashes and special characters are not
+ supported by Spark, even though Athena supports them.
+- Incremental models do not fully utilize Spark capabilities. They depend partially on existing SQL-based logic which
+ runs on Trino.
+- Snapshot materializations are not supported.
+- Spark can only reference tables within the same catalog.
+- For tables created outside of the dbt tool, be sure to populate the location field or dbt will throw an error
+when trying to create the table.
+
+[pre-installed list]: https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark-preinstalled-python-libraries.html
+[imported manually]: https://docs.aws.amazon.com/athena/latest/ug/notebooks-import-files-libraries.html
+
+## Contracts
+
+The adapter partly supports contract definitions:
+
+- `data_type` is supported but needs to be adjusted for complex types. Types must be specified
+ entirely (for instance `array`) even though they won't be checked. Indeed, as dbt recommends, we only compare
+ the broader type (array, map, int, varchar). The complete definition is used in order to check that the data types
+ defined in Athena are ok (pre-flight check).
+- The adapter does not support the constraints since there is no constraint concept in Athena.
+
+## Contributing
+
+See [CONTRIBUTING](CONTRIBUTING.md) for more information on how to contribute to this project.
+
+## Contributors ✨
+
+Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
+
+
+
+
+
+Contributions of any kind welcome!
diff --git a/dbt-athena-community/pyproject.toml b/dbt-athena-community/pyproject.toml
index b2dfe528..b356879b 100644
--- a/dbt-athena-community/pyproject.toml
+++ b/dbt-athena-community/pyproject.toml
@@ -1,7 +1,7 @@
[project]
name = "dbt-athena-community"
description = "The athena adapter plugin for dbt (data build tool)"
-readme = "../README.md"
+readme = "README.md"
keywords = ["dbt", "adapter", "adapters", "database", "elt", "dbt-core", "dbt Core", "dbt Cloud", "dbt Labs", "athena"]
requires-python = ">=3.9.0"
authors = [
@@ -79,13 +79,13 @@ check-all = [
"- check-sdist",
]
check-wheel = [
- "twine check dist/*",
+ "check-wheel-contents dist/*.whl --ignore W007,W008",
"find ./dist/dbt_athena_community-*.whl -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/",
"pip freeze | grep dbt-athena-community",
"pip freeze | grep dbt-athena",
]
check-sdist = [
- "check-wheel-contents dist/*.whl --ignore W007,W008",
+ "twine check dist/*",
"find ./dist/dbt_athena_community-*.gz -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/",
"pip freeze | grep dbt-athena-community",
"pip freeze | grep dbt-athena",
diff --git a/dbt-athena/README.md b/dbt-athena/README.md
new file mode 100644
index 00000000..c5487c05
--- /dev/null
+++ b/dbt-athena/README.md
@@ -0,0 +1,882 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+- [Features](#features)
+ - [Quick start](#quick-start)
+ - [Installation](#installation)
+ - [Prerequisites](#prerequisites)
+ - [Credentials](#credentials)
+ - [Configuring your profile](#configuring-your-profile)
+ - [Additional information](#additional-information)
+ - [Models](#models)
+ - [Table configuration](#table-configuration)
+ - [Table location](#table-location)
+ - [Incremental models](#incremental-models)
+ - [On schema change](#on-schema-change)
+ - [Iceberg](#iceberg)
+ - [Highly available table (HA)](#highly-available-table-ha)
+ - [HA known issues](#ha-known-issues)
+ - [Update glue data catalog](#update-glue-data-catalog)
+ - [Snapshots](#snapshots)
+ - [Timestamp strategy](#timestamp-strategy)
+ - [Check strategy](#check-strategy)
+ - [Hard-deletes](#hard-deletes)
+ - [Working example](#working-example)
+ - [Snapshots known issues](#snapshots-known-issues)
+ - [AWS Lake Formation integration](#aws-lake-formation-integration)
+ - [Python models](#python-models)
+ - [Contracts](#contracts)
+ - [Contributing](#contributing)
+ - [Contributors ✨](#contributors-)
+
+
+# Features
+
+- Supports dbt version `1.7.*`
+- Support for Python
+- Supports [seeds][seeds]
+- Correctly detects views and their columns
+- Supports [table materialization][table]
+ - [Iceberg tables][athena-iceberg] are supported **only with Athena Engine v3** and **a unique table location**
+ (see table location section below)
+ - Hive tables are supported by both Athena engines
+- Supports [incremental models][incremental]
+ - On Iceberg tables:
+ - Supports the use of `unique_key` only with the `merge` strategy
+ - Supports the `append` strategy
+ - On Hive tables:
+ - Supports two incremental update strategies: `insert_overwrite` and `append`
+ - Does **not** support the use of `unique_key`
+- Supports [snapshots][snapshots]
+- Supports [Python models][python-models]
+
+[seeds]: https://docs.getdbt.com/docs/building-a-dbt-project/seeds
+
+[incremental]: https://docs.getdbt.com/docs/build/incremental-models
+
+[table]: https://docs.getdbt.com/docs/build/materializations#table
+
+[python-models]: https://docs.getdbt.com/docs/build/python-models#configuring-python-models
+
+[athena-iceberg]: https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html
+
+[snapshots]: https://docs.getdbt.com/docs/build/snapshots
+
+## Quick start
+
+### Installation
+
+- `pip install dbt-athena-community`
+- Or `pip install git+https://github.com/dbt-athena/dbt-athena.git`
+
+### Prerequisites
+
+To start, you will need an S3 bucket, for instance `my-bucket` and an Athena database:
+
+```sql
+CREATE DATABASE IF NOT EXISTS analytics_dev
+COMMENT 'Analytics models generated by dbt (development)'
+LOCATION 's3://my-bucket/'
+WITH DBPROPERTIES ('creator'='Foo Bar', 'email'='foo@bar.com');
+```
+
+Notes:
+
+- Take note of your AWS region code (e.g. `us-west-2` or `eu-west-2`, etc.).
+- You can also use [AWS Glue](https://docs.aws.amazon.com/athena/latest/ug/glue-athena.html) to create and manage Athena
+ databases.
+
+### Credentials
+
+Credentials can be passed directly to the adapter, or they can
+be [determined automatically](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html) based
+on `aws cli`/`boto3` conventions.
+You can either:
+
+- Configure `aws_access_key_id` and `aws_secret_access_key`
+- Configure `aws_profile_name` to match a profile defined in your AWS credentials file.
+ Checkout dbt profile configuration below for details.
+
+### Configuring your profile
+
+A dbt profile can be configured to run against AWS Athena using the following configuration:
+
+| Option | Description | Required? | Example |
+|-----------------------|------------------------------------------------------------------------------------------|-----------|--------------------------------------------|
+| s3_staging_dir | S3 location to store Athena query results and metadata | Required | `s3://bucket/dbt/` |
+| s3_data_dir | Prefix for storing tables, if different from the connection's `s3_staging_dir` | Optional | `s3://bucket2/dbt/` |
+| s3_data_naming | How to generate table paths in `s3_data_dir` | Optional | `schema_table_unique` |
+| s3_tmp_table_dir | Prefix for storing temporary tables, if different from the connection's `s3_data_dir` | Optional | `s3://bucket3/dbt/` |
+| region_name | AWS region of your Athena instance | Required | `eu-west-1` |
+| schema | Specify the schema (Athena database) to build models into (lowercase **only**) | Required | `dbt` |
+| database | Specify the database (Data catalog) to build models into (lowercase **only**) | Required | `awsdatacatalog` |
+| poll_interval | Interval in seconds to use for polling the status of query results in Athena | Optional | `5` |
+| debug_query_state | Flag if debug message with Athena query state is needed | Optional | `false` |
+| aws_access_key_id | Access key ID of the user performing requests | Optional | `AKIAIOSFODNN7EXAMPLE` |
+| aws_secret_access_key | Secret access key of the user performing requests | Optional | `wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY` |
+| aws_profile_name | Profile to use from your AWS shared credentials file | Optional | `my-profile` |
+| work_group | Identifier of Athena workgroup | Optional | `my-custom-workgroup` |
+| skip_workgroup_check | Indicates if the WorkGroup check (additional AWS call) can be skipped | Optional | `true` |
+| num_retries | Number of times to retry a failing query | Optional | `3` |
+| num_boto3_retries | Number of times to retry boto3 requests (e.g. deleting S3 files for materialized tables) | Optional | `5` |
+| num_iceberg_retries | Number of times to retry iceberg commit queries to fix ICEBERG_COMMIT_ERROR | Optional | `3` |
+| spark_work_group | Identifier of Athena Spark workgroup for running Python models | Optional | `my-spark-workgroup` |
+| seed_s3_upload_args | Dictionary containing boto3 ExtraArgs when uploading to S3 | Optional | `{"ACL": "bucket-owner-full-control"}` |
+| lf_tags_database | Default LF tags for new database if it's created by dbt | Optional | `tag_key: tag_value` |
+
+**Example profiles.yml entry:**
+
+```yaml
+athena:
+ target: dev
+ outputs:
+ dev:
+ type: athena
+ s3_staging_dir: s3://athena-query-results/dbt/
+ s3_data_dir: s3://your_s3_bucket/dbt/
+ s3_data_naming: schema_table
+ s3_tmp_table_dir: s3://your_s3_bucket/temp/
+ region_name: eu-west-1
+ schema: dbt
+ database: awsdatacatalog
+ threads: 4
+ aws_profile_name: my-profile
+ work_group: my-workgroup
+ spark_work_group: my-spark-workgroup
+ seed_s3_upload_args:
+ ACL: bucket-owner-full-control
+```
+
+### Additional information
+
+- `threads` is supported
+- `database` and `catalog` can be used interchangeably
+
+## Models
+
+### Table configuration
+
+- `external_location` (`default=none`)
+ - If set, the full S3 path to which the table will be saved
+ - Works only with incremental models
+ - Does not work with Hive table with `ha` set to true
+- `partitioned_by` (`default=none`)
+ - An array list of columns by which the table will be partitioned
+ - Limited to creation of 100 partitions (*currently*)
+- `bucketed_by` (`default=none`)
+ - An array list of columns to bucket data, ignored if using Iceberg
+- `bucket_count` (`default=none`)
+ - The number of buckets for bucketing your data, ignored if using Iceberg
+- `table_type` (`default='hive'`)
+ - The type of table
+ - Supports `hive` or `iceberg`
+- `ha` (`default=false`)
+ - If the table should be built using the high-availability method. This option is only available for Hive tables
+ since it is by default for Iceberg tables (see the section [below](#highly-available-table-ha))
+- `format` (`default='parquet'`)
+ - The data format for the table
+ - Supports `ORC`, `PARQUET`, `AVRO`, `JSON`, `TEXTFILE`
+- `write_compression` (`default=none`)
+ - The compression type to use for any storage format that allows compression to be specified. To see which options are
+ available, check out [CREATE TABLE AS][create-table-as]
+- `field_delimiter` (`default=none`)
+ - Custom field delimiter, for when format is set to `TEXTFILE`
+- `table_properties`: table properties to add to the table, valid for Iceberg only
+- `native_drop`: Relation drop operations will be performed with SQL, not direct Glue API calls. No S3 calls will be
+ made to manage data in S3. Data in S3 will only be cleared up for Iceberg
+ tables [see AWS docs](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-managing-tables.html). Note that
+ Iceberg DROP TABLE operations may timeout if they take longer than 60 seconds.
+- `seed_by_insert` (`default=false`)
+ - Default behaviour uploads seed data to S3. This flag will create seeds using an SQL insert statement
+ - Large seed files cannot use `seed_by_insert`, as the SQL insert statement would
+ exceed [the Athena limit of 262144 bytes](https://docs.aws.amazon.com/athena/latest/ug/service-limits.html)
+- `force_batch` (`default=false`)
+ - Skip creating the table as CTAS and run the operation directly in batch insert mode
+ - This is particularly useful when the standard table creation process fails due to partition limitations,
+ allowing you to work with temporary tables and persist the dataset more efficiently
+- `unique_tmp_table_suffix` (`default=false`)
+ - For incremental models using insert overwrite strategy on hive table
+ - Replace the __dbt_tmp suffix used as temporary table name suffix by a unique uuid
+ - Useful if you are looking to run multiple dbt build inserting in the same table in parallel
+- `temp_schema` (`default=none`)
+ - For incremental models, it allows to define a schema to hold temporary create statements
+ used in incremental model runs
+ - Schema will be created in the model target database if does not exist
+- `lf_tags_config` (`default=none`)
+ - [AWS Lake Formation](#aws-lake-formation-integration) tags to associate with the table and columns
+ - `enabled` (`default=False`) whether LF tags management is enabled for a model
+ - `tags` dictionary with tags and their values to assign for the model
+ - `tags_columns` dictionary with a tag key, value and list of columns they must be assigned to
+ - `lf_inherited_tags` (`default=none`)
+ - List of Lake Formation tag keys that are intended to be inherited from the database level and thus shouldn't be
+ removed during association of those defined in `lf_tags_config`
+ - i.e., the default behavior of `lf_tags_config` is to be exhaustive and first remove any pre-existing tags from
+ tables and columns before associating the ones currently defined for a given model
+ - This breaks tag inheritance as inherited tags appear on tables and columns like those associated directly
+
+```sql
+{{
+ config(
+ materialized='incremental',
+ incremental_strategy='append',
+ on_schema_change='append_new_columns',
+ table_type='iceberg',
+ schema='test_schema',
+ lf_tags_config={
+ 'enabled': true,
+ 'tags': {
+ 'tag1': 'value1',
+ 'tag2': 'value2'
+ },
+ 'tags_columns': {
+ 'tag1': {
+ 'value1': ['column1', 'column2'],
+ 'value2': ['column3', 'column4']
+ }
+ },
+ 'inherited_tags': ['tag1', 'tag2']
+ }
+ )
+}}
+```
+
+- Format for `dbt_project.yml`:
+
+```yaml
+ +lf_tags_config:
+ enabled: true
+ tags:
+ tag1: value1
+ tag2: value2
+ tags_columns:
+ tag1:
+ value1: [ column1, column2 ]
+ inherited_tags: [ tag1, tag2 ]
+```
+
+- `lf_grants` (`default=none`)
+ - Lake Formation grants config for data_cell filters
+ - Format:
+
+ ```python
+ lf_grants={
+ 'data_cell_filters': {
+ 'enabled': True | False,
+ 'filters': {
+ 'filter_name': {
+ 'row_filter': '',
+ 'principals': ['principal_arn1', 'principal_arn2']
+ }
+ }
+ }
+ }
+ ```
+
+> Notes:
+>
+> - `lf_tags` and `lf_tags_columns` configs support only attaching lf tags to corresponding resources.
+> We recommend managing LF Tags permissions somewhere outside dbt. For example, you may use
+> [terraform](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_permissions) or
+> [aws cdk](https://docs.aws.amazon.com/cdk/api/v1/docs/aws-lakeformation-readme.html) for such purpose.
+> - `data_cell_filters` management can't be automated outside dbt because the filter can't be attached to the table
+> which doesn't exist. Once you `enable` this config, dbt will set all filters and their permissions during every
+> dbt run. Such approach keeps the actual state of row level security configuration actual after every dbt run and
+> apply changes if they occur: drop, create, update filters and their permissions.
+> - Any tags listed in `lf_inherited_tags` should be strictly inherited from the database level and never overridden at
+ the table and column level
+> - Currently `dbt-athena` does not differentiate between an inherited tag association and an override of same it made
+> previously
+> - e.g. If an inherited tag is overridden by an `lf_tags_config` value in one DBT run, and that override is removed
+ prior to a subsequent run, the prior override will linger and no longer be encoded anywhere (in e.g. Terraform
+ where the inherited value is configured nor in the DBT project where the override previously existed but now is
+ gone)
+
+[create-table-as]: https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html#ctas-table-properties
+
+### Table location
+
+The location a table is saved to is determined by:
+
+1. If `external_location` is defined, that value is used
+2. If `s3_data_dir` is defined, the path is determined by that and `s3_data_naming`
+3. If `s3_data_dir` is not defined, data is stored under `s3_staging_dir/tables/`
+
+Here all the options available for `s3_data_naming`:
+
+- `unique`: `{s3_data_dir}/{uuid4()}/`
+- `table`: `{s3_data_dir}/{table}/`
+- `table_unique`: `{s3_data_dir}/{table}/{uuid4()}/`
+- `schema_table`: `{s3_data_dir}/{schema}/{table}/`
+- `s3_data_naming=schema_table_unique`: `{s3_data_dir}/{schema}/{table}/{uuid4()}/`
+
+It's possible to set the `s3_data_naming` globally in the target profile, or overwrite the value in the table config,
+or setting up the value for groups of model in dbt_project.yml.
+
+> Note: when using a workgroup with a default output location configured, `s3_data_naming` and any configured buckets
+> are ignored and the location configured in the workgroup is used.
+
+### Incremental models
+
+Support for [incremental models](https://docs.getdbt.com/docs/build/incremental-models).
+
+These strategies are supported:
+
+- `insert_overwrite` (default): The insert overwrite strategy deletes the overlapping partitions from the destination
+ table, and then inserts the new records from the source. This strategy depends on the `partitioned_by` keyword! If no
+ partitions are defined, dbt will fall back to the `append` strategy.
+- `append`: Insert new records without updating, deleting or overwriting any existing data. There might be duplicate
+ data (e.g. great for log or historical data).
+- `merge`: Conditionally updates, deletes, or inserts rows into an Iceberg table. Used in combination with `unique_key`.
+ Only available when using Iceberg.
+
+### On schema change
+
+`on_schema_change` is an option to reflect changes of schema in incremental models.
+The following options are supported:
+
+- `ignore` (default)
+- `fail`
+- `append_new_columns`
+- `sync_all_columns`
+
+For details, please refer
+to [dbt docs](https://docs.getdbt.com/docs/build/incremental-models#what-if-the-columns-of-my-incremental-model-change).
+
+### Iceberg
+
+The adapter supports table materialization for Iceberg.
+
+To get started just add this as your model:
+
+```sql
+{{ config(
+ materialized='table',
+ table_type='iceberg',
+ format='parquet',
+ partitioned_by=['bucket(user_id, 5)'],
+ table_properties={
+ 'optimize_rewrite_delete_file_threshold': '2'
+ }
+) }}
+
+select 'A' as user_id,
+ 'pi' as name,
+ 'active' as status,
+ 17.89 as cost,
+ 1 as quantity,
+ 100000000 as quantity_big,
+ current_date as my_date
+```
+
+Iceberg supports bucketing as hidden partitions, therefore use the `partitioned_by` config to add specific bucketing
+conditions.
+
+Iceberg supports several table formats for data : `PARQUET`, `AVRO` and `ORC`.
+
+It is possible to use Iceberg in an incremental fashion, specifically two strategies are supported:
+
+- `append`: New records are appended to the table, this can lead to duplicates.
+- `merge`: Performs an upsert (and optional delete), where new records are added and existing records are updated. Only
+ available with Athena engine version 3.
+ - `unique_key` **(required)**: columns that define a unique record in the source and target tables.
+ - `incremental_predicates` (optional): SQL conditions that enable custom join clauses in the merge statement. This can
+ be useful for improving performance via predicate pushdown on the target table.
+ - `delete_condition` (optional): SQL condition used to identify records that should be deleted.
+ - `update_condition` (optional): SQL condition used to identify records that should be updated.
+ - `insert_condition` (optional): SQL condition used to identify records that should be inserted.
+ - `incremental_predicates`, `delete_condition`, `update_condition` and `insert_condition` can include any column of
+ the incremental table (`src`) or the final table (`target`).
+ Column names must be prefixed by either `src` or `target` to prevent a `Column is ambiguous` error.
+
+`delete_condition` example:
+
+```sql
+{{ config(
+ materialized='incremental',
+ table_type='iceberg',
+ incremental_strategy='merge',
+ unique_key='user_id',
+ incremental_predicates=["src.quantity > 1", "target.my_date >= now() - interval '4' year"],
+ delete_condition="src.status != 'active' and target.my_date < now() - interval '2' year",
+ format='parquet'
+) }}
+
+select 'A' as user_id,
+ 'pi' as name,
+ 'active' as status,
+ 17.89 as cost,
+ 1 as quantity,
+ 100000000 as quantity_big,
+ current_date as my_date
+```
+
+`update_condition` example:
+
+```sql
+{{ config(
+ materialized='incremental',
+ incremental_strategy='merge',
+ unique_key=['id'],
+ update_condition='target.id > 1',
+ schema='sandbox'
+ )
+}}
+
+{% if is_incremental() %}
+
+select * from (
+ values
+ (1, 'v1-updated')
+ , (2, 'v2-updated')
+) as t (id, value)
+
+{% else %}
+
+select * from (
+ values
+ (-1, 'v-1')
+ , (0, 'v0')
+ , (1, 'v1')
+ , (2, 'v2')
+) as t (id, value)
+
+{% endif %}
+```
+
+`insert_condition` example:
+
+```sql
+{{ config(
+ materialized='incremental',
+ incremental_strategy='merge',
+ unique_key=['id'],
+ insert_condition='target.status != 0',
+ schema='sandbox'
+ )
+}}
+
+select * from (
+ values
+ (1, 0)
+ , (2, 1)
+) as t (id, status)
+
+```
+
+### Highly available table (HA)
+
+The current implementation of the table materialization can lead to downtime, as the target table is
+dropped and re-created. To have the less destructive behavior it's possible to use the `ha` config on
+your `table` materialized models. It leverages the table versions feature of glue catalog, creating
+a temp table and swapping the target table to the location of the temp table. This materialization
+is only available for `table_type=hive` and requires using unique locations. For iceberg, high
+availability is the default.
+
+```sql
+{{ config(
+ materialized='table',
+ ha=true,
+ format='parquet',
+ table_type='hive',
+ partitioned_by=['status'],
+ s3_data_naming='table_unique'
+) }}
+
+select 'a' as user_id,
+ 'pi' as user_name,
+ 'active' as status
+union all
+select 'b' as user_id,
+ 'sh' as user_name,
+ 'disabled' as status
+```
+
+By default, the materialization keeps the last 4 table versions, you can change it by setting `versions_to_keep`.
+
+#### HA known issues
+
+- When swapping from a table with partitions to a table without (and the other way around), there could be a little
+ downtime.
+ If high performances is needed consider bucketing instead of partitions
+- By default, Glue "duplicates" the versions internally, so the last two versions of a table point to the same location
+- It's recommended to set `versions_to_keep` >= 4, as this will avoid having the older location removed
+
+### Update glue data catalog
+
+Optionally persist resource descriptions as column and relation comments to the glue data catalog, and meta as
+[glue table properties](https://docs.aws.amazon.com/glue/latest/dg/tables-described.html#table-properties)
+and [column parameters](https://docs.aws.amazon.com/glue/latest/webapi/API_Column.html).
+By default, documentation persistence is disabled, but it can be enabled for specific resources or
+groups of resources as needed.
+
+For example:
+
+```yaml
+models:
+ - name: test_deduplicate
+ description: another value
+ config:
+ persist_docs:
+ relation: true
+ columns: true
+ meta:
+ test: value
+ columns:
+ - name: id
+ meta:
+ primary_key: true
+```
+
+See [persist docs](https://docs.getdbt.com/reference/resource-configs/persist_docs) for more details.
+
+## Snapshots
+
+The adapter supports snapshot materialization. It supports both timestamp and check strategy. To create a snapshot
+create a snapshot file in the snapshots directory. If the directory does not exist create one.
+
+### Timestamp strategy
+
+To use the timestamp strategy refer to
+the [dbt docs](https://docs.getdbt.com/docs/build/snapshots#timestamp-strategy-recommended)
+
+### Check strategy
+
+To use the check strategy refer to the [dbt docs](https://docs.getdbt.com/docs/build/snapshots#check-strategy)
+
+### Hard-deletes
+
+The materialization also supports invalidating hard deletes. Check
+the [docs](https://docs.getdbt.com/docs/build/snapshots#hard-deletes-opt-in) to understand usage.
+
+### Working example
+
+seed file - employent_indicators_november_2022_csv_tables.csv
+
+```csv
+Series_reference,Period,Data_value,Suppressed
+MEIM.S1WA,1999.04,80267,
+MEIM.S1WA,1999.05,70803,
+MEIM.S1WA,1999.06,65792,
+MEIM.S1WA,1999.07,66194,
+MEIM.S1WA,1999.08,67259,
+MEIM.S1WA,1999.09,69691,
+MEIM.S1WA,1999.1,72475,
+MEIM.S1WA,1999.11,79263,
+MEIM.S1WA,1999.12,86540,
+MEIM.S1WA,2000.01,82552,
+MEIM.S1WA,2000.02,81709,
+MEIM.S1WA,2000.03,84126,
+MEIM.S1WA,2000.04,77089,
+MEIM.S1WA,2000.05,73811,
+MEIM.S1WA,2000.06,70070,
+MEIM.S1WA,2000.07,69873,
+MEIM.S1WA,2000.08,71468,
+MEIM.S1WA,2000.09,72462,
+MEIM.S1WA,2000.1,74897,
+```
+
+model.sql
+
+```sql
+{{ config(
+ materialized='table'
+) }}
+
+select row_number() over() as id
+ , *
+ , cast(from_unixtime(to_unixtime(now())) as timestamp(6)) as refresh_timestamp
+from {{ ref('employment_indicators_november_2022_csv_tables') }}
+```
+
+timestamp strategy - model_snapshot_1
+
+```sql
+{% snapshot model_snapshot_1 %}
+
+{{
+ config(
+ strategy='timestamp',
+ updated_at='refresh_timestamp',
+ unique_key='id'
+ )
+}}
+
+select *
+from {{ ref('model') }} {% endsnapshot %}
+```
+
+invalidate hard deletes - model_snapshot_2
+
+```sql
+{% snapshot model_snapshot_2 %}
+
+{{
+ config
+ (
+ unique_key='id',
+ strategy='timestamp',
+ updated_at='refresh_timestamp',
+ invalidate_hard_deletes=True,
+ )
+}}
+select *
+from {{ ref('model') }} {% endsnapshot %}
+```
+
+check strategy - model_snapshot_3
+
+```sql
+{% snapshot model_snapshot_3 %}
+
+{{
+ config
+ (
+ unique_key='id',
+ strategy='check',
+ check_cols=['series_reference','data_value']
+ )
+}}
+select *
+from {{ ref('model') }} {% endsnapshot %}
+```
+
+### Snapshots known issues
+
+- Incremental Iceberg models - Sync all columns on schema change can't remove columns used for partitioning.
+ The only way, from a dbt perspective, is to do a full-refresh of the incremental model.
+
+- Tables, schemas and database names should only be lowercase
+
+- In order to avoid potential conflicts, make sure [`dbt-athena-adapter`](https://github.com/Tomme/dbt-athena) is not
+ installed in the target environment.
+ See for more details.
+
+- Snapshot does not support dropping columns from the source table. If you drop a column make sure to drop the column
+ from the snapshot as well. Another workaround is to NULL the column in the snapshot definition to preserve history
+
+## AWS Lake Formation integration
+
+The adapter implements AWS Lake Formation tags management in the following way:
+
+- You can enable or disable lf-tags management via [config](#table-configuration) (disabled by default)
+- Once you enable the feature, lf-tags will be updated on every dbt run
+- First, all lf-tags for columns are removed to avoid inheritance issues
+- Then, all redundant lf-tags are removed from tables and actual tags from table configs are applied
+- Finally, lf-tags for columns are applied
+
+It's important to understand the following points:
+
+- dbt does not manage lf-tags for databases
+- dbt does not manage Lake Formation permissions
+
+That's why you should handle this by yourself manually or using an automation tool like terraform, AWS CDK etc.
+You may find the following links useful to manage that:
+
+
+* [terraform aws_lakeformation_permissions](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_permissions)
+* [terraform aws_lakeformation_resource_lf_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_resource_lf_tags)
+
+
+## Python models
+
+The adapter supports Python models using [`spark`](https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark.html).
+
+### Setup
+
+- A Spark-enabled workgroup created in Athena
+- Spark execution role granted access to Athena, Glue and S3
+- The Spark workgroup is added to the `~/.dbt/profiles.yml` file and the profile to be used
+ is referenced in `dbt_project.yml`
+
+### Spark-specific table configuration
+
+- `timeout` (`default=43200`)
+ - Time out in seconds for each Python model execution. Defaults to 12 hours/43200 seconds.
+- `spark_encryption` (`default=false`)
+ - If this flag is set to true, encrypts data in transit between Spark nodes and also encrypts data at rest stored
+ locally by Spark.
+- `spark_cross_account_catalog` (`default=false`)
+ - When using the Spark Athena workgroup, queries can only be made against catalogs located on the same
+ AWS account by default. However, sometimes you want to query another catalog located on an external AWS
+ account. Setting this additional Spark properties parameter to true will enable querying external catalogs.
+ You can use the syntax `external_catalog_id/database.table` to access the external table on the external
+ catalog (ex: `999999999999/mydatabase.cloudfront_logs` where 999999999999 is the external catalog ID)
+- `spark_requester_pays` (`default=false`)
+ - When an Amazon S3 bucket is configured as requester pays, the account of the user running the query is charged for
+ data access and data transfer fees associated with the query.
+ - If this flag is set to true, requester pays S3 buckets are enabled in Athena for Spark.
+
+### Spark notes
+
+- A session is created for each unique engine configuration defined in the models that are part of the invocation.
+- A session's idle timeout is set to 10 minutes. Within the timeout period, if there is a new calculation
+ (Spark Python model) ready for execution and the engine configuration matches, the process will reuse the same session.
+- The number of Python models running at a time depends on the `threads`. The number of sessions created for the
+ entire run depends on the number of unique engine configurations and the availability of sessions to maintain
+ thread concurrency.
+- For Iceberg tables, it is recommended to use `table_properties` configuration to set the `format_version` to 2.
+ This is to maintain compatibility between Iceberg tables created by Trino with those created by Spark.
+
+### Example models
+
+#### Simple pandas model
+
+```python
+import pandas as pd
+
+
+def model(dbt, session):
+ dbt.config(materialized="table")
+
+ model_df = pd.DataFrame({"A": [1, 2, 3, 4]})
+
+ return model_df
+```
+
+#### Simple spark
+
+```python
+def model(dbt, spark_session):
+ dbt.config(materialized="table")
+
+ data = [(1,), (2,), (3,), (4,)]
+
+ df = spark_session.createDataFrame(data, ["A"])
+
+ return df
+```
+
+#### Spark incremental
+
+```python
+def model(dbt, spark_session):
+ dbt.config(materialized="incremental")
+ df = dbt.ref("model")
+
+ if dbt.is_incremental:
+ max_from_this = (
+ f"select max(run_date) from {dbt.this.schema}.{dbt.this.identifier}"
+ )
+ df = df.filter(df.run_date >= spark_session.sql(max_from_this).collect()[0][0])
+
+ return df
+```
+
+#### Config spark model
+
+```python
+def model(dbt, spark_session):
+ dbt.config(
+ materialized="table",
+ engine_config={
+ "CoordinatorDpuSize": 1,
+ "MaxConcurrentDpus": 3,
+ "DefaultExecutorDpuSize": 1
+ },
+ spark_encryption=True,
+ spark_cross_account_catalog=True,
+ spark_requester_pays=True
+ polling_interval=15,
+ timeout=120,
+ )
+
+ data = [(1,), (2,), (3,), (4,)]
+
+ df = spark_session.createDataFrame(data, ["A"])
+
+ return df
+```
+
+#### Create pySpark udf using imported external python files
+
+```python
+def model(dbt, spark_session):
+ dbt.config(
+ materialized="incremental",
+ incremental_strategy="merge",
+ unique_key="num",
+ )
+ sc = spark_session.sparkContext
+ sc.addPyFile("s3://athena-dbt/test/file1.py")
+ sc.addPyFile("s3://athena-dbt/test/file2.py")
+
+ def func(iterator):
+ from file2 import transform
+
+ return [transform(i) for i in iterator]
+
+ from pyspark.sql.functions import udf
+ from pyspark.sql.functions import col
+
+ udf_with_import = udf(func)
+
+ data = [(1, "a"), (2, "b"), (3, "c")]
+ cols = ["num", "alpha"]
+ df = spark_session.createDataFrame(data, cols)
+
+ return df.withColumn("udf_test_col", udf_with_import(col("alpha")))
+```
+
+### Known issues in Python models
+
+- Python models cannot
+ [reference Athena SQL views](https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark.html).
+- Third-party Python libraries can be used, but they must be [included in the pre-installed list][pre-installed list]
+ or [imported manually][imported manually].
+- Python models can only reference or write to tables with names meeting the
+ regular expression: `^[0-9a-zA-Z_]+$`. Dashes and special characters are not
+ supported by Spark, even though Athena supports them.
+- Incremental models do not fully utilize Spark capabilities. They depend partially on existing SQL-based logic which
+ runs on Trino.
+- Snapshot materializations are not supported.
+- Spark can only reference tables within the same catalog.
+- For tables created outside of the dbt tool, be sure to populate the location field or dbt will throw an error
+when trying to create the table.
+
+[pre-installed list]: https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark-preinstalled-python-libraries.html
+[imported manually]: https://docs.aws.amazon.com/athena/latest/ug/notebooks-import-files-libraries.html
+
+## Contracts
+
+The adapter partly supports contract definitions:
+
+- `data_type` is supported but needs to be adjusted for complex types. Types must be specified
+ entirely (for instance `array`) even though they won't be checked. Indeed, as dbt recommends, we only compare
+ the broader type (array, map, int, varchar). The complete definition is used in order to check that the data types
+ defined in Athena are ok (pre-flight check).
+- The adapter does not support the constraints since there is no constraint concept in Athena.
+
+## Contributing
+
+See [CONTRIBUTING](CONTRIBUTING.md) for more information on how to contribute to this project.
+
+## Contributors ✨
+
+Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
+
+
+
+
+
+Contributions of any kind welcome!
diff --git a/dbt-athena/pyproject.toml b/dbt-athena/pyproject.toml
index 4832e320..a56d057c 100644
--- a/dbt-athena/pyproject.toml
+++ b/dbt-athena/pyproject.toml
@@ -2,7 +2,7 @@
dynamic = ["version"]
name = "dbt-athena"
description = "The athena adapter plugin for dbt (data build tool)"
-readme = "../README.md"
+readme = "README.md"
keywords = ["dbt", "adapter", "adapters", "database", "elt", "dbt-core", "dbt Core", "dbt Cloud", "dbt Labs", "athena"]
requires-python = ">=3.9.0"
authors = [
@@ -87,12 +87,12 @@ check-all = [
"- check-sdist",
]
check-wheel = [
- "twine check dist/*",
+ "check-wheel-contents dist/*.whl --ignore W007,W008",
"find ./dist/dbt_athena-*.whl -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/",
"pip freeze | grep dbt-athena",
]
check-sdist = [
- "check-wheel-contents dist/*.whl --ignore W007,W008",
+ "twine check dist/*",
"find ./dist/dbt_athena-*.gz -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/",
"pip freeze | grep dbt-athena",
]