diff --git a/.changes/unreleased/Under the Hood-20230929-161218.yaml b/.changes/unreleased/Under the Hood-20230929-161218.yaml new file mode 100644 index 000000000..9b5c6818b --- /dev/null +++ b/.changes/unreleased/Under the Hood-20230929-161218.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Add GitHub action for integration testing and use dagger-io to run tests. Remove CircleCI workflow. +time: 2023-09-29T16:12:18.968755+02:00 +custom: + Author: JCZuurmond, colin-rogers-dbt + Issue: "719" diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index f2a3b6357..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,136 +0,0 @@ -version: 2.1 - -jobs: - unit: - environment: - DBT_INVOCATION_ENV: circle - docker: - - image: fishtownanalytics/test-container:10 - steps: - - checkout - - run: tox -e flake8,unit - -# Turning off for now due to flaky runs of tests will turn back on at later date. - integration-spark-session: - environment: - DBT_INVOCATION_ENV: circle - docker: - - image: godatadriven/pyspark:3.1 - steps: - - checkout - - run: apt-get update - - run: conda install python=3.10 - - run: python3 -m pip install --upgrade pip - - run: apt-get install -y git gcc g++ unixodbc-dev libsasl2-dev libxml2-dev libxslt-dev - - run: python3 -m pip install tox - - run: - name: Run integration tests - command: tox -e integration-spark-session - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-thrift: - environment: - DBT_INVOCATION_ENV: circle - docker: - - image: fishtownanalytics/test-container:10 - - image: godatadriven/spark:3.1.1 - environment: - WAIT_FOR: localhost:5432 - command: > - --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 - --name Thrift JDBC/ODBC Server - - image: postgres:9.6.17-alpine - environment: - POSTGRES_USER: dbt - POSTGRES_PASSWORD: dbt - POSTGRES_DB: metastore - - steps: - - checkout - - - run: - name: Wait for Spark-Thrift - command: dockerize -wait tcp://localhost:10000 -timeout 15m -wait-retry-interval 5s - - - run: - name: Run integration tests - command: tox -e integration-spark-thrift - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-databricks-http: - environment: - DBT_INVOCATION_ENV: circle - DBT_DATABRICKS_RETRY_ALL: True - DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" - DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" - DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" - docker: - - image: fishtownanalytics/test-container:10 - steps: - - checkout - - run: - name: Run integration tests - command: tox -e integration-spark-databricks-http - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-databricks-odbc-cluster: &databricks-odbc - environment: - DBT_INVOCATION_ENV: circle - ODBC_DRIVER: Simba # TODO: move env var to Docker image - DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" - DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" - DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" - docker: - # image based on `fishtownanalytics/test-container` w/ Simba ODBC Spark driver installed - - image: 828731156495.dkr.ecr.us-east-1.amazonaws.com/dbt-spark-odbc-test-container:latest - aws_auth: - aws_access_key_id: $AWS_ACCESS_KEY_ID_STAGING - aws_secret_access_key: $AWS_SECRET_ACCESS_KEY_STAGING - steps: - - checkout - - run: - name: Run integration tests - command: tox -e integration-spark-databricks-odbc-cluster - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-databricks-odbc-endpoint: - <<: *databricks-odbc - steps: - - checkout - - run: - name: Run integration tests - command: tox -e integration-spark-databricks-odbc-sql-endpoint - no_output_timeout: 1h - - store_artifacts: - path: ./logs - -workflows: - version: 2 - test-everything: - jobs: - - unit - - integration-spark-session: - requires: - - unit - - integration-spark-thrift: - requires: - - unit - - integration-spark-databricks-http: - requires: - - integration-spark-thrift - - integration-spark-databricks-odbc-cluster: - context: aws-credentials - requires: - - integration-spark-thrift - - integration-spark-databricks-odbc-endpoint: - context: aws-credentials - requires: - - integration-spark-thrift diff --git a/.github/scripts/update_dbt_core_branch.sh b/.github/scripts/update_dbt_core_branch.sh new file mode 100755 index 000000000..1a5a5c2d7 --- /dev/null +++ b/.github/scripts/update_dbt_core_branch.sh @@ -0,0 +1,17 @@ +#!/bin/bash -e +set -e + +git_branch=$1 +target_req_file="dev-requirements.txt" +core_req_sed_pattern="s|dbt-core.git.*#egg=dbt-core|dbt-core.git@${git_branch}#egg=dbt-core|g" +tests_req_sed_pattern="s|dbt-core.git.*#egg=dbt-tests|dbt-core.git@${git_branch}#egg=dbt-tests|g" +if [[ "$OSTYPE" == darwin* ]]; then + # mac ships with a different version of sed that requires a delimiter arg + sed -i "" "$core_req_sed_pattern" $target_req_file + sed -i "" "$tests_req_sed_pattern" $target_req_file +else + sed -i "$core_req_sed_pattern" $target_req_file + sed -i "$tests_req_sed_pattern" $target_req_file +fi +core_version=$(curl "https://raw.githubusercontent.com/dbt-labs/dbt-core/${git_branch}/core/dbt/version.py" | grep "__version__ = *"|cut -d'=' -f2) +bumpversion --allow-dirty --new-version "$core_version" major diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml new file mode 100644 index 000000000..94dece350 --- /dev/null +++ b/.github/workflows/integration.yml @@ -0,0 +1,112 @@ +# **what?** +# Runs integration tests. + +# **why?** +# Ensure code runs as expected. + +# **when?** +# This will run for all PRs, when code is pushed to a release +# branch, and when manually triggered. + +name: Adapter Integration Tests + +on: + push: + branches: + - "main" + - "*.latest" + + pull_request_target: + paths-ignore: + - ".changes/**" + - ".flake8" + - ".gitignore" + - "**.md" + + workflow_dispatch: + inputs: + dbt-core-branch: + description: "branch of dbt-core to use in dev-requirements.txt" + required: false + type: string + +# explicitly turn off permissions for `GITHUB_TOKEN` +permissions: read-all + +# will cancel previous workflows triggered by the same event and for the same ref for PRs or same SHA otherwise +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ contains(github.event_name, 'pull_request_target') && github.event.pull_request.head.ref || github.sha }} + cancel-in-progress: true + +defaults: + run: + shell: bash + +jobs: + + test: + name: ${{ matrix.test }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + test: + - "apache_spark" + - "spark_session" + - "databricks_sql_endpoint" + - "databricks_cluster" + - "databricks_http_cluster" + + env: + DBT_INVOCATION_ENV: github-actions + DD_CIVISIBILITY_AGENTLESS_ENABLED: true + DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} + DD_SITE: datadoghq.com + DD_ENV: ci + DD_SERVICE: ${{ github.event.repository.name }} + DBT_DATABRICKS_CLUSTER_NAME: ${{ secrets.DBT_DATABRICKS_CLUSTER_NAME }} + DBT_DATABRICKS_HOST_NAME: ${{ secrets.DBT_DATABRICKS_HOST_NAME }} + DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} + DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} + DBT_DATABRICKS_USER: ${{ secrets.DBT_DATABRICKS_USERNAME }} + DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" + DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" + DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" + + steps: + - name: Check out the repository + if: github.event_name != 'pull_request_target' + uses: actions/checkout@v3 + with: + persist-credentials: false + + # explicitly checkout the branch for the PR, + # this is necessary for the `pull_request` event + - name: Check out the repository (PR) + if: github.event_name == 'pull_request_target' + uses: actions/checkout@v3 + with: + persist-credentials: false + ref: ${{ github.event.pull_request.head.sha }} + + # the python version used here is not what is used in the tests themselves + - name: Set up Python for dagger + uses: actions/setup-python@v4 + with: + python-version: "3.11" + + - name: Install python dependencies + run: | + python -m pip install --user --upgrade pip + python -m pip --version + python -m pip install -r dagger/requirements.txt + + - name: Update dev_requirements.txt + if: inputs.dbt-core-branch != '' + run: | + pip install bumpversion + ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} + + - name: Run tests for ${{ matrix.test }} + run: python dagger/run_dbt_spark_tests.py --profile ${{ matrix.test }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 30126325e..20f3f88f4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,6 @@ on: branches: - "main" - "*.latest" - - "releases/*" pull_request: workflow_dispatch: @@ -81,10 +80,6 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11"] - env: - TOXENV: "unit" - PYTEST_ADDOPTS: "-v --color=yes --csv unit_results.csv" - steps: - name: Check out the repository uses: actions/checkout@v3 @@ -100,10 +95,12 @@ jobs: sudo apt-get install libsasl2-dev python -m pip install --user --upgrade pip python -m pip --version - python -m pip install tox - tox --version - - name: Run tox - run: tox + python -m pip install -r requirements.txt + python -m pip install -r dev-requirements.txt + python -m pip install -e . + + - name: Run unit tests + run: python -m pytest --color=yes --csv unit_results.csv -v tests/unit - name: Get current date if: always() diff --git a/.gitignore b/.gitignore index 33a83848c..1e8ff7411 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,5 @@ test.env .hive-metastore/ .spark-warehouse/ dbt-integration-tests +/.tool-versions +/.hypothesis/* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a61306ea5..6fcaacea8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,11 +65,27 @@ $EDITOR test.env ### Test commands There are a few methods for running tests locally. -#### `tox` -`tox` takes care of managing Python virtualenvs and installing dependencies in order to run tests. You can also run tests in parallel, for example you can run unit tests for Python 3.8, Python 3.9, and `flake8` checks in parallel with `tox -p`. Also, you can run unit tests for specific python versions with `tox -e py38`. The configuration of these tests are located in `tox.ini`. +#### dagger +To run functional tests we rely on [dagger](https://dagger.io/). This launches a virtual container or containers to test against. -#### `pytest` -Finally, you can also run a specific test or group of tests using `pytest` directly. With a Python virtualenv active and dev dependencies installed you can do things like: +```sh +pip install -r dagger/requirements.txt +python dagger/run_dbt_spark_tests.py --profile databricks_sql_endpoint --test-path tests/functional/adapter/test_basic.py::TestSimpleMaterializationsSpark::test_base +``` + +`--profile`: required, this is the kind of spark connection to test against + +_options_: + - "apache_spark" + - "spark_session" + - "databricks_sql_endpoint" + - "databricks_cluster" + - "databricks_http_cluster" + +`--test-path`: optional, this is the path to the test file you want to run. If not specified, all tests will be run. + +#### pytest +Finally, you can also run a specific test or group of tests using `pytest` directly (if you have all the dependencies set up on your machine). With a Python virtualenv active and dev dependencies installed you can do things like: ```sh # run all functional tests diff --git a/Makefile b/Makefile index cc1d9f75d..2bd1055fa 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ .PHONY: dev dev: ## Installs adapter in develop mode along with development dependencies @\ - pip install -e . -r requirements.txt -r dev-requirements.txt && pre-commit install + pip install -e . -r requirements.txt -r dev-requirements.txt -r dagger/requirements.txt && pre-commit install .PHONY: dev-uninstall dev-uninstall: ## Uninstalls all packages while maintaining the virtual environment @@ -40,12 +40,13 @@ linecheck: ## Checks for all Python lines 100 characters or more .PHONY: unit unit: ## Runs unit tests with py38. @\ - tox -e py38 + python -m pytest tests/unit .PHONY: test test: ## Runs unit tests with py38 and code checks against staged changes. @\ - tox -p -e py38; \ + python -m pytest tests/unit; \ + python dagger/run_dbt_spark_tests.py --profile spark_session \ pre-commit run black-check --hook-stage manual | grep -v "INFO"; \ pre-commit run flake8-check --hook-stage manual | grep -v "INFO"; \ pre-commit run mypy-check --hook-stage manual | grep -v "INFO" diff --git a/README.md b/README.md index 2d2586795..7e95b1fc3 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,6 @@ - - -
**[dbt](https://www.getdbt.com/)** enables data analysts and engineers to transform their data using the same practices that software engineers use to build applications. diff --git a/dagger/requirements.txt b/dagger/requirements.txt new file mode 100644 index 000000000..df36543c2 --- /dev/null +++ b/dagger/requirements.txt @@ -0,0 +1,2 @@ +dagger-io~=0.8.0 +python-dotenv diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py new file mode 100644 index 000000000..718519909 --- /dev/null +++ b/dagger/run_dbt_spark_tests.py @@ -0,0 +1,130 @@ +import os + +import argparse +import sys + +import anyio as anyio +import dagger as dagger +from dotenv import find_dotenv, load_dotenv + +PG_PORT = 5432 +load_dotenv(find_dotenv("test.env")) +# if env vars aren't specified in test.env (i.e. in github actions worker), use the ones from the host +TESTING_ENV_VARS = { + env_name: os.environ[env_name] + for env_name in os.environ + if env_name.startswith(("DD_", "DBT_")) +} + +TESTING_ENV_VARS.update({"ODBC_DRIVER": "Simba"}) + + +def env_variables(envs: dict[str, str]): + def env_variables_inner(ctr: dagger.Container): + for key, value in envs.items(): + ctr = ctr.with_env_variable(key, value) + return ctr + + return env_variables_inner + + +async def get_postgres_container(client: dagger.Client) -> (dagger.Container, str): + ctr = await ( + client.container() + .from_("postgres:13") + .with_env_variable("POSTGRES_PASSWORD", "postgres") + .with_exposed_port(PG_PORT) + ) + + return ctr, "postgres_db" + + +async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): + spark_dir = client.host().directory("./dagger/spark-container") + spark_ctr_base = ( + client.container() + .from_("eclipse-temurin:8-jre") + .with_directory("/spark_setup", spark_dir) + .with_env_variable("SPARK_HOME", "/usr/spark") + .with_env_variable("PATH", "/usr/spark/bin:/usr/spark/sbin:$PATH", expand=True) + .with_file( + "/scripts/entrypoint.sh", + client.host().file("./dagger/spark-container/entrypoint.sh"), + permissions=755, + ) + .with_file( + "/scripts/install_spark.sh", + client.host().file("./dagger/spark-container/install_spark.sh"), + permissions=755, + ) + .with_exec(["./spark_setup/install_spark.sh"]) + .with_file("/usr/spark/conf/hive-site.xml", spark_dir.file("/hive-site.xml")) + .with_file("/usr/spark/conf/spark-defaults.conf", spark_dir.file("spark-defaults.conf")) + ) + + # postgres is the metastore here + pg_ctr, pg_host = await get_postgres_container(client) + + spark_ctr = ( + spark_ctr_base.with_service_binding(alias=pg_host, service=pg_ctr) + .with_exec( + [ + "/scripts/entrypoint.sh", + "--class", + "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2", + "--name", + "Thrift JDBC/ODBC Server", + ] + ) + .with_exposed_port(10000) + ) + + return spark_ctr, "spark_db" + + +async def test_spark(test_args): + async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: + test_profile = test_args.profile + req_files = client.host().directory("./", include=["*.txt", "*.env", "*.ini"]) + dbt_spark_dir = client.host().directory("./dbt") + test_dir = client.host().directory("./tests") + scripts = client.host().directory("./dagger/scripts") + platform = dagger.Platform("linux/amd64") + tst_container = ( + client.container(platform=platform) + .from_("python:3.8-slim") + .with_directory("/.", req_files) + .with_directory("/dbt", dbt_spark_dir) + .with_directory("/tests", test_dir) + .with_directory("/scripts", scripts) + .with_exec("./scripts/install_os_reqs.sh") + .with_exec(["pip", "install", "-r", "requirements.txt"]) + .with_exec(["pip", "install", "-r", "dev-requirements.txt"]) + ) + + if test_profile == "apache_spark": + spark_ctr, spark_host = await get_spark_container(client) + tst_container = tst_container.with_service_binding(alias=spark_host, service=spark_ctr) + + elif test_profile in ["databricks_cluster", "databricks_sql_endpoint"]: + tst_container = tst_container.with_exec("./scripts/configure_odbc.sh") + + elif test_profile == "spark_session": + tst_container = tst_container.with_exec(["pip", "install", "pyspark"]) + tst_container = tst_container.with_exec(["apt-get", "install", "openjdk-17-jre", "-y"]) + + tst_container = tst_container.with_(env_variables(TESTING_ENV_VARS)) + test_path = test_args.test_path if test_args.test_path else "tests/functional/adapter" + result = await tst_container.with_exec( + ["pytest", "-v", "--profile", test_profile, "-n", "auto", test_path] + ).stdout() + + return result + + +parser = argparse.ArgumentParser() +parser.add_argument("--profile", required=True, type=str) +parser.add_argument("--test-path", required=False, type=str) +args = parser.parse_args() + +anyio.run(test_spark, args) diff --git a/dagger/scripts/configure_odbc.sh b/dagger/scripts/configure_odbc.sh new file mode 100755 index 000000000..ddf020ad2 --- /dev/null +++ b/dagger/scripts/configure_odbc.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -eo +rm -rf /tmp && mkdir /tmp + +curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" +unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ +dpkg -i /tmp/SimbaSparkODBC-2.6.16.1019-Debian-64bit/simbaspark_2.6.16.1019-2_amd64.deb +echo "--------------------------------------------" +echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini +dpkg -l | grep Simba # confirm that the driver is installed +rm -rf /tmp diff --git a/dagger/scripts/install_os_reqs.sh b/dagger/scripts/install_os_reqs.sh new file mode 100755 index 000000000..b50027f52 --- /dev/null +++ b/dagger/scripts/install_os_reqs.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -eo +apt-get update && apt-get install -y --no-install-recommends \ + g++ \ + git \ + curl \ + unixodbc \ + unixodbc-dev \ + libsasl2-modules-gssapi-mit \ + unzip diff --git a/dagger/spark-container/entrypoint.sh b/dagger/spark-container/entrypoint.sh new file mode 100644 index 000000000..4b15cab61 --- /dev/null +++ b/dagger/spark-container/entrypoint.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +if [ -n "$WAIT_FOR" ]; then + IFS=';' read -a HOSTPORT_ARRAY <<< "$WAIT_FOR" + for HOSTPORT in "${HOSTPORT_ARRAY[@]}" + do + WAIT_FOR_HOST=${HOSTPORT%:*} + WAIT_FOR_PORT=${HOSTPORT#*:} + + echo Waiting for $WAIT_FOR_HOST to listen on $WAIT_FOR_PORT... + while ! nc -z $WAIT_FOR_HOST $WAIT_FOR_PORT; do echo sleeping; sleep 2; done + done +fi +echo "$PATH" +exec spark-submit "$@" diff --git a/dagger/spark-container/hive-site.xml b/dagger/spark-container/hive-site.xml new file mode 100644 index 000000000..93e966fb7 --- /dev/null +++ b/dagger/spark-container/hive-site.xml @@ -0,0 +1,46 @@ + + + + +