Skip to content

Commit

Permalink
1.6.3 Release
Browse files Browse the repository at this point in the history
  • Loading branch information
benc-db committed Sep 8, 2023
2 parents e69f121 + 4e77d3c commit cbd78e9
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 38 deletions.
47 changes: 37 additions & 10 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,46 @@
name: Integration Tests
on: push
on:
push:
paths-ignore:
- "**.MD"
jobs:
run-tox-tests-uc:
run-tox-tests-uc-cluster:
runs-on: ubuntu-latest
environment: azure-prod
env:
DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
DBT_DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
DBT_DATABRICKS_CLIENT_ID: ${{ secrets.TEST_PECO_SP_ID }}
DBT_DATABRICKS_CLIENT_SECRET: ${{ secrets.TEST_PECO_SP_SECRET }}
DBT_DATABRICKS_UC_INITIAL_CATALOG: peco
DBT_DATABRICKS_LOCATION_ROOT: ${{ secrets.TEST_PECO_EXTERNAL_LOCATION }}
TEST_PECO_UC_CLUSTER_ID: ${{ secrets.TEST_PECO_UC_CLUSTER_ID }}
steps:
- name: Check out repository
uses: actions/checkout@v3
- name: Set up python
id: setup-python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Get http path from environment
run: python .github/workflows/build_cluster_http_path.py
shell: sh
- name: Install tox
id: install-dependencies
run: pip install tox
- name: Run integration-uc-databricks-cluster
run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_CLIENT_ID=$DBT_DATABRICKS_CLIENT_ID DBT_DATABRICKS_CLIENT_SECRET=$DBT_DATABRICKS_CLIENT_SECRET tox -e integration-databricks-uc-cluster

run-tox-tests-uc-sql:
runs-on: ubuntu-latest
environment: azure-prod
env:
DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
DBT_DATABRICKS_CLIENT_ID: ${{ secrets.TEST_PECO_SP_ID }}
DBT_DATABRICKS_CLIENT_SECRET: ${{ secrets.TEST_PECO_SP_SECRET }}
DBT_DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }}
DBT_DATABRICKS_UC_INITIAL_CATALOG: peco
DBT_DATABRICKS_LOCATION_ROOT: ${{ secrets.TEST_PECO_EXTERNAL_LOCATION }}
TEST_PECO_CLUSTER_ID: ${{ secrets.TEST_PECO_CLUSTER_ID }}
TEST_PECO_UC_CLUSTER_ID: ${{ secrets.TEST_PECO_UC_CLUSTER_ID }}
steps:
- name: Check out repository
Expand All @@ -27,18 +57,15 @@ jobs:
id: install-dependencies
run: pip install tox
- name: Run integration-databricks-uc-sql-endpoint
run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_HTTP_PATH DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_TOKEN=$DBT_DATABRICKS_TOKEN tox -e integration-databricks-uc-sql-endpoint
- name: Run integration-uc-databricks-cluster
run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_CLUSTER_HTTP_PATH DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_TOKEN=$DBT_DATABRICKS_TOKEN tox -e integration-databricks-uc-cluster
run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_CLIENT_ID=$DBT_DATABRICKS_CLIENT_ID DBT_DATABRICKS_CLIENT_SECRET=$DBT_DATABRICKS_CLIENT_SECRET tox -e integration-databricks-uc-sql-endpoint

run-tox-tests-non-uc:
runs-on: ubuntu-latest
environment: azure-prod
env:
DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
DBT_DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
DBT_DATABRICKS_UC_INITIAL_CATALOG: peco
TEST_PECO_CLUSTER_ID: ${{ secrets.TEST_PECO_CLUSTER_ID }}
TEST_PECO_UC_CLUSTER_ID: ${{ secrets.TEST_PECO_UC_CLUSTER_ID }}
DBT_DATABRICKS_LOCATION_ROOT: ${{ secrets.TEST_PECO_EXTERNAL_LOCATION }}
steps:
- name: Check out repository
Expand All @@ -55,4 +82,4 @@ jobs:
id: install-dependencies
run: pip install tox
- name: Run integration-databricks-cluster
run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_CLUSTER_HTTP_PATH DBT_DATABRICKS_TOKEN=$DBT_DATABRICKS_TOKEN tox -e integration-databricks-cluster
run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_CLUSTER_HTTP_PATH DBT_DATABRICKS_CLIENT_ID=$DBT_DATABRICKS_CLIENT_ID DBT_DATABRICKS_CLIENT_SECRET=$DBT_DATABRICKS_CLIENT_SECRET tox -e integration-databricks-cluster
8 changes: 6 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ on:
- "main"
- "*.latest"
- "releases/*"
paths-ignore:
- "**.MD"
pull_request:
paths-ignore:
- "**.MD"
workflow_dispatch:

permissions: read-all
Expand Down Expand Up @@ -59,7 +63,7 @@ jobs:
- name: Set up Python
uses: actions/[email protected]
with:
python-version: '3.8'
python-version: "3.8"

- name: Install python dependencies
run: |
Expand Down Expand Up @@ -132,7 +136,7 @@ jobs:
- name: Set up Python
uses: actions/[email protected]
with:
python-version: '3.8'
python-version: "3.8"

- name: Install python dependencies
run: |
Expand Down
18 changes: 16 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
## dbt-databricks 1.6.x (Release TBD)

## dbt-databricks 1.6.3 (September 8, 2023)

### Fixes

- Improved legibility of python stack traces ([#434](https://github.com/databricks/dbt-databricks/pull/434)).
- Add `fetchmany`, resolves #408 (Thanks @NodeJSmith) ([#409](https://github.com/databricks/dbt-databricks/pull/409))
- Improved legibility of python stack traces ([#434](https://github.com/databricks/dbt-databricks/pull/434))
- Update our Databricks Workflow README to make clear that jobs clusters are not supported targets ([#436](https://github.com/databricks/dbt-databricks/pull/436))
- Relaxed the constraint on databricks-sql-connector to allow newer versions ([#436](https://github.com/databricks/dbt-databricks/pull/436))
- Streamlined sql connector output in dbt.log ([#437](https://github.com/databricks/dbt-databricks/pull/437))

### Under the hood

- Switch to running integration tests with OAuth ([#436](https://github.com/databricks/dbt-databricks/pull/436))

## dbt-databricks 1.6.2 (August 29, 2023)

### Features

- Follow up: re-implement fix for issue where the show tables extended command is limited to 2048 characters. ([#326](https://github.com/databricks/dbt-databricks/pull/326)). Set `DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS` to `true` to enable this behaviour.
- Add `liquid_clustered_by` config to enable Liquid Clustering for Delta-based dbt models.
- Add `liquid_clustered_by` config to enable Liquid Clustering for Delta-based dbt models (Thanks @ammarchalifah) ([#398](https://github.com/databricks/dbt-databricks/pull/398)).

### Under the hood

- Dropping the databricks_sql_endpoint test profile as not truly testing different behavior than databricks_uc_sql_endpoint profile ([#417](https://github.com/databricks/dbt-databricks/pull/417))

- Improve testing of python model support so that we can package the new config options in this release ([#421](https://github.com/databricks/dbt-databricks/pull/421))

## dbt-databricks 1.6.1 (August 2, 2023)
Expand Down
2 changes: 1 addition & 1 deletion dbt/adapters/databricks/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version: str = "1.6.2"
version: str = "1.6.3"
11 changes: 9 additions & 2 deletions dbt/adapters/databricks/connections.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def emit(self, record: logging.LogRecord) -> None:
dbt_adapter_logger = AdapterLogger("databricks-sql-connector")

pysql_logger = logging.getLogger("databricks.sql")
pysql_logger_level = os.environ.get("DBT_DATABRICKS_CONNECTOR_LOG_LEVEL", "INFO").upper()
pysql_logger_level = os.environ.get("DBT_DATABRICKS_CONNECTOR_LOG_LEVEL", "WARN").upper()
pysql_logger.setLevel(pysql_logger_level)

pysql_handler = DbtCoreHandler(dbt_logger=dbt_adapter_logger, level=pysql_logger_level)
Expand Down Expand Up @@ -496,6 +496,9 @@ def fetchall(self) -> Sequence[Tuple]:
def fetchone(self) -> Optional[Tuple]:
return self._cursor.fetchone()

def fetchmany(self, size: int) -> Sequence[Tuple]:
return self._cursor.fetchmany(size)

def execute(self, sql: str, bindings: Optional[Sequence[Any]] = None) -> None:
# print(f"execute: {sql}")
if sql.strip().endswith(";"):
Expand Down Expand Up @@ -779,7 +782,11 @@ def add_query(
cursor.close()

def execute(
self, sql: str, auto_begin: bool = False, fetch: bool = False, limit: Optional[int] = None
self,
sql: str,
auto_begin: bool = False,
fetch: bool = False,
limit: Optional[int] = None,
) -> Tuple[DatabricksAdapterResponse, Table]:
sql = self._add_query_comment(sql)
_, cursor = self.add_query(sql, auto_begin)
Expand Down
5 changes: 3 additions & 2 deletions dbt/adapters/databricks/python_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from dbt.adapters.databricks.__version__ import version
from dbt.adapters.databricks.connections import DatabricksCredentials
from dbt.adapters.databricks import utils

import base64
import time
Expand Down Expand Up @@ -146,7 +147,7 @@ def _submit_through_notebook(self, compiled_code: str, cluster_spec: dict) -> No
"Python model failed with traceback as:\n"
"(Note that the line number here does not "
"match the line number in your code due to dbt templating)\n"
f"{json_run_output['error_trace']}"
f"{utils.remove_ansi(json_run_output['error_trace'])}"
)

def submit(self, compiled_code: str) -> None:
Expand Down Expand Up @@ -373,7 +374,7 @@ def submit(self, compiled_code: str) -> None:
if response["results"]["resultType"] == "error":
raise dbt.exceptions.DbtRuntimeError(
f"Python model failed with traceback as:\n"
f"{response['results']['cause']}"
f"{utils.remove_ansi(response['results']['cause'])}"
)
finally:
context.destroy(context_id)
Expand Down
5 changes: 5 additions & 0 deletions dbt/adapters/databricks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,8 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
return func(*new_args, **new_kwargs)

return wrapper


def remove_ansi(line: str) -> str:
ansi_escape = re.compile(r"(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]")
return ansi_escape.sub("", line)
41 changes: 28 additions & 13 deletions docs/databricks-workflows.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,34 @@ Databricks Workflows is a highly-reliable, managed orchestrator that lets you au
In this guide, you will learn how to update an existing dbt project to run as a job, retrieve dbt run artifacts using the Jobs API and debug common issues.

# Overview

When you run a dbt project as a Databricks Job, the dbt CLI runs on a single-node Automated Cluster. The SQL generated by dbt runs on a serverless SQL warehouse.

# Prerequisites

- An existing dbt project version controlled in git
- Access to a Databricks workspace
- Ability to launch job clusters (using a policy or cluster create permissions) or access to an existing interactive cluster with `dbt-core` and `dbt-databricks` libraries installed or `CAN_MANAGE` permissions to install the `dbt-core` and `dbt-databricks` as cluster libraries.
- Access to an existing interactive cluster with `dbt-core` and `dbt-databricks` libraries installed or `CAN_MANAGE` permissions to install the `dbt-core` and `dbt-databricks` as cluster libraries OR
- Access to serverless SQL warehouses. See [documentation](https://docs.databricks.com/serverless-compute/index.html) to learn more about this feature and regional availability.
- [Files in Repos](https://docs.databricks.com/repos/index.html#enable-support-for-arbitrary-files-in-databricks-repos) must be enabled and is only supported on Databricks Runtime (DBR) 8.4+ or DBR 11+ depending on the configuration. Please make sure the cluster has the appropriate DBR version.
- Install and configure the [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html)
- Install [jq](https://stedolan.github.io/jq/download/), a popular open source tool for parsing JSON from the command line

Note: previously dbt tasks on Databricks Workflows could target jobs clusters for compute.
That is [no longer supported](https://docs.databricks.com/en/workflows/jobs/how-to/use-dbt-in-workflows.html#advanced-run-dbt-with-a-custom-profile).
Job clusters can only be used for running the dbt-cli.

# Run dbt as a production job

In this step, you will create a job that will run the dbt project on a schedule.

## Connect Databricks to Git

The dbt task only supports retrieve dbt projects from Git. Please follow [the documentation](https://docs.databricks.com/repos/index.html#configure-your-git-integration-with-databricks) to connect Databricks to Git.

## Create a job
1. Log in to your Databricks workspace

1. Log in to your Databricks workspace
2. Click the _Data Science & Engineering_ persona in the left navigation bar
3. Click _Workflows_
4. Click _Create Job_
Expand All @@ -41,12 +50,14 @@ The dbt task only supports retrieve dbt projects from Git. Please follow [the do
13. Click _Save_

# Run the job and view dbt output

You can now run your newly-saved job and see its output.

1. Click _Run Now_ on the notification that shows up when you save the job
2. Click the active run and see dbt output. Note that dbt output is not real-time, it lags behind dbt's progress by several seconds to a minute.

# Retrieve dbt artifacts using the Jobs API

A dbt run generates useful artifacts which you may want to retrieve for analysis and more. Databricks saves the contents of `/logs` and `/target` directories as a compressed archive which you can retrieve using the Jobs API.

> It is currently not possible to refer to a previous run's artifacts e.g. using the `--state` flag. You can, however, include a known good state in your repository.
Expand Down Expand Up @@ -77,14 +88,18 @@ $ tar -xvf artifact.tar.gz
```

# Common issues

## Unable to connect to Databricks

- If you do not use the automatically-generated `profiles.yml`, check your Personal Access Token (PAT). It must not be expired.
- Consider adding `dbt debug` as the first command. This may give you a clue about the failure.

## dbt cannot find my `dbt_project.yml` file

If you have checked out the Git repository before enabling the _Files in Repos_ feature, the checkout might be cached invalidly. You need to push a dummy commit to your repository to force a fresh checkout.

# Connecting to different sources (custom profile)

By default the dbt task type will connect to the serverless SQL warehouse specified in the task without any configuration changes or need to check in any secrets. It does so by generating a default `profiles.yml` and telling dbt to use it. We have no restrictions on connection to any other dbt targets such as Databricks SQL, Amazon Redshift, Google BigQuery, Snowflake, or any other [supported adapter](https://docs.getdbt.com/docs/available-adapters).

The automatically generated profile can be overridden by specifying an alternative profiles directory in the dbt command using `--profiles-dir <dir>`, where the path of the `<dir>` should be a relative path like `.` or `./my-directory`.
Expand All @@ -95,15 +110,15 @@ If you'd like to connect to multiple outputs and including the current Interacti
```yaml
databricks_demo:
target: databricks_cluster
outputs:
databricks_cluster:
type: databricks
connect_retries: 5
connect_timeout: 180
schema: "<your-schema>"
threads: 8 # This can be increased or decreased to control the parallism
host: "{{ env_var('DBT_HOST') }}"
http_path: "sql/protocolv1/o/{{ env_var('DBT_ORG_ID') }}/{{ env_var('DBT_CLUSTER_ID') }}"
token: "{{ env_var('DBT_ACCESS_TOKEN') }}"
target: databricks_cluster
outputs:
databricks_cluster:
type: databricks
connect_retries: 5
connect_timeout: 180
schema: "<your-schema>"
threads: 8 # This can be increased or decreased to control the parallism
host: "{{ env_var('DBT_HOST') }}"
http_path: "sql/protocolv1/o/{{ env_var('DBT_ORG_ID') }}/{{ env_var('DBT_CLUSTER_ID') }}"
token: "{{ env_var('DBT_ACCESS_TOKEN') }}"
```
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
databricks-sql-connector~=2.7.0
databricks-sql-connector>=2.9.3, <3.0.0
dbt-spark==1.6.0
databricks-sdk==0.1.7
keyring>=23.13.0
14 changes: 10 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
except ImportError:
# the user has a downlevel version of setuptools.
print("Error: dbt requires setuptools v40.1.0 or higher.")
print('Please upgrade setuptools with "pip install --upgrade setuptools" and try again')
print(
'Please upgrade setuptools with "pip install --upgrade setuptools" and try again'
)
sys.exit(1)


Expand All @@ -29,12 +31,16 @@

# get this package's version from dbt/adapters/<name>/__version__.py
def _get_plugin_version():
_version_path = os.path.join(this_directory, "dbt", "adapters", "databricks", "__version__.py")
_version_path = os.path.join(
this_directory, "dbt", "adapters", "databricks", "__version__.py"
)
try:
exec(open(_version_path).read())
return locals()["version"]
except IOError:
print("Failed to load dbt-databricks version file for packaging.", file=sys.stderr)
print(
"Failed to load dbt-databricks version file for packaging.", file=sys.stderr
)
sys.exit(-1)


Expand All @@ -55,7 +61,7 @@ def _get_plugin_version():
include_package_data=True,
install_requires=[
"dbt-spark==1.6.0",
"databricks-sql-connector~=2.7.0",
"databricks-sql-connector>=2.9.3, <3.0.0",
"databricks-sdk>=0.1.7",
"keyring>=23.13.0",
],
Expand Down
21 changes: 20 additions & 1 deletion tests/unit/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from dbt.adapters.databricks.utils import redact_credentials
from dbt.adapters.databricks.utils import redact_credentials, remove_ansi


class TestDatabricksUtils(unittest.TestCase):
Expand Down Expand Up @@ -68,3 +68,22 @@ def test_redact_credentials_copy_into(self):
"copy_options ('mergeSchema' = 'True')"
)
self.assertEqual(redact_credentials(sql), expected)

def test_remove_ansi(self):
test_string = """Python model failed with traceback as:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
File ~/.ipykernel/1292/command--1-4090367456:79
 70 # COMMAND ----------
 71
 72 # how to execute python model in notebook
"""
expected_string = """Python model failed with traceback as:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
File ~/.ipykernel/1292/command--1-4090367456:79
70 # COMMAND ----------
71
72 # how to execute python model in notebook
"""
self.assertEqual(remove_ansi(test_string), expected_string)

0 comments on commit cbd78e9

Please sign in to comment.