1.6.3 Release

databricks · Sep 8, 2023 · cbd78e9 · cbd78e9
2 parents e69f121 + 4e77d3c
commit cbd78e9
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 38 deletions.
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -1,16 +1,46 @@
 name: Integration Tests
-on: push
+on:
+  push:
+    paths-ignore:
+      - "**.MD"
 jobs:
-  run-tox-tests-uc:
+  run-tox-tests-uc-cluster:
     runs-on: ubuntu-latest
     environment: azure-prod
     env:
       DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
-      DBT_DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+      DBT_DATABRICKS_CLIENT_ID: ${{ secrets.TEST_PECO_SP_ID }}
+      DBT_DATABRICKS_CLIENT_SECRET: ${{ secrets.TEST_PECO_SP_SECRET }}
+      DBT_DATABRICKS_UC_INITIAL_CATALOG: peco
+      DBT_DATABRICKS_LOCATION_ROOT: ${{ secrets.TEST_PECO_EXTERNAL_LOCATION }}
+      TEST_PECO_UC_CLUSTER_ID: ${{ secrets.TEST_PECO_UC_CLUSTER_ID }}
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v3
+      - name: Set up python
+        id: setup-python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Get http path from environment
+        run: python .github/workflows/build_cluster_http_path.py
+        shell: sh
+      - name: Install tox
+        id: install-dependencies
+        run: pip install tox
+      - name: Run integration-uc-databricks-cluster
+        run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_CLIENT_ID=$DBT_DATABRICKS_CLIENT_ID DBT_DATABRICKS_CLIENT_SECRET=$DBT_DATABRICKS_CLIENT_SECRET tox -e integration-databricks-uc-cluster
+
+  run-tox-tests-uc-sql:
+    runs-on: ubuntu-latest
+    environment: azure-prod
+    env:
+      DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
+      DBT_DATABRICKS_CLIENT_ID: ${{ secrets.TEST_PECO_SP_ID }}
+      DBT_DATABRICKS_CLIENT_SECRET: ${{ secrets.TEST_PECO_SP_SECRET }}
       DBT_DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }}
       DBT_DATABRICKS_UC_INITIAL_CATALOG: peco
       DBT_DATABRICKS_LOCATION_ROOT: ${{ secrets.TEST_PECO_EXTERNAL_LOCATION }}
-      TEST_PECO_CLUSTER_ID: ${{ secrets.TEST_PECO_CLUSTER_ID }}
       TEST_PECO_UC_CLUSTER_ID: ${{ secrets.TEST_PECO_UC_CLUSTER_ID }}
     steps:
       - name: Check out repository
@@ -27,18 +57,15 @@ jobs:
         id: install-dependencies
         run: pip install tox
       - name: Run integration-databricks-uc-sql-endpoint
-        run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_HTTP_PATH DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_TOKEN=$DBT_DATABRICKS_TOKEN tox -e integration-databricks-uc-sql-endpoint
-      - name: Run integration-uc-databricks-cluster
-        run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_CLUSTER_HTTP_PATH DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_TOKEN=$DBT_DATABRICKS_TOKEN tox -e integration-databricks-uc-cluster
+        run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH=$DBT_DATABRICKS_UC_CLUSTER_HTTP_PATH DBT_DATABRICKS_CLIENT_ID=$DBT_DATABRICKS_CLIENT_ID DBT_DATABRICKS_CLIENT_SECRET=$DBT_DATABRICKS_CLIENT_SECRET tox -e integration-databricks-uc-sql-endpoint
+
   run-tox-tests-non-uc:
     runs-on: ubuntu-latest
     environment: azure-prod
     env:
       DBT_DATABRICKS_HOST_NAME: ${{ secrets.DATABRICKS_HOST }}
       DBT_DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
-      DBT_DATABRICKS_UC_INITIAL_CATALOG: peco
       TEST_PECO_CLUSTER_ID: ${{ secrets.TEST_PECO_CLUSTER_ID }}
-      TEST_PECO_UC_CLUSTER_ID: ${{ secrets.TEST_PECO_UC_CLUSTER_ID }}
       DBT_DATABRICKS_LOCATION_ROOT: ${{ secrets.TEST_PECO_EXTERNAL_LOCATION }}
     steps:
       - name: Check out repository
@@ -55,4 +82,4 @@ jobs:
         id: install-dependencies
         run: pip install tox
       - name: Run integration-databricks-cluster
-        run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_CLUSTER_HTTP_PATH DBT_DATABRICKS_TOKEN=$DBT_DATABRICKS_TOKEN tox -e integration-databricks-cluster
+        run: [email protected] DBT_DATABRICKS_LOCATION_ROOT=$DBT_DATABRICKS_LOCATION_ROOT DBT_DATABRICKS_HOST_NAME=$DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_HTTP_PATH=$DBT_DATABRICKS_CLUSTER_HTTP_PATH DBT_DATABRICKS_CLIENT_ID=$DBT_DATABRICKS_CLIENT_ID DBT_DATABRICKS_CLIENT_SECRET=$DBT_DATABRICKS_CLIENT_SECRET tox -e integration-databricks-cluster
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -20,7 +20,11 @@ on:
       - "main"
       - "*.latest"
       - "releases/*"
+    paths-ignore:
+      - "**.MD"
   pull_request:
+    paths-ignore:
+      - "**.MD"
   workflow_dispatch:
 
 permissions: read-all
@@ -59,7 +63,7 @@ jobs:
       - name: Set up Python
         uses: actions/[email protected]
         with:
-          python-version: '3.8'
+          python-version: "3.8"
 
       - name: Install python dependencies
         run: |
@@ -132,7 +136,7 @@ jobs:
       - name: Set up Python
         uses: actions/[email protected]
         with:
-          python-version: '3.8'
+          python-version: "3.8"
 
       - name: Install python dependencies
         run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,16 +1,30 @@
 ## dbt-databricks 1.6.x (Release TBD)
 
+## dbt-databricks 1.6.3 (September 8, 2023)
+
+### Fixes
+
+- Improved legibility of python stack traces ([#434](https://github.com/databricks/dbt-databricks/pull/434)).
+- Add `fetchmany`, resolves #408 (Thanks @NodeJSmith) ([#409](https://github.com/databricks/dbt-databricks/pull/409))
+- Improved legibility of python stack traces ([#434](https://github.com/databricks/dbt-databricks/pull/434))
+- Update our Databricks Workflow README to make clear that jobs clusters are not supported targets ([#436](https://github.com/databricks/dbt-databricks/pull/436))
+- Relaxed the constraint on databricks-sql-connector to allow newer versions ([#436](https://github.com/databricks/dbt-databricks/pull/436))
+- Streamlined sql connector output in dbt.log ([#437](https://github.com/databricks/dbt-databricks/pull/437))
+
+### Under the hood
+
+- Switch to running integration tests with OAuth ([#436](https://github.com/databricks/dbt-databricks/pull/436))
+
 ## dbt-databricks 1.6.2 (August 29, 2023)
 
 ### Features
 
 - Follow up: re-implement fix for issue where the show tables extended command is limited to 2048 characters. ([#326](https://github.com/databricks/dbt-databricks/pull/326)). Set `DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS` to `true` to enable this behaviour.
-- Add `liquid_clustered_by` config to enable Liquid Clustering for Delta-based dbt models.
+- Add `liquid_clustered_by` config to enable Liquid Clustering for Delta-based dbt models (Thanks @ammarchalifah) ([#398](https://github.com/databricks/dbt-databricks/pull/398)).
 
 ### Under the hood
 
 - Dropping the databricks_sql_endpoint test profile as not truly testing different behavior than databricks_uc_sql_endpoint profile ([#417](https://github.com/databricks/dbt-databricks/pull/417))
-
 - Improve testing of python model support so that we can package the new config options in this release ([#421](https://github.com/databricks/dbt-databricks/pull/421))
 
 ## dbt-databricks 1.6.1 (August 2, 2023)

diff --git a/dbt/adapters/databricks/__version__.py b/dbt/adapters/databricks/__version__.py
@@ -1 +1 @@
-version: str = "1.6.2"
+version: str = "1.6.3"
diff --git a/dbt/adapters/databricks/connections.py b/dbt/adapters/databricks/connections.py
@@ -78,7 +78,7 @@ def emit(self, record: logging.LogRecord) -> None:
 dbt_adapter_logger = AdapterLogger("databricks-sql-connector")
 
 pysql_logger = logging.getLogger("databricks.sql")
-pysql_logger_level = os.environ.get("DBT_DATABRICKS_CONNECTOR_LOG_LEVEL", "INFO").upper()
+pysql_logger_level = os.environ.get("DBT_DATABRICKS_CONNECTOR_LOG_LEVEL", "WARN").upper()
 pysql_logger.setLevel(pysql_logger_level)
 
 pysql_handler = DbtCoreHandler(dbt_logger=dbt_adapter_logger, level=pysql_logger_level)
@@ -496,6 +496,9 @@ def fetchall(self) -> Sequence[Tuple]:
     def fetchone(self) -> Optional[Tuple]:
         return self._cursor.fetchone()
 
+    def fetchmany(self, size: int) -> Sequence[Tuple]:
+        return self._cursor.fetchmany(size)
+
     def execute(self, sql: str, bindings: Optional[Sequence[Any]] = None) -> None:
         # print(f"execute: {sql}")
         if sql.strip().endswith(";"):
@@ -779,7 +782,11 @@ def add_query(
                     cursor.close()
 
     def execute(
-        self, sql: str, auto_begin: bool = False, fetch: bool = False, limit: Optional[int] = None
+        self,
+        sql: str,
+        auto_begin: bool = False,
+        fetch: bool = False,
+        limit: Optional[int] = None,
     ) -> Tuple[DatabricksAdapterResponse, Table]:
         sql = self._add_query_comment(sql)
         _, cursor = self.add_query(sql, auto_begin)

diff --git a/dbt/adapters/databricks/python_submissions.py b/dbt/adapters/databricks/python_submissions.py
@@ -2,6 +2,7 @@
 
 from dbt.adapters.databricks.__version__ import version
 from dbt.adapters.databricks.connections import DatabricksCredentials
+from dbt.adapters.databricks import utils
 
 import base64
 import time
@@ -146,7 +147,7 @@ def _submit_through_notebook(self, compiled_code: str, cluster_spec: dict) -> No
                 "Python model failed with traceback as:\n"
                 "(Note that the line number here does not "
                 "match the line number in your code due to dbt templating)\n"
-                f"{json_run_output['error_trace']}"
+                f"{utils.remove_ansi(json_run_output['error_trace'])}"
             )
 
     def submit(self, compiled_code: str) -> None:
@@ -373,7 +374,7 @@ def submit(self, compiled_code: str) -> None:
                 if response["results"]["resultType"] == "error":
                     raise dbt.exceptions.DbtRuntimeError(
                         f"Python model failed with traceback as:\n"
-                        f"{response['results']['cause']}"
+                        f"{utils.remove_ansi(response['results']['cause'])}"
                     )
             finally:
                 context.destroy(context_id)

diff --git a/dbt/adapters/databricks/utils.py b/dbt/adapters/databricks/utils.py
@@ -72,3 +72,8 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
         return func(*new_args, **new_kwargs)
 
     return wrapper
+
+
+def remove_ansi(line: str) -> str:
+    ansi_escape = re.compile(r"(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]")
+    return ansi_escape.sub("", line)
diff --git a/docs/databricks-workflows.md b/docs/databricks-workflows.md
@@ -5,25 +5,34 @@ Databricks Workflows is a highly-reliable, managed orchestrator that lets you au
 In this guide, you will learn how to update an existing dbt project to run as a job, retrieve dbt run artifacts using the Jobs API and debug common issues.
 
 # Overview
+
 When you run a dbt project as a Databricks Job, the dbt CLI runs on a single-node Automated Cluster. The SQL generated by dbt runs on a serverless SQL warehouse.
 
 # Prerequisites
+
 - An existing dbt project version controlled in git
 - Access to a Databricks workspace
-- Ability to launch job clusters (using a policy or cluster create permissions) or access to an existing interactive cluster with `dbt-core` and `dbt-databricks` libraries installed or  `CAN_MANAGE` permissions to install the `dbt-core` and `dbt-databricks` as cluster libraries.
+- Access to an existing interactive cluster with `dbt-core` and `dbt-databricks` libraries installed or `CAN_MANAGE` permissions to install the `dbt-core` and `dbt-databricks` as cluster libraries OR
 - Access to serverless SQL warehouses. See [documentation](https://docs.databricks.com/serverless-compute/index.html) to learn more about this feature and regional availability.
 - [Files in Repos](https://docs.databricks.com/repos/index.html#enable-support-for-arbitrary-files-in-databricks-repos) must be enabled and is only supported on Databricks Runtime (DBR) 8.4+ or DBR 11+ depending on the configuration. Please make sure the cluster has the appropriate DBR version.
 - Install and configure the [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html)
 - Install [jq](https://stedolan.github.io/jq/download/), a popular open source tool for parsing JSON from the command line
 
+Note: previously dbt tasks on Databricks Workflows could target jobs clusters for compute.
+That is [no longer supported](https://docs.databricks.com/en/workflows/jobs/how-to/use-dbt-in-workflows.html#advanced-run-dbt-with-a-custom-profile).
+Job clusters can only be used for running the dbt-cli.
+
 # Run dbt as a production job
+
 In this step, you will create a job that will run the dbt project on a schedule.
 
 ## Connect Databricks to Git
+
 The dbt task only supports retrieve dbt projects from Git. Please follow [the documentation](https://docs.databricks.com/repos/index.html#configure-your-git-integration-with-databricks) to connect Databricks to Git.
 
 ## Create a job
-1. Log in to your Databricks workspace 
+
+1. Log in to your Databricks workspace
 2. Click the _Data Science & Engineering_ persona in the left navigation bar
 3. Click _Workflows_
 4. Click _Create Job_
@@ -41,12 +50,14 @@ The dbt task only supports retrieve dbt projects from Git. Please follow [the do
 13. Click _Save_
 
 # Run the job and view dbt output
+
 You can now run your newly-saved job and see its output.
 
 1. Click _Run Now_ on the notification that shows up when you save the job
 2. Click the active run and see dbt output. Note that dbt output is not real-time, it lags behind dbt's progress by several seconds to a minute.
 
 # Retrieve dbt artifacts using the Jobs API
+
 A dbt run generates useful artifacts which you may want to retrieve for analysis and more. Databricks saves the contents of `/logs` and `/target` directories as a compressed archive which you can retrieve using the Jobs API.
 
 > It is currently not possible to refer to a previous run's artifacts e.g. using the `--state` flag. You can, however, include a known good state in your repository.
@@ -77,14 +88,18 @@ $ tar -xvf artifact.tar.gz
 ```
 
 # Common issues
+
 ## Unable to connect to Databricks
+
 - If you do not use the automatically-generated `profiles.yml`, check your Personal Access Token (PAT). It must not be expired.
 - Consider adding `dbt debug` as the first command. This may give you a clue about the failure.
 
 ## dbt cannot find my `dbt_project.yml` file
+
 If you have checked out the Git repository before enabling the _Files in Repos_ feature, the checkout might be cached invalidly. You need to push a dummy commit to your repository to force a fresh checkout.
 
 # Connecting to different sources (custom profile)
+
 By default the dbt task type will connect to the serverless SQL warehouse specified in the task without any configuration changes or need to check in any secrets. It does so by generating a default `profiles.yml` and telling dbt to use it. We have no restrictions on connection to any other dbt targets such as Databricks SQL, Amazon Redshift, Google BigQuery, Snowflake, or any other [supported adapter](https://docs.getdbt.com/docs/available-adapters).
 
 The automatically generated profile can be overridden by specifying an alternative profiles directory in the dbt command using `--profiles-dir <dir>`, where the path of the `<dir>` should be a relative path like `.` or `./my-directory`.
@@ -95,15 +110,15 @@ If you'd like to connect to multiple outputs and including the current Interacti
 
 ```yaml
 databricks_demo:
- target: databricks_cluster
- outputs:
-   databricks_cluster:
-     type: databricks
-     connect_retries: 5
-     connect_timeout: 180
-     schema: "<your-schema>"
-     threads: 8 # This can be increased or decreased to control the parallism
-     host: "{{ env_var('DBT_HOST') }}"
-     http_path: "sql/protocolv1/o/{{ env_var('DBT_ORG_ID') }}/{{ env_var('DBT_CLUSTER_ID') }}"
-     token: "{{ env_var('DBT_ACCESS_TOKEN') }}"
+  target: databricks_cluster
+  outputs:
+    databricks_cluster:
+      type: databricks
+      connect_retries: 5
+      connect_timeout: 180
+      schema: "<your-schema>"
+      threads: 8 # This can be increased or decreased to control the parallism
+      host: "{{ env_var('DBT_HOST') }}"
+      http_path: "sql/protocolv1/o/{{ env_var('DBT_ORG_ID') }}/{{ env_var('DBT_CLUSTER_ID') }}"
+      token: "{{ env_var('DBT_ACCESS_TOKEN') }}"
 ```
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-databricks-sql-connector~=2.7.0
+databricks-sql-connector>=2.9.3, <3.0.0
 dbt-spark==1.6.0
 databricks-sdk==0.1.7
 keyring>=23.13.0
diff --git a/setup.py b/setup.py
@@ -17,7 +17,9 @@
 except ImportError:
     # the user has a downlevel version of setuptools.
     print("Error: dbt requires setuptools v40.1.0 or higher.")
-    print('Please upgrade setuptools with "pip install --upgrade setuptools" and try again')
+    print(
+        'Please upgrade setuptools with "pip install --upgrade setuptools" and try again'
+    )
     sys.exit(1)
 
 
@@ -29,12 +31,16 @@
 
 # get this package's version from dbt/adapters/<name>/__version__.py
 def _get_plugin_version():
-    _version_path = os.path.join(this_directory, "dbt", "adapters", "databricks", "__version__.py")
+    _version_path = os.path.join(
+        this_directory, "dbt", "adapters", "databricks", "__version__.py"
+    )
     try:
         exec(open(_version_path).read())
         return locals()["version"]
     except IOError:
-        print("Failed to load dbt-databricks version file for packaging.", file=sys.stderr)
+        print(
+            "Failed to load dbt-databricks version file for packaging.", file=sys.stderr
+        )
         sys.exit(-1)
 
 
@@ -55,7 +61,7 @@ def _get_plugin_version():
     include_package_data=True,
     install_requires=[
         "dbt-spark==1.6.0",
-        "databricks-sql-connector~=2.7.0",
+        "databricks-sql-connector>=2.9.3, <3.0.0",
         "databricks-sdk>=0.1.7",
         "keyring>=23.13.0",
     ],

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -1,6 +1,6 @@
 import unittest
 
-from dbt.adapters.databricks.utils import redact_credentials
+from dbt.adapters.databricks.utils import redact_credentials, remove_ansi
 
 
 class TestDatabricksUtils(unittest.TestCase):
@@ -68,3 +68,22 @@ def test_redact_credentials_copy_into(self):
             "copy_options ('mergeSchema' = 'True')"
         )
         self.assertEqual(redact_credentials(sql), expected)
+
+    def test_remove_ansi(self):
+        test_string = """Python model failed with traceback as:
+  [0;31m---------------------------------------------------------------------------[0m
+  [0;31mException[0m                                 Traceback (most recent call last)
+  File [0;32m~/.ipykernel/1292/command--1-4090367456:79[0m
+  [1;32m     70[0m [38;5;66;03m# COMMAND ----------[39;00m
+  [1;32m     71[0m
+  [1;32m     72[0m [38;5;66;03m# how to execute python model in notebook[39;00m
+"""
+        expected_string = """Python model failed with traceback as:
+  ---------------------------------------------------------------------------
+  Exception                                 Traceback (most recent call last)
+  File ~/.ipykernel/1292/command--1-4090367456:79
+       70 # COMMAND ----------
+       71
+       72 # how to execute python model in notebook
+"""
+        self.assertEqual(remove_ansi(test_string), expected_string)