Merge pull request #3 from octoenergy/rebase-to-main-package

Rebase to main package
octoenergy · Jan 26, 2024 · 56687e5 · 56687e5
2 parents 8347257 + 377e158
commit 56687e5
Show file tree

Hide file tree

Showing 37 changed files with 4,244 additions and 1,424 deletions.
diff --git a/.github/workflows/code-quality-checks.yml b/.github/workflows/code-quality-checks.yml
diff --git a/.gitignore b/.gitignore
@@ -204,4 +204,7 @@ dist/
 build/
 
 # vs code stuff
-.vscode
+.vscode
+
+# don't commit authentication info to source control
+test.env
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,59 @@
 # Release History
 
-## 2.5.x (Unreleased)
+## 2.9.4 (Unreleased)
+
+## 2.9.3 (2023-08-24)
+
+- Fix: Connections failed when urllib3~=1.0.0 is installed (#206)
+
+## 2.9.2 (2023-08-17)
+
+- Other: Add `examples/v3_retries_query_execute.py` (#199)
+- Other: suppress log message when `_enable_v3_retries` is not `True` (#199)
+- Other: make this connector backwards compatible with `urllib3>=1.0.0` (#197)
+
+## 2.9.1 (2023-08-11)
+
+- Other: Explicitly pin urllib3 to ^2.0.0 (#191)
+
+## 2.9.0 (2023-08-10)
+
+- Replace retry handling with DatabricksRetryPolicy. This is disabled by default. To enable, set `enable_v3_retries=True` when creating `databricks.sql.client` (#182)
+- Other: Fix typo in README quick start example (#186)
+- Other: Add autospec to Client mocks and tidy up `make_request` (#188)
+
+## 2.8.0 (2023-07-21)
+
+- Add support for Cloud Fetch. Disabled by default. Set `use_cloud_fetch=True` when building `databricks.sql.client` to enable it (#146, #151, #154)
+- SQLAlchemy has_table function now honours schema= argument and adds catalog= argument (#174)
+- SQLAlchemy set non_native_boolean_check_constraint False as it's not supported by Databricks (#120)
+- Fix: Revised SQLAlchemy dialect and examples for compatibility with SQLAlchemy==1.3.x (#173)
+- Fix: oauth would fail if expired credentials appeared in ~/.netrc (#122)
+- Fix: Python HTTP proxies were broken after switch to urllib3 (#158)
+- Other: remove unused import in SQLAlchemy dialect
+- Other: Relax pandas dependency constraint to allow ^2.0.0 (#164)
+- Other: Connector now logs operation handle guids as hexadecimal instead of bytes (#170)
+- Other: test_socket_timeout_user_defined e2e test was broken (#144)
+
+## 2.7.0 (2023-06-26)
+
+- Fix: connector raised exception when calling close() on a closed Thrift session
+- Improve e2e test development ergonomics
+- Redact logged thrift responses by default
+- Add support for OAuth on Databricks Azure
+
+## 2.6.2 (2023-06-14)
+
+- Fix: Retry GetOperationStatus requests for http errors
+
+## 2.6.1 (2023-06-08)
+
+- Fix: http.client would raise a BadStatusLine exception in some cases
+
+## 2.6.0 (2023-06-07)
+
+- Add support for HTTP 1.1 connections (connection pools)
+- Add a default socket timeout for thrift RPCs
 
 ## 2.5.2 (2023-05-08)
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -109,6 +109,17 @@ export http_path=""
 export access_token=""
 ```
 
+Or you can write these into a file called `test.env` in the root of the repository:
+
+```
+host="****.cloud.databricks.com"
+http_path="/sql/1.0/warehouses/***"
+access_token="dapi***"
+staging_ingestion_user="***@example.com"
+```
+
+To see logging output from pytest while running tests, set `log_cli = "true"` under `tool.pytest.ini_options` in `pyproject.toml`. You can also set `log_cli_level` to any of the default Python log levels: `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`
+
 There are several e2e test suites available:
 - `PySQLCoreTestSuite`
 - `PySQLLargeQueriesSuite`

diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ from databricks import sql
 
 host = os.getenv("DATABRICKS_HOST")
 http_path = os.getenv("DATABRICKS_HTTP_PATH")
-access_token = os.getenv("DATABRICKS_ACCESS_TOKEN")
+access_token = os.getenv("DATABRICKS_TOKEN")
 
 connection = sql.connect(
   server_hostname=host,

diff --git a/examples/README.md b/examples/README.md
@@ -38,4 +38,5 @@ To run all of these examples you can clone the entire repository to your disk. O
 this example the string `ExamplePartnerTag` will be added to the the user agent on every request.
 - **`staging_ingestion.py`** shows how the connector handles Databricks' experimental staging ingestion commands `GET`, `PUT`, and `REMOVE`.
 - **`sqlalchemy.py`** shows a basic example of connecting to Databricks with [SQLAlchemy](https://www.sqlalchemy.org/). 
-- **`custom_cred_provider.py`** shows how to pass a custom credential provider to bypass connector authentication. Please install databricks-sdk prior to running this example.
+- **`custom_cred_provider.py`** shows how to pass a custom credential provider to bypass connector authentication. Please install databricks-sdk prior to running this example.
+- **`v3_retries_query_execute.py`** shows how to enable v3 retries in connector version 2.9.x including how to enable retries for non-default retry cases.
diff --git a/examples/sqlalchemy.py b/examples/sqlalchemy.py
@@ -42,9 +42,15 @@
 """
 
 import os
-from sqlalchemy.orm import declarative_base, Session
+import sqlalchemy
+from sqlalchemy.orm import Session
 from sqlalchemy import Column, String, Integer, BOOLEAN, create_engine, select
 
+try:
+    from sqlalchemy.orm import declarative_base
+except ImportError:
+    from sqlalchemy.ext.declarative import declarative_base
+
 host = os.getenv("DATABRICKS_SERVER_HOSTNAME")
 http_path = os.getenv("DATABRICKS_HTTP_PATH")
 access_token = os.getenv("DATABRICKS_TOKEN")
@@ -59,10 +65,20 @@
     "_user_agent_entry": "PySQL Example Script",
 }
 
-engine = create_engine(
-    f"databricks://token:{access_token}@{host}?http_path={http_path}&catalog={catalog}&schema={schema}",
-    connect_args=extra_connect_args,
-)
+if sqlalchemy.__version__.startswith("1.3"):
+    # SQLAlchemy 1.3.x fails to parse the http_path, catalog, and schema from our connection string
+    # Pass these in as connect_args instead
+
+    conn_string = f"databricks://token:{access_token}@{host}"
+    connect_args = dict(catalog=catalog, schema=schema, http_path=http_path)
+    all_connect_args = {**extra_connect_args, **connect_args}
+    engine = create_engine(conn_string, connect_args=all_connect_args)
+else:
+    engine = create_engine(
+        f"databricks://token:{access_token}@{host}?http_path={http_path}&catalog={catalog}&schema={schema}",
+        connect_args=extra_connect_args,
+    )
+
 session = Session(bind=engine)
 base = declarative_base(bind=engine)
 
@@ -86,9 +102,14 @@ class SampleObject(base):
 
 session.commit()
 
-stmt = select(SampleObject).where(SampleObject.name.in_(["Bim Adewunmi", "Miki Meek"]))
+# SQLAlchemy 1.3 has slightly different methods
+if sqlalchemy.__version__.startswith("1.3"):
+    stmt = select([SampleObject]).where(SampleObject.name.in_(["Bim Adewunmi", "Miki Meek"]))
+    output = [i for i in session.execute(stmt)]
+else:
+    stmt = select(SampleObject).where(SampleObject.name.in_(["Bim Adewunmi", "Miki Meek"]))
+    output = [i for i in session.scalars(stmt)]
 
-output = [i for i in session.scalars(stmt)]
 assert len(output) == 2
 
 base.metadata.drop_all()
diff --git a/examples/v3_retries_query_execute.py b/examples/v3_retries_query_execute.py
@@ -0,0 +1,35 @@
+from databricks import sql
+import os
+
+# Users of connector versions >= 2.9.0 and <= 3.0.0 can use the v3 retry behaviour by setting _enable_v3_retries=True
+# This flag will be deprecated in databricks-sql-connector~=3.0.0 as it will become the default.
+#
+# The new retry behaviour is defined in src/databricks/sql/auth/retry.py
+# 
+# The new retry behaviour allows users to force the connector to automatically retry requests that fail with codes
+# that are not retried by default (in most cases only codes 429 and 503 are retried by default). Additional HTTP
+# codes to retry are specified as a list passed to `_retry_dangerous_codes`.
+#
+# Note that, as implied in the name, doing this is *dangerous* and should not be configured in all usages.
+# With the default behaviour, ExecuteStatement Thrift commands are only retried for codes 429 and 503 because
+# we can be certain at run-time that the statement never reached Databricks compute. These codes are returned by
+# the SQL gateway / load balancer. So there is no risk that retrying the request would result in a doubled
+# (or tripled etc) command execution. These codes are always accompanied by a Retry-After header, which we honour.
+#
+# However, if your use-case emits idempotent queries such as SELECT statements, it can be helpful to retry 
+# for 502 (Bad Gateway) codes etc. In these cases, there is a possibility that the initial command _did_ reach
+# Databricks compute and retrying it could result in additional executions. Retrying under these conditions uses
+# an exponential back-off since a Retry-After header is not present.
+
+with sql.connect(server_hostname = os.getenv("DATABRICKS_SERVER_HOSTNAME"),
+                 http_path       = os.getenv("DATABRICKS_HTTP_PATH"),
+                 access_token    = os.getenv("DATABRICKS_TOKEN"),
+                 _enable_v3_retries = True,
+                 _retry_dangerous_codes=[502,400]) as connection:
+
+  with connection.cursor() as cursor:
+    cursor.execute("SELECT * FROM default.diamonds LIMIT 2")
+    result = cursor.fetchall()
+
+    for row in result:
+      print(row)