databricks · varun-edachali-dbx · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025
@@ -11,6 +11,8 @@
 from abc import ABC, abstractmethod
 from typing import Dict, Tuple, List, Optional, Any, Union, TYPE_CHECKING
 
+from databricks.sql.types import SSLOptions
+
 if TYPE_CHECKING:
     from databricks.sql.client import Cursor
 
@@ -25,6 +27,13 @@
 
 
 class DatabricksClient(ABC):
+    def __init__(self, ssl_options: SSLOptions, **kwargs):
+        self._use_arrow_native_complex_types = kwargs.get(
+            "_use_arrow_native_complex_types", True
+        )
+        self._max_download_threads = kwargs.get("max_download_threads", 10)
+        self._ssl_options = ssl_options
+
     # == Connection and Session Management ==
     @abstractmethod
     def open_session(
@@ -82,7 +91,7 @@ def execute_command(
         lz4_compression: bool,
         cursor: "Cursor",
         use_cloud_fetch: bool,
-        parameters: List,
+        parameters: List[ttypes.TSparkParameter],
         async_op: bool,
         enforce_embedded_schema_correctness: bool,
     ) -> Union["ResultSet", None]:

@@ -3,7 +3,7 @@
 import re
 from typing import Any, Dict, Tuple, List, Optional, Union, TYPE_CHECKING, Set
 
-from databricks.sql.backend.sea.models.base import ResultManifest
+from databricks.sql.backend.sea.models.base import ExternalLink, ResultManifest
 from databricks.sql.backend.sea.utils.constants import (
     ALLOWED_SESSION_CONF_TO_DEFAULT_VALUES_MAP,
     ResultFormat,
@@ -41,6 +41,7 @@
     GetStatementResponse,
     CreateSessionResponse,
 )
+from databricks.sql.backend.sea.models.responses import GetChunksResponse
 
 logger = logging.getLogger(__name__)
 
@@ -85,6 +86,7 @@ class SeaDatabricksClient(DatabricksClient):
     STATEMENT_PATH = BASE_PATH + "statements"
     STATEMENT_PATH_WITH_ID = STATEMENT_PATH + "/{}"
     CANCEL_STATEMENT_PATH_WITH_ID = STATEMENT_PATH + "/{}/cancel"
+    CHUNK_PATH_WITH_ID_AND_INDEX = STATEMENT_PATH + "/{}/result/chunks/{}"
 
     # SEA constants
     POLL_INTERVAL_SECONDS = 0.2
@@ -119,7 +121,7 @@ def __init__(
             http_path,
         )
 
-        self._max_download_threads = kwargs.get("max_download_threads", 10)
+        super().__init__(ssl_options=ssl_options, **kwargs)
 
         # Extract warehouse ID from http_path
         self.warehouse_id = self._extract_warehouse_id(http_path)
@@ -131,7 +133,7 @@ def __init__(
             http_path=http_path,
             http_headers=http_headers,
             auth_provider=auth_provider,
-            ssl_options=ssl_options,
+            ssl_options=self._ssl_options,
             **kwargs,
         )
 
@@ -342,7 +344,7 @@ def _results_message_to_execute_response(
 
         # Check for compression
         lz4_compressed = (
-            response.manifest.result_compression == ResultCompression.LZ4_FRAME
+            response.manifest.result_compression == ResultCompression.LZ4_FRAME.value
         )
 
         execute_response = ExecuteResponse(
@@ -351,7 +353,7 @@ def _results_message_to_execute_response(
             description=description,
             has_been_closed_server_side=False,
             lz4_compressed=lz4_compressed,
-            is_staging_operation=False,
+            is_staging_operation=response.manifest.is_volume_operation,
             arrow_schema_bytes=None,
             result_format=response.manifest.format,
         )
@@ -620,6 +622,35 @@ def get_execution_result(
             manifest=response.manifest,
         )
 
+    def get_chunk_link(self, statement_id: str, chunk_index: int) -> ExternalLink:
+        """
+        Get links for chunks starting from the specified index.
+        Args:
+            statement_id: The statement ID
+            chunk_index: The starting chunk index
+        Returns:
+            ExternalLink: External link for the chunk
+        """
+
+        response_data = self.http_client._make_request(
+            method="GET",
+            path=self.CHUNK_PATH_WITH_ID_AND_INDEX.format(statement_id, chunk_index),
+        )
+        response = GetChunksResponse.from_dict(response_data)
+
+        links = response.external_links
+        link = next((l for l in links if l.chunk_index == chunk_index), None)
+        if not link:
+            raise ServerOperationError(
+                f"No link found for chunk index {chunk_index}",
+                {
+                    "operation-id": statement_id,
+                    "diagnostic-info": None,
+                },
+            )
+
+        return link
+
     # == Metadata Operations ==
 
     def get_catalogs(

@@ -27,6 +27,7 @@
     ExecuteStatementResponse,
     GetStatementResponse,
     CreateSessionResponse,
+    GetChunksResponse,
 )
 
 __all__ = [
@@ -49,4 +50,5 @@
     "ExecuteStatementResponse",
     "GetStatementResponse",
     "CreateSessionResponse",
+    "GetChunksResponse",
 ]
@@ -4,7 +4,7 @@
 These models define the structures used in SEA API responses.
 """
 
-from typing import Dict, Any
+from typing import Dict, Any, List
 from dataclasses import dataclass
 
 from databricks.sql.backend.types import CommandState
@@ -154,3 +154,38 @@ class CreateSessionResponse:
     def from_dict(cls, data: Dict[str, Any]) -> "CreateSessionResponse":
         """Create a CreateSessionResponse from a dictionary."""
         return cls(session_id=data.get("session_id", ""))
+
+
+@dataclass
+class GetChunksResponse:
+    """Response from getting chunks for a statement."""
+
+    statement_id: str
+    external_links: List[ExternalLink]
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "GetChunksResponse":
+        """Create a GetChunksResponse from a dictionary."""
+        external_links = []
+        if "external_links" in data:
+            for link_data in data["external_links"]:
+                external_links.append(
+                    ExternalLink(
+                        external_link=link_data.get("external_link", ""),
+                        expiration=link_data.get("expiration", ""),
+                        chunk_index=link_data.get("chunk_index", 0),
+                        byte_count=link_data.get("byte_count", 0),
+                        row_count=link_data.get("row_count", 0),
+                        row_offset=link_data.get("row_offset", 0),
+                        next_chunk_index=link_data.get("next_chunk_index"),
+                        next_chunk_internal_link=link_data.get(
+                            "next_chunk_internal_link"
+                        ),
+                        http_headers=link_data.get("http_headers"),
+                    )
+                )
+
+        return cls(
+            statement_id=data.get("statement_id", ""),
+            external_links=external_links,
+        )
@@ -40,7 +40,6 @@
 )
 
 from databricks.sql.utils import (
-    ThriftResultSetQueueFactory,
     _bound,
     RequestErrorInfo,
     NoRetryReason,
@@ -148,6 +147,8 @@ def __init__(
             http_path,
         )
 
+        super().__init__(ssl_options, **kwargs)
+
         port = port or 443
         if kwargs.get("_connection_uri"):
             uri = kwargs.get("_connection_uri")
@@ -161,19 +162,13 @@ def __init__(
             raise ValueError("No valid connection settings.")
 
         self._initialize_retry_args(kwargs)
-        self._use_arrow_native_complex_types = kwargs.get(
-            "_use_arrow_native_complex_types", True
-        )
+
         self._use_arrow_native_decimals = kwargs.get("_use_arrow_native_decimals", True)
         self._use_arrow_native_timestamps = kwargs.get(
             "_use_arrow_native_timestamps", True
         )
 
         # Cloud fetch
-        self._max_download_threads = kwargs.get("max_download_threads", 10)
-
-        self._ssl_options = ssl_options
-
         self._auth_provider = auth_provider
 
         # Connector version 3 retry approach

@@ -101,6 +101,24 @@ def _schedule_downloads(self):
             task = self._thread_pool.submit(handler.run)
             self._download_tasks.append(task)
 
+    def add_link(self, link: TSparkArrowResultLink):
+        """
+        Add more links to the download manager.
+
+        Args:
+            link: Link to add
+        """
+
+        if link.rowCount <= 0:
+            return
+
+        logger.debug(
+            "ResultFileDownloadManager: adding file link, start offset {}, row count: {}".format(
+                link.startRowOffset, link.rowCount
+            )
+        )
+        self._pending_links.append(link)
+
     def _shutdown_manager(self):
         # Clear download handlers and shutdown the thread pool
         self._pending_links = []

@@ -1,11 +1,18 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional, TYPE_CHECKING
+import json
+from typing import List, Optional, Any, Union, Tuple, TYPE_CHECKING
 
 import logging
+import time
 import pandas
 
 from databricks.sql.backend.sea.backend import SeaDatabricksClient
-from databricks.sql.backend.sea.models.base import ResultData, ResultManifest
+from databricks.sql.backend.sea.models.base import (
+    ExternalLink,
+    ResultData,
+    ResultManifest,
+)
+from databricks.sql.utils import SeaResultSetQueueFactory
 
 try:
     import pyarrow
@@ -16,14 +23,10 @@
     from databricks.sql.backend.thrift_backend import ThriftDatabricksClient
     from databricks.sql.client import Connection
 from databricks.sql.backend.databricks_client import DatabricksClient
+from databricks.sql.thrift_api.TCLIService import ttypes
 from databricks.sql.types import Row
-from databricks.sql.exc import RequestError, CursorAlreadyClosedError
-from databricks.sql.utils import (
-    ColumnTable,
-    ColumnQueue,
-    JsonQueue,
-    SeaResultSetQueueFactory,
-)
+from databricks.sql.exc import Error, RequestError, CursorAlreadyClosedError
+from databricks.sql.utils import ColumnTable, ColumnQueue, JsonQueue
 from databricks.sql.backend.types import CommandId, CommandState, ExecuteResponse
 
 logger = logging.getLogger(__name__)
@@ -252,7 +255,7 @@ def __init__(
             description=execute_response.description,
             is_staging_operation=execute_response.is_staging_operation,
             lz4_compressed=execute_response.lz4_compressed,
-            arrow_schema_bytes=execute_response.arrow_schema_bytes,
+            arrow_schema_bytes=execute_response.arrow_schema_bytes or b"",
         )
 
         # Initialize results queue if not provided
@@ -476,6 +479,7 @@ def __init__(
                 result_data,
                 manifest,
                 str(execute_response.command_id.to_sea_statement_id()),
+                ssl_options=connection.session.ssl_options,
                 description=execute_response.description,
                 max_download_threads=sea_client.max_download_threads,
                 sea_client=sea_client,
@@ -548,6 +552,43 @@ def fetchall_json(self):
 
         return results
 
+    def _convert_complex_types_to_string(
+        self, rows: "pyarrow.Table"
+    ) -> "pyarrow.Table":
+        """
+        Convert complex types (array, struct, map) to string representation.
+
+        Args:
+            rows: Input PyArrow table
+
+        Returns:
+            PyArrow table with complex types converted to strings
+        """
+
+        if not pyarrow:
+            return rows
+
+        def convert_complex_column_to_string(col: "pyarrow.Array") -> "pyarrow.Array":
+            python_values = col.to_pylist()
+            json_strings = [
+                (None if val is None else json.dumps(val)) for val in python_values
+            ]
+            return pyarrow.array(json_strings, type=pyarrow.string())
+
+        converted_columns = []
+        for col in rows.columns:
+            converted_col = col
+            if (
+                pyarrow.types.is_list(col.type)
+                or pyarrow.types.is_large_list(col.type)
+                or pyarrow.types.is_struct(col.type)
+                or pyarrow.types.is_map(col.type)
+            ):
+                converted_col = convert_complex_column_to_string(col)
+            converted_columns.append(converted_col)
+
+        return pyarrow.Table.from_arrays(converted_columns, names=rows.column_names)
+
     def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
         """
         Fetch the next set of rows as an Arrow table.
@@ -568,6 +609,9 @@ def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
         results = self.results.next_n_rows(size)
         self._next_row_index += results.num_rows
 
+        if not self.backend._use_arrow_native_complex_types:
+            results = self._convert_complex_types_to_string(results)
+
         return results
 
     def fetchall_arrow(self) -> "pyarrow.Table":
@@ -577,6 +621,9 @@ def fetchall_arrow(self) -> "pyarrow.Table":
         results = self.results.remaining_rows()
         self._next_row_index += results.num_rows
 
+        if not self.backend._use_arrow_native_complex_types:
+            results = self._convert_complex_types_to_string(results)
+
         return results
 
     def fetchone(self) -> Optional[Row]:
@@ -590,7 +637,7 @@ def fetchone(self) -> Optional[Row]:
         if isinstance(self.results, JsonQueue):
             res = self._convert_json_table(self.fetchmany_json(1))
         else:
-            raise NotImplementedError("fetchone only supported for JSON data")
+            res = self._convert_arrow_table(self.fetchmany_arrow(1))
 
         return res[0] if res else None
 
@@ -610,7 +657,7 @@ def fetchmany(self, size: int) -> List[Row]:
         if isinstance(self.results, JsonQueue):
             return self._convert_json_table(self.fetchmany_json(size))
         else:
-            raise NotImplementedError("fetchmany only supported for JSON data")
+            return self._convert_arrow_table(self.fetchmany_arrow(size))
 
     def fetchall(self) -> List[Row]:
         """
@@ -622,4 +669,4 @@ def fetchall(self) -> List[Row]:
         if isinstance(self.results, JsonQueue):
             return self._convert_json_table(self.fetchall_json())
         else:
-            raise NotImplementedError("fetchall only supported for JSON data")
+            return self._convert_arrow_table(self.fetchall_arrow())