airbytehq · aaronsteers · Jul 21, 2024 · Jul 21, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# perf-test
+*test-artifact*
+
 # temp files
 temp
 .temp

diff --git a/airbyte/_airbyte_message_overrides.py b/airbyte/_airbyte_message_overrides.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+"""Custom Airbyte message classes.
+
+These classes override the default handling, in order to ensure that the data field is always a
+jsonified string, rather than a dict.
+
+To use these classes, import them from this module, and use them in place of the default classes.
+
+Example:
+```python
+from airbyte._airbyte_message_overrides import AirbyteMessageWithStrData
+
+for line in sys.stdin:
+    message = AirbyteMessageWithStrData.model_validate_json(line)
+```
+"""
+
+from __future__ import annotations
+
+import copy
+import json
+from typing import Any
+
+from pydantic import BaseModel, Field, model_validator
+
+from airbyte_protocol.models import (
+    AirbyteMessage,
+    AirbyteRecordMessage,
+    AirbyteStateMessage,
+)
+
+
+AirbyteRecordMessageWithStrData = copy.deepcopy(AirbyteRecordMessage)
+AirbyteStateMessageWithStrData = copy.deepcopy(AirbyteStateMessage)
+AirbyteMessageWithStrData = copy.deepcopy(AirbyteMessage)
+
+# Modify the data field in the copied class
+AirbyteRecordMessageWithStrData.__annotations__["data"] = str
+AirbyteStateMessageWithStrData.__annotations__["data"] = str
+
+AirbyteRecordMessageWithStrData.data = Field(..., description="jsonified record data as a str")
+AirbyteStateMessageWithStrData.data = Field(..., description="jsonified state data as a str")
+
+
+# Add a validator to ensure data is a JSON string
+@model_validator(mode="before")
+def ensure_data_is_string(
+    cls: BaseModel,  # type: ignore  # noqa: ARG001, PGH003
+    values: dict[str, Any],
+) -> None:
+    if "data" in values and not isinstance(values["data"], dict):
+        values["data"] = json.dumps(values["data"])
+    if "data" in values and not isinstance(values["data"], str):
+        raise ValueError
+
+
+AirbyteRecordMessageWithStrData.ensure_data_is_string = classmethod(ensure_data_is_string)  # type: ignore [arg-type]
+AirbyteStateMessageWithStrData.ensure_data_is_string = classmethod(ensure_data_is_string)  # type: ignore [arg-type]
+
+AirbyteMessageWithStrData.__annotations__["record"] = AirbyteRecordMessageWithStrData | None
+AirbyteMessageWithStrData.__annotations__["state"] = AirbyteStateMessageWithStrData | None
diff --git a/airbyte/_connector_base.py b/airbyte/_connector_base.py
@@ -169,7 +169,7 @@ def _get_spec(self, *, force_refresh: bool = False) -> ConnectorSpecification:
         """
         if force_refresh or self._spec is None:
             try:
-                for msg in self._execute(["spec"]):
+                for msg in self._execute_and_parse(["spec"]):
                     if msg.type == Type.SPEC and msg.spec:
                         self._spec = msg.spec
                         break
@@ -275,7 +275,7 @@ def check(self) -> None:
         """
         with as_temp_files([self._config]) as [config_file]:
             try:
-                for msg in self._execute(["check", "--config", config_file]):
+                for msg in self._execute_and_parse(["check", "--config", config_file]):
                     if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus:
                         if msg.connectionStatus.status != Status.FAILED:
                             rich.print(f"Connection check succeeded for `{self.name}`.")
@@ -349,7 +349,7 @@ def _peek_airbyte_message(
                 )
             return
 
-    def _execute(
+    def _execute_and_parse(
         self,
         args: list[str],
         stdin: IO[str] | AirbyteMessageIterator | None = None,
@@ -371,7 +371,7 @@ def _execute(
         self.executor.ensure_installation(auto_fix=False)
 
         try:
-            for line in self.executor.execute(args, stdin=stdin):
+            for line in self._execute(args, stdin=stdin):
                 try:
                     message: AirbyteMessage = AirbyteMessage.model_validate_json(json_data=line)
                     if progress_tracker and message.record:
@@ -403,6 +403,33 @@ def _execute(
                 original_exception=e,
             ) from None
 
+    def _execute(
+        self,
+        args: list[str],
+        stdin: IO[str] | AirbyteMessageIterator | None = None,
+    ) -> Generator[str, None, None]:
+        """Execute the connector with the given arguments.
+
+        This involves the following steps:
+        * Locate the right venv. It is called ".venv-<connector_name>"
+        * Spawn a subprocess with .venv-<connector_name>/bin/<connector-name> <args>
+        * Read the output line by line of the subprocess and yield (unparsed) strings.
+
+        Raises:
+            AirbyteConnectorFailedError: If the process returns a failure status (non-zero).
+        """
+        # Fail early if the connector is not installed.
+        self.executor.ensure_installation(auto_fix=False)
+
+        try:
+            yield from self.executor.execute(args, stdin=stdin)
+
+        except Exception as e:
+            raise exc.AirbyteConnectorFailedError(
+                connector_name=self.name,
+                log_text=self._last_log_messages,
+            ) from e
+
 
 __all__ = [
     "ConnectorBase",

diff --git a/airbyte/_message_iterators.py b/airbyte/_message_iterators.py
@@ -4,12 +4,13 @@
 from __future__ import annotations
 
 import datetime
+import io
 import sys
 from collections.abc import Iterator
 from typing import IO, TYPE_CHECKING, cast
 
 import pydantic
-from typing_extensions import final
+from typing_extensions import Literal, final
 
 from airbyte_protocol.models import (
     AirbyteMessage,
@@ -57,6 +58,30 @@ def read(self) -> str:
         """Read the next message from the iterator."""
         return next(self).model_dump_json()
 
+    def as_filelike(self) -> io.BytesIO:
+        """Return a file-like object that reads from the iterator."""
+
+        class FileLikeReader(io.RawIOBase):
+            def __init__(self, iterator: Iterator[AirbyteMessage]) -> None:
+                self.iterator = (msg.model_dump_json() for msg in iterator)
+                self.buffer = ""
+
+            def readable(self) -> Literal[True]:
+                return True
+
+            def readinto(self, b: Any) -> int:
+                try:
+                    chunk = next(self.iterator)
+                except StopIteration:
+                    return 0  # EOF
+
+                data = chunk.encode()
+                n = len(data)
+                b[:n] = data
+                return n
+
+        return cast(io.BytesIO, FileLikeReader(self._iterator))
+
     @classmethod
     def from_read_result(cls, read_result: ReadResult) -> AirbyteMessageIterator:
         """Create a iterator from a `ReadResult` object."""

diff --git a/airbyte/_processors/sql/duckdb.py b/airbyte/_processors/sql/duckdb.py
@@ -87,12 +87,7 @@ def get_sql_engine(self) -> Engine:
 
 
 class DuckDBSqlProcessor(SqlProcessorBase):
-    """A DuckDB implementation of the cache.
-
-    Jsonl is used for local file storage before bulk loading.
-    Unlike the Snowflake implementation, we can't use the COPY command to load data
-    so we insert as values instead.
-    """
+    """A DuckDB implementation of the cache."""
 
     supports_merge_insert = False
     file_writer_class = JsonlWriter

diff --git a/airbyte/_processors/sql/iceberg.py b/airbyte/_processors/sql/iceberg.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+from __future__ import annotations
+
+from textwrap import dedent, indent
+from typing import TYPE_CHECKING
+
+from airbyte_protocol.models import (
+    AirbyteRecordMessage,
+    AirbyteStateMessage,
+    AirbyteStateType,
+)
+
+from airbyte import exceptions as exc
+from airbyte._future_cdk.sql_processor import SqlConfig, SqlProcessorBase
+from airbyte._future_cdk.state_writers import StateWriterBase
+from airbyte._processors.file.parquet import ParquetWriter
+
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class IcebergConfig(SqlConfig):
+    """A Iceberg configuration."""
+
+    def __init__(self, db_path: str, schema_name: str) -> None:
+        """Initialize the Iceberg configuration."""
+        self.db_path = db_path
+        self.schema_name = schema_name
+
+
+class IcebergSqlProcessor(SqlProcessorBase):
+    """A Iceberg SQL processor."""
+
+    supports_merge_insert = False
+    file_writer_class = ParquetWriter
+    sql_config: IcebergConfig
+
+    class IcebergStateWriter(StateWriterBase):
+        """A state writer for the Parquet cache."""
+
+        def __init__(self, iceberg_processor: IcebergSqlProcessor) -> None:
+            self._iceberg_processor = iceberg_processor
+            super().__init__()
+
+        def _write_state(self, state: AirbyteRecordMessage) -> None:
+            """Write the state to the cache."""
+            self._iceberg_processor.write_state(state)
+
+    @property
+    def get_state_writer(self) -> StateWriterBase:
+        if self._state_writer is None:
+            self._state_writer = self.IcebergStateWriter(self)
+
+        return self._state_writer
+
+    def write_state(self, state: AirbyteStateMessage) -> None:
+        """Write the state to the cache.
+
+        Args:
+            state (AirbyteStateMessage): The state to write.
+
+        Implementation:
+        - State messages are written a separate file.
+        - Any pending records are written to the cache file and the cache file is closed.
+        - For stream state messages, the matching stream batches are flushed and closed.
+        - For global state, all batches are flushed and closed.
+        """
+        stream_names: list[str] = []
+        if state.type == AirbyteStateType.STREAM:
+            stream_names = [state.record.stream]
+        if state.type == AirbyteStateType.GLOBAL:
+            stream_names = list(self._buffered_records.keys())
+        else:
+            msg = f"Unexpected state type: {state.type}"
+            raise exc.PyAirbyteInternalError(msg)
+
+        for stream_name in stream_names:
+            state_file_name = self.file_writer.get_active_batch(stream_name)
+            self.file_writer.flush_active_batch(stream_name)
+            self.file_writer._write_state_to_file(state)
+            return
+
+    def _write_files_to_new_table(
+        self,
+        files: list[Path],
+        stream_name: str,
+        batch_id: str,
+    ) -> str:
+        """Write file(s) to a new table.
+
+        This involves registering the table in the Iceberg catalog, creating a manifest file,
+        and registering the manifest file in the catalog.
+        """
+        temp_table_name = self._create_table_for_loading(
+            stream_name=stream_name,
+            batch_id=batch_id,
+        )
+        columns_list = list(self._get_sql_column_definitions(stream_name=stream_name).keys())
+        columns_list_str = indent(
+            "\n, ".join([self._quote_identifier(col) for col in columns_list]),
+            "    ",
+        )
+        files_list = ", ".join([f"'{f!s}'" for f in files])
+        columns_type_map = indent(
+            "\n, ".join(
+                [
+                    self._quote_identifier(self.normalizer.normalize(prop_name))
+                    + ': "'
+                    + str(
+                        self._get_sql_column_definitions(stream_name)[
+                            self.normalizer.normalize(prop_name)
+                        ]
+                    )
+                    + '"'
+                    for prop_name in columns_list
+                ]
+            ),
+            "    ",
+        )
+        insert_statement = dedent(
+            f"""
+            INSERT INTO {self.sql_config.schema_name}.{temp_table_name}
+            (
+                {columns_list_str}
+            )
+            SELECT
+                {columns_list_str}
+            FROM read_json_auto(
+                [{files_list}],
+                format = 'newline_delimited',
+                union_by_name = true,
+                columns = {{ { columns_type_map } }}
+            )
+            """
+        )
+        self._execute_sql(insert_statement)
+        return temp_table_name