dlt-hub · rudolfix · Dec 11, 2024 · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml
@@ -78,7 +78,10 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake
+        run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake -E pyiceberg
+
+      - name: Upgrade sqlalchemy
+        run: poetry run pip install sqlalchemy==2.0.18  # minimum version required by `pyiceberg`
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml
@@ -95,7 +95,10 @@ jobs:
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake
+        run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake -E pyiceberg
+
+      - name: Upgrade sqlalchemy
+        run: poetry run pip install sqlalchemy==2.0.18  # minimum version required by `pyiceberg`
 
       - name: Start SFTP server
         run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d

diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py
@@ -30,7 +30,7 @@ def find_call_arguments_to_replace(
                     if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str):
                         raise CliCommandInnerException(
                             "init",
-                            f"The pipeline script {init_script_name} must pass the {t_arg_name} as"
+                            f"The pipeline script {init_script_name} must pass the {t_arg_name} as"  # type: ignore[attr-defined]
                             f" string to '{arg_name}' function in line {dn_node.lineno}",
                         )
                     else:

diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py
@@ -8,6 +8,7 @@
     CredentialsWithDefault,
     configspec,
 )
+from dlt.common.configuration.specs.mixins import WithPyicebergConfig
 from dlt.common.configuration.specs.exceptions import (
     InvalidBoto3Session,
     ObjectStoreRsCredentialsException,
@@ -16,7 +17,7 @@
 
 
 @configspec
-class AwsCredentialsWithoutDefaults(CredentialsConfiguration):
+class AwsCredentialsWithoutDefaults(CredentialsConfiguration, WithPyicebergConfig):
     # credentials without boto implementation
     aws_access_key_id: str = None
     aws_secret_access_key: TSecretStrValue = None
@@ -77,6 +78,16 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]:
 
         return creds
 
+    def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+        return {
+            "s3.access-key-id": self.aws_access_key_id,
+            "s3.secret-access-key": self.aws_secret_access_key,
+            "s3.session-token": self.aws_session_token,
+            "s3.region": self.region_name,
+            "s3.endpoint": self.endpoint_url,
+            "s3.connect-timeout": 300,
+        }
+
 
 @configspec
 class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault):

diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py
@@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]:
     def get_resolvable_fields(cls) -> Dict[str, type]:
         """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned"""
         return {
-            f.name: eval(f.type) if isinstance(f.type, str) else f.type  # type: ignore[arg-type]
+            f.name: eval(f.type) if isinstance(f.type, str) else f.type
             for f in cls._get_resolvable_dataclass_fields()
         }
 

diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py
@@ -0,0 +1,12 @@
+from typing import Dict, Any
+from abc import abstractmethod, ABC
+
+
+class WithPyicebergConfig(ABC):
+    @abstractmethod
+    def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+        """Returns `pyiceberg` FileIO configuration dictionary.
+
+        https://py.iceberg.apache.org/configuration/#fileio
+        """
+        pass
diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py
@@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None:
                 if self.writer_spec.is_binary_format:
                     self._file = self.open(self._file_name, "wb")  # type: ignore
                 else:
-                    self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="")  # type: ignore
+                    self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="")  # type: ignore[unused-ignore]
                 self._writer = self.writer_cls(self._file, caps=self._caps)  # type: ignore[assignment]
                 self._writer.write_header(self._current_columns)
             # write buffer

diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py
@@ -38,7 +38,7 @@ def verify_schema_capabilities(
     exception_log: List[Exception] = []
     # combined casing function
     case_identifier = lambda ident: capabilities.casefold_identifier(
-        (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident)  # type: ignore
+        (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident)  # type: ignore[unused-ignore]
     )
     table_name_lookup: DictStrStr = {}
     # name collision explanation

diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py
@@ -0,0 +1,145 @@
+from typing import Dict, Any
+import os
+
+from dlt import version, Pipeline
+from dlt.common.libs.pyarrow import cast_arrow_schema_types, columns_to_arrow
+from dlt.common.schema.typing import TWriteDisposition
+from dlt.common.utils import assert_min_pkg_version
+from dlt.common.exceptions import MissingDependencyException
+from dlt.common.configuration.specs import CredentialsConfiguration
+from dlt.common.configuration.specs.mixins import WithPyicebergConfig
+from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
+
+assert_min_pkg_version(
+    pkg_name="sqlalchemy",
+    version="2.0.18",
+    msg="`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination.",
+)
+
+try:
+    from pyiceberg.table import Table as IcebergTable
+    from pyiceberg.catalog.sql import SqlCatalog
+    import pyarrow as pa
+except ModuleNotFoundError:
+    raise MissingDependencyException(
+        "dlt pyiceberg helpers",
+        [f"{version.DLT_PKG_NAME}[pyiceberg]"],
+        "Install `pyiceberg` so dlt can create Iceberg tables in the `filesystem` destination.",
+    )
+
+
+DLT_ICEBERG_NAMESPACE = "dlt"
+
+
+def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
+    ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = {
+        pa.types.is_time: pa.string(),
+        pa.types.is_decimal256: pa.string(),  # pyarrow does not allow downcasting to decimal128
+    }
+    return cast_arrow_schema_types(schema, ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP)
+
+
+def ensure_iceberg_compatible_arrow_data(data: pa.Table) -> pa.Table:
+    schema = ensure_iceberg_compatible_arrow_schema(data.schema)
+    return data.cast(schema)
+
+
+def write_iceberg_table(
+    table: IcebergTable,
+    data: pa.Table,
+    write_disposition: TWriteDisposition,
+) -> None:
+    if write_disposition == "append":
+        table.append(ensure_iceberg_compatible_arrow_data(data))
+    elif write_disposition == "replace":
+        table.overwrite(ensure_iceberg_compatible_arrow_data(data))
+
+
+def get_catalog(
+    client: FilesystemClient,
+    table_name: str,
+    schema: pa.Schema = None,
+) -> SqlCatalog:
+    """Returns single-table, ephemeral, in-memory Iceberg catalog."""
+
+    # create in-memory catalog
+    catalog = SqlCatalog(
+        "default",
+        uri="sqlite:///:memory:",
+        **_get_fileio_config(client.config.credentials),
+    )
+    catalog.create_namespace(DLT_ICEBERG_NAMESPACE)
+
+    # add table to catalog
+    table_id = f"{DLT_ICEBERG_NAMESPACE}.{table_name}"
+    table_path = f"{client.dataset_path}/{table_name}"
+    metadata_path = f"{table_path}/metadata"
+    if client.fs_client.exists(metadata_path):
+        # found metadata; register existing table
+        table = _register_table(table_id, metadata_path, catalog, client)
+
+        # evolve schema
+        if schema is not None:
+            with table.update_schema() as update:
+                update.union_by_name(ensure_iceberg_compatible_arrow_schema(schema))
+    else:
+        # found no metadata; create new table
+        assert schema is not None
+        catalog.create_table(
+            table_id,
+            schema=ensure_iceberg_compatible_arrow_schema(schema),
+            location=client.make_remote_url(table_path),
+        )
+
+    return catalog
+
+
+def get_iceberg_tables(
+    pipeline: Pipeline, *tables: str, schema_name: str = None
+) -> Dict[str, IcebergTable]:
+    from dlt.common.schema.utils import get_table_format
+
+    with pipeline.destination_client(schema_name=schema_name) as client:
+        assert isinstance(
+            client, FilesystemClient
+        ), "The `get_iceberg_tables` function requires a `filesystem` destination."
+
+        schema_iceberg_tables = [
+            t["name"]
+            for t in client.schema.tables.values()
+            if get_table_format(client.schema.tables, t["name"]) == "iceberg"
+        ]
+        if len(tables) > 0:
+            invalid_tables = set(tables) - set(schema_iceberg_tables)
+            if len(invalid_tables) > 0:
+                available_schemas = ""
+                if len(pipeline.schema_names) > 1:
+                    available_schemas = f" Available schemas are {pipeline.schema_names}"
+                raise ValueError(
+                    f"Schema {client.schema.name} does not contain Iceberg tables with these names:"
+                    f" {', '.join(invalid_tables)}.{available_schemas}"
+                )
+            schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables]
+
+        return {
+            name: get_catalog(client, name).load_table(f"{DLT_ICEBERG_NAMESPACE}.{name}")
+            for name in schema_iceberg_tables
+        }
+
+
+def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]:
+    if isinstance(credentials, WithPyicebergConfig):
+        return credentials.to_pyiceberg_fileio_config()
+    return {}
+
+
+def _register_table(
+    identifier: str,
+    metadata_path: str,
+    catalog: SqlCatalog,
+    client: FilesystemClient,
+) -> IcebergTable:
+    # TODO: implement faster way to obtain `last_metadata_file` (listing is slow)
+    metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")]
+    last_metadata_file = client.make_remote_url(sorted(metadata_files)[-1])
+    return catalog.register_table(identifier, last_metadata_file)
diff --git a/dlt/common/logger.py b/dlt/common/logger.py
@@ -47,7 +47,7 @@ def is_logging() -> bool:
 def log_level() -> str:
     if not LOGGER:
         raise RuntimeError("Logger not initialized")
-    return logging.getLevelName(LOGGER.level)  # type: ignore
+    return logging.getLevelName(LOGGER.level)
 
 
 def is_json_logging(log_format: str) -> bool:

diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py
@@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple):
     created: float
     last_modified: float
 
-    def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]:
+    def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]:  # type: ignore[override]
         if isinstance(other, DataWriterMetrics):
             return DataWriterMetrics(
                 self.file_path if self.file_path == other.file_path else "",

diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py
@@ -84,24 +84,24 @@ def rewrite_python_script(
     last_line = -1
     last_offset = -1
     # sort transformed nodes by line and offset
-    for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)):
+    for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)):  # type: ignore[attr-defined]
         # do we have a line changed
-        if last_line != node.lineno - 1:
+        if last_line != node.lineno - 1:  # type: ignore[attr-defined]
             # add remainder from the previous line
             if last_offset >= 0:
                 script_lines.append(source_script_lines[last_line][last_offset:])
             # add all new lines from previous line to current
-            script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1])
+            script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1])  # type: ignore[attr-defined]
             # add trailing characters until node in current line starts
-            script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset])
+            script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset])  # type: ignore[attr-defined]
         elif last_offset >= 0:
             # no line change, add the characters from the end of previous node to the current
-            script_lines.append(source_script_lines[last_line][last_offset : node.col_offset])
+            script_lines.append(source_script_lines[last_line][last_offset : node.col_offset])  # type: ignore[attr-defined]
 
         # replace node value
         script_lines.append(astunparse.unparse(t_value).strip())
-        last_line = node.end_lineno - 1
-        last_offset = node.end_col_offset
+        last_line = node.end_lineno - 1  # type: ignore[attr-defined]
+        last_offset = node.end_col_offset  # type: ignore[attr-defined]
 
     # add all that was missing
     if last_offset >= 0:

diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py
@@ -524,7 +524,7 @@ def get_new_table_columns(
         Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored.
 
         Optionally includes incomplete columns (without data type)"""
-        casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str  # type: ignore[assignment]
+        casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str
         casefold_existing = {
             casefold_f(col_name): col for col_name, col in existing_columns.items()
         }

diff --git a/dlt/common/typing.py b/dlt/common/typing.py
@@ -439,7 +439,7 @@ def get_generic_type_argument_from_instance(
     if cls_:
         orig_param_type = get_args(cls_)[0]
     if orig_param_type in (Any, CallableAny) and sample_value is not None:
-        orig_param_type = type(sample_value)
+        orig_param_type = type(sample_value)  # type: ignore[assignment]
     return orig_param_type  # type: ignore
 
 

diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py
@@ -19,7 +19,7 @@ def filesystem_loader_file_format_selector(
     *,
     table_schema: TTableSchema,
 ) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]:
-    if table_schema.get("table_format") == "delta":
+    if table_schema.get("table_format") in ("delta", "iceberg"):
         return ("parquet", ["parquet"])
     return (preferred_loader_file_format, supported_loader_file_formats)
 
@@ -43,7 +43,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext:
         caps = DestinationCapabilitiesContext.generic_capabilities(
             preferred_loader_file_format="jsonl",
             loader_file_format_selector=filesystem_loader_file_format_selector,
-            supported_table_formats=["delta"],
+            supported_table_formats=["delta", "iceberg"],
             supported_merge_strategies=["upsert"],
             merge_strategies_selector=filesystem_merge_strategies_selector,
         )