From 8b4fc8c25d2a46f86432d496b413983da2bd87eb Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Thu, 8 Aug 2024 19:27:34 +0530
Subject: [PATCH 01/34] RangePaginator: Stops pagination in case of page
 without data items

---
 dlt/sources/helpers/rest_client/client.py     |  2 +-
 dlt/sources/helpers/rest_client/paginators.py | 62 +++++++++++--------
 .../helpers/rest_client/test_client.py        |  2 +-
 .../helpers/rest_client/test_paginators.py    | 29 +++++++++
 4 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py
index 73ae064299..c05dabc30c 100644
--- a/dlt/sources/helpers/rest_client/client.py
+++ b/dlt/sources/helpers/rest_client/client.py
@@ -225,7 +225,7 @@ def raise_for_status(response: Response, *args: Any, **kwargs: Any) -> None:
 
             if paginator is None:
                 paginator = self.detect_paginator(response, data)
-            paginator.update_state(response)
+            paginator.update_state(response, data)
             paginator.update_request(request)
 
             # yield data with context
diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 4c8ce70bb2..078d4b0a87 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -1,6 +1,6 @@
 import warnings
 from abc import ABC, abstractmethod
-from typing import Optional, Dict, Any
+from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse, urljoin
 
 from requests import Response, Request
@@ -39,7 +39,7 @@ def init_request(self, request: Request) -> None:  # noqa: B027, optional overri
         pass
 
     @abstractmethod
-    def update_state(self, response: Response) -> None:
+    def update_state(self, response: Response, data: List[Any] = None) -> None:
         """Updates the paginator's state based on the response from the API.
 
         This method should extract necessary pagination details (like next page
@@ -73,7 +73,7 @@ def __str__(self) -> str:
 class SinglePagePaginator(BasePaginator):
     """A paginator for single-page API responses."""
 
-    def update_state(self, response: Response) -> None:
+    def update_state(self, response: Response, data: List[Any] = None) -> None:
         self._has_next_page = False
 
     def update_request(self, request: Request) -> None:
@@ -96,6 +96,7 @@ def __init__(
         maximum_value: Optional[int] = None,
         total_path: Optional[jsonpath.TJsonPath] = None,
         error_message_items: str = "items",
+        stop_after_empty_page: bool = False,
     ):
         """
         Args:
@@ -127,6 +128,7 @@ def __init__(
         self.maximum_value = maximum_value
         self.total_path = jsonpath.compile_path(total_path) if total_path else None
         self.error_message_items = error_message_items
+        self.stop_after_empty_page = stop_after_empty_page
 
     def init_request(self, request: Request) -> None:
         if request.params is None:
@@ -134,26 +136,32 @@ def init_request(self, request: Request) -> None:
 
         request.params[self.param_name] = self.current_value
 
-    def update_state(self, response: Response) -> None:
-        total = None
-        if self.total_path:
-            response_json = response.json()
-            values = jsonpath.find_values(self.total_path, response_json)
-            total = values[0] if values else None
-            if total is None:
-                self._handle_missing_total(response_json)
-
-            try:
-                total = int(total)
-            except ValueError:
-                self._handle_invalid_total(total)
-
-        self.current_value += self.value_step
-
-        if (total is not None and self.current_value >= total + self.base_index) or (
-            self.maximum_value is not None and self.current_value >= self.maximum_value
-        ):
+    def update_state(self, response: Response, data: List[Any] = None) -> None:
+        if self._stop_after_this_page(data):
             self._has_next_page = False
+        else:
+            total = None
+            if self.total_path:
+                response_json = response.json()
+                values = jsonpath.find_values(self.total_path, response_json)
+                total = values[0] if values else None
+                if total is None:
+                    self._handle_missing_total(response_json)
+
+                try:
+                    total = int(total)
+                except ValueError:
+                    self._handle_invalid_total(total)
+
+            self.current_value += self.value_step
+
+            if (total is not None and self.current_value >= total + self.base_index) or (
+                self.maximum_value is not None and self.current_value >= self.maximum_value
+            ):
+                self._has_next_page = False
+
+    def _stop_after_this_page(self, data: List[Any]) -> bool:
+        return self.stop_after_empty_page and data == []
 
     def _handle_missing_total(self, response_json: Dict[str, Any]) -> None:
         raise ValueError(
@@ -229,6 +237,7 @@ def __init__(
         page_param: str = "page",
         total_path: jsonpath.TJsonPath = "total",
         maximum_page: Optional[int] = None,
+        stop_after_empty_page: bool = False,
     ):
         """
         Args:
@@ -260,6 +269,7 @@ def __init__(
             value_step=1,
             maximum_value=maximum_page,
             error_message_items="pages",
+            stop_after_empty_page=stop_after_empty_page,
         )
 
     def __str__(self) -> str:
@@ -330,6 +340,7 @@ def __init__(
         limit_param: str = "limit",
         total_path: jsonpath.TJsonPath = "total",
         maximum_offset: Optional[int] = None,
+        stop_after_empty_page: bool = False,
     ) -> None:
         """
         Args:
@@ -356,6 +367,7 @@ def __init__(
             total_path=total_path,
             value_step=limit,
             maximum_value=maximum_offset,
+            stop_after_empty_page=stop_after_empty_page,
         )
         self.limit_param = limit_param
         self.limit = limit
@@ -484,7 +496,7 @@ def __init__(self, links_next_key: str = "next") -> None:
         super().__init__()
         self.links_next_key = links_next_key
 
-    def update_state(self, response: Response) -> None:
+    def update_state(self, response: Response, data: List[Any] = None) -> None:
         """Extracts the next page URL from the 'Link' header in the response."""
         self._next_reference = response.links.get(self.links_next_key, {}).get("url")
 
@@ -539,7 +551,7 @@ def __init__(
         super().__init__()
         self.next_url_path = jsonpath.compile_path(next_url_path)
 
-    def update_state(self, response: Response) -> None:
+    def update_state(self, response: Response, data: List[Any] = None) -> None:
         """Extracts the next page URL from the JSON response."""
         values = jsonpath.find_values(self.next_url_path, response.json())
         self._next_reference = values[0] if values else None
@@ -618,7 +630,7 @@ def __init__(
         self.cursor_path = jsonpath.compile_path(cursor_path)
         self.cursor_param = cursor_param
 
-    def update_state(self, response: Response) -> None:
+    def update_state(self, response: Response, data: List[Any] = None) -> None:
         """Extracts the cursor value from the JSON response."""
         values = jsonpath.find_values(self.cursor_path, response.json())
         self._next_reference = values[0] if values else None
diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py
index f5de1ec5da..af914bf89d 100644
--- a/tests/sources/helpers/rest_client/test_client.py
+++ b/tests/sources/helpers/rest_client/test_client.py
@@ -400,7 +400,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None:
         posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE
 
         class JSONBodyPageCursorPaginator(BaseReferencePaginator):
-            def update_state(self, response):
+            def update_state(self, response, data):
                 self._next_reference = response.json().get("next_page")
 
             def update_request(self, request):
diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py
index 8a3c136e09..9e4ccada72 100644
--- a/tests/sources/helpers/rest_client/test_paginators.py
+++ b/tests/sources/helpers/rest_client/test_paginators.py
@@ -1,3 +1,4 @@
+from typing import Any, List
 from unittest.mock import Mock
 
 import pytest
@@ -312,6 +313,19 @@ def test_client_pagination(self, rest_client):
 
         assert_pagination(pages)
 
+    def test_stop_after_empty_page(self):
+        paginator = OffsetPaginator(
+            offset=0,
+            limit=50,
+            maximum_offset=100,
+            total_path=None,
+            stop_after_empty_page=True,
+        )
+        response = Mock(Response, json=lambda: {"items": []})
+        no_data_found: List[Any] = []
+        paginator.update_state(response, no_data_found)  # Page 1
+        assert paginator.has_next_page is False
+
 
 @pytest.mark.usefixtures("mock_api_server")
 class TestPageNumberPaginator:
@@ -372,6 +386,21 @@ def test_maximum_page(self):
         assert paginator.current_value == 3
         assert paginator.has_next_page is False
 
+    def test_stop_after_empty_page(self):
+        paginator = PageNumberPaginator(
+            base_page=1,
+            page=1,
+            maximum_page=5,
+            stop_after_empty_page=True,
+            total_path=None,
+        )
+        response = Mock(Response, json=lambda: {"items": []})
+        no_data_found: List[Any] = []
+        assert paginator.has_next_page is True
+        paginator.update_state(response, no_data_found)
+        assert paginator.current_value == 1
+        assert paginator.has_next_page is False
+
     def test_client_pagination_one_based(self, rest_client):
         pages_iter = rest_client.paginate(
             "/posts",

From 8d4ffa9e49f083866c507de68c57196332e0493c Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Fri, 9 Aug 2024 15:58:55 +0530
Subject: [PATCH 02/34] Defaults RangePaginator to stop after having received
 an empty page

---
 dlt/sources/helpers/rest_client/paginators.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 078d4b0a87..a96413d84e 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -96,7 +96,7 @@ def __init__(
         maximum_value: Optional[int] = None,
         total_path: Optional[jsonpath.TJsonPath] = None,
         error_message_items: str = "items",
-        stop_after_empty_page: bool = False,
+        stop_after_empty_page: bool = True,
     ):
         """
         Args:
@@ -237,7 +237,7 @@ def __init__(
         page_param: str = "page",
         total_path: jsonpath.TJsonPath = "total",
         maximum_page: Optional[int] = None,
-        stop_after_empty_page: bool = False,
+        stop_after_empty_page: bool = True,
     ):
         """
         Args:
@@ -340,7 +340,7 @@ def __init__(
         limit_param: str = "limit",
         total_path: jsonpath.TJsonPath = "total",
         maximum_offset: Optional[int] = None,
-        stop_after_empty_page: bool = False,
+        stop_after_empty_page: bool = True,
     ) -> None:
         """
         Args:

From e9ecf88a741033034b7beff4fcb0c3e8a12d12e9 Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Mon, 12 Aug 2024 15:31:04 +0530
Subject: [PATCH 03/34] Documents how to stop paginator, updates docs on
 json_link

---
 dlt/sources/helpers/rest_client/paginators.py | 12 +++--
 .../verified-sources/rest_api.md              |  8 +--
 .../docs/general-usage/http/rest-client.md    | 51 +++++++++++++++----
 3 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index a96413d84e..083b95da18 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -96,7 +96,7 @@ def __init__(
         maximum_value: Optional[int] = None,
         total_path: Optional[jsonpath.TJsonPath] = None,
         error_message_items: str = "items",
-        stop_after_empty_page: bool = True,
+        stop_after_empty_page: Optional[bool] = True,
     ):
         """
         Args:
@@ -117,6 +117,8 @@ def __init__(
                 If not provided, `maximum_value` must be specified.
             error_message_items (str): The name of the items in the error message.
                 Defaults to 'items'.
+            stop_after_empty_page (bool): Whether pagination should stop when
+              a page contains no result items. Defaults to `True`.
         """
         super().__init__()
         if total_path is None and maximum_value is None:
@@ -237,7 +239,7 @@ def __init__(
         page_param: str = "page",
         total_path: jsonpath.TJsonPath = "total",
         maximum_page: Optional[int] = None,
-        stop_after_empty_page: bool = True,
+        stop_after_empty_page: Optional[bool] = True,
     ):
         """
         Args:
@@ -255,6 +257,8 @@ def __init__(
                 will stop once this page is reached or exceeded, even if more
                 data is available. This allows you to limit the maximum number
                 of pages for pagination. Defaults to None.
+            stop_after_empty_page (bool): Whether pagination should stop when
+              a page contains no result items. Defaults to `True`.
         """
         if total_path is None and maximum_page is None:
             raise ValueError("Either `total_path` or `maximum_page` must be provided.")
@@ -340,7 +344,7 @@ def __init__(
         limit_param: str = "limit",
         total_path: jsonpath.TJsonPath = "total",
         maximum_offset: Optional[int] = None,
-        stop_after_empty_page: bool = True,
+        stop_after_empty_page: Optional[bool] = True,
     ) -> None:
         """
         Args:
@@ -358,6 +362,8 @@ def __init__(
                 pagination will stop once this offset is reached or exceeded,
                 even if more data is available. This allows you to limit the
                 maximum range for pagination. Defaults to None.
+            stop_after_empty_page (bool): Whether pagination should stop when
+              a page contains no result items. Defaults to `True`.
         """
         if total_path is None and maximum_offset is None:
             raise ValueError("Either `total_path` or `maximum_offset` must be provided.")
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md
index 4b72b3276e..b4d2d08daa 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md
@@ -371,7 +371,7 @@ You can configure the pagination for the `posts` resource like this:
 {
     "path": "posts",
     "paginator": {
-        "type": "json_response",
+        "type": "json_link",
         "next_url_path": "pagination.next",
     }
 }
@@ -380,7 +380,7 @@ You can configure the pagination for the `posts` resource like this:
 Alternatively, you can use the paginator instance directly:
 
 ```py
-from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator
+from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator
 
 # ...
 
@@ -402,8 +402,8 @@ These are the available paginators:
 | ------------ | -------------- | ----------- |
 | `json_link` | [JSONLinkPaginator](../../general-usage/http/rest-client.md#jsonresponsepaginator) | The link to the next page is in the body (JSON) of the response.<br/>*Parameters:*<ul><li>`next_url_path` (str) - the JSONPath to the next page URL</li></ul> |
 | `header_link` | [HeaderLinkPaginator](../../general-usage/http/rest-client.md#headerlinkpaginator) | The links to the next page are in the response headers.<br/>*Parameters:*<ul><li>`link_header` (str) - the name of the header containing the links. Default is "next".</li></ul> |
-| `offset` | [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.<br/>*Parameters:*<ul><li>`limit` (int) - the maximum number of items to retrieve in each request</li><li>`offset` (int) - the initial offset for the first request. Defaults to `0`</li><li>`offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"</li><li>`limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"</li><li>`total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset`</li><li>`maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count</li></ul> |
-| `page_number` | [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.<br/>*Parameters:*<ul><li>`base_page` (int) - the starting page number. Defaults to `0`</li><li>`page_param` (str) - the query parameter name for the page number. Defaults to "page"</li><li>`total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page`</li><li>`maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached</li></ul> |
+| `offset` | [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.<br/>*Parameters:*<ul><li>`limit` (int) - the maximum number of items to retrieve in each request</li><li>`offset` (int) - the initial offset for the first request. Defaults to `0`</li><li>`offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"</li><li>`limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"</li><li>`total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`</li><li>`maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count</li><li>`stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`</li></ul> |
+| `page_number` | [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.<br/>*Parameters:*<ul><li>`base_page` (int) - the starting page number. Defaults to `0`</li><li>`page_param` (str) - the query parameter name for the page number. Defaults to "page"</li><li>`total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`</li><li>`maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached</li><li>`stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`</li></ul> |
 | `cursor` | [JSONResponseCursorPaginator](../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON).<br/>*Parameters:*<ul><li>`cursor_path` (str) - the JSONPath to the cursor value. Defaults to "cursors.next"</li><li>`cursor_param` (str) - the query parameter name for the cursor. Defaults to "after"</li></ul> |
 | `single_page` | SinglePagePaginator | The response will be interpreted as a single-page response, ignoring possible pagination metadata. |
 | `auto` | `None` | Explicitly specify that the source should automatically detect the pagination method. |
diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md
index ddd66a233b..9451ca689d 100644
--- a/docs/website/docs/general-usage/http/rest-client.md
+++ b/docs/website/docs/general-usage/http/rest-client.md
@@ -183,8 +183,9 @@ need to specify the paginator when the API uses a different relation type.
 - `offset`: The initial offset for the first request. Defaults to `0`.
 - `offset_param`: The name of the query parameter used to specify the offset. Defaults to `"offset"`.
 - `limit_param`: The name of the query parameter used to specify the limit. Defaults to `"limit"`.
-- `total_path`: A JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset`.
+- `total_path`: A JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`.
 - `maximum_offset`: Optional maximum offset value. Limits pagination even without total count.
+- `stop_after_empty_page`: Whether pagination should stop when a page contains no result items. Defaults to `True`.
 
 **Example:**
 
@@ -198,7 +199,7 @@ E.g. `https://api.example.com/items?offset=0&limit=100`, `https://api.example.co
 }
 ```
 
-You can paginate through responses from this API using `OffsetPaginator`:
+You can paginate through responses from this API using the `OffsetPaginator`:
 
 ```py
 client = RESTClient(
@@ -210,20 +211,34 @@ client = RESTClient(
 )
 ```
 
-In a different scenario where the API does not provide the total count, you can use `maximum_offset` to limit the pagination:
+Pagination stops by default when a page contains no records. This is especially useful when the API does not provide the total item count.
+Here, the `total_path` parameter is set to `None` because the API does not provide the total count.
 
 ```py
 client = RESTClient(
     base_url="https://api.example.com",
     paginator=OffsetPaginator(
         limit=100,
-        maximum_offset=1000,
-        total_path=None
+        total_path=None,
     )
 )
 ```
 
-Note, that in this case, the `total_path` parameter is set explicitly to `None` to indicate that the API does not provide the total count.
+Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_offset` is reached before the first empty page then pagination stops:
+
+```py
+client = RESTClient(
+    base_url="https://api.example.com",
+    paginator=OffsetPaginator(
+        limit=10,
+        maximum_offset=20,  # limits response to 20 records
+        total_path=None,
+    )
+)
+```
+
+You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_offset` to guarantee that the paginator terminates.
+
 
 #### PageNumberPaginator
 
@@ -234,8 +249,9 @@ Note, that in this case, the `total_path` parameter is set explicitly to `None`
 - `base_page`: The index of the initial page from the API perspective. Normally, it's 0-based or 1-based (e.g., 1, 2, 3, ...) indexing for the pages. Defaults to 0.
 - `page`: The page number for the first request. If not provided, the initial value will be set to `base_page`.
 - `page_param`: The query parameter name for the page number. Defaults to `"page"`.
-- `total_path`: A JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page`.
+- `total_path`: A JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`.
 - `maximum_page`: Optional maximum page number. Stops pagination once this page is reached.
+- `stop_after_empty_page`: Whether pagination should stop when a page contains no result items. Defaults to `True`.
 
 **Example:**
 
@@ -248,7 +264,7 @@ Assuming an API endpoint `https://api.example.com/items` paginates by page numbe
 }
 ```
 
-You can paginate through responses from this API using `PageNumberPaginator`:
+You can paginate through responses from this API using the `PageNumberPaginator`:
 
 ```py
 client = RESTClient(
@@ -259,19 +275,32 @@ client = RESTClient(
 )
 ```
 
-If the API does not provide the total number of pages:
+Pagination stops by default when a page contains no records. This is especially useful when the API does not provide the total item count.
+Here, the `total_path` parameter is set to `None` because the API does not provide the total count.
 
 ```py
 client = RESTClient(
     base_url="https://api.example.com",
     paginator=PageNumberPaginator(
-        maximum_page=5,  # Stops after fetching 5 pages
         total_path=None
     )
 )
 ```
 
-Note, that in the case above, the `total_path` parameter is set explicitly to `None` to indicate that the API does not provide the total count.
+Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_page` is reached before the first empty page then pagination stops:
+
+```py
+client = RESTClient(
+    base_url="https://api.example.com",
+    paginator=OffsetPaginator(
+        maximum_page=2,  # limits response to 2 pages
+        total_path=None,
+    )
+)
+```
+
+You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_page` to guarantee that the paginator terminates.
+
 
 #### JSONResponseCursorPaginator
 

From 5e78dcc45efc3400811c00dc1ce1bb7564ba3f6c Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Mon, 12 Aug 2024 17:05:28 +0530
Subject: [PATCH 04/34] Either total_path or maximum_value or
 stop_after_empty_pages is required

---
 dlt/sources/helpers/rest_client/paginators.py | 18 ++++--
 .../helpers/rest_client/test_paginators.py    | 56 +++++++++++++++++++
 2 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 083b95da18..888539a64d 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -121,8 +121,10 @@ def __init__(
               a page contains no result items. Defaults to `True`.
         """
         super().__init__()
-        if total_path is None and maximum_value is None:
-            raise ValueError("Either `total_path` or `maximum_value` must be provided.")
+        if total_path is None and maximum_value is None and not stop_after_empty_page:
+            raise ValueError(
+                "Either `total_path` or `maximum_value` or stop_after_empty_page must be provided."
+            )
         self.param_name = param_name
         self.current_value = initial_value
         self.value_step = value_step
@@ -260,8 +262,10 @@ def __init__(
             stop_after_empty_page (bool): Whether pagination should stop when
               a page contains no result items. Defaults to `True`.
         """
-        if total_path is None and maximum_page is None:
-            raise ValueError("Either `total_path` or `maximum_page` must be provided.")
+        if total_path is None and maximum_page is None and not stop_after_empty_page:
+            raise ValueError(
+                "Either `total_path` or `maximum_page` or `stop_after_empty_page` must be provided."
+            )
 
         page = page if page is not None else base_page
 
@@ -365,8 +369,10 @@ def __init__(
             stop_after_empty_page (bool): Whether pagination should stop when
               a page contains no result items. Defaults to `True`.
         """
-        if total_path is None and maximum_offset is None:
-            raise ValueError("Either `total_path` or `maximum_offset` must be provided.")
+        if total_path is None and maximum_offset is None and not stop_after_empty_page:
+            raise ValueError(
+                "Either `total_path` or `maximum_offset` or `stop_after_empty_page` must be provided."
+            )
         super().__init__(
             param_name=offset_param,
             initial_value=offset,
diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py
index 9e4ccada72..7357169101 100644
--- a/tests/sources/helpers/rest_client/test_paginators.py
+++ b/tests/sources/helpers/rest_client/test_paginators.py
@@ -326,6 +326,36 @@ def test_stop_after_empty_page(self):
         paginator.update_state(response, no_data_found)  # Page 1
         assert paginator.has_next_page is False
 
+    def test_guarantee_termination(self):
+        OffsetPaginator(
+            limit=10,
+            total_path=None,
+        )
+
+        OffsetPaginator(
+            limit=10,
+            total_path=None,
+            maximum_offset=1,
+            stop_after_empty_page=False,
+        )
+
+        with pytest.raises(ValueError) as e:
+            OffsetPaginator(
+                limit=10,
+                total_path=None,
+                stop_after_empty_page=False,
+            )
+        assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided")
+
+        with pytest.raises(ValueError) as e:
+            OffsetPaginator(
+                limit=10,
+                total_path=None,
+                stop_after_empty_page=False,
+                maximum_offset=None,
+            )
+        assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided")
+
 
 @pytest.mark.usefixtures("mock_api_server")
 class TestPageNumberPaginator:
@@ -431,6 +461,32 @@ def test_client_pagination_zero_based(self, rest_client):
 
         assert_pagination(pages)
 
+    def test_guarantee_termination(self):
+        PageNumberPaginator(
+            total_path=None,
+        )
+
+        PageNumberPaginator(
+            total_path=None,
+            maximum_page=1,
+            stop_after_empty_page=False,
+        )
+
+        with pytest.raises(ValueError) as e:
+            PageNumberPaginator(
+                total_path=None,
+                stop_after_empty_page=False,
+            )
+        assert e.match("`total_path` or `maximum_page` or `stop_after_empty_page` must be provided")
+
+        with pytest.raises(ValueError) as e:
+            PageNumberPaginator(
+                total_path=None,
+                stop_after_empty_page=False,
+                maximum_page=None,
+            )
+        assert e.match("`total_path` or `maximum_page` or `stop_after_empty_page` must be provided")
+
 
 @pytest.mark.usefixtures("mock_api_server")
 class TestJSONResponseCursorPaginator:

From 44b82749365592de0b12879fb564d56c05120c72 Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Mon, 12 Aug 2024 17:55:20 +0530
Subject: [PATCH 05/34] updates docs to new type signature

---
 dlt/sources/helpers/rest_client/paginators.py      | 14 +++++++-------
 .../website/docs/general-usage/http/rest-client.md |  8 +++++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 888539a64d..993cbf7f26 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -39,7 +39,7 @@ def init_request(self, request: Request) -> None:  # noqa: B027, optional overri
         pass
 
     @abstractmethod
-    def update_state(self, response: Response, data: List[Any] = None) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         """Updates the paginator's state based on the response from the API.
 
         This method should extract necessary pagination details (like next page
@@ -73,7 +73,7 @@ def __str__(self) -> str:
 class SinglePagePaginator(BasePaginator):
     """A paginator for single-page API responses."""
 
-    def update_state(self, response: Response, data: List[Any] = None) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         self._has_next_page = False
 
     def update_request(self, request: Request) -> None:
@@ -140,7 +140,7 @@ def init_request(self, request: Request) -> None:
 
         request.params[self.param_name] = self.current_value
 
-    def update_state(self, response: Response, data: List[Any] = None) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         if self._stop_after_this_page(data):
             self._has_next_page = False
         else:
@@ -164,7 +164,7 @@ def update_state(self, response: Response, data: List[Any] = None) -> None:
             ):
                 self._has_next_page = False
 
-    def _stop_after_this_page(self, data: List[Any]) -> bool:
+    def _stop_after_this_page(self, data: Optional[List[Any]]) -> bool:
         return self.stop_after_empty_page and data == []
 
     def _handle_missing_total(self, response_json: Dict[str, Any]) -> None:
@@ -508,7 +508,7 @@ def __init__(self, links_next_key: str = "next") -> None:
         super().__init__()
         self.links_next_key = links_next_key
 
-    def update_state(self, response: Response, data: List[Any] = None) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         """Extracts the next page URL from the 'Link' header in the response."""
         self._next_reference = response.links.get(self.links_next_key, {}).get("url")
 
@@ -563,7 +563,7 @@ def __init__(
         super().__init__()
         self.next_url_path = jsonpath.compile_path(next_url_path)
 
-    def update_state(self, response: Response, data: List[Any] = None) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         """Extracts the next page URL from the JSON response."""
         values = jsonpath.find_values(self.next_url_path, response.json())
         self._next_reference = values[0] if values else None
@@ -642,7 +642,7 @@ def __init__(
         self.cursor_path = jsonpath.compile_path(cursor_path)
         self.cursor_param = cursor_param
 
-    def update_state(self, response: Response, data: List[Any] = None) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         """Extracts the cursor value from the JSON response."""
         values = jsonpath.find_values(self.cursor_path, response.json())
         self._next_reference = values[0] if values else None
diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md
index 9451ca689d..40c83f8c5b 100644
--- a/docs/website/docs/general-usage/http/rest-client.md
+++ b/docs/website/docs/general-usage/http/rest-client.md
@@ -339,7 +339,7 @@ When working with APIs that use non-standard pagination schemes, or when you nee
 
 - `init_request(request: Request) -> None`: This method is called before making the first API call in the `RESTClient.paginate` method. You can use this method to set up the initial request query parameters, headers, etc. For example, you can set the initial page number or cursor value.
 
-- `update_state(response: Response) -> None`: This method updates the paginator's state based on the response of the API call. Typically, you extract pagination details (like the next page reference) from the response and store them in the paginator instance.
+- `update_state(response: Response, data: Optional[List[Any]]) -> None`: This method updates the paginator's state based on the response of the API call. Typically, you extract pagination details (like the next page reference) from the response and store them in the paginator instance.
 
 - `update_request(request: Request) -> None`: Before making the next API call in `RESTClient.paginate` method, `update_request` is used to modify the request with the necessary parameters to fetch the next page (based on the current state of the paginator). For example, you can add query parameters to the request, or modify the URL.
 
@@ -348,6 +348,7 @@ When working with APIs that use non-standard pagination schemes, or when you nee
 Suppose an API uses query parameters for pagination, incrementing an page parameter for each subsequent page, without providing direct links to next pages in its responses. E.g. `https://api.example.com/posts?page=1`, `https://api.example.com/posts?page=2`, etc. Here's how you could implement a paginator for this scheme:
 
 ```py
+from typing import Any, List, Optional
 from dlt.sources.helpers.rest_client.paginators import BasePaginator
 from dlt.sources.helpers.requests import Response, Request
 
@@ -361,7 +362,7 @@ class QueryParamPaginator(BasePaginator):
         # This will set the initial page number (e.g. page=1)
         self.update_request(request)
 
-    def update_state(self, response: Response) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         # Assuming the API returns an empty list when no more data is available
         if not response.json():
             self._has_next_page = False
@@ -399,6 +400,7 @@ def get_data():
 Some APIs use POST requests for pagination, where the next page is fetched by sending a POST request with a cursor or other parameters in the request body. This is frequently used in "search" API endpoints or other endpoints with big payloads. Here's how you could implement a paginator for a case like this:
 
 ```py
+from typing import Any, List, Optional
 from dlt.sources.helpers.rest_client.paginators import BasePaginator
 from dlt.sources.helpers.rest_client import RESTClient
 from dlt.sources.helpers.requests import Response, Request
@@ -408,7 +410,7 @@ class PostBodyPaginator(BasePaginator):
         super().__init__()
         self.cursor = None
 
-    def update_state(self, response: Response) -> None:
+    def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
         # Assuming the API returns an empty list when no more data is available
         if not response.json():
             self._has_next_page = False

From e42f4d729f9f65e049ba87fb72f2ae7652867264 Mon Sep 17 00:00:00 2001
From: dat-a-man <98139823+dat-a-man@users.noreply.github.com>
Date: Wed, 14 Aug 2024 15:18:06 +0530
Subject: [PATCH 06/34] Updated the docs: Using
 pipeline.default_schema.toprettyyaml()  (#1660)

---
 docs/website/docs/general-usage/schema.md | 24 ++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md
index 0e3e3bba1f..df405de1af 100644
--- a/docs/website/docs/general-usage/schema.md
+++ b/docs/website/docs/general-usage/schema.md
@@ -352,8 +352,30 @@ load_info = pipeline.run(source_data)
 ```
 This example iterates through MongoDB collections, applying the complex [data type](schema#data-types) to a specified column, and then processes the data with `pipeline.run`.
 
-## Export and import schema files
+## View and print the schema
+To view and print the default schema in a clear YAML format use the command:
+
+```py
+pipeline.default_schema.to_pretty_yaml()
+```
+This can be used in a pipeline as:
 
+```py
+# Create a pipeline
+pipeline = dlt.pipeline(
+               pipeline_name="chess_pipeline", 
+               destination='duckdb', 
+               dataset_name="games_data")
+
+# Run the pipeline
+load_info = pipeline.run(source)
+
+# Print the default schema in a pretty YAML format
+print(pipeline.default_schema.to_pretty_yaml())
+```
+This will display a structured YAML representation of your schema, showing details like tables, columns, data types, and metadata, including version, version_hash, and engine_version.
+
+## Export and import schema files
 
 Please follow the guide on [how to adjust a schema](../walkthroughs/adjust-a-schema.md) to export and import `yaml`
 schema files in your pipeline.

From a9c29586ff108de6ccf0bb5aace945df7ba94765 Mon Sep 17 00:00:00 2001
From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com>
Date: Wed, 14 Aug 2024 14:51:38 +0400
Subject: [PATCH 07/34] Add `storage_options` to `DeltaTable.create` (#1686)

* add storage_options to delta table create statement
---
 dlt/common/schema/exceptions.py                 | 9 ++++++---
 dlt/common/schema/utils.py                      | 4 +++-
 dlt/destinations/impl/filesystem/filesystem.py  | 1 +
 dlt/normalize/schema.py                         | 5 ++++-
 tests/load/pipeline/test_filesystem_pipeline.py | 2 +-
 5 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py
index 1055163942..2e75b4b3a1 100644
--- a/dlt/common/schema/exceptions.py
+++ b/dlt/common/schema/exceptions.py
@@ -246,12 +246,15 @@ def __init__(self, schema_name: str, table_name: str, column: TColumnSchemaBase)
         elif column.get("primary_key"):
             key_type = "primary key"
 
-        msg = f"The column {column['name']} in table {table_name} did not receive any data during this load. "
+        msg = (
+            f"The column {column['name']} in table {table_name} did not receive any data during"
+            " this load. "
+        )
         if key_type or not nullable:
             msg += f"It is marked as non-nullable{' '+key_type} and it must have values. "
 
         msg += (
-            "This can happen if you specify the column manually, for example using the 'merge_key', 'primary_key' or 'columns' argument "
-            "but it does not exist in the data."
+            "This can happen if you specify the column manually, for example using the 'merge_key',"
+            " 'primary_key' or 'columns' argument but it does not exist in the data."
         )
         super().__init__(schema_name, msg)
diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py
index d879c21b3c..8b87a7e5fe 100644
--- a/dlt/common/schema/utils.py
+++ b/dlt/common/schema/utils.py
@@ -357,7 +357,9 @@ def is_nullable_column(col: TColumnSchemaBase) -> bool:
     return col.get("nullable", True)
 
 
-def find_incomplete_columns(tables: List[TTableSchema]) -> Iterable[Tuple[str, TColumnSchemaBase, bool]]:
+def find_incomplete_columns(
+    tables: List[TTableSchema],
+) -> Iterable[Tuple[str, TColumnSchemaBase, bool]]:
     """Yields (table_name, column, nullable) for all incomplete columns in `tables`"""
     for table in tables:
         for col in table["columns"].values():
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 449d5c1862..7009ad95ac 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -123,6 +123,7 @@ def run(self) -> None:
                     table_uri=dt_path,
                     schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema),
                     mode="overwrite",
+                    storage_options=storage_options,
                 )
             return
 
diff --git a/dlt/normalize/schema.py b/dlt/normalize/schema.py
index 4967fab18f..c01d184c92 100644
--- a/dlt/normalize/schema.py
+++ b/dlt/normalize/schema.py
@@ -3,13 +3,16 @@
 from dlt.common.schema.exceptions import UnboundColumnException
 from dlt.common import logger
 
+
 def verify_normalized_schema(schema: Schema) -> None:
     """Verify the schema is valid for next stage after normalization.
 
     1. Log warning if any incomplete nullable columns are in any data tables
     2. Raise `UnboundColumnException` on incomplete non-nullable columns (e.g. missing merge/primary key)
     """
-    for table_name, column, nullable in find_incomplete_columns(schema.data_tables(seen_data_only=True)):
+    for table_name, column, nullable in find_incomplete_columns(
+        schema.data_tables(seen_data_only=True)
+    ):
         exc = UnboundColumnException(schema.name, table_name, column)
         if nullable:
             logger.warning(str(exc))
diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py
index 0554b1ef3c..71620e889d 100644
--- a/tests/load/pipeline/test_filesystem_pipeline.py
+++ b/tests/load/pipeline/test_filesystem_pipeline.py
@@ -442,7 +442,7 @@ def complex_table():
     destinations_configs(
         table_format_filesystem_configs=True,
         table_format="delta",
-        bucket_subset=(FILE_BUCKET),
+        bucket_subset=(FILE_BUCKET, AZ_BUCKET),
     ),
     ids=lambda x: x.name,
 )

From 61fc190fa580d0efc32ce6a074a088e674554040 Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Wed, 14 Aug 2024 17:42:49 +0530
Subject: [PATCH 08/34] documents pluggable custom auth

---
 .../docs/dlt-ecosystem/verified-sources/rest_api.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md
index 4b72b3276e..ca9a9360c5 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md
@@ -553,6 +553,19 @@ Available authentication types:
 
 For more complex authentication methods, you can implement a [custom authentication class](../../general-usage/http/rest-client.md#implementing-custom-authentication) and use it in the configuration.
 
+You can use the dictionary configuration syntax also for custom authentication classes after registering them as follows:
+
+```py
+rest_api.config_setup.register_auth("custom_auth", CustomAuth)
+
+{
+    # ...
+    "auth": {
+        "type": "custom_auth",
+        "api_key": dlt.secrets["sources.my_source.my_api_key"],
+    }
+}
+```
 
 
 ### Define resource relationships

From 9bd0b2e258cc8abd3745598e0be9339e15b59847 Mon Sep 17 00:00:00 2001
From: Anton Burnashev <anton.burnashev@gmail.com>
Date: Wed, 14 Aug 2024 14:30:02 +0200
Subject: [PATCH 09/34] bumps to pre release 0.5.4a0 (#1689)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index fcf508f95b..f33bbbefcf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dlt"
-version = "0.5.3"
+version = "0.5.4a0"
 description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run."
 authors = ["dltHub Inc. <services@dlthub.com>"]
 maintainers = [ "Marcin Rudolf <marcin@dlthub.com>", "Adrian Brudaru <adrian@dlthub.com>", "Anton Burnashev <anton@dlthub.com>", "David Scharf <david@dlthub.com>" ]

From 122fc7f7ae3d7f00a899cedcb7bb1d77a15accb7 Mon Sep 17 00:00:00 2001
From: VioletM <sansiositres@gmail.com>
Date: Wed, 14 Aug 2024 08:46:09 -0400
Subject: [PATCH 10/34] Allow different from credentials project_id for
 BigQuery (#1680)

---
 dlt/destinations/impl/bigquery/bigquery.py    |  1 +
 .../impl/bigquery/configuration.py            |  4 ++-
 dlt/destinations/impl/bigquery/sql_client.py  |  8 ++++--
 .../dlt-ecosystem/destinations/bigquery.md    | 12 ++++++++
 tests/load/bigquery/test_bigquery_client.py   | 28 +++++++++++++++++++
 5 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
index ef4e31acd1..c6bf2e7654 100644
--- a/dlt/destinations/impl/bigquery/bigquery.py
+++ b/dlt/destinations/impl/bigquery/bigquery.py
@@ -225,6 +225,7 @@ def __init__(
             config.credentials,
             capabilities,
             config.get_location(),
+            config.project_id,
             config.http_timeout,
             config.retry_deadline,
         )
diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py
index 47cc997a4a..3d71b0c8ea 100644
--- a/dlt/destinations/impl/bigquery/configuration.py
+++ b/dlt/destinations/impl/bigquery/configuration.py
@@ -1,6 +1,6 @@
 import dataclasses
 import warnings
-from typing import ClassVar, List, Final
+from typing import ClassVar, List, Final, Optional
 
 from dlt.common.configuration import configspec
 from dlt.common.configuration.specs import GcpServiceAccountCredentials
@@ -14,6 +14,8 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration):
     destination_type: Final[str] = dataclasses.field(default="bigquery", init=False, repr=False, compare=False)  # type: ignore
     credentials: GcpServiceAccountCredentials = None
     location: str = "US"
+    project_id: Optional[str] = None
+    """Note, that this is BigQuery project_id which could be different from credentials.project_id"""
     has_case_sensitive_identifiers: bool = True
     """If True then dlt expects to load data into case sensitive dataset"""
     should_set_case_sensitivity_on_new_dataset: bool = False
diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py
index dfc4094e7b..c56742f1ff 100644
--- a/dlt/destinations/impl/bigquery/sql_client.py
+++ b/dlt/destinations/impl/bigquery/sql_client.py
@@ -82,14 +82,16 @@ def __init__(
         credentials: GcpServiceAccountCredentialsWithoutDefaults,
         capabilities: DestinationCapabilitiesContext,
         location: str = "US",
+        project_id: Optional[str] = None,
         http_timeout: float = 15.0,
         retry_deadline: float = 60.0,
     ) -> None:
         self._client: bigquery.Client = None
         self.credentials: GcpServiceAccountCredentialsWithoutDefaults = credentials
         self.location = location
+        self.project_id = project_id or self.credentials.project_id
         self.http_timeout = http_timeout
-        super().__init__(credentials.project_id, dataset_name, staging_dataset_name, capabilities)
+        super().__init__(self.project_id, dataset_name, staging_dataset_name, capabilities)
 
         self._default_retry = bigquery.DEFAULT_RETRY.with_deadline(retry_deadline)
         self._default_query = bigquery.QueryJobConfig(
@@ -100,7 +102,7 @@ def __init__(
     @raise_open_connection_error
     def open_connection(self) -> bigquery.Client:
         self._client = bigquery.Client(
-            self.credentials.project_id,
+            self.project_id,
             credentials=self.credentials.to_native_credentials(),
             location=self.location,
         )
@@ -240,7 +242,7 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
                 conn.close()
 
     def catalog_name(self, escape: bool = True) -> Optional[str]:
-        project_id = self.capabilities.casefold_identifier(self.credentials.project_id)
+        project_id = self.capabilities.casefold_identifier(self.project_id)
         if escape:
             project_id = self.capabilities.escape_identifier(project_id)
         return project_id
diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md
index 51d124251a..334e08c4a7 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md
@@ -112,6 +112,18 @@ VMs available on GCP (cloud functions, Composer runners, Colab notebooks) have a
 location = "US"
 ```
 
+### Using Different `project_id`
+
+You can set the `project_id` in your configuration to be different from the one in your credentials, provided your account has access to it:
+```toml
+[destination.bigquery]
+project_id = "project_id_destination"
+
+[destination.bigquery.credentials]
+project_id = "project_id_credentials"
+```
+In this scenario, `project_id_credentials` will be used for authentication, while `project_id_destination` will be used as the data destination.
+
 ## Write Disposition
 
 All write dispositions are supported.
diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py
index 80bd008730..c92f18e159 100644
--- a/tests/load/bigquery/test_bigquery_client.py
+++ b/tests/load/bigquery/test_bigquery_client.py
@@ -32,6 +32,7 @@
     prepare_table,
     yield_client_with_storage,
     cm_yield_client_with_storage,
+    cm_yield_client,
 )
 
 # mark all tests as essential, do not remove
@@ -53,6 +54,18 @@ def auto_delete_storage() -> None:
     delete_test_storage()
 
 
+@pytest.fixture
+def bigquery_project_id() -> Iterator[str]:
+    project_id = "different_project_id"
+    project_id_key = "DESTINATION__BIGQUERY__PROJECT_ID"
+    saved_project_id = os.environ.get(project_id_key)
+    os.environ[project_id_key] = project_id
+    yield project_id
+    del os.environ[project_id_key]
+    if saved_project_id:
+        os.environ[project_id_key] = saved_project_id
+
+
 def test_service_credentials_with_default(environment: Any) -> None:
     gcpc = GcpServiceAccountCredentials()
     # resolve will miss values and try to find default credentials on the machine
@@ -247,6 +260,21 @@ def test_bigquery_configuration() -> None:
     )
 
 
+def test_bigquery_different_project_id(bigquery_project_id) -> None:
+    """Test scenario when bigquery project_id different from gcp credentials project_id."""
+    config = resolve_configuration(
+        BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset"),
+        sections=("destination", "bigquery"),
+    )
+    assert config.project_id == bigquery_project_id
+    with cm_yield_client(
+        "bigquery",
+        dataset_name="dataset",
+        default_config_values={"project_id": bigquery_project_id},
+    ) as client:
+        assert bigquery_project_id in client.sql_client.catalog_name()
+
+
 def test_bigquery_autodetect_configuration(client: BigQueryClient) -> None:
     # no schema autodetect
     assert client._should_autodetect_schema("event_slot") is False

From 982b448d533303cef803b681a2150d3f3a531f22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Willi=20M=C3=BCller?= <willi.mueller@posteo.de>
Date: Thu, 15 Aug 2024 13:59:12 +0200
Subject: [PATCH 11/34] improves formatting in error message

Co-authored-by: Anton Burnashev <anton.burnashev@gmail.com>
---
 dlt/sources/helpers/rest_client/paginators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 993cbf7f26..f87eaea873 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -123,7 +123,7 @@ def __init__(
         super().__init__()
         if total_path is None and maximum_value is None and not stop_after_empty_page:
             raise ValueError(
-                "Either `total_path` or `maximum_value` or stop_after_empty_page must be provided."
+                "Either `total_path` or `maximum_value` or `stop_after_empty_page` must be provided."
             )
         self.param_name = param_name
         self.current_value = initial_value

From a4dbd5d479659820c41a42f4bdf255836e96f7af Mon Sep 17 00:00:00 2001
From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com>
Date: Thu, 15 Aug 2024 21:29:11 +0400
Subject: [PATCH 12/34] fix delta table dangling parquet file bug (#1695)

---
 .../impl/filesystem/filesystem.py             |  3 +-
 .../load/pipeline/test_filesystem_pipeline.py | 45 +++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 7009ad95ac..9683617db8 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -36,6 +36,7 @@
 from dlt.destinations.job_impl import (
     ReferenceFollowupJob,
     FinalizedLoadJob,
+    FinalizedLoadJobWithFollowupJobs,
 )
 from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration
 from dlt.destinations import path_utils
@@ -366,7 +367,7 @@ def create_load_job(
             if ReferenceFollowupJob.is_reference_job(file_path):
                 return DeltaLoadFilesystemJob(file_path)
             # otherwise just continue
-            return FilesystemLoadJobWithFollowup(file_path)
+            return FinalizedLoadJobWithFollowupJobs(file_path)
 
         cls = FilesystemLoadJobWithFollowup if self.config.as_staging else FilesystemLoadJob
         return cls(file_path)
diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py
index 71620e889d..f9196cc909 100644
--- a/tests/load/pipeline/test_filesystem_pipeline.py
+++ b/tests/load/pipeline/test_filesystem_pipeline.py
@@ -314,6 +314,51 @@ def data_types():
     assert len(rows) == 10
 
 
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(
+        table_format_filesystem_configs=True,
+        table_format="delta",
+        bucket_subset=(FILE_BUCKET),
+    ),
+    ids=lambda x: x.name,
+)
+def test_delta_table_does_not_contain_job_files(
+    destination_config: DestinationTestConfiguration,
+) -> None:
+    """Asserts Parquet job files do not end up in Delta table."""
+
+    pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True)
+
+    @dlt.resource(table_format="delta")
+    def delta_table():
+        yield [{"foo": 1}]
+
+    # create Delta table
+    info = pipeline.run(delta_table())
+    assert_load_info(info)
+
+    # get Parquet jobs
+    completed_jobs = info.load_packages[0].jobs["completed_jobs"]
+    parquet_jobs = [
+        job
+        for job in completed_jobs
+        if job.job_file_info.table_name == "delta_table" and job.file_path.endswith(".parquet")
+    ]
+    assert len(parquet_jobs) == 1
+
+    # get Parquet files in Delta table folder
+    with pipeline.destination_client() as client:
+        assert isinstance(client, FilesystemClient)
+        table_dir = client.get_table_dir("delta_table")
+        parquet_files = [f for f in client.fs_client.ls(table_dir) if f.endswith(".parquet")]
+    assert len(parquet_files) == 1
+
+    # Parquet file should not be the job file
+    file_id = parquet_jobs[0].job_file_info.file_id
+    assert file_id not in parquet_files[0]
+
+
 @pytest.mark.parametrize(
     "destination_config",
     destinations_configs(

From 01423f7892a1dc8d50447f5f27c9e8573e5e254a Mon Sep 17 00:00:00 2001
From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com>
Date: Thu, 15 Aug 2024 22:46:27 +0400
Subject: [PATCH 13/34] Add `delta` table partitioning support (#1696)

* add delta table partitioning support

* document delta table partitioning support

* Update docs/website/docs/dlt-ecosystem/destinations/filesystem.md

---------

Co-authored-by: Anton Burnashev <anton.burnashev@gmail.com>
---
 dlt/common/libs/deltalake.py                  |  9 +-
 .../impl/filesystem/filesystem.py             |  5 ++
 .../dlt-ecosystem/destinations/filesystem.md  | 17 ++++
 .../load/pipeline/test_filesystem_pipeline.py | 85 +++++++++++++++++++
 4 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py
index e6cd91bd0a..d98795d07c 100644
--- a/dlt/common/libs/deltalake.py
+++ b/dlt/common/libs/deltalake.py
@@ -1,4 +1,4 @@
-from typing import Optional, Dict, Union
+from typing import Optional, Dict, Union, List
 from pathlib import Path
 
 from dlt import version, Pipeline
@@ -71,9 +71,13 @@ def write_delta_table(
     table_or_uri: Union[str, Path, DeltaTable],
     data: Union[pa.Table, pa.RecordBatchReader],
     write_disposition: TWriteDisposition,
+    partition_by: Optional[Union[List[str], str]] = None,
     storage_options: Optional[Dict[str, str]] = None,
 ) -> None:
-    """Writes in-memory Arrow table to on-disk Delta table."""
+    """Writes in-memory Arrow table to on-disk Delta table.
+
+    Thin wrapper around `deltalake.write_deltalake`.
+    """
 
     # throws warning for `s3` protocol: https://github.com/delta-io/delta-rs/issues/2460
     # TODO: upgrade `deltalake` lib after https://github.com/delta-io/delta-rs/pull/2500
@@ -81,6 +85,7 @@ def write_delta_table(
     write_deltalake(  # type: ignore[call-overload]
         table_or_uri=table_or_uri,
         data=ensure_delta_compatible_arrow_data(data),
+        partition_by=partition_by,
         mode=get_delta_write_mode(write_disposition),
         schema_mode="merge",  # enable schema evolution (adding new columns)
         storage_options=storage_options,
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 9683617db8..f2466f25a2 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -115,6 +115,9 @@ def run(self) -> None:
         storage_options = _deltalake_storage_options(self._job_client.config)
         dt = try_get_deltatable(dt_path, storage_options=storage_options)
 
+        # get partition columns
+        part_cols = get_columns_names_with_prop(self._load_table, "partition")
+
         # explicitly check if there is data
         # (https://github.com/delta-io/delta-rs/issues/2686)
         if arrow_ds.head(1).num_rows == 0:
@@ -124,6 +127,7 @@ def run(self) -> None:
                     table_uri=dt_path,
                     schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema),
                     mode="overwrite",
+                    partition_by=part_cols,
                     storage_options=storage_options,
                 )
             return
@@ -159,6 +163,7 @@ def run(self) -> None:
                 table_or_uri=dt_path if dt is None else dt,
                 data=arrow_rbr,
                 write_disposition=self._load_table["write_disposition"],
+                partition_by=part_cols,
                 storage_options=storage_options,
             )
 
diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
index bba0ff3df3..018b838363 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
@@ -530,6 +530,23 @@ def my_delta_resource():
 
 > `dlt` always uses `parquet` as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded.
 
+#### Delta table partitioning
+A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column:
+
+```py
+@dlt.resource(
+  table_format="delta",
+  columns={"foo": {"partition": True}}
+)
+def my_delta_resource():
+    ...
+```
+
+:::caution
+It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match.
+:::
+
+
 #### Storage options
 You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`:
 
diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py
index f9196cc909..759f443546 100644
--- a/tests/load/pipeline/test_filesystem_pipeline.py
+++ b/tests/load/pipeline/test_filesystem_pipeline.py
@@ -482,6 +482,91 @@ def complex_table():
     assert len(rows_dict["complex_table__child__grandchild"]) == 5
 
 
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(
+        table_format_filesystem_configs=True,
+        table_format="delta",
+        bucket_subset=(FILE_BUCKET),
+    ),
+    ids=lambda x: x.name,
+)
+def test_delta_table_partitioning(
+    destination_config: DestinationTestConfiguration,
+) -> None:
+    """Tests partitioning for `delta` table format."""
+
+    from dlt.common.libs.deltalake import get_delta_tables
+    from tests.pipeline.utils import users_materialize_table_schema
+
+    pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True)
+
+    # zero partition columns
+    @dlt.resource(table_format="delta")
+    def zero_part():
+        yield {"foo": 1, "bar": 1}
+
+    info = pipeline.run(zero_part())
+    assert_load_info(info)
+    dt = get_delta_tables(pipeline, "zero_part")["zero_part"]
+    assert dt.metadata().partition_columns == []
+    assert load_table_counts(pipeline, "zero_part")["zero_part"] == 1
+
+    # one partition column
+    @dlt.resource(table_format="delta", columns={"c1": {"partition": True}})
+    def one_part():
+        yield [
+            {"c1": "foo", "c2": 1},
+            {"c1": "foo", "c2": 2},
+            {"c1": "bar", "c2": 3},
+            {"c1": "baz", "c2": 4},
+        ]
+
+    info = pipeline.run(one_part())
+    assert_load_info(info)
+    dt = get_delta_tables(pipeline, "one_part")["one_part"]
+    assert dt.metadata().partition_columns == ["c1"]
+    assert load_table_counts(pipeline, "one_part")["one_part"] == 4
+
+    # two partition columns
+    @dlt.resource(
+        table_format="delta", columns={"c1": {"partition": True}, "c2": {"partition": True}}
+    )
+    def two_part():
+        yield [
+            {"c1": "foo", "c2": 1, "c3": True},
+            {"c1": "foo", "c2": 2, "c3": True},
+            {"c1": "bar", "c2": 1, "c3": True},
+            {"c1": "baz", "c2": 1, "c3": True},
+        ]
+
+    info = pipeline.run(two_part())
+    assert_load_info(info)
+    dt = get_delta_tables(pipeline, "two_part")["two_part"]
+    assert dt.metadata().partition_columns == ["c1", "c2"]
+    assert load_table_counts(pipeline, "two_part")["two_part"] == 4
+
+    # test partitioning with empty source
+    users_materialize_table_schema.apply_hints(
+        table_format="delta",
+        columns={"id": {"partition": True}},
+    )
+    info = pipeline.run(users_materialize_table_schema())
+    assert_load_info(info)
+    dt = get_delta_tables(pipeline, "users")["users"]
+    assert dt.metadata().partition_columns == ["id"]
+    assert load_table_counts(pipeline, "users")["users"] == 0
+
+    # changing partitioning after initial table creation is not supported
+    zero_part.apply_hints(columns={"foo": {"partition": True}})
+    with pytest.raises(PipelineStepFailed) as pip_ex:
+        pipeline.run(zero_part())
+    assert isinstance(pip_ex.value.__context__, LoadClientJobRetry)
+    assert "partitioning" in pip_ex.value.__context__.retry_message
+    dt = get_delta_tables(pipeline, "zero_part")["zero_part"]
+    assert dt.metadata().partition_columns == []
+
+
 @pytest.mark.parametrize(
     "destination_config",
     destinations_configs(

From 49b45fb4592e53e2d0d7eaf09c1c4279927b7853 Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Fri, 16 Aug 2024 17:12:21 +0530
Subject: [PATCH 14/34] sets default argument to None

---
 dlt/sources/helpers/rest_client/paginators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index f87eaea873..91b364c395 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -164,7 +164,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) ->
             ):
                 self._has_next_page = False
 
-    def _stop_after_this_page(self, data: Optional[List[Any]]) -> bool:
+    def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool:
         return self.stop_after_empty_page and data == []
 
     def _handle_missing_total(self, response_json: Dict[str, Any]) -> None:

From 1f26fe74587fb13046ce0646fe97426150283b65 Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Fri, 16 Aug 2024 17:13:39 +0530
Subject: [PATCH 15/34] passes non-empty list to paginator.update_state() and
 interprets both None and [] as "no data"

---
 dlt/sources/helpers/rest_client/paginators.py |  2 +-
 .../helpers/rest_client/test_paginators.py    | 34 +++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 91b364c395..632c93d0c7 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -165,7 +165,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) ->
                 self._has_next_page = False
 
     def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool:
-        return self.stop_after_empty_page and data == []
+        return self.stop_after_empty_page and not data
 
     def _handle_missing_total(self, response_json: Dict[str, Any]) -> None:
         raise ValueError(
diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py
index 7357169101..7ae6aa10dc 100644
--- a/tests/sources/helpers/rest_client/test_paginators.py
+++ b/tests/sources/helpers/rest_client/test_paginators.py
@@ -242,7 +242,7 @@ class TestOffsetPaginator:
     def test_update_state(self):
         paginator = OffsetPaginator(offset=0, limit=10)
         response = Mock(Response, json=lambda: {"total": 20})
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         assert paginator.current_value == 10
         assert paginator.has_next_page is True
 
@@ -253,7 +253,7 @@ def test_update_state(self):
     def test_update_state_with_string_total(self):
         paginator = OffsetPaginator(0, 10)
         response = Mock(Response, json=lambda: {"total": "20"})
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         assert paginator.current_value == 10
         assert paginator.has_next_page is True
 
@@ -261,13 +261,13 @@ def test_update_state_with_invalid_total(self):
         paginator = OffsetPaginator(0, 10)
         response = Mock(Response, json=lambda: {"total": "invalid"})
         with pytest.raises(ValueError):
-            paginator.update_state(response)
+            paginator.update_state(response, data=[{}])
 
     def test_update_state_without_total(self):
         paginator = OffsetPaginator(0, 10)
         response = Mock(Response, json=lambda: {})
         with pytest.raises(ValueError):
-            paginator.update_state(response)
+            paginator.update_state(response, data=[{}])
 
     def test_init_request(self):
         paginator = OffsetPaginator(offset=123, limit=42)
@@ -281,7 +281,7 @@ def test_init_request(self):
 
         response = Mock(Response, json=lambda: {"total": 200})
 
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
 
         # Test for the next request
         next_request = Mock(spec=Request)
@@ -295,11 +295,11 @@ def test_init_request(self):
     def test_maximum_offset(self):
         paginator = OffsetPaginator(offset=0, limit=50, maximum_offset=100, total_path=None)
         response = Mock(Response, json=lambda: {"items": []})
-        paginator.update_state(response)  # Offset 0 to 50
+        paginator.update_state(response, data=[{}])  # Offset 0 to 50
         assert paginator.current_value == 50
         assert paginator.has_next_page is True
 
-        paginator.update_state(response)  # Offset 50 to 100
+        paginator.update_state(response, data=[{}])  # Offset 50 to 100
         assert paginator.current_value == 100
         assert paginator.has_next_page is False
 
@@ -362,22 +362,22 @@ class TestPageNumberPaginator:
     def test_update_state(self):
         paginator = PageNumberPaginator(base_page=1, page=1, total_path="total_pages")
         response = Mock(Response, json=lambda: {"total_pages": 3})
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         assert paginator.current_value == 2
         assert paginator.has_next_page is True
 
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         assert paginator.current_value == 3
         assert paginator.has_next_page is True
 
         # Test for reaching the end
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         assert paginator.has_next_page is False
 
     def test_update_state_with_string_total_pages(self):
         paginator = PageNumberPaginator(base_page=1, page=1)
         response = Mock(Response, json=lambda: {"total": "3"})
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         assert paginator.current_value == 2
         assert paginator.has_next_page is True
 
@@ -385,34 +385,34 @@ def test_update_state_with_invalid_total_pages(self):
         paginator = PageNumberPaginator(base_page=1, page=1)
         response = Mock(Response, json=lambda: {"total_pages": "invalid"})
         with pytest.raises(ValueError):
-            paginator.update_state(response)
+            paginator.update_state(response, data=[{}])
 
     def test_update_state_without_total_pages(self):
         paginator = PageNumberPaginator(base_page=1, page=1)
         response = Mock(Response, json=lambda: {})
         with pytest.raises(ValueError):
-            paginator.update_state(response)
+            paginator.update_state(response, data=[{}])
 
     def test_update_request(self):
         paginator = PageNumberPaginator(base_page=1, page=1, page_param="page")
         request = Mock(Request)
         response = Mock(Response, json=lambda: {"total": 3})
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         request.params = {}
         paginator.update_request(request)
         assert request.params["page"] == 2
-        paginator.update_state(response)
+        paginator.update_state(response, data=[{}])
         paginator.update_request(request)
         assert request.params["page"] == 3
 
     def test_maximum_page(self):
         paginator = PageNumberPaginator(base_page=1, page=1, maximum_page=3, total_path=None)
         response = Mock(Response, json=lambda: {"items": []})
-        paginator.update_state(response)  # Page 1
+        paginator.update_state(response, data=[{}])  # Page 1
         assert paginator.current_value == 2
         assert paginator.has_next_page is True
 
-        paginator.update_state(response)  # Page 2
+        paginator.update_state(response, data=[{}])  # Page 2
         assert paginator.current_value == 3
         assert paginator.has_next_page is False
 

From 5bf78ae3f04dfbfe26f4c6b737756f2e46b970f8 Mon Sep 17 00:00:00 2001
From: rudolfix <rudolfix@rudolfix.org>
Date: Fri, 16 Aug 2024 16:40:09 +0200
Subject: [PATCH 16/34] fixes load job counter (#1702)

* displays log counter when new counter is created

* initializes load job counters only when package starts
---
 dlt/common/runtime/collector.py |  1 +
 dlt/load/load.py                | 10 ++++------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/dlt/common/runtime/collector.py b/dlt/common/runtime/collector.py
index 95117b70cc..be5453cdd3 100644
--- a/dlt/common/runtime/collector.py
+++ b/dlt/common/runtime/collector.py
@@ -170,6 +170,7 @@ def update(
                 total=total,
             )
             self.messages[counter_key] = None
+            self.last_log_time = None
 
         self.counters[counter_key] += inc
         if message is not None:
diff --git a/dlt/load/load.py b/dlt/load/load.py
index 34b7e2b5b7..99a12d69ee 100644
--- a/dlt/load/load.py
+++ b/dlt/load/load.py
@@ -471,7 +471,7 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False)
             f"All jobs completed, archiving package {load_id} with aborted set to {aborted}"
         )
 
-    def update_loadpackage_info(self, load_id: str) -> None:
+    def update_load_package_info(self, load_id: str) -> None:
         # update counter we only care about the jobs that are scheduled to be loaded
         package_jobs = self.load_storage.normalized_packages.get_load_package_jobs(load_id)
         total_jobs = reduce(lambda p, c: p + len(c), package_jobs.values(), 0)
@@ -492,6 +492,8 @@ def load_single_package(self, load_id: str, schema: Schema) -> None:
         dropped_tables = current_load_package()["state"].get("dropped_tables", [])
         truncated_tables = current_load_package()["state"].get("truncated_tables", [])
 
+        self.update_load_package_info(load_id)
+
         # initialize analytical storage ie. create dataset required by passed schema
         with self.get_destination_client(schema) as job_client:
             if (expected_update := self.load_storage.begin_schema_update(load_id)) is not None:
@@ -539,14 +541,10 @@ def load_single_package(self, load_id: str, schema: Schema) -> None:
         pending_exception: Optional[LoadClientJobException] = None
         while True:
             try:
-                # we continously spool new jobs and complete finished ones
+                # we continuously spool new jobs and complete finished ones
                 running_jobs, finalized_jobs, new_pending_exception = self.complete_jobs(
                     load_id, running_jobs, schema
                 )
-                # update load package info if any jobs where finalized
-                if finalized_jobs:
-                    self.update_loadpackage_info(load_id)
-
                 pending_exception = pending_exception or new_pending_exception
 
                 # do not spool new jobs if there was a signal or an exception was encountered

From 83bab151a81ad3e3beaad8b4486741bd3e28d2fa Mon Sep 17 00:00:00 2001
From: Willi <willi.mueller@posteo.de>
Date: Mon, 19 Aug 2024 17:55:44 +0530
Subject: [PATCH 17/34] refactors magic to telling name

---
 .../helpers/rest_client/test_paginators.py    | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py
index 7ae6aa10dc..5c9f484bbc 100644
--- a/tests/sources/helpers/rest_client/test_paginators.py
+++ b/tests/sources/helpers/rest_client/test_paginators.py
@@ -17,6 +17,8 @@
 
 from .conftest import assert_pagination
 
+NON_EMPTY_PAGE = [{"some": "data"}]
+
 
 @pytest.mark.usefixtures("mock_api_server")
 class TestHeaderLinkPaginator:
@@ -242,7 +244,7 @@ class TestOffsetPaginator:
     def test_update_state(self):
         paginator = OffsetPaginator(offset=0, limit=10)
         response = Mock(Response, json=lambda: {"total": 20})
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         assert paginator.current_value == 10
         assert paginator.has_next_page is True
 
@@ -253,7 +255,7 @@ def test_update_state(self):
     def test_update_state_with_string_total(self):
         paginator = OffsetPaginator(0, 10)
         response = Mock(Response, json=lambda: {"total": "20"})
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         assert paginator.current_value == 10
         assert paginator.has_next_page is True
 
@@ -261,13 +263,13 @@ def test_update_state_with_invalid_total(self):
         paginator = OffsetPaginator(0, 10)
         response = Mock(Response, json=lambda: {"total": "invalid"})
         with pytest.raises(ValueError):
-            paginator.update_state(response, data=[{}])
+            paginator.update_state(response, data=NON_EMPTY_PAGE)
 
     def test_update_state_without_total(self):
         paginator = OffsetPaginator(0, 10)
         response = Mock(Response, json=lambda: {})
         with pytest.raises(ValueError):
-            paginator.update_state(response, data=[{}])
+            paginator.update_state(response, data=NON_EMPTY_PAGE)
 
     def test_init_request(self):
         paginator = OffsetPaginator(offset=123, limit=42)
@@ -281,7 +283,7 @@ def test_init_request(self):
 
         response = Mock(Response, json=lambda: {"total": 200})
 
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
 
         # Test for the next request
         next_request = Mock(spec=Request)
@@ -295,11 +297,11 @@ def test_init_request(self):
     def test_maximum_offset(self):
         paginator = OffsetPaginator(offset=0, limit=50, maximum_offset=100, total_path=None)
         response = Mock(Response, json=lambda: {"items": []})
-        paginator.update_state(response, data=[{}])  # Offset 0 to 50
+        paginator.update_state(response, data=NON_EMPTY_PAGE)  # Offset 0 to 50
         assert paginator.current_value == 50
         assert paginator.has_next_page is True
 
-        paginator.update_state(response, data=[{}])  # Offset 50 to 100
+        paginator.update_state(response, data=NON_EMPTY_PAGE)  # Offset 50 to 100
         assert paginator.current_value == 100
         assert paginator.has_next_page is False
 
@@ -362,22 +364,22 @@ class TestPageNumberPaginator:
     def test_update_state(self):
         paginator = PageNumberPaginator(base_page=1, page=1, total_path="total_pages")
         response = Mock(Response, json=lambda: {"total_pages": 3})
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         assert paginator.current_value == 2
         assert paginator.has_next_page is True
 
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         assert paginator.current_value == 3
         assert paginator.has_next_page is True
 
         # Test for reaching the end
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         assert paginator.has_next_page is False
 
     def test_update_state_with_string_total_pages(self):
         paginator = PageNumberPaginator(base_page=1, page=1)
         response = Mock(Response, json=lambda: {"total": "3"})
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         assert paginator.current_value == 2
         assert paginator.has_next_page is True
 
@@ -385,34 +387,34 @@ def test_update_state_with_invalid_total_pages(self):
         paginator = PageNumberPaginator(base_page=1, page=1)
         response = Mock(Response, json=lambda: {"total_pages": "invalid"})
         with pytest.raises(ValueError):
-            paginator.update_state(response, data=[{}])
+            paginator.update_state(response, data=NON_EMPTY_PAGE)
 
     def test_update_state_without_total_pages(self):
         paginator = PageNumberPaginator(base_page=1, page=1)
         response = Mock(Response, json=lambda: {})
         with pytest.raises(ValueError):
-            paginator.update_state(response, data=[{}])
+            paginator.update_state(response, data=NON_EMPTY_PAGE)
 
     def test_update_request(self):
         paginator = PageNumberPaginator(base_page=1, page=1, page_param="page")
         request = Mock(Request)
         response = Mock(Response, json=lambda: {"total": 3})
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         request.params = {}
         paginator.update_request(request)
         assert request.params["page"] == 2
-        paginator.update_state(response, data=[{}])
+        paginator.update_state(response, data=NON_EMPTY_PAGE)
         paginator.update_request(request)
         assert request.params["page"] == 3
 
     def test_maximum_page(self):
         paginator = PageNumberPaginator(base_page=1, page=1, maximum_page=3, total_path=None)
         response = Mock(Response, json=lambda: {"items": []})
-        paginator.update_state(response, data=[{}])  # Page 1
+        paginator.update_state(response, data=NON_EMPTY_PAGE)  # Page 1
         assert paginator.current_value == 2
         assert paginator.has_next_page is True
 
-        paginator.update_state(response, data=[{}])  # Page 2
+        paginator.update_state(response, data=NON_EMPTY_PAGE)  # Page 2
         assert paginator.current_value == 3
         assert paginator.has_next_page is False
 

From 843b658fdec9e71fd5129fd3a726878a0b29d83f Mon Sep 17 00:00:00 2001
From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com>
Date: Wed, 21 Aug 2024 12:24:32 +0400
Subject: [PATCH 18/34] Enable `scd2` record reinsert (#1707)

* make sorting optional

* enable scd2 record reinsertion by dropping unique constraint

* document scd2 row id uniqueness characteristics

* assert unique constraint is not dropped when users bring their own hash
---
 dlt/destinations/sql_jobs.py                  |   8 +-
 dlt/extract/hints.py                          |   5 +
 .../docs/general-usage/incremental-loading.md |  13 +++
 tests/load/pipeline/test_scd2.py              | 108 +++++++++++++++---
 4 files changed, 115 insertions(+), 19 deletions(-)

diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
index cddae52bb7..51e5b95a0e 100644
--- a/dlt/destinations/sql_jobs.py
+++ b/dlt/destinations/sql_jobs.py
@@ -749,20 +749,20 @@ def gen_scd2_sql(
             INSERT INTO {root_table_name} ({col_str}, {from_}, {to})
             SELECT {col_str}, {boundary_ts} AS {from_}, {active_record_literal} AS {to}
             FROM {staging_root_table_name} AS s
-            WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name});
+            WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name} WHERE {is_active_clause});
         """)
 
         # insert list elements for new active records in child tables
         child_tables = table_chain[1:]
         if child_tables:
-            unique_column = escape_column_id(
-                cls._get_unique_col(table_chain, sql_client, root_table)
-            )
             # TODO: - based on deterministic child hashes (OK)
             # - if row hash changes all is right
             # - if it does not we only capture new records, while we should replace existing with those in stage
             # - this write disposition is way more similar to regular merge (how root tables are handled is different, other tables handled same)
             for table in child_tables:
+                unique_column = escape_column_id(
+                    cls._get_unique_col(table_chain, sql_client, table)
+                )
                 table_name, staging_table_name = sql_client.get_qualified_table_names(table["name"])
                 sql.append(f"""
                     INSERT INTO {table_name}
diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py
index dce375afb0..123a8455e1 100644
--- a/dlt/extract/hints.py
+++ b/dlt/extract/hints.py
@@ -465,11 +465,16 @@ def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None:
                     "x-valid-to": True,
                     "x-active-record-timestamp": mddict.get("active_record_timestamp"),
                 }
+                # unique constraint is dropped for C_DLT_ID when used to store
+                # SCD2 row hash (only applies to root table)
                 hash_ = mddict.get("row_version_column_name", DataItemNormalizer.C_DLT_ID)
                 dict_["columns"][hash_] = {
                     "name": hash_,
                     "nullable": False,
                     "x-row-version": True,
+                    # duplicate value in row hash column is possible in case
+                    # of insert-delete-reinsert pattern
+                    "unique": False,
                 }
 
     @staticmethod
diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md
index b130f7a4f5..8eb1002dcf 100644
--- a/docs/website/docs/general-usage/incremental-loading.md
+++ b/docs/website/docs/general-usage/incremental-loading.md
@@ -251,6 +251,19 @@ executed. You can achieve the same in the decorator `@dlt.source(root_key=True)`
 ### `scd2` strategy
 `dlt` can create [Slowly Changing Dimension Type 2](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) (SCD2) destination tables for dimension tables that change in the source. The resource is expected to provide a full extract of the source table each run. A row hash is stored in `_dlt_id` and used as surrogate key to identify source records that have been inserted, updated, or deleted. A `NULL` value is used by default to indicate an active record, but it's possible to use a configurable high timestamp (e.g. 9999-12-31 00:00:00.000000) instead.
 
+:::note
+The `unique` hint for `_dlt_id` in the root table is set to `false`  when using `scd2`. This differs from [default behavior](./destination-tables.md#child-and-parent-tables). The reason is that the surrogate key stored in `_dlt_id` contains duplicates after an _insert-delete-reinsert_ pattern:
+1. record with surrogate key X is inserted in a load at `t1`
+2. record with surrogate key X is deleted in a later load at `t2`
+3. record with surrogate key X is reinserted in an even later load at `t3`
+
+After this pattern, the `scd2` table in the destination has two records for surrogate key X: one for validity window `[t1, t2]`, and one for `[t3, NULL]`. A duplicate value exists in `_dlt_id` because both records have the same surrogate key.
+
+Note that:
+- the composite key `(_dlt_id, _dlt_valid_from)` is unique   
+- `_dlt_id` remains unique for child tables—`scd2` does not affect this
+:::
+
 #### Example: `scd2` merge strategy
 ```py
 @dlt.resource(
diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py
index 8b41c354b2..8f2c0c2486 100644
--- a/tests/load/pipeline/test_scd2.py
+++ b/tests/load/pipeline/test_scd2.py
@@ -46,7 +46,7 @@ def get_load_package_created_at(pipeline: dlt.Pipeline, load_info: LoadInfo) ->
 
 
 def get_table(
-    pipeline: dlt.Pipeline, table_name: str, sort_column: str, include_root_id: bool = True
+    pipeline: dlt.Pipeline, table_name: str, sort_column: str = None, include_root_id: bool = True
 ) -> List[Dict[str, Any]]:
     """Returns destination table contents as list of dictionaries."""
 
@@ -54,6 +54,21 @@ def strip_timezone(ts: datetime) -> datetime:
         """Converts timezone of datetime object to UTC and removes timezone awareness."""
         return ensure_pendulum_datetime(ts).astimezone(tz=timezone.utc).replace(tzinfo=None)
 
+    table = [
+        {
+            k: strip_timezone(v) if isinstance(v, datetime) else v
+            for k, v in r.items()
+            if not k.startswith("_dlt")
+            or k in DEFAULT_VALIDITY_COLUMN_NAMES
+            or (k == "_dlt_root_id" if include_root_id else False)
+        }
+        for r in load_tables_to_dicts(pipeline, table_name)[table_name]
+    ]
+
+    if sort_column is None:
+        return table
+    return sorted(table, key=lambda d: d[sort_column])
+
     return sorted(
         [
             {
@@ -139,8 +154,8 @@ def r(data):
     assert table["columns"][from_]["x-valid-from"]  # type: ignore[typeddict-item]
     assert table["columns"][to]["x-valid-to"]  # type: ignore[typeddict-item]
     assert table["columns"]["_dlt_id"]["x-row-version"]  # type: ignore[typeddict-item]
-    # _dlt_id is still unique
-    assert table["columns"]["_dlt_id"]["unique"]
+    # root table _dlt_id is not unique with `scd2` merge strategy
+    assert not table["columns"]["_dlt_id"]["unique"]
 
     # assert load results
     ts_1 = get_load_package_created_at(p, info)
@@ -288,7 +303,7 @@ def r(data):
         {from_: ts_2, to: None, "nk": 1, "c1": "foo_updated"},  # new
     ]
     assert_records_as_set(
-        get_table(p, "dim_test__c2", cname),
+        get_table(p, "dim_test__c2"),
         [
             {"_dlt_root_id": get_row_hash(l1_1), cname: 1},
             {"_dlt_root_id": get_row_hash(l2_1), cname: 1},  # new
@@ -310,7 +325,7 @@ def r(data):
     ts_3 = get_load_package_created_at(p, info)
     assert_load_info(info)
     assert_records_as_set(
-        get_table(p, "dim_test", "c1"),
+        get_table(p, "dim_test"),
         [
             {from_: ts_1, to: None, "nk": 2, "c1": "bar"},
             {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"},
@@ -326,7 +341,7 @@ def r(data):
         {"_dlt_root_id": get_row_hash(l3_1), cname: 2},  # new
         {"_dlt_root_id": get_row_hash(l1_2), cname: 3},
     ]
-    assert_records_as_set(get_table(p, "dim_test__c2", cname), exp_3)
+    assert_records_as_set(get_table(p, "dim_test__c2"), exp_3)
 
     # load 4 — delete a record
     dim_snap = [
@@ -336,7 +351,7 @@ def r(data):
     ts_4 = get_load_package_created_at(p, info)
     assert_load_info(info)
     assert_records_as_set(
-        get_table(p, "dim_test", "c1"),
+        get_table(p, "dim_test"),
         [
             {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"},  # updated
             {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"},
@@ -345,7 +360,7 @@ def r(data):
         ],
     )
     assert_records_as_set(
-        get_table(p, "dim_test__c2", cname), exp_3
+        get_table(p, "dim_test__c2"), exp_3
     )  # deletes should not alter child tables
 
     # load 5 — insert a record
@@ -357,7 +372,7 @@ def r(data):
     ts_5 = get_load_package_created_at(p, info)
     assert_load_info(info)
     assert_records_as_set(
-        get_table(p, "dim_test", "c1"),
+        get_table(p, "dim_test"),
         [
             {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"},
             {from_: ts_5, to: None, "nk": 3, "c1": "baz"},  # new
@@ -367,7 +382,7 @@ def r(data):
         ],
     )
     assert_records_as_set(
-        get_table(p, "dim_test__c2", cname),
+        get_table(p, "dim_test__c2"),
         [
             {"_dlt_root_id": get_row_hash(l1_1), cname: 1},
             {"_dlt_root_id": get_row_hash(l2_1), cname: 1},
@@ -403,7 +418,7 @@ def r(data):
     info = p.run(r(dim_snap), loader_file_format=destination_config.file_format)
     assert_load_info(info)
     assert_records_as_set(
-        get_table(p, "dim_test__c2__cc1", "value"),
+        get_table(p, "dim_test__c2__cc1"),
         [
             {"_dlt_root_id": get_row_hash(l1_1), "value": 1},
             {"_dlt_root_id": get_row_hash(l1_2), "value": 1},
@@ -419,7 +434,7 @@ def r(data):
     info = p.run(r(dim_snap), loader_file_format=destination_config.file_format)
     assert_load_info(info)
     assert_records_as_set(
-        (get_table(p, "dim_test__c2__cc1", "value")),
+        (get_table(p, "dim_test__c2__cc1")),
         [
             {"_dlt_root_id": get_row_hash(l1_1), "value": 1},
             {"_dlt_root_id": get_row_hash(l1_2), "value": 1},
@@ -443,7 +458,7 @@ def r(data):
         {"_dlt_root_id": get_row_hash(l1_2), "value": 2},
         {"_dlt_root_id": get_row_hash(l3_1), "value": 2},  # new
     ]
-    assert_records_as_set(get_table(p, "dim_test__c2__cc1", "value"), exp_3)
+    assert_records_as_set(get_table(p, "dim_test__c2__cc1"), exp_3)
 
     # load 4 — delete a record
     dim_snap = [
@@ -451,7 +466,7 @@ def r(data):
     ]
     info = p.run(r(dim_snap), loader_file_format=destination_config.file_format)
     assert_load_info(info)
-    assert_records_as_set(get_table(p, "dim_test__c2__cc1", "value"), exp_3)
+    assert_records_as_set(get_table(p, "dim_test__c2__cc1"), exp_3)
 
     # load 5 — insert a record
     dim_snap = [
@@ -461,7 +476,7 @@ def r(data):
     info = p.run(r(dim_snap), loader_file_format=destination_config.file_format)
     assert_load_info(info)
     assert_records_as_set(
-        get_table(p, "dim_test__c2__cc1", "value"),
+        get_table(p, "dim_test__c2__cc1"),
         [
             {"_dlt_root_id": get_row_hash(l1_1), "value": 1},
             {"_dlt_root_id": get_row_hash(l1_2), "value": 1},
@@ -474,6 +489,67 @@ def r(data):
     )
 
 
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(default_sql_configs=True, supports_merge=True),
+    ids=lambda x: x.name,
+)
+def test_record_reinsert(destination_config: DestinationTestConfiguration) -> None:
+    p = destination_config.setup_pipeline("abstract", dev_mode=True)
+
+    @dlt.resource(
+        table_name="dim_test", write_disposition={"disposition": "merge", "strategy": "scd2"}
+    )
+    def r(data):
+        yield data
+
+    # load 1 — initial load
+    dim_snap = [
+        r1 := {"nk": 1, "c1": "foo", "c2": "foo", "child": [1]},
+        r2 := {"nk": 2, "c1": "bar", "c2": "bar", "child": [2, 3]},
+    ]
+    info = p.run(r(dim_snap))
+    assert_load_info(info)
+    assert load_table_counts(p, "dim_test")["dim_test"] == 2
+    assert load_table_counts(p, "dim_test__child")["dim_test__child"] == 3
+    ts_1 = get_load_package_created_at(p, info)
+
+    # load 2 — delete natural key 1
+    dim_snap = [r2]
+    info = p.run(r(dim_snap))
+    assert_load_info(info)
+    assert load_table_counts(p, "dim_test")["dim_test"] == 2
+    assert load_table_counts(p, "dim_test__child")["dim_test__child"] == 3
+    ts_2 = get_load_package_created_at(p, info)
+
+    # load 3 — reinsert natural key 1
+    dim_snap = [r1, r2]
+    info = p.run(r(dim_snap))
+    assert_load_info(info)
+    assert load_table_counts(p, "dim_test")["dim_test"] == 3
+    assert load_table_counts(p, "dim_test__child")["dim_test__child"] == 3  # no new record
+    ts_3 = get_load_package_created_at(p, info)
+
+    # assert parent records
+    from_, to = DEFAULT_VALIDITY_COLUMN_NAMES
+    r1_no_child = {k: v for k, v in r1.items() if k != "child"}
+    r2_no_child = {k: v for k, v in r2.items() if k != "child"}
+    expected = [
+        {**{from_: ts_1, to: ts_2}, **r1_no_child},
+        {**{from_: ts_3, to: None}, **r1_no_child},
+        {**{from_: ts_1, to: None}, **r2_no_child},
+    ]
+    assert_records_as_set(get_table(p, "dim_test"), expected)
+
+    # assert child records
+    expected = [
+        {"_dlt_root_id": get_row_hash(r1), "value": 1},  # links to two records in parent
+        {"_dlt_root_id": get_row_hash(r2), "value": 2},
+        {"_dlt_root_id": get_row_hash(r2), "value": 3},
+    ]
+    assert_records_as_set(get_table(p, "dim_test__child"), expected)
+
+
 @pytest.mark.parametrize(
     "destination_config",
     destinations_configs(default_sql_configs=True, subset=["duckdb"]),
@@ -633,6 +709,8 @@ def r(data):
     table = p.default_schema.get_table("dim_test")
     assert table["columns"]["row_hash"]["x-row-version"]  # type: ignore[typeddict-item]
     assert "x-row-version" not in table["columns"]["_dlt_id"]
+    # _dlt_id unique constraint should not be dropped when users bring their own hash
+    assert table["columns"]["_dlt_id"]["unique"]
 
     # load 2 — update and delete a record
     dim_snap = [

From 6f778ebf047967c358e5bb0d20e99efb12063261 Mon Sep 17 00:00:00 2001
From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com>
Date: Thu, 22 Aug 2024 19:04:44 +0400
Subject: [PATCH 19/34] `scd2` custom "valid from" / "valid to" value feature 
 (#1709)

* black format

* remove code remnant

* add scd2 custom boundary timestamp feature

* add invalid scd2 active record timestamp test

* document scd2 boundary timestamp argument
---
 dlt/common/schema/typing.py                   |   1 +
 dlt/destinations/sql_jobs.py                  |  18 ++-
 dlt/extract/hints.py                          |  14 ++
 dlt/sources/helpers/rest_client/paginators.py |   8 +-
 .../docs/general-usage/incremental-loading.md |  18 ++-
 tests/load/pipeline/test_scd2.py              | 143 +++++++++++++++---
 .../helpers/rest_client/test_paginators.py    |   8 +-
 7 files changed, 174 insertions(+), 36 deletions(-)

diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
index 9a4dd51d4b..284c55caac 100644
--- a/dlt/common/schema/typing.py
+++ b/dlt/common/schema/typing.py
@@ -187,6 +187,7 @@ class TMergeDispositionDict(TWriteDispositionDict, total=False):
     strategy: Optional[TLoaderMergeStrategy]
     validity_column_names: Optional[List[str]]
     active_record_timestamp: Optional[TAnyDateTime]
+    boundary_timestamp: Optional[TAnyDateTime]
     row_version_column_name: Optional[str]
 
 
diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
index 51e5b95a0e..a1e38a2c20 100644
--- a/dlt/destinations/sql_jobs.py
+++ b/dlt/destinations/sql_jobs.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, List, Sequence, Tuple, cast, TypedDict, Optional, Callable, Union
 
 import yaml
-from dlt.common.logger import pretty_format_exception
+from dlt.common.time import ensure_pendulum_datetime
 
 from dlt.common.schema.typing import (
     TTableSchema,
@@ -721,10 +721,18 @@ def gen_scd2_sql(
             format_datetime_literal = (
                 DestinationCapabilitiesContext.generic_capabilities().format_datetime_literal
             )
-        boundary_ts = format_datetime_literal(
-            current_load_package()["state"]["created_at"],
+
+        boundary_ts = ensure_pendulum_datetime(
+            root_table.get(  # type: ignore[arg-type]
+                "x-boundary-timestamp",
+                current_load_package()["state"]["created_at"],
+            )
+        )
+        boundary_literal = format_datetime_literal(
+            boundary_ts,
             caps.timestamp_precision,
         )
+
         active_record_timestamp = get_active_record_timestamp(root_table)
         if active_record_timestamp is None:
             active_record_literal = "NULL"
@@ -737,7 +745,7 @@ def gen_scd2_sql(
 
         # retire updated and deleted records
         sql.append(f"""
-            {cls.gen_update_table_prefix(root_table_name)} {to} = {boundary_ts}
+            {cls.gen_update_table_prefix(root_table_name)} {to} = {boundary_literal}
             WHERE {is_active_clause}
             AND {hash_} NOT IN (SELECT {hash_} FROM {staging_root_table_name});
         """)
@@ -747,7 +755,7 @@ def gen_scd2_sql(
         col_str = ", ".join([c for c in columns if c not in (from_, to)])
         sql.append(f"""
             INSERT INTO {root_table_name} ({col_str}, {from_}, {to})
-            SELECT {col_str}, {boundary_ts} AS {from_}, {active_record_literal} AS {to}
+            SELECT {col_str}, {boundary_literal} AS {from_}, {active_record_literal} AS {to}
             FROM {staging_root_table_name} AS s
             WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name} WHERE {is_active_clause});
         """)
diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py
index 123a8455e1..67a6b3e83a 100644
--- a/dlt/extract/hints.py
+++ b/dlt/extract/hints.py
@@ -26,6 +26,7 @@
     new_table,
 )
 from dlt.common.typing import TDataItem
+from dlt.common.time import ensure_pendulum_datetime
 from dlt.common.utils import clone_dict_nested
 from dlt.common.normalizers.json.relational import DataItemNormalizer
 from dlt.common.validation import validate_dict_ignoring_xkeys
@@ -444,6 +445,8 @@ def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None:
         mddict: TMergeDispositionDict = deepcopy(dict_["write_disposition"])
         if mddict is not None:
             dict_["x-merge-strategy"] = mddict.get("strategy", DEFAULT_MERGE_STRATEGY)
+            if "boundary_timestamp" in mddict:
+                dict_["x-boundary-timestamp"] = mddict["boundary_timestamp"]
             # add columns for `scd2` merge strategy
             if dict_.get("x-merge-strategy") == "scd2":
                 if mddict.get("validity_column_names") is None:
@@ -512,3 +515,14 @@ def validate_write_disposition_hint(wd: TTableHintTemplate[TWriteDispositionConf
                     f'`{wd["strategy"]}` is not a valid merge strategy. '
                     f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}."""
                 )
+
+            for ts in ("active_record_timestamp", "boundary_timestamp"):
+                if ts == "active_record_timestamp" and wd.get("active_record_timestamp") is None:
+                    continue  # None is allowed for active_record_timestamp
+                if ts in wd:
+                    try:
+                        ensure_pendulum_datetime(wd[ts])  # type: ignore[literal-required]
+                    except Exception:
+                        raise ValueError(
+                            f'could not parse `{ts}` value "{wd[ts]}"'  # type: ignore[literal-required]
+                        )
diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 632c93d0c7..872d4f34e8 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -123,7 +123,8 @@ def __init__(
         super().__init__()
         if total_path is None and maximum_value is None and not stop_after_empty_page:
             raise ValueError(
-                "Either `total_path` or `maximum_value` or `stop_after_empty_page` must be provided."
+                "Either `total_path` or `maximum_value` or `stop_after_empty_page` must be"
+                " provided."
             )
         self.param_name = param_name
         self.current_value = initial_value
@@ -164,7 +165,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) ->
             ):
                 self._has_next_page = False
 
-    def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool:
+    def _stop_after_this_page(self, data: Optional[List[Any]] = None) -> bool:
         return self.stop_after_empty_page and not data
 
     def _handle_missing_total(self, response_json: Dict[str, Any]) -> None:
@@ -371,7 +372,8 @@ def __init__(
         """
         if total_path is None and maximum_offset is None and not stop_after_empty_page:
             raise ValueError(
-                "Either `total_path` or `maximum_offset` or `stop_after_empty_page` must be provided."
+                "Either `total_path` or `maximum_offset` or `stop_after_empty_page` must be"
+                " provided."
             )
         super().__init__(
             param_name=offset_param,
diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md
index 8eb1002dcf..68fc46e6dc 100644
--- a/docs/website/docs/general-usage/incremental-loading.md
+++ b/docs/website/docs/general-usage/incremental-loading.md
@@ -348,7 +348,23 @@ You can configure the literal used to indicate an active record with `active_rec
     write_disposition={
         "disposition": "merge",
         "strategy": "scd2",
-        "active_record_timestamp": "9999-12-31",  # e.g. datetime.datetime(9999, 12, 31) is also accepted
+        # accepts various types of date/datetime objects
+        "active_record_timestamp": "9999-12-31",
+    }
+)
+def dim_customer():
+    ...
+```
+
+#### Example: configure boundary timestamp
+You can configure the "boundary timestamp" used for record validity windows with `boundary_timestamp`. The provided date(time) value is used as "valid from" for new records and as "valid to" for retired records. The timestamp at which a load package is created is used if `boundary_timestamp` is omitted.
+```py
+@dlt.resource(
+    write_disposition={
+        "disposition": "merge",
+        "strategy": "scd2",
+        # accepts various types of date/datetime objects
+        "boundary_timestamp": "2024-08-21T12:15:00+00:00",
     }
 )
 def dim_customer():
diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py
index 8f2c0c2486..065da5ce94 100644
--- a/tests/load/pipeline/test_scd2.py
+++ b/tests/load/pipeline/test_scd2.py
@@ -3,6 +3,7 @@
 import pytest
 from typing import List, Dict, Any, Optional
 from datetime import date, datetime, timezone  # noqa: I251
+from contextlib import nullcontext as does_not_raise
 
 import dlt
 from dlt.common.typing import TAnyDateTime
@@ -45,15 +46,16 @@ def get_load_package_created_at(pipeline: dlt.Pipeline, load_info: LoadInfo) ->
     return reduce_pendulum_datetime_precision(created_at, caps.timestamp_precision)
 
 
+def strip_timezone(ts: TAnyDateTime) -> pendulum.DateTime:
+    """Converts timezone of datetime object to UTC and removes timezone awareness."""
+    return ensure_pendulum_datetime(ts).astimezone(tz=timezone.utc).replace(tzinfo=None)
+
+
 def get_table(
     pipeline: dlt.Pipeline, table_name: str, sort_column: str = None, include_root_id: bool = True
 ) -> List[Dict[str, Any]]:
     """Returns destination table contents as list of dictionaries."""
 
-    def strip_timezone(ts: datetime) -> datetime:
-        """Converts timezone of datetime object to UTC and removes timezone awareness."""
-        return ensure_pendulum_datetime(ts).astimezone(tz=timezone.utc).replace(tzinfo=None)
-
     table = [
         {
             k: strip_timezone(v) if isinstance(v, datetime) else v
@@ -69,20 +71,6 @@ def strip_timezone(ts: datetime) -> datetime:
         return table
     return sorted(table, key=lambda d: d[sort_column])
 
-    return sorted(
-        [
-            {
-                k: strip_timezone(v) if isinstance(v, datetime) else v
-                for k, v in r.items()
-                if not k.startswith("_dlt")
-                or k in DEFAULT_VALIDITY_COLUMN_NAMES
-                or (k == "_dlt_root_id" if include_root_id else False)
-            }
-            for r in load_tables_to_dicts(pipeline, table_name)[table_name]
-        ],
-        key=lambda d: d[sort_column],
-    )
-
 
 @pytest.mark.essential
 @pytest.mark.parametrize(
@@ -596,6 +584,7 @@ def r(data):
         "9999-12-31T00:00:00",
         "9999-12-31T00:00:00+00:00",
         "9999-12-31T00:00:00+01:00",
+        "i_am_not_a_timestamp",
     ],
 )
 def test_active_record_timestamp(
@@ -604,22 +593,126 @@ def test_active_record_timestamp(
 ) -> None:
     p = destination_config.setup_pipeline("abstract", dev_mode=True)
 
+    context = does_not_raise()
+    if active_record_timestamp == "i_am_not_a_timestamp":
+        context = pytest.raises(ValueError)  # type: ignore[assignment]
+
+    with context:
+
+        @dlt.resource(
+            table_name="dim_test",
+            write_disposition={
+                "disposition": "merge",
+                "strategy": "scd2",
+                "active_record_timestamp": active_record_timestamp,
+            },
+        )
+        def r():
+            yield {"foo": "bar"}
+
+        p.run(r())
+        actual_active_record_timestamp = ensure_pendulum_datetime(
+            load_tables_to_dicts(p, "dim_test")["dim_test"][0]["_dlt_valid_to"]
+        )
+        assert actual_active_record_timestamp == ensure_pendulum_datetime(active_record_timestamp)
+
+
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(default_sql_configs=True, subset=["duckdb"]),
+    ids=lambda x: x.name,
+)
+def test_boundary_timestamp(
+    destination_config: DestinationTestConfiguration,
+) -> None:
+    p = destination_config.setup_pipeline("abstract", dev_mode=True)
+
+    ts1 = "2024-08-21T12:15:00+00:00"
+    ts2 = "2024-08-22"
+    ts3 = date(2024, 8, 20)  # earlier than ts1 and ts2
+    ts4 = "i_am_not_a_timestamp"
+
     @dlt.resource(
         table_name="dim_test",
         write_disposition={
             "disposition": "merge",
             "strategy": "scd2",
-            "active_record_timestamp": active_record_timestamp,
+            "boundary_timestamp": ts1,
         },
     )
-    def r():
-        yield {"foo": "bar"}
+    def r(data):
+        yield data
 
-    p.run(r())
-    actual_active_record_timestamp = ensure_pendulum_datetime(
-        load_tables_to_dicts(p, "dim_test")["dim_test"][0]["_dlt_valid_to"]
+    # load 1 — initial load
+    dim_snap = [
+        l1_1 := {"nk": 1, "foo": "foo"},
+        l1_2 := {"nk": 2, "foo": "foo"},
+    ]
+    info = p.run(r(dim_snap))
+    assert_load_info(info)
+    assert load_table_counts(p, "dim_test")["dim_test"] == 2
+    from_, to = DEFAULT_VALIDITY_COLUMN_NAMES
+    expected = [
+        {**{from_: strip_timezone(ts1), to: None}, **l1_1},
+        {**{from_: strip_timezone(ts1), to: None}, **l1_2},
+    ]
+    assert get_table(p, "dim_test", "nk") == expected
+
+    # load 2 — different source records, different boundary timestamp
+    r.apply_hints(
+        write_disposition={
+            "disposition": "merge",
+            "strategy": "scd2",
+            "boundary_timestamp": ts2,
+        }
     )
-    assert actual_active_record_timestamp == ensure_pendulum_datetime(active_record_timestamp)
+    dim_snap = [
+        l2_1 := {"nk": 1, "foo": "bar"},  # natural key 1 updated
+        # l1_2,  # natural key 2 no longer present
+        l2_3 := {"nk": 3, "foo": "foo"},  # new natural key
+    ]
+    info = p.run(r(dim_snap))
+    assert_load_info(info)
+    assert load_table_counts(p, "dim_test")["dim_test"] == 4
+    expected = [
+        {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_1},  # retired
+        {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_2},  # retired
+        {**{from_: strip_timezone(ts2), to: None}, **l2_1},  # new
+        {**{from_: strip_timezone(ts2), to: None}, **l2_3},  # new
+    ]
+    assert_records_as_set(get_table(p, "dim_test"), expected)
+
+    # load 3 — earlier boundary timestamp
+    # we naively apply any valid timestamp
+    # may lead to "valid from" > "valid to", as in this test case
+    r.apply_hints(
+        write_disposition={
+            "disposition": "merge",
+            "strategy": "scd2",
+            "boundary_timestamp": ts3,
+        }
+    )
+    dim_snap = [l2_1]  # natural key 3 no longer present
+    info = p.run(r(dim_snap))
+    assert_load_info(info)
+    assert load_table_counts(p, "dim_test")["dim_test"] == 4
+    expected = [
+        {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_1},  # unchanged
+        {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_2},  # unchanged
+        {**{from_: strip_timezone(ts2), to: None}, **l2_1},  # unchanged
+        {**{from_: strip_timezone(ts2), to: strip_timezone(ts3)}, **l2_3},  # retired
+    ]
+    assert_records_as_set(get_table(p, "dim_test"), expected)
+
+    # invalid boundary timestamp should raise error
+    with pytest.raises(ValueError):
+        r.apply_hints(
+            write_disposition={
+                "disposition": "merge",
+                "strategy": "scd2",
+                "boundary_timestamp": ts4,
+            }
+        )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py
index 5c9f484bbc..39e3d767a0 100644
--- a/tests/sources/helpers/rest_client/test_paginators.py
+++ b/tests/sources/helpers/rest_client/test_paginators.py
@@ -347,7 +347,9 @@ def test_guarantee_termination(self):
                 total_path=None,
                 stop_after_empty_page=False,
             )
-        assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided")
+        assert e.match(
+            "`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided"
+        )
 
         with pytest.raises(ValueError) as e:
             OffsetPaginator(
@@ -356,7 +358,9 @@ def test_guarantee_termination(self):
                 stop_after_empty_page=False,
                 maximum_offset=None,
             )
-        assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided")
+        assert e.match(
+            "`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided"
+        )
 
 
 @pytest.mark.usefixtures("mock_api_server")

From 49dabb87b92ad1ba916348b38031f7b4fd6d7b7c Mon Sep 17 00:00:00 2001
From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com>
Date: Thu, 22 Aug 2024 19:47:34 +0400
Subject: [PATCH 20/34] Make `make lint` fail on `black` format diff (#1716)

* make lint fail on black format diff and add diff coloring

* format with black
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 15fb895a9f..f47047a3fe 100644
--- a/Makefile
+++ b/Makefile
@@ -52,7 +52,7 @@ lint:
 	poetry run mypy --config-file mypy.ini dlt tests
 	poetry run flake8 --max-line-length=200 dlt
 	poetry run flake8 --max-line-length=200 tests --exclude tests/reflection/module_cases
-	poetry run black dlt docs tests --diff --extend-exclude=".*syntax_error.py"
+	poetry run black dlt docs tests --check --diff --color --extend-exclude=".*syntax_error.py"
 	# poetry run isort ./ --diff
 	# $(MAKE) lint-security
 

From c51445c007bf7167d2e52facf2df0b2be00cb08a Mon Sep 17 00:00:00 2001
From: dat-a-man <98139823+dat-a-man@users.noreply.github.com>
Date: Fri, 23 Aug 2024 19:11:56 +0530
Subject: [PATCH 21/34] Docs/issue 1661 add tip to source docs and update
 weaviate docs (#1662)

---
 .../dlt-ecosystem/destinations/lancedb.md     | 17 +++++++++++++++-
 .../docs/dlt-ecosystem/destinations/qdrant.md | 19 ++++++++++++++++--
 .../dlt-ecosystem/destinations/weaviate.md    | 16 +++++++++++++++
 docs/website/docs/general-usage/source.md     | 20 +++++++++++++++++++
 4 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
index dbf90da4b9..8b7f3854ee 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
@@ -144,7 +144,22 @@ lancedb_adapter(
 )
 ```
 
-Bear in mind that you can't use an adapter on a [dlt source](../../general-usage/source.md), only a [dlt resource](../../general-usage/resource.md).
+When using the `lancedb_adapter`, it's important to apply it directly to resources, not to the whole source. Here's an example:
+
+```py
+products_tables = sql_database().with_resources("products", "customers")
+
+pipeline = dlt.pipeline(
+        pipeline_name="postgres_to_lancedb_pipeline",
+        destination="lancedb",
+    )
+
+# apply adapter to the needed resources
+lancedb_adapter(products_tables.products, embed="description")
+lancedb_adapter(products_tables.customers, embed="bio")
+
+info = pipeline.run(products_tables)
+```
 
 ## Write disposition
 
diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md
index 9f19007227..5fc8097440 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md
@@ -106,10 +106,25 @@ qdrant_adapter(
 )
 ```
 
-:::tip
+When using the `qdrant_adapter`, it's important to apply it directly to resources, not to the whole source. Here's an example:
 
-A more comprehensive pipeline would load data from some API or use one of dlt's [verified sources](../verified-sources/).
+```py
+products_tables = sql_database().with_resources("products", "customers")
+
+pipeline = dlt.pipeline(
+        pipeline_name="postgres_to_qdrant_pipeline",
+        destination="qdrant",
+    )
 
+# apply adapter to the needed resources
+qdrant_adapter(products_tables.products, embed="description")
+qdrant_adapter(products_tables.customers, embed="bio")
+
+info = pipeline.run(products_tables)
+```
+
+:::tip
+A more comprehensive pipeline would load data from some API or use one of dlt's [verified sources](../verified-sources/).
 :::
 
 ## Write disposition
diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md
index c6597fadce..43bd85ce41 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md
@@ -116,6 +116,22 @@ weaviate_adapter(
     tokenization={"title": "word", "description": "whitespace"},
 )
 ```
+When using the `weaviate_adapter`, it's important to apply it directly to resources, not to the whole source. Here's an example:  
+
+```py
+products_tables = sql_database().with_resources("products", "customers")
+
+pipeline = dlt.pipeline(
+        pipeline_name="postgres_to_weaviate_pipeline",
+        destination="weaviate",
+    )
+
+# apply adapter to the needed resources
+weaviate_adapter(products_tables.products, vectorize="description")
+weaviate_adapter(products_tables.customers, vectorize="bio")
+
+info = pipeline.run(products_tables)
+```    
 
 :::tip
 
diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md
index 936a3160f0..98c7a13b81 100644
--- a/docs/website/docs/general-usage/source.md
+++ b/docs/website/docs/general-usage/source.md
@@ -187,6 +187,26 @@ Several data sources are prone to contain semi-structured documents with very de
 MongoDB databases. Our practical experience is that setting the `max_nesting_level` to 2 or 3
 produces the clearest and human-readable schemas.
 
+:::tip
+The `max_table_nesting` parameter at the source level doesn't automatically apply to individual 
+resources when accessed directly (e.g., using `source.resources["resource_1"])`. To make sure it 
+works, either use `source.with_resources("resource_1")` or set the parameter directly on the resource.
+:::
+
+
+You can directly configure the `max_table_nesting` parameter on the resource level as:
+
+```py
+@dlt.resource(max_table_nesting=0)
+def my_resource():
+    ...
+```
+or
+```py
+my_source = source()
+my_source.my_resource.max_table_nesting = 0
+```
+
 ### Modify schema
 
 The schema is available via `schema` property of the source.

From 6f7591e2d79c544e82accb205780171d3962863f Mon Sep 17 00:00:00 2001
From: dat-a-man <98139823+dat-a-man@users.noreply.github.com>
Date: Fri, 23 Aug 2024 20:03:48 +0530
Subject: [PATCH 22/34] Add custom parent-child relationships example (#1678)

---
 .../parent_child_relationship/__init__.py     |  0
 .../parent_child_relationship.py              | 69 ++++++++++++++++
 .../test_parent_child_relationship.py         | 78 +++++++++++++++++++
 3 files changed, 147 insertions(+)
 create mode 100644 docs/examples/parent_child_relationship/__init__.py
 create mode 100644 docs/examples/parent_child_relationship/parent_child_relationship.py
 create mode 100644 docs/examples/parent_child_relationship/test_parent_child_relationship.py

diff --git a/docs/examples/parent_child_relationship/__init__.py b/docs/examples/parent_child_relationship/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/examples/parent_child_relationship/parent_child_relationship.py b/docs/examples/parent_child_relationship/parent_child_relationship.py
new file mode 100644
index 0000000000..39c9f577cc
--- /dev/null
+++ b/docs/examples/parent_child_relationship/parent_child_relationship.py
@@ -0,0 +1,69 @@
+"""
+---
+title: Load parent table records into child table
+description: Learn how to integrate custom parent keys into child records
+keywords: [parent child relationship, parent key]
+---
+
+This example demonstrates handling data with parent-child relationships using the `dlt` library.
+You learn how to integrate specific fields (e.g., primary, foreign keys) from a parent record into each child record.
+
+In this example, we'll explore how to:
+
+- Add `parent_id` into each child record using `add_parent_id` function
+- Use the [`add_map` function](https://dlthub.com/docs/api_reference/extract/resource#add_map) to apply this
+custom logic to every record in the dataset
+
+:::note important
+Please note that dlt metadata, including `_dlt_id` and `_dlt_load_id`, will still be loaded into the tables.
+:::
+"""
+
+from typing import List, Dict, Any, Generator
+import dlt
+
+# Define a dlt resource with write disposition to 'merge'
+@dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"})
+def data_source() -> Generator[List[Dict[str, Any]], None, None]:
+    # Example data
+    data = [
+        {
+            "parent_id": 1,
+            "parent_name": "Alice",
+            "children": [
+                {"child_id": 1, "child_name": "Child 1"},
+                {"child_id": 2, "child_name": "Child 2"},
+            ],
+        },
+        {
+            "parent_id": 2,
+            "parent_name": "Bob",
+            "children": [{"child_id": 3, "child_name": "Child 3"}],
+        },
+    ]
+
+    yield data
+
+# Function to add parent_id to each child record within a parent record
+def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]:
+    parent_id_key = "parent_id"
+    for child in record["children"]:
+        child[parent_id_key] = record[parent_id_key]
+    return record
+
+if __name__ == "__main__":
+    # Create and configure the dlt pipeline
+    pipeline = dlt.pipeline(
+        pipeline_name="generic_pipeline",
+        destination="duckdb",
+        dataset_name="dataset",
+    )
+
+    # Run the pipeline
+    load_info = pipeline.run(
+        data_source()
+        .add_map(add_parent_id),
+        primary_key="parent_id"
+    )
+    # Output the load information after pipeline execution
+    print(load_info)
diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py
new file mode 100644
index 0000000000..f671040823
--- /dev/null
+++ b/docs/examples/parent_child_relationship/test_parent_child_relationship.py
@@ -0,0 +1,78 @@
+
+import pytest
+
+from tests.utils import skipifgithubfork
+
+
+"""
+---
+title: Load parent table records into child table
+description: Learn how to integrate custom parent keys into child records
+keywords: [parent child relationship, parent key]
+---
+
+This example demonstrates handling data with parent-child relationships using
+the `dlt` library. You learn how to integrate specific fields (e.g., primary,
+foreign keys) from a parent record into each child record.
+
+In this example, we'll explore how to:
+
+- Add `parent_id` into each child record using `add_parent_id` function
+- Use the [`add_map` function](https://dlthub.com/docs/api_reference/extract/resource#add_map) to apply this
+custom logic to every record in the dataset
+
+:::note important
+Please note that dlt metadata, including `_dlt_id` and `_dlt_load_id`, will still be loaded into the tables.
+:::
+"""
+
+from typing import List, Dict, Any, Generator
+import dlt
+
+# Define a dlt resource with write disposition to 'merge'
+@dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"})
+def data_source() -> Generator[List[Dict[str, Any]], None, None]:
+    # Example data
+    data = [
+        {
+            "parent_id": 1,
+            "parent_name": "Alice",
+            "children": [
+                {"child_id": 1, "child_name": "Child 1"},
+                {"child_id": 2, "child_name": "Child 2"},
+            ],
+        },
+        {
+            "parent_id": 2,
+            "parent_name": "Bob",
+            "children": [{"child_id": 3, "child_name": "Child 3"}],
+        },
+    ]
+
+    yield data
+
+# Function to add parent_id to each child record within a parent record
+def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]:
+    parent_id_key = "parent_id"
+    for child in record["children"]:
+        child[parent_id_key] = record[parent_id_key]
+    return record
+
+@skipifgithubfork
+@pytest.mark.forked
+def test_parent_child_relationship():
+    # Create and configure the dlt pipeline
+    pipeline = dlt.pipeline(
+        pipeline_name="generic_pipeline",
+        destination="duckdb",
+        dataset_name="dataset",
+    )
+
+    # Run the pipeline
+    load_info = pipeline.run(
+        data_source()
+        .add_map(add_parent_id),
+        primary_key="parent_id"
+    )
+    # Output the load information after pipeline execution
+    print(load_info)

From d9a7b93ca74d237ea6d92a774a017eef1013f3f5 Mon Sep 17 00:00:00 2001
From: Deepyaman Datta <deepyaman.datta@utexas.edu>
Date: Sun, 25 Aug 2024 14:50:49 -0600
Subject: [PATCH 23/34] Correct the library name for mem stats to `psutil`
 (#1733)

---
 docs/website/docs/reference/performance.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md
index 075d351553..0ee62acec7 100644
--- a/docs/website/docs/reference/performance.md
+++ b/docs/website/docs/reference/performance.md
@@ -62,7 +62,7 @@ Several [text file formats](../dlt-ecosystem/file-formats/) have `gzip` compress
 Keep in mind load packages are buffered to disk and are left for any troubleshooting, so you can [clear disk space by setting the `delete_completed_jobs` option](../running-in-production/running.md#data-left-behind).
 
 ### Observing cpu and memory usage
-Please make sure that you have the `psutils` package installed (note that Airflow installs it by default). Then you can dump the stats periodically by setting the [progress](../general-usage/pipeline.md#display-the-loading-progress) to `log` in `config.toml`:
+Please make sure that you have the `psutil` package installed (note that Airflow installs it by default). Then you can dump the stats periodically by setting the [progress](../general-usage/pipeline.md#display-the-loading-progress) to `log` in `config.toml`:
 ```toml
 progress="log"
 ```
@@ -258,4 +258,4 @@ DLT_USE_JSON=simplejson
 
 ## Using the built in requests wrapper or RESTClient for API calls
 
-Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches.
\ No newline at end of file
+Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches.

From 7d7c14f71d14612f0de873110eaa6d300a4544c2 Mon Sep 17 00:00:00 2001
From: dat-a-man <98139823+dat-a-man@users.noreply.github.com>
Date: Mon, 26 Aug 2024 02:23:43 +0530
Subject: [PATCH 24/34] Replaced "full_refresh" with "dev_mode" (#1735)

---
 docs/technical/general_usage.md                        | 10 +++++-----
 .../dlt-ecosystem/verified-sources/sql_database.md     |  2 +-
 .../docs/dlt-ecosystem/verified-sources/stripe.md      |  2 +-
 .../docs/dlt-ecosystem/verified-sources/workable.md    |  2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/technical/general_usage.md b/docs/technical/general_usage.md
index 336c892c66..2df903b062 100644
--- a/docs/technical/general_usage.md
+++ b/docs/technical/general_usage.md
@@ -47,7 +47,7 @@ Pipeline can be explicitly created and configured via `dlt.pipeline()` that retu
 4. dataset_name - name of the dataset where the data goes (see later the default names)
 5. import_schema_path - default is None
 6. export_schema_path - default is None
-7. full_refresh - if set to True the pipeline working dir will be erased and the dataset name will get the unique suffix (current timestamp). ie the `my_data` becomes `my_data_20221107164856`.
+7. dev_mode - if set to True the pipeline working dir will be erased and the dataset name will get the unique suffix (current timestamp). ie the `my_data` becomes `my_data_20221107164856`.
 
 > **Achtung** as per `secrets_and_config.md` the arguments passed to `dlt.pipeline` are configurable and if skipped will be injected by the config providers. **the values provided explicitly in the code have a full precedence over all config providers**
 
@@ -101,7 +101,7 @@ In case **there are more schemas in the pipeline**, the data will be loaded into
 1. `spotify` tables and `labels` will load into `spotify_data_1`
 2. `mel` resource will load into `spotify_data_1_echonest`
 
-The `full_refresh` option: dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. This allows a non destructive full refresh. Nothing is being deleted/dropped from the destination.
+The `dev_mode` option: dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. This allows a non destructive full refresh. Nothing is being deleted/dropped from the destination.
 
 ## pipeline working directory and state
 Another fundamental concept is the pipeline working directory. This directory keeps the following information:
@@ -117,7 +117,7 @@ The `restore_from_destination` argument to `dlt.pipeline` let's the user restore
 
 The state is being stored in the destination together with other data. So only when all pipeline stages are completed the state is available for restoration.
 
-The pipeline cannot be restored if `full_refresh` flag is set.
+The pipeline cannot be restored if `dev_mode` flag is set.
 
 The other way to trigger full refresh is to drop destination dataset. `dlt` detects that and resets the pipeline local working folder.
 
@@ -155,8 +155,8 @@ The default json normalizer will convert json documents into tables. All the key
 
 ❗ [more here](working_with_schemas.md)
 
-### Full refresh mode
-If `full_refresh` flag is passed to `dlt.pipeline` then
+### Dev mode mode
+If `dev_mode` flag is passed to `dlt.pipeline` then
 1. the pipeline working dir is fully wiped out (state, schemas, temp files)
 2. dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`.
 3. pipeline will not be restored from the destination
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md
index eeb717515a..c89a63a524 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md
@@ -652,6 +652,6 @@ resource. Below we show you an example on how to pseudonymize the data before it
    print(info)
    ```
 
-1. Remember to keep the pipeline name and destination dataset name consistent. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) from the last run, which is essential for incremental loading. Altering these names could initiate a "[full_refresh](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh)", interfering with the metadata tracking necessary for [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading).
+1. Remember to keep the pipeline name and destination dataset name consistent. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) from the last run, which is essential for incremental loading. Altering these names could initiate a "[dev_mode](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-dev-mode)", interfering with the metadata tracking necessary for [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading).
 
 <!--@@@DLT_TUBA sql_database-->
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md
index 8c39a5090e..fdbefeddf1 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md
@@ -232,6 +232,6 @@ verified source.
     load_info = pipeline.run(data=[source_single, source_incremental])
     print(load_info)
     ```
-    > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“full_refresh”](../../general-usage/pipeline#do-experiments-with-full-refresh), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading).
+    > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“dev_mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading).
 
 <!--@@@DLT_TUBA stripe_analytics-->
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md
index 472f48a28f..9229ddca7e 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md
@@ -272,7 +272,7 @@ To create your data pipeline using single loading and
    destination dataset names. The pipeline name helps retrieve the
    [state](https://dlthub.com/docs/general-usage/state) of the last run, essential for incremental
    data loading. Changing these names might trigger a
-   [“full_refresh”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh),
+   [“dev_mode”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-dev-mode),
    disrupting metadata tracking for
    [incremental data loading](https://dlthub.com/docs/general-usage/incremental-loading).
 

From 011d7ff508f3d5a2da666e418a7137fb79acab49 Mon Sep 17 00:00:00 2001
From: rudolfix <rudolfix@rudolfix.org>
Date: Sun, 25 Aug 2024 23:07:02 +0200
Subject: [PATCH 25/34] feat/1681 collects load job metrics and adds remote uri
 (#1708)

* collects basic load job metrics in LoadJob

* adds remote uri to filesystem copy jobs metrics

* adds job id to load package info

* adds table name to job metrics

* skips run step when serializing trace

* adds trace shape test with trace schema

* tests job file name too long

* docs running pipelines with the same name for different envs

* extracts step metrics in common, renames followup jobs

* fixes tests

* fixes tests

* tests delta filesystem for remote_uri

* adds exec_info to trace contract test

* tests remote_uri for filesystem copy

* fixes platform test
---
 dlt/common/data_writers/__init__.py           |   2 -
 dlt/common/data_writers/buffered.py           |   3 +-
 dlt/common/data_writers/writers.py            |  20 +-
 dlt/common/destination/reference.py           |  27 +-
 dlt/common/metrics.py                         |  71 ++
 dlt/common/pipeline.py                        | 103 +--
 dlt/common/storages/__init__.py               |   4 +-
 dlt/common/storages/data_item_storage.py      |   7 +-
 dlt/common/storages/load_package.py           |  51 +-
 dlt/common/storages/load_storage.py           |   8 +-
 dlt/destinations/impl/athena/athena.py        |  10 +-
 dlt/destinations/impl/bigquery/bigquery.py    |  12 +-
 .../impl/clickhouse/clickhouse.py             |  12 +-
 .../impl/databricks/databricks.py             |  12 +-
 dlt/destinations/impl/dremio/dremio.py        |  12 +-
 dlt/destinations/impl/dummy/configuration.py  |   2 +-
 dlt/destinations/impl/dummy/dummy.py          |  33 +-
 .../impl/filesystem/filesystem.py             |  43 +-
 dlt/destinations/impl/mssql/mssql.py          |   8 +-
 dlt/destinations/impl/postgres/postgres.py    |   4 +-
 dlt/destinations/impl/redshift/redshift.py    |  10 +-
 dlt/destinations/impl/snowflake/snowflake.py  |   6 +-
 dlt/destinations/impl/synapse/synapse.py      |   8 +-
 dlt/destinations/job_client_impl.py           |  18 +-
 dlt/destinations/job_impl.py                  |  11 +-
 dlt/destinations/sql_jobs.py                  |   6 +-
 dlt/extract/extractors.py                     |   2 +-
 dlt/extract/storage.py                        |   3 +-
 dlt/load/load.py                              |  41 +-
 dlt/load/utils.py                             |   4 +-
 dlt/normalize/items_normalizers.py            |   2 +-
 dlt/normalize/normalize.py                    |   2 +-
 dlt/normalize/worker.py                       |   2 +-
 dlt/pipeline/trace.py                         |   2 +-
 docs/website/docs/general-usage/pipeline.md   |  13 +
 .../common/data_writers/test_data_writers.py  |   7 +-
 tests/common/storages/utils.py                |   4 +-
 .../data_writers/test_buffered_writer.py      |   2 +-
 .../data_writers/test_data_item_storage.py    |   3 +-
 .../load/pipeline/test_filesystem_pipeline.py |  58 ++
 tests/load/pipeline/test_postgres.py          |  15 +
 tests/load/pipeline/test_stage_loading.py     |  16 +
 tests/load/test_dummy_client.py               | 110 ++-
 tests/load/utils.py                           |   5 +-
 .../cases/contracts/trace.schema.yaml         | 772 ++++++++++++++++++
 tests/pipeline/test_pipeline.py               |  53 +-
 tests/pipeline/test_pipeline_trace.py         | 169 +++-
 tests/pipeline/test_platform_connection.py    |   3 +-
 tests/pipeline/utils.py                       |   3 +
 tests/utils.py                                |   5 +-
 50 files changed, 1552 insertions(+), 247 deletions(-)
 create mode 100644 dlt/common/metrics.py
 create mode 100644 tests/pipeline/cases/contracts/trace.schema.yaml

diff --git a/dlt/common/data_writers/__init__.py b/dlt/common/data_writers/__init__.py
index 945e74a37b..9966590c06 100644
--- a/dlt/common/data_writers/__init__.py
+++ b/dlt/common/data_writers/__init__.py
@@ -1,6 +1,5 @@
 from dlt.common.data_writers.writers import (
     DataWriter,
-    DataWriterMetrics,
     TDataItemFormat,
     FileWriterSpec,
     create_import_spec,
@@ -22,7 +21,6 @@
     "resolve_best_writer_spec",
     "get_best_writer_spec",
     "is_native_writer",
-    "DataWriterMetrics",
     "TDataItemFormat",
     "BufferedDataWriter",
     "new_file_id",
diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py
index 8077007edb..945fca6580 100644
--- a/dlt/common/data_writers/buffered.py
+++ b/dlt/common/data_writers/buffered.py
@@ -3,6 +3,7 @@
 import contextlib
 from typing import ClassVar, Iterator, List, IO, Any, Optional, Type, Generic
 
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.typing import TDataItem, TDataItems
 from dlt.common.data_writers.exceptions import (
     BufferedDataWriterClosed,
@@ -10,7 +11,7 @@
     FileImportNotFound,
     InvalidFileNameTemplateException,
 )
-from dlt.common.data_writers.writers import TWriter, DataWriter, DataWriterMetrics, FileWriterSpec
+from dlt.common.data_writers.writers import TWriter, DataWriter, FileWriterSpec
 from dlt.common.schema.typing import TTableSchemaColumns
 from dlt.common.configuration import with_config, known_sections, configspec
 from dlt.common.configuration.specs import BaseConfiguration
diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py
index d324792a83..abd3343ea1 100644
--- a/dlt/common/data_writers/writers.py
+++ b/dlt/common/data_writers/writers.py
@@ -34,6 +34,7 @@
     TLoaderFileFormat,
     ALL_SUPPORTED_FILE_FORMATS,
 )
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.schema.typing import TTableSchemaColumns
 from dlt.common.typing import StrAny
 
@@ -59,25 +60,6 @@ class FileWriterSpec(NamedTuple):
     supports_compression: bool = False
 
 
-class DataWriterMetrics(NamedTuple):
-    file_path: str
-    items_count: int
-    file_size: int
-    created: float
-    last_modified: float
-
-    def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]:
-        if isinstance(other, DataWriterMetrics):
-            return DataWriterMetrics(
-                "",  # path is not known
-                self.items_count + other.items_count,
-                self.file_size + other.file_size,
-                min(self.created, other.created),
-                max(self.last_modified, other.last_modified),
-            )
-        return NotImplemented
-
-
 EMPTY_DATA_WRITER_METRICS = DataWriterMetrics("", 0, 0, 2**32, 0.0)
 
 
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
index 3af7dcff13..b6c7041592 100644
--- a/dlt/common/destination/reference.py
+++ b/dlt/common/destination/reference.py
@@ -24,10 +24,11 @@
 from copy import deepcopy
 import inspect
 
-from dlt.common import logger
+from dlt.common import logger, pendulum
 from dlt.common.configuration.specs.base_configuration import extract_inner_hint
 from dlt.common.destination.utils import verify_schema_capabilities
 from dlt.common.exceptions import TerminalValueError
+from dlt.common.metrics import LoadJobMetrics
 from dlt.common.normalizers.naming import NamingConvention
 from dlt.common.schema import Schema, TTableSchema, TSchemaTables
 from dlt.common.schema.utils import (
@@ -284,6 +285,8 @@ def __init__(self, file_path: str) -> None:
         # NOTE: we only accept a full filepath in the constructor
         assert self._file_name != self._file_path
         self._parsed_file_name = ParsedLoadJobFileName.parse(self._file_name)
+        self._started_at: pendulum.DateTime = None
+        self._finished_at: pendulum.DateTime = None
 
     def job_id(self) -> str:
         """The job id that is derived from the file name and does not changes during job lifecycle"""
@@ -306,6 +309,18 @@ def exception(self) -> str:
         """The exception associated with failed or retry states"""
         pass
 
+    def metrics(self) -> Optional[LoadJobMetrics]:
+        """Returns job execution metrics"""
+        return LoadJobMetrics(
+            self._parsed_file_name.job_id(),
+            self._file_path,
+            self._parsed_file_name.table_name,
+            self._started_at,
+            self._finished_at,
+            self.state(),
+            None,
+        )
+
 
 class RunnableLoadJob(LoadJob, ABC):
     """Represents a runnable job that loads a single file
@@ -361,6 +376,7 @@ def run_managed(
         # filepath is now moved to running
         try:
             self._state = "running"
+            self._started_at = pendulum.now()
             self._job_client.prepare_load_job_execution(self)
             self.run()
             self._state = "completed"
@@ -371,6 +387,7 @@ def run_managed(
             self._state = "retry"
             self._exception = e
         finally:
+            self._finished_at = pendulum.now()
             # sanity check
             assert self._state in ("completed", "retry", "failed")
 
@@ -391,7 +408,7 @@ def exception(self) -> str:
         return str(self._exception)
 
 
-class FollowupJob:
+class FollowupJobRequest:
     """Base class for follow up jobs that should be created"""
 
     @abstractmethod
@@ -403,8 +420,8 @@ def new_file_path(self) -> str:
 class HasFollowupJobs:
     """Adds a trait that allows to create single or table chain followup jobs"""
 
-    def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]:
-        """Return list of new jobs. `final_state` is state to which this job transits"""
+    def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]:
+        """Return list of jobs requests for jobs that should be created. `final_state` is state to which this job transits"""
         return []
 
 
@@ -479,7 +496,7 @@ def create_table_chain_completed_followup_jobs(
         self,
         table_chain: Sequence[TTableSchema],
         completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         """Creates a list of followup jobs that should be executed after a table chain is completed"""
         return []
 
diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py
new file mode 100644
index 0000000000..5cccee4045
--- /dev/null
+++ b/dlt/common/metrics.py
@@ -0,0 +1,71 @@
+import datetime  # noqa: I251
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict  # noqa: 251
+
+
+class DataWriterMetrics(NamedTuple):
+    file_path: str
+    items_count: int
+    file_size: int
+    created: float
+    last_modified: float
+
+    def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]:
+        if isinstance(other, DataWriterMetrics):
+            return DataWriterMetrics(
+                self.file_path if self.file_path == other.file_path else "",
+                # self.table_name if self.table_name == other.table_name else "",
+                self.items_count + other.items_count,
+                self.file_size + other.file_size,
+                min(self.created, other.created),
+                max(self.last_modified, other.last_modified),
+            )
+        return NotImplemented
+
+
+class StepMetrics(TypedDict):
+    """Metrics for particular package processed in particular pipeline step"""
+
+    started_at: datetime.datetime
+    """Start of package processing"""
+    finished_at: datetime.datetime
+    """End of package processing"""
+
+
+class ExtractDataInfo(TypedDict):
+    name: str
+    data_type: str
+
+
+class ExtractMetrics(StepMetrics):
+    schema_name: str
+    job_metrics: Dict[str, DataWriterMetrics]
+    """Metrics collected per job id during writing of job file"""
+    table_metrics: Dict[str, DataWriterMetrics]
+    """Job metrics aggregated by table"""
+    resource_metrics: Dict[str, DataWriterMetrics]
+    """Job metrics aggregated by resource"""
+    dag: List[Tuple[str, str]]
+    """A resource dag where elements of the list are graph edges"""
+    hints: Dict[str, Dict[str, Any]]
+    """Hints passed to the resources"""
+
+
+class NormalizeMetrics(StepMetrics):
+    job_metrics: Dict[str, DataWriterMetrics]
+    """Metrics collected per job id during writing of job file"""
+    table_metrics: Dict[str, DataWriterMetrics]
+    """Job metrics aggregated by table"""
+
+
+class LoadJobMetrics(NamedTuple):
+    job_id: str
+    file_path: str
+    table_name: str
+    started_at: datetime.datetime
+    finished_at: datetime.datetime
+    state: Optional[str]
+    remote_uri: Optional[str]
+
+
+class LoadMetrics(StepMetrics):
+    job_metrics: Dict[str, LoadJobMetrics]
diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py
index 1e1416eb53..8a07ddbd33 100644
--- a/dlt/common/pipeline.py
+++ b/dlt/common/pipeline.py
@@ -16,7 +16,6 @@
     Optional,
     Protocol,
     Sequence,
-    TYPE_CHECKING,
     Tuple,
     TypeVar,
     TypedDict,
@@ -36,6 +35,14 @@
 from dlt.common.destination import TDestinationReferenceArg, TDestination
 from dlt.common.destination.exceptions import DestinationHasFailedJobs
 from dlt.common.exceptions import PipelineStateNotAvailable, SourceSectionNotAvailable
+from dlt.common.metrics import (
+    DataWriterMetrics,
+    ExtractDataInfo,
+    ExtractMetrics,
+    LoadMetrics,
+    NormalizeMetrics,
+    StepMetrics,
+)
 from dlt.common.schema import Schema
 from dlt.common.schema.typing import (
     TColumnNames,
@@ -44,11 +51,12 @@
     TSchemaContract,
 )
 from dlt.common.source import get_current_pipe_name
+from dlt.common.storages.load_package import ParsedLoadJobFileName
 from dlt.common.storages.load_storage import LoadPackageInfo
 from dlt.common.time import ensure_pendulum_datetime, precise_time
 from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize
 from dlt.common.jsonpath import delete_matches, TAnyJsonPath
-from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat
+from dlt.common.data_writers.writers import TLoaderFileFormat
 from dlt.common.utils import RowCounts, merge_row_counts
 from dlt.common.versioned_state import TVersionedState
 
@@ -68,15 +76,6 @@ class _StepInfo(NamedTuple):
     finished_at: datetime.datetime
 
 
-class StepMetrics(TypedDict):
-    """Metrics for particular package processed in particular pipeline step"""
-
-    started_at: datetime.datetime
-    """Start of package processing"""
-    finished_at: datetime.datetime
-    """End of package processing"""
-
-
 TStepMetricsCo = TypeVar("TStepMetricsCo", bound=StepMetrics, covariant=True)
 
 
@@ -154,17 +153,20 @@ def _load_packages_asstr(load_packages: List[LoadPackageInfo], verbosity: int) -
         return msg
 
     @staticmethod
-    def job_metrics_asdict(
+    def writer_metrics_asdict(
         job_metrics: Dict[str, DataWriterMetrics], key_name: str = "job_id", extend: StrAny = None
     ) -> List[DictStrAny]:
-        jobs = []
-        for job_id, metrics in job_metrics.items():
+        entities = []
+        for entity_id, metrics in job_metrics.items():
             d = metrics._asdict()
             if extend:
                 d.update(extend)
-            d[key_name] = job_id
-            jobs.append(d)
-        return jobs
+            d[key_name] = entity_id
+            # add job-level info if known
+            if metrics.file_path:
+                d["table_name"] = ParsedLoadJobFileName.parse(metrics.file_path).table_name
+            entities.append(d)
+        return entities
 
     def _astuple(self) -> _StepInfo:
         return _StepInfo(
@@ -177,25 +179,6 @@ def _astuple(self) -> _StepInfo:
         )
 
 
-class ExtractDataInfo(TypedDict):
-    name: str
-    data_type: str
-
-
-class ExtractMetrics(StepMetrics):
-    schema_name: str
-    job_metrics: Dict[str, DataWriterMetrics]
-    """Metrics collected per job id during writing of job file"""
-    table_metrics: Dict[str, DataWriterMetrics]
-    """Job metrics aggregated by table"""
-    resource_metrics: Dict[str, DataWriterMetrics]
-    """Job metrics aggregated by resource"""
-    dag: List[Tuple[str, str]]
-    """A resource dag where elements of the list are graph edges"""
-    hints: Dict[str, Dict[str, Any]]
-    """Hints passed to the resources"""
-
-
 class _ExtractInfo(NamedTuple):
     """NamedTuple cannot be part of the derivation chain so we must re-declare all fields to use it as mixin later"""
 
@@ -228,16 +211,8 @@ def asdict(self) -> DictStrAny:
         for load_id, metrics_list in self.metrics.items():
             for idx, metrics in enumerate(metrics_list):
                 extend = {"load_id": load_id, "extract_idx": idx}
-                load_metrics["job_metrics"].extend(
-                    self.job_metrics_asdict(metrics["job_metrics"], extend=extend)
-                )
-                load_metrics["table_metrics"].extend(
-                    self.job_metrics_asdict(
-                        metrics["table_metrics"], key_name="table_name", extend=extend
-                    )
-                )
                 load_metrics["resource_metrics"].extend(
-                    self.job_metrics_asdict(
+                    self.writer_metrics_asdict(
                         metrics["resource_metrics"], key_name="resource_name", extend=extend
                     )
                 )
@@ -253,6 +228,15 @@ def asdict(self) -> DictStrAny:
                         for name, hints in metrics["hints"].items()
                     ]
                 )
+                load_metrics["job_metrics"].extend(
+                    self.writer_metrics_asdict(metrics["job_metrics"], extend=extend)
+                )
+                load_metrics["table_metrics"].extend(
+                    self.writer_metrics_asdict(
+                        metrics["table_metrics"], key_name="table_name", extend=extend
+                    )
+                )
+
         d.update(load_metrics)
         return d
 
@@ -260,13 +244,6 @@ def asstr(self, verbosity: int = 0) -> str:
         return self._load_packages_asstr(self.load_packages, verbosity)
 
 
-class NormalizeMetrics(StepMetrics):
-    job_metrics: Dict[str, DataWriterMetrics]
-    """Metrics collected per job id during writing of job file"""
-    table_metrics: Dict[str, DataWriterMetrics]
-    """Job metrics aggregated by table"""
-
-
 class _NormalizeInfo(NamedTuple):
     pipeline: "SupportsPipeline"
     metrics: Dict[str, List[NormalizeMetrics]]
@@ -305,10 +282,10 @@ def asdict(self) -> DictStrAny:
             for idx, metrics in enumerate(metrics_list):
                 extend = {"load_id": load_id, "extract_idx": idx}
                 load_metrics["job_metrics"].extend(
-                    self.job_metrics_asdict(metrics["job_metrics"], extend=extend)
+                    self.writer_metrics_asdict(metrics["job_metrics"], extend=extend)
                 )
                 load_metrics["table_metrics"].extend(
-                    self.job_metrics_asdict(
+                    self.writer_metrics_asdict(
                         metrics["table_metrics"], key_name="table_name", extend=extend
                     )
                 )
@@ -326,10 +303,6 @@ def asstr(self, verbosity: int = 0) -> str:
         return msg
 
 
-class LoadMetrics(StepMetrics):
-    pass
-
-
 class _LoadInfo(NamedTuple):
     pipeline: "SupportsPipeline"
     metrics: Dict[str, List[LoadMetrics]]
@@ -354,7 +327,19 @@ class LoadInfo(StepInfo[LoadMetrics], _LoadInfo):  # type: ignore[misc]
 
     def asdict(self) -> DictStrAny:
         """A dictionary representation of LoadInfo that can be loaded with `dlt`"""
-        return super().asdict()
+        d = super().asdict()
+        # transform metrics
+        d.pop("metrics")
+        load_metrics: Dict[str, List[Any]] = {"job_metrics": []}
+        for load_id, metrics_list in self.metrics.items():
+            # one set of metrics per package id
+            assert len(metrics_list) == 1
+            metrics = metrics_list[0]
+            for job_metrics in metrics["job_metrics"].values():
+                load_metrics["job_metrics"].append({"load_id": load_id, **job_metrics._asdict()})
+
+        d.update(load_metrics)
+        return d
 
     def asstr(self, verbosity: int = 0) -> str:
         msg = f"Pipeline {self.pipeline.pipeline_name} load step completed in "
diff --git a/dlt/common/storages/__init__.py b/dlt/common/storages/__init__.py
index 7bb3c0cf97..50876a01cd 100644
--- a/dlt/common/storages/__init__.py
+++ b/dlt/common/storages/__init__.py
@@ -8,7 +8,7 @@
     LoadJobInfo,
     LoadPackageInfo,
     PackageStorage,
-    TJobState,
+    TPackageJobState,
     create_load_id,
 )
 from .data_item_storage import DataItemStorage
@@ -40,7 +40,7 @@
     "LoadJobInfo",
     "LoadPackageInfo",
     "PackageStorage",
-    "TJobState",
+    "TPackageJobState",
     "create_load_id",
     "fsspec_from_config",
     "fsspec_filesystem",
diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py
index 29a9da8acf..0f70c04bc5 100644
--- a/dlt/common/storages/data_item_storage.py
+++ b/dlt/common/storages/data_item_storage.py
@@ -1,14 +1,13 @@
-from pathlib import Path
-from typing import Dict, Any, List, Sequence
+from typing import Dict, Any, List
 from abc import ABC, abstractmethod
 
 from dlt.common import logger
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.schema import TTableSchemaColumns
-from dlt.common.typing import StrAny, TDataItems
+from dlt.common.typing import TDataItems
 from dlt.common.data_writers import (
     BufferedDataWriter,
     DataWriter,
-    DataWriterMetrics,
     FileWriterSpec,
 )
 
diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py
index b0ed93f734..d569fbe662 100644
--- a/dlt/common/storages/load_package.py
+++ b/dlt/common/storages/load_package.py
@@ -143,8 +143,8 @@ def create_load_id() -> str:
 
 
 # folders to manage load jobs in a single load package
-TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"]
-WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState))
+TPackageJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"]
+WORKING_FOLDERS: Set[TPackageJobState] = set(get_args(TPackageJobState))
 TLoadPackageStatus = Literal["new", "extracted", "normalized", "loaded", "aborted"]
 
 
@@ -191,7 +191,7 @@ def __str__(self) -> str:
 
 
 class LoadJobInfo(NamedTuple):
-    state: TJobState
+    state: TPackageJobState
     file_path: str
     file_size: int
     created_at: datetime.datetime
@@ -204,6 +204,7 @@ def asdict(self) -> DictStrAny:
         # flatten
         del d["job_file_info"]
         d.update(self.job_file_info._asdict())
+        d["job_id"] = self.job_file_info.job_id()
         return d
 
     def asstr(self, verbosity: int = 0) -> str:
@@ -241,7 +242,7 @@ class _LoadPackageInfo(NamedTuple):
     schema: Schema
     schema_update: TSchemaTables
     completed_at: datetime.datetime
-    jobs: Dict[TJobState, List[LoadJobInfo]]
+    jobs: Dict[TPackageJobState, List[LoadJobInfo]]
 
 
 class LoadPackageInfo(SupportsHumanize, _LoadPackageInfo):
@@ -298,10 +299,10 @@ def __str__(self) -> str:
 
 
 class PackageStorage:
-    NEW_JOBS_FOLDER: ClassVar[TJobState] = "new_jobs"
-    FAILED_JOBS_FOLDER: ClassVar[TJobState] = "failed_jobs"
-    STARTED_JOBS_FOLDER: ClassVar[TJobState] = "started_jobs"
-    COMPLETED_JOBS_FOLDER: ClassVar[TJobState] = "completed_jobs"
+    NEW_JOBS_FOLDER: ClassVar[TPackageJobState] = "new_jobs"
+    FAILED_JOBS_FOLDER: ClassVar[TPackageJobState] = "failed_jobs"
+    STARTED_JOBS_FOLDER: ClassVar[TPackageJobState] = "started_jobs"
+    COMPLETED_JOBS_FOLDER: ClassVar[TPackageJobState] = "completed_jobs"
 
     SCHEMA_FILE_NAME: ClassVar[str] = "schema.json"
     SCHEMA_UPDATES_FILE_NAME = (  # updates to the tables in schema created by normalizer
@@ -330,11 +331,11 @@ def get_package_path(self, load_id: str) -> str:
         """Gets path of the package relative to storage root"""
         return load_id
 
-    def get_job_state_folder_path(self, load_id: str, state: TJobState) -> str:
+    def get_job_state_folder_path(self, load_id: str, state: TPackageJobState) -> str:
         """Gets path to the jobs in `state` in package `load_id`, relative to the storage root"""
         return os.path.join(self.get_package_path(load_id), state)
 
-    def get_job_file_path(self, load_id: str, state: TJobState, file_name: str) -> str:
+    def get_job_file_path(self, load_id: str, state: TPackageJobState, file_name: str) -> str:
         """Get path to job with `file_name` in `state` in package `load_id`, relative to the storage root"""
         return os.path.join(self.get_job_state_folder_path(load_id, state), file_name)
 
@@ -369,12 +370,12 @@ def list_failed_jobs(self, load_id: str) -> Sequence[str]:
 
     def list_job_with_states_for_table(
         self, load_id: str, table_name: str
-    ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]:
+    ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]:
         return self.filter_jobs_for_table(self.list_all_jobs_with_states(load_id), table_name)
 
     def list_all_jobs_with_states(
         self, load_id: str
-    ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]:
+    ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]:
         info = self.get_load_package_jobs(load_id)
         state_jobs = []
         for state, jobs in info.items():
@@ -413,7 +414,7 @@ def is_package_completed(self, load_id: str) -> bool:
     #
 
     def import_job(
-        self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs"
+        self, load_id: str, job_file_path: str, job_state: TPackageJobState = "new_jobs"
     ) -> None:
         """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`"""
         self.storage.atomic_import(
@@ -568,12 +569,14 @@ def get_load_package_state_path(self, load_id: str) -> str:
     # Get package info
     #
 
-    def get_load_package_jobs(self, load_id: str) -> Dict[TJobState, List[ParsedLoadJobFileName]]:
+    def get_load_package_jobs(
+        self, load_id: str
+    ) -> Dict[TPackageJobState, List[ParsedLoadJobFileName]]:
         """Gets all jobs in a package and returns them as lists assigned to a particular state."""
         package_path = self.get_package_path(load_id)
         if not self.storage.has_folder(package_path):
             raise LoadPackageNotFound(load_id)
-        all_jobs: Dict[TJobState, List[ParsedLoadJobFileName]] = {}
+        all_jobs: Dict[TPackageJobState, List[ParsedLoadJobFileName]] = {}
         for state in WORKING_FOLDERS:
             jobs: List[ParsedLoadJobFileName] = []
             with contextlib.suppress(FileNotFoundError):
@@ -616,7 +619,7 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo:
         schema = Schema.from_dict(self._load_schema(load_id))
 
         # read jobs with all statuses
-        all_job_infos: Dict[TJobState, List[LoadJobInfo]] = {}
+        all_job_infos: Dict[TPackageJobState, List[LoadJobInfo]] = {}
         for state, jobs in package_jobs.items():
             all_job_infos[state] = [
                 self._read_job_file_info(load_id, state, job, package_created_at) for job in jobs
@@ -643,7 +646,7 @@ def get_job_failed_message(self, load_id: str, job: ParsedLoadJobFileName) -> st
         return failed_message
 
     def job_to_job_info(
-        self, load_id: str, state: TJobState, job: ParsedLoadJobFileName
+        self, load_id: str, state: TPackageJobState, job: ParsedLoadJobFileName
     ) -> LoadJobInfo:
         """Creates partial job info by converting job object. size, mtime and failed message will not be populated"""
         full_path = os.path.join(
@@ -660,7 +663,11 @@ def job_to_job_info(
         )
 
     def _read_job_file_info(
-        self, load_id: str, state: TJobState, job: ParsedLoadJobFileName, now: DateTime = None
+        self,
+        load_id: str,
+        state: TPackageJobState,
+        job: ParsedLoadJobFileName,
+        now: DateTime = None,
     ) -> LoadJobInfo:
         """Creates job info by reading additional props from storage"""
         failed_message = None
@@ -687,8 +694,8 @@ def _read_job_file_info(
     def _move_job(
         self,
         load_id: str,
-        source_folder: TJobState,
-        dest_folder: TJobState,
+        source_folder: TPackageJobState,
+        dest_folder: TPackageJobState,
         file_name: str,
         new_file_name: str = None,
     ) -> str:
@@ -736,8 +743,8 @@ def _job_elapsed_time_seconds(file_path: str, now_ts: float = None) -> float:
 
     @staticmethod
     def filter_jobs_for_table(
-        all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]], table_name: str
-    ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]:
+        all_jobs: Iterable[Tuple[TPackageJobState, ParsedLoadJobFileName]], table_name: str
+    ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]:
         return [job for job in all_jobs if job[1].table_name == table_name]
 
 
diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py
index 00e95fbad9..8ac1d74e9a 100644
--- a/dlt/common/storages/load_storage.py
+++ b/dlt/common/storages/load_storage.py
@@ -17,7 +17,7 @@
     LoadPackageInfo,
     PackageStorage,
     ParsedLoadJobFileName,
-    TJobState,
+    TPackageJobState,
     TLoadPackageState,
     TJobFileFormat,
 )
@@ -141,16 +141,16 @@ def commit_schema_update(self, load_id: str, applied_update: TSchemaTables) -> N
         """Marks schema update as processed and stores the update that was applied at the destination"""
         load_path = self.get_normalized_package_path(load_id)
         schema_update_file = join(load_path, PackageStorage.SCHEMA_UPDATES_FILE_NAME)
-        processed_schema_update_file = join(
+        applied_schema_update_file = join(
             load_path, PackageStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME
         )
         # delete initial schema update
         self.storage.delete(schema_update_file)
         # save applied update
-        self.storage.save(processed_schema_update_file, json.dumps(applied_update))
+        self.storage.save(applied_schema_update_file, json.dumps(applied_update))
 
     def import_new_job(
-        self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs"
+        self, load_id: str, job_file_path: str, job_state: TPackageJobState = "new_jobs"
     ) -> None:
         """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`"""
         # TODO: use normalize storage and add file type checks
diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
index 371c1bae22..1429b28240 100644
--- a/dlt/destinations/impl/athena/athena.py
+++ b/dlt/destinations/impl/athena/athena.py
@@ -46,7 +46,7 @@
 from dlt.common.schema.utils import table_schema_has_type
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.destination.reference import LoadJob
-from dlt.common.destination.reference import FollowupJob, SupportsStagingDestination
+from dlt.common.destination.reference import FollowupJobRequest, SupportsStagingDestination
 from dlt.common.data_writers.escape import escape_hive_identifier
 from dlt.destinations.sql_jobs import SqlStagingCopyFollowupJob, SqlMergeFollowupJob
 
@@ -490,7 +490,7 @@ def create_load_job(
 
     def _create_append_followup_jobs(
         self, table_chain: Sequence[TTableSchema]
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         if self._is_iceberg_table(self.prepare_load_table(table_chain[0]["name"])):
             return [
                 SqlStagingCopyFollowupJob.from_table_chain(
@@ -501,7 +501,7 @@ def _create_append_followup_jobs(
 
     def _create_replace_followup_jobs(
         self, table_chain: Sequence[TTableSchema]
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         if self._is_iceberg_table(self.prepare_load_table(table_chain[0]["name"])):
             return [
                 SqlStagingCopyFollowupJob.from_table_chain(
@@ -510,7 +510,9 @@ def _create_replace_followup_jobs(
             ]
         return super()._create_replace_followup_jobs(table_chain)
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [AthenaMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _is_iceberg_table(self, table: TTableSchema) -> bool:
diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
index c6bf2e7654..8291415434 100644
--- a/dlt/destinations/impl/bigquery/bigquery.py
+++ b/dlt/destinations/impl/bigquery/bigquery.py
@@ -16,7 +16,7 @@
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.destination.reference import (
     HasFollowupJobs,
-    FollowupJob,
+    FollowupJobRequest,
     TLoadJobState,
     RunnableLoadJob,
     SupportsStagingDestination,
@@ -51,7 +51,7 @@
 from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration
 from dlt.destinations.impl.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS
 from dlt.destinations.job_client_impl import SqlJobClientWithStaging
-from dlt.destinations.job_impl import ReferenceFollowupJob
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 from dlt.destinations.sql_jobs import SqlMergeFollowupJob
 from dlt.destinations.type_mapping import TypeMapper
 from dlt.destinations.utils import parse_db_data_type_str_with_precision
@@ -234,7 +234,9 @@ def __init__(
         self.sql_client: BigQuerySqlClient = sql_client  # type: ignore
         self.type_mapper = BigQueryTypeMapper(self.capabilities)
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def create_load_job(
@@ -433,8 +435,8 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load
         # determine whether we load from local or uri
         bucket_path = None
         ext: str = os.path.splitext(file_path)[1][1:]
-        if ReferenceFollowupJob.is_reference_job(file_path):
-            bucket_path = ReferenceFollowupJob.resolve_reference(file_path)
+        if ReferenceFollowupJobRequest.is_reference_job(file_path):
+            bucket_path = ReferenceFollowupJobRequest.resolve_reference(file_path)
             ext = os.path.splitext(bucket_path)[1][1:]
 
         # Select a correct source format
diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py
index 5bd34e0e0d..5f17a5a18c 100644
--- a/dlt/destinations/impl/clickhouse/clickhouse.py
+++ b/dlt/destinations/impl/clickhouse/clickhouse.py
@@ -20,7 +20,7 @@
     TLoadJobState,
     HasFollowupJobs,
     RunnableLoadJob,
-    FollowupJob,
+    FollowupJobRequest,
     LoadJob,
 )
 from dlt.common.schema import Schema, TColumnSchema
@@ -52,7 +52,7 @@
     SqlJobClientBase,
     SqlJobClientWithStaging,
 )
-from dlt.destinations.job_impl import ReferenceFollowupJob, FinalizedLoadJobWithFollowupJobs
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest, FinalizedLoadJobWithFollowupJobs
 from dlt.destinations.sql_jobs import SqlMergeFollowupJob
 from dlt.destinations.type_mapping import TypeMapper
 
@@ -141,8 +141,8 @@ def run(self) -> None:
         bucket_path = None
         file_name = self._file_name
 
-        if ReferenceFollowupJob.is_reference_job(self._file_path):
-            bucket_path = ReferenceFollowupJob.resolve_reference(self._file_path)
+        if ReferenceFollowupJobRequest.is_reference_job(self._file_path):
+            bucket_path = ReferenceFollowupJobRequest.resolve_reference(self._file_path)
             file_name = FileStorage.get_file_name_from_file_path(bucket_path)
             bucket_url = urlparse(bucket_path)
             bucket_scheme = bucket_url.scheme
@@ -288,7 +288,9 @@ def __init__(
         self.active_hints = deepcopy(HINT_TO_CLICKHOUSE_ATTR)
         self.type_mapper = ClickHouseTypeMapper(self.capabilities)
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [ClickHouseMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
index 0a203c21b6..2f23e88ea0 100644
--- a/dlt/destinations/impl/databricks/databricks.py
+++ b/dlt/destinations/impl/databricks/databricks.py
@@ -5,7 +5,7 @@
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.destination.reference import (
     HasFollowupJobs,
-    FollowupJob,
+    FollowupJobRequest,
     TLoadJobState,
     RunnableLoadJob,
     CredentialsConfiguration,
@@ -31,7 +31,7 @@
 from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration
 from dlt.destinations.impl.databricks.sql_client import DatabricksSqlClient
 from dlt.destinations.sql_jobs import SqlMergeFollowupJob
-from dlt.destinations.job_impl import ReferenceFollowupJob
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 from dlt.destinations.type_mapping import TypeMapper
 
 
@@ -121,8 +121,8 @@ def run(self) -> None:
         staging_credentials = self._staging_config.credentials
         # extract and prepare some vars
         bucket_path = orig_bucket_path = (
-            ReferenceFollowupJob.resolve_reference(self._file_path)
-            if ReferenceFollowupJob.is_reference_job(self._file_path)
+            ReferenceFollowupJobRequest.resolve_reference(self._file_path)
+            if ReferenceFollowupJobRequest.is_reference_job(self._file_path)
             else ""
         )
         file_name = (
@@ -279,7 +279,9 @@ def create_load_job(
             )
         return job
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [DatabricksMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _make_add_column_sql(
diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py
index 3611665f6c..68a3fedc31 100644
--- a/dlt/destinations/impl/dremio/dremio.py
+++ b/dlt/destinations/impl/dremio/dremio.py
@@ -7,7 +7,7 @@
     TLoadJobState,
     RunnableLoadJob,
     SupportsStagingDestination,
-    FollowupJob,
+    FollowupJobRequest,
     LoadJob,
 )
 from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns
@@ -19,7 +19,7 @@
 from dlt.destinations.impl.dremio.sql_client import DremioSqlClient
 from dlt.destinations.job_client_impl import SqlJobClientWithStaging
 from dlt.destinations.job_impl import FinalizedLoadJobWithFollowupJobs
-from dlt.destinations.job_impl import ReferenceFollowupJob
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 from dlt.destinations.sql_jobs import SqlMergeFollowupJob
 from dlt.destinations.type_mapping import TypeMapper
 from dlt.destinations.sql_client import SqlClientBase
@@ -101,8 +101,8 @@ def run(self) -> None:
 
         # extract and prepare some vars
         bucket_path = (
-            ReferenceFollowupJob.resolve_reference(self._file_path)
-            if ReferenceFollowupJob.is_reference_job(self._file_path)
+            ReferenceFollowupJobRequest.resolve_reference(self._file_path)
+            if ReferenceFollowupJobRequest.is_reference_job(self._file_path)
             else ""
         )
 
@@ -201,7 +201,9 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
             f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}"
         )
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [DremioMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _make_add_column_sql(
diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py
index 7bc1d9e943..023b88e51a 100644
--- a/dlt/destinations/impl/dummy/configuration.py
+++ b/dlt/destinations/impl/dummy/configuration.py
@@ -25,7 +25,7 @@ class DummyClientConfiguration(DestinationClientConfiguration):
     retry_prob: float = 0.0
     """probability of job retry"""
     completed_prob: float = 0.0
-    """probablibitly of successful job completion"""
+    """probability of successful job completion"""
     exception_prob: float = 0.0
     """probability of exception transient exception when running job"""
     timeout: float = 10.0
diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py
index 7d406c969f..49b55ec65d 100644
--- a/dlt/destinations/impl/dummy/dummy.py
+++ b/dlt/destinations/impl/dummy/dummy.py
@@ -14,6 +14,7 @@
 )
 import os
 import time
+from dlt.common.metrics import LoadJobMetrics
 from dlt.common.pendulum import pendulum
 from dlt.common.schema import Schema, TTableSchema, TSchemaTables
 from dlt.common.storages import FileStorage
@@ -25,7 +26,7 @@
 )
 from dlt.common.destination.reference import (
     HasFollowupJobs,
-    FollowupJob,
+    FollowupJobRequest,
     SupportsStagingDestination,
     TLoadJobState,
     RunnableLoadJob,
@@ -37,10 +38,9 @@
 
 from dlt.destinations.exceptions import (
     LoadJobNotExistsException,
-    LoadJobInvalidStateTransitionException,
 )
 from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration
-from dlt.destinations.job_impl import ReferenceFollowupJob
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 
 
 class LoadDummyBaseJob(RunnableLoadJob):
@@ -78,18 +78,25 @@ def run(self) -> None:
             c_r = random.random()
             if self.config.retry_prob >= c_r:
                 # this will make the job go to a retry state
-                raise DestinationTransientException("a random retry occured")
+                raise DestinationTransientException("a random retry occurred")
 
             # fail prob
             c_r = random.random()
             if self.config.fail_prob >= c_r:
                 # this will make the the job go to a failed state
-                raise DestinationTerminalException("a random fail occured")
+                raise DestinationTerminalException("a random fail occurred")
 
             time.sleep(0.1)
 
+    def metrics(self) -> Optional[LoadJobMetrics]:
+        m = super().metrics()
+        # add remote uri if there's followup job
+        if self.config.create_followup_jobs:
+            m = m._replace(remote_uri=self._file_name)
+        return m
 
-class DummyFollowupJob(ReferenceFollowupJob):
+
+class DummyFollowupJobRequest(ReferenceFollowupJobRequest):
     def __init__(
         self, original_file_name: str, remote_paths: List[str], config: DummyClientConfiguration
     ) -> None:
@@ -100,9 +107,9 @@ def __init__(
 
 
 class LoadDummyJob(LoadDummyBaseJob, HasFollowupJobs):
-    def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]:
+    def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]:
         if self.config.create_followup_jobs and final_state == "completed":
-            new_job = DummyFollowupJob(
+            new_job = DummyFollowupJobRequest(
                 original_file_name=self.file_name(),
                 remote_paths=[self._file_name],
                 config=self.config,
@@ -113,8 +120,8 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]:
 
 
 JOBS: Dict[str, LoadDummyBaseJob] = {}
-CREATED_FOLLOWUP_JOBS: Dict[str, FollowupJob] = {}
-CREATED_TABLE_CHAIN_FOLLOWUP_JOBS: Dict[str, FollowupJob] = {}
+CREATED_FOLLOWUP_JOBS: Dict[str, FollowupJobRequest] = {}
+CREATED_TABLE_CHAIN_FOLLOWUP_JOBS: Dict[str, FollowupJobRequest] = {}
 RETRIED_JOBS: Dict[str, LoadDummyBaseJob] = {}
 
 
@@ -173,7 +180,7 @@ def create_table_chain_completed_followup_jobs(
         self,
         table_chain: Sequence[TTableSchema],
         completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         """Creates a list of followup jobs that should be executed after a table chain is completed"""
 
         # if sql job follow up is configure we schedule a merge job that will always fail
@@ -184,7 +191,7 @@ def create_table_chain_completed_followup_jobs(
         if self.config.create_followup_table_chain_reference_jobs:
             table_job_paths = [job.file_path for job in completed_table_chain_jobs]
             file_name = FileStorage.get_file_name_from_file_path(table_job_paths[0])
-            job = ReferenceFollowupJob(file_name, table_job_paths)
+            job = ReferenceFollowupJobRequest(file_name, table_job_paths)
             CREATED_TABLE_CHAIN_FOLLOWUP_JOBS[job.job_id()] = job
             return [job]
         return []
@@ -212,7 +219,7 @@ def __exit__(
         pass
 
     def _create_job(self, job_id: str) -> LoadDummyBaseJob:
-        if ReferenceFollowupJob.is_reference_job(job_id):
+        if ReferenceFollowupJobRequest.is_reference_job(job_id):
             return LoadDummyBaseJob(job_id, config=self.config)
         else:
             return LoadDummyJob(job_id, config=self.config)
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index f2466f25a2..2e09871ba9 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -9,6 +9,7 @@
 
 import dlt
 from dlt.common import logger, time, json, pendulum
+from dlt.common.metrics import LoadJobMetrics
 from dlt.common.storages.fsspec_filesystem import glob_files
 from dlt.common.typing import DictStrAny
 from dlt.common.schema import Schema, TSchemaTables, TTableSchema
@@ -21,7 +22,7 @@
 )
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.destination.reference import (
-    FollowupJob,
+    FollowupJobRequest,
     TLoadJobState,
     RunnableLoadJob,
     JobClientBase,
@@ -34,7 +35,7 @@
 )
 from dlt.common.destination.exceptions import DestinationUndefinedEntity
 from dlt.destinations.job_impl import (
-    ReferenceFollowupJob,
+    ReferenceFollowupJobRequest,
     FinalizedLoadJob,
     FinalizedLoadJobWithFollowupJobs,
 )
@@ -87,6 +88,13 @@ def make_remote_path(self) -> str:
             path_utils.normalize_path_sep(self.pathlib, self.destination_file_name),
         )
 
+    def make_remote_uri(self) -> str:
+        return self._job_client.make_remote_uri(self.make_remote_path())
+
+    def metrics(self) -> Optional[LoadJobMetrics]:
+        m = super().metrics()
+        return m._replace(remote_uri=self.make_remote_uri())
+
 
 class DeltaLoadFilesystemJob(FilesystemLoadJob):
     def __init__(self, file_path: str) -> None:
@@ -95,6 +103,15 @@ def __init__(self, file_path: str) -> None:
         )
 
     def run(self) -> None:
+        # pick local filesystem pathlib or posix for buckets
+        # TODO: since we pass _job_client via run_managed and not set_env_vars it is hard
+        # to write a handler with those two line below only in FilesystemLoadJob
+        self.is_local_filesystem = self._job_client.config.protocol == "file"
+        self.pathlib = os.path if self.is_local_filesystem else posixpath
+        self.destination_file_name = self._job_client.make_remote_uri(
+            self._job_client.get_table_dir(self.load_table_name)
+        )
+
         from dlt.common.libs.pyarrow import pyarrow as pa
         from dlt.common.libs.deltalake import (
             DeltaTable,
@@ -105,15 +122,13 @@ def run(self) -> None:
         )
 
         # create Arrow dataset from Parquet files
-        file_paths = ReferenceFollowupJob.resolve_references(self._file_path)
+        file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path)
         arrow_ds = pa.dataset.dataset(file_paths)
 
         # create Delta table object
-        dt_path = self._job_client.make_remote_uri(
-            self._job_client.get_table_dir(self.load_table_name)
-        )
+
         storage_options = _deltalake_storage_options(self._job_client.config)
-        dt = try_get_deltatable(dt_path, storage_options=storage_options)
+        dt = try_get_deltatable(self.destination_file_name, storage_options=storage_options)
 
         # get partition columns
         part_cols = get_columns_names_with_prop(self._load_table, "partition")
@@ -124,7 +139,7 @@ def run(self) -> None:
             if dt is None:
                 # create new empty Delta table with schema from Arrow table
                 DeltaTable.create(
-                    table_uri=dt_path,
+                    table_uri=self.destination_file_name,
                     schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema),
                     mode="overwrite",
                     partition_by=part_cols,
@@ -160,7 +175,7 @@ def run(self) -> None:
 
         else:
             write_delta_table(
-                table_or_uri=dt_path if dt is None else dt,
+                table_or_uri=self.destination_file_name if dt is None else dt,
                 data=arrow_rbr,
                 write_disposition=self._load_table["write_disposition"],
                 partition_by=part_cols,
@@ -169,13 +184,13 @@ def run(self) -> None:
 
 
 class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob):
-    def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]:
+    def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]:
         jobs = super().create_followup_jobs(final_state)
         if self._load_table.get("table_format") == "delta":
             # delta table jobs only require table chain followup jobs
             pass
         elif final_state == "completed":
-            ref_job = ReferenceFollowupJob(
+            ref_job = ReferenceFollowupJobRequest(
                 original_file_name=self.file_name(),
                 remote_paths=[self._job_client.make_remote_uri(self.make_remote_path())],
             )
@@ -369,7 +384,7 @@ def create_load_job(
             import dlt.common.libs.deltalake  # assert dependencies are installed
 
             # a reference job for a delta table indicates a table chain followup job
-            if ReferenceFollowupJob.is_reference_job(file_path):
+            if ReferenceFollowupJobRequest.is_reference_job(file_path):
                 return DeltaLoadFilesystemJob(file_path)
             # otherwise just continue
             return FinalizedLoadJobWithFollowupJobs(file_path)
@@ -578,7 +593,7 @@ def create_table_chain_completed_followup_jobs(
         self,
         table_chain: Sequence[TTableSchema],
         completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         assert completed_table_chain_jobs is not None
         jobs = super().create_table_chain_completed_followup_jobs(
             table_chain, completed_table_chain_jobs
@@ -591,5 +606,5 @@ def create_table_chain_completed_followup_jobs(
                     if job.job_file_info.table_name == table["name"]
                 ]
                 file_name = FileStorage.get_file_name_from_file_path(table_job_paths[0])
-                jobs.append(ReferenceFollowupJob(file_name, table_job_paths))
+                jobs.append(ReferenceFollowupJobRequest(file_name, table_job_paths))
         return jobs
diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py
index a67423a873..750dc93a10 100644
--- a/dlt/destinations/impl/mssql/mssql.py
+++ b/dlt/destinations/impl/mssql/mssql.py
@@ -1,7 +1,7 @@
 from typing import Dict, Optional, Sequence, List, Any
 
 from dlt.common.exceptions import TerminalValueError
-from dlt.common.destination.reference import FollowupJob
+from dlt.common.destination.reference import FollowupJobRequest
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.schema import TColumnSchema, TColumnHint, Schema
 from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat
@@ -160,7 +160,9 @@ def __init__(
         self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {}
         self.type_mapper = MsSqlTypeMapper(self.capabilities)
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [MsSqlMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _make_add_column_sql(
@@ -189,7 +191,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
 
     def _create_replace_followup_jobs(
         self, table_chain: Sequence[TTableSchema]
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         if self.config.replace_strategy == "staging-optimized":
             return [MsSqlStagingCopyJob.from_table_chain(table_chain, self.sql_client)]
         return super()._create_replace_followup_jobs(table_chain)
diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py
index 5ae5f27a6e..a832bfe07f 100644
--- a/dlt/destinations/impl/postgres/postgres.py
+++ b/dlt/destinations/impl/postgres/postgres.py
@@ -9,7 +9,7 @@
 from dlt.common.destination.reference import (
     HasFollowupJobs,
     RunnableLoadJob,
-    FollowupJob,
+    FollowupJobRequest,
     LoadJob,
     TLoadJobState,
 )
@@ -246,7 +246,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
 
     def _create_replace_followup_jobs(
         self, table_chain: Sequence[TTableSchema]
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         if self.config.replace_strategy == "staging-optimized":
             return [PostgresStagingCopyJob.from_table_chain(table_chain, self.sql_client)]
         return super()._create_replace_followup_jobs(table_chain)
diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py
index 81abd57803..93827c8163 100644
--- a/dlt/destinations/impl/redshift/redshift.py
+++ b/dlt/destinations/impl/redshift/redshift.py
@@ -14,7 +14,7 @@
 
 
 from dlt.common.destination.reference import (
-    FollowupJob,
+    FollowupJobRequest,
     CredentialsConfiguration,
     SupportsStagingDestination,
     LoadJob,
@@ -33,7 +33,7 @@
 from dlt.destinations.job_client_impl import CopyRemoteFileLoadJob
 from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient
 from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration
-from dlt.destinations.job_impl import ReferenceFollowupJob
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 from dlt.destinations.sql_client import SqlClientBase
 from dlt.destinations.type_mapping import TypeMapper
 
@@ -238,7 +238,9 @@ def __init__(
         self.config: RedshiftClientConfiguration = config
         self.type_mapper = RedshiftTypeMapper(self.capabilities)
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
@@ -258,7 +260,7 @@ def create_load_job(
         """Starts SqlLoadJob for files ending with .sql or returns None to let derived classes to handle their specific jobs"""
         job = super().create_load_job(table, file_path, load_id, restore)
         if not job:
-            assert ReferenceFollowupJob.is_reference_job(
+            assert ReferenceFollowupJobRequest.is_reference_job(
                 file_path
             ), "Redshift must use staging to load files"
             job = RedshiftCopyFileLoadJob(
diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py
index 904b524791..8b4eabc961 100644
--- a/dlt/destinations/impl/snowflake/snowflake.py
+++ b/dlt/destinations/impl/snowflake/snowflake.py
@@ -29,7 +29,7 @@
 from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration
 from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient
 from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient
-from dlt.destinations.job_impl import ReferenceFollowupJob
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 from dlt.destinations.type_mapping import TypeMapper
 
 
@@ -98,11 +98,11 @@ def run(self) -> None:
         self._sql_client = self._job_client.sql_client
 
         # resolve reference
-        is_local_file = not ReferenceFollowupJob.is_reference_job(self._file_path)
+        is_local_file = not ReferenceFollowupJobRequest.is_reference_job(self._file_path)
         file_url = (
             self._file_path
             if is_local_file
-            else ReferenceFollowupJob.resolve_reference(self._file_path)
+            else ReferenceFollowupJobRequest.resolve_reference(self._file_path)
         )
         # take file name
         file_name = FileStorage.get_file_name_from_file_path(file_url)
diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py
index d1b38f73bd..e43e2a6dfa 100644
--- a/dlt/destinations/impl/synapse/synapse.py
+++ b/dlt/destinations/impl/synapse/synapse.py
@@ -5,7 +5,7 @@
 from urllib.parse import urlparse, urlunparse
 
 from dlt.common.destination import DestinationCapabilitiesContext
-from dlt.common.destination.reference import SupportsStagingDestination, FollowupJob, LoadJob
+from dlt.common.destination.reference import SupportsStagingDestination, FollowupJobRequest, LoadJob
 
 from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint
 from dlt.common.schema.utils import (
@@ -19,7 +19,7 @@
     AzureServicePrincipalCredentialsWithoutDefaults,
 )
 
-from dlt.destinations.job_impl import ReferenceFollowupJob
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 from dlt.destinations.sql_client import SqlClientBase
 from dlt.destinations.job_client_impl import (
     SqlJobClientBase,
@@ -131,7 +131,7 @@ def _get_columstore_valid_column(self, c: TColumnSchema) -> TColumnSchema:
 
     def _create_replace_followup_jobs(
         self, table_chain: Sequence[TTableSchema]
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         return SqlJobClientBase._create_replace_followup_jobs(self, table_chain)
 
     def prepare_load_table(self, table_name: str, staging: bool = False) -> TTableSchema:
@@ -163,7 +163,7 @@ def create_load_job(
     ) -> LoadJob:
         job = super().create_load_job(table, file_path, load_id, restore)
         if not job:
-            assert ReferenceFollowupJob.is_reference_job(
+            assert ReferenceFollowupJobRequest.is_reference_job(
                 file_path
             ), "Synapse must use staging to load files"
             job = SynapseCopyFileLoadJob(
diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
index 7fdd979c5d..92132dd751 100644
--- a/dlt/destinations/job_client_impl.py
+++ b/dlt/destinations/job_client_impl.py
@@ -42,7 +42,7 @@
     WithStateSync,
     DestinationClientConfiguration,
     DestinationClientDwhConfiguration,
-    FollowupJob,
+    FollowupJobRequest,
     WithStagingDataset,
     RunnableLoadJob,
     LoadJob,
@@ -53,7 +53,7 @@
 
 from dlt.destinations.exceptions import DatabaseUndefinedRelation
 from dlt.destinations.job_impl import (
-    ReferenceFollowupJob,
+    ReferenceFollowupJobRequest,
 )
 from dlt.destinations.sql_jobs import SqlMergeFollowupJob, SqlStagingCopyFollowupJob
 from dlt.destinations.typing import TNativeConn
@@ -118,7 +118,7 @@ def __init__(
         super().__init__(file_path)
         self._job_client: "SqlJobClientBase" = None
         self._staging_credentials = staging_credentials
-        self._bucket_path = ReferenceFollowupJob.resolve_reference(file_path)
+        self._bucket_path = ReferenceFollowupJobRequest.resolve_reference(file_path)
 
 
 class SqlJobClientBase(JobClientBase, WithStateSync):
@@ -216,16 +216,18 @@ def should_truncate_table_before_load(self, table: TTableSchema) -> bool:
 
     def _create_append_followup_jobs(
         self, table_chain: Sequence[TTableSchema]
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         return []
 
-    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]:
+    def _create_merge_followup_jobs(
+        self, table_chain: Sequence[TTableSchema]
+    ) -> List[FollowupJobRequest]:
         return [SqlMergeFollowupJob.from_table_chain(table_chain, self.sql_client)]
 
     def _create_replace_followup_jobs(
         self, table_chain: Sequence[TTableSchema]
-    ) -> List[FollowupJob]:
-        jobs: List[FollowupJob] = []
+    ) -> List[FollowupJobRequest]:
+        jobs: List[FollowupJobRequest] = []
         if self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]:
             jobs.append(
                 SqlStagingCopyFollowupJob.from_table_chain(
@@ -238,7 +240,7 @@ def create_table_chain_completed_followup_jobs(
         self,
         table_chain: Sequence[TTableSchema],
         completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
-    ) -> List[FollowupJob]:
+    ) -> List[FollowupJobRequest]:
         """Creates a list of followup jobs for merge write disposition and staging replace strategies"""
         jobs = super().create_table_chain_completed_followup_jobs(
             table_chain, completed_table_chain_jobs
diff --git a/dlt/destinations/job_impl.py b/dlt/destinations/job_impl.py
index 41c939f482..1f54913064 100644
--- a/dlt/destinations/job_impl.py
+++ b/dlt/destinations/job_impl.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 import os
 import tempfile  # noqa: 251
-from typing import Dict, Iterable, List
+from typing import Dict, Iterable, List, Optional
 
 from dlt.common.json import json
 from dlt.common.destination.reference import (
@@ -9,9 +9,10 @@
     TLoadJobState,
     RunnableLoadJob,
     JobClientBase,
-    FollowupJob,
+    FollowupJobRequest,
     LoadJob,
 )
+from dlt.common.metrics import LoadJobMetrics
 from dlt.common.storages.load_package import commit_load_package_state
 from dlt.common.schema import Schema, TTableSchema
 from dlt.common.storages import FileStorage
@@ -56,7 +57,7 @@ class FinalizedLoadJobWithFollowupJobs(FinalizedLoadJob, HasFollowupJobs):
     pass
 
 
-class FollowupJobImpl(FollowupJob):
+class FollowupJobRequestImpl(FollowupJobRequest):
     """
     Class to create a new loadjob, not stateful and not runnable
     """
@@ -79,7 +80,7 @@ def job_id(self) -> str:
         return self._parsed_file_name.job_id()
 
 
-class ReferenceFollowupJob(FollowupJobImpl):
+class ReferenceFollowupJobRequest(FollowupJobRequestImpl):
     def __init__(self, original_file_name: str, remote_paths: List[str]) -> None:
         file_name = os.path.splitext(original_file_name)[0] + "." + "reference"
         self._remote_paths = remote_paths
@@ -98,7 +99,7 @@ def resolve_references(file_path: str) -> List[str]:
 
     @staticmethod
     def resolve_reference(file_path: str) -> str:
-        refs = ReferenceFollowupJob.resolve_references(file_path)
+        refs = ReferenceFollowupJobRequest.resolve_references(file_path)
         assert len(refs) == 1
         return refs[0]
 
diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
index a1e38a2c20..d5f005ee9a 100644
--- a/dlt/destinations/sql_jobs.py
+++ b/dlt/destinations/sql_jobs.py
@@ -21,7 +21,7 @@
 from dlt.common.utils import uniq_id
 from dlt.common.destination.capabilities import DestinationCapabilitiesContext
 from dlt.destinations.exceptions import MergeDispositionException
-from dlt.destinations.job_impl import FollowupJobImpl
+from dlt.destinations.job_impl import FollowupJobRequestImpl
 from dlt.destinations.sql_client import SqlClientBase
 from dlt.common.destination.exceptions import DestinationTransientException
 
@@ -45,7 +45,7 @@ def __init__(self, original_exception: Exception, table_chain: Sequence[TTableSc
         )
 
 
-class SqlFollowupJob(FollowupJobImpl):
+class SqlFollowupJob(FollowupJobRequestImpl):
     """Sql base job for jobs that rely on the whole tablechain"""
 
     @classmethod
@@ -54,7 +54,7 @@ def from_table_chain(
         table_chain: Sequence[TTableSchema],
         sql_client: SqlClientBase[Any],
         params: Optional[SqlJobParams] = None,
-    ) -> FollowupJobImpl:
+    ) -> FollowupJobRequestImpl:
         """Generates a list of sql statements, that will be executed by the sql client when the job is executed in the loader.
 
         The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list).
diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py
index 4a1de2517d..8a91dd7477 100644
--- a/dlt/extract/extractors.py
+++ b/dlt/extract/extractors.py
@@ -4,9 +4,9 @@
 from dlt.common.configuration import known_sections, resolve_configuration, with_config
 from dlt.common import logger
 from dlt.common.configuration.specs import BaseConfiguration, configspec
-from dlt.common.data_writers import DataWriterMetrics
 from dlt.common.destination.capabilities import DestinationCapabilitiesContext
 from dlt.common.exceptions import MissingDependencyException
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.runtime.collector import Collector, NULL_COLLECTOR
 from dlt.common.typing import TDataItems, TDataItem, TLoaderFileFormat
 from dlt.common.schema import Schema, utils
diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py
index de777ad60e..395366b09e 100644
--- a/dlt/extract/storage.py
+++ b/dlt/extract/storage.py
@@ -1,7 +1,8 @@
 import os
 from typing import Dict, List
 
-from dlt.common.data_writers import TDataItemFormat, DataWriterMetrics, DataWriter, FileWriterSpec
+from dlt.common.data_writers import TDataItemFormat, DataWriter, FileWriterSpec
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.schema import Schema
 from dlt.common.storages import (
     NormalizeStorageConfiguration,
diff --git a/dlt/load/load.py b/dlt/load/load.py
index 99a12d69ee..f084c9d3d9 100644
--- a/dlt/load/load.py
+++ b/dlt/load/load.py
@@ -5,12 +5,17 @@
 import os
 
 from dlt.common import logger
+from dlt.common.metrics import LoadJobMetrics
 from dlt.common.runtime.signals import sleep
 from dlt.common.configuration import with_config, known_sections
 from dlt.common.configuration.accessors import config
 from dlt.common.pipeline import LoadInfo, LoadMetrics, SupportsPipeline, WithStepInfo
 from dlt.common.schema.utils import get_top_level_table
-from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState
+from dlt.common.storages.load_storage import (
+    LoadPackageInfo,
+    ParsedLoadJobFileName,
+    TPackageJobState,
+)
 from dlt.common.storages.load_package import (
     LoadPackageStateInjectableContext,
     load_package as current_load_package,
@@ -29,7 +34,7 @@
     Destination,
     RunnableLoadJob,
     LoadJob,
-    FollowupJob,
+    FollowupJobRequest,
     TLoadJobState,
     DestinationClientConfiguration,
     SupportsStagingDestination,
@@ -84,6 +89,7 @@ def __init__(
         self.pool = NullExecutor()
         self.load_storage: LoadStorage = self.create_storage(is_storage_owner)
         self._loaded_packages: List[LoadPackageInfo] = []
+        self._job_metrics: Dict[str, LoadJobMetrics] = {}
         self._run_loop_sleep_duration: float = (
             1.0  # amount of time to sleep between querying completed jobs
         )
@@ -308,7 +314,7 @@ def create_followup_jobs(
         where they will be picked up for execution
         """
 
-        jobs: List[FollowupJob] = []
+        jobs: List[FollowupJobRequest] = []
         if isinstance(starting_job, HasFollowupJobs):
             # check for merge jobs only for jobs executing on the destination, the staging destination jobs must be excluded
             # NOTE: we may move that logic to the interface
@@ -392,6 +398,11 @@ def complete_jobs(
                 # create followup jobs
                 self.create_followup_jobs(load_id, state, job, schema)
 
+                # preserve metrics
+                metrics = job.metrics()
+                if metrics:
+                    self._job_metrics[job.job_id()] = metrics
+
                 # try to get exception message from job
                 failed_message = job.exception()
                 self.load_storage.normalized_packages.fail_job(
@@ -423,7 +434,7 @@ def complete_jobs(
                     if r_c > 0 and r_c % self.config.raise_on_max_retries == 0:
                         pending_exception = LoadClientJobRetry(
                             load_id,
-                            job.job_file_info().job_id(),
+                            job.job_id(),
                             r_c,
                             self.config.raise_on_max_retries,
                             retry_message=retry_message,
@@ -431,6 +442,15 @@ def complete_jobs(
             elif state == "completed":
                 # create followup jobs
                 self.create_followup_jobs(load_id, state, job, schema)
+
+                # preserve metrics
+                # TODO: metrics should be persisted. this is different vs. all other steps because load step
+                # may be restarted in the middle of execution
+                # NOTE: we could use package state but cases with 100k jobs must be tested
+                metrics = job.metrics()
+                if metrics:
+                    self._job_metrics[job.job_id()] = metrics
+
                 # move to completed folder after followup jobs are created
                 # in case of exception when creating followup job, the loader will retry operation and try to complete again
                 self.load_storage.normalized_packages.complete_job(load_id, job.file_name())
@@ -464,14 +484,18 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False)
         self.load_storage.complete_load_package(load_id, aborted)
         # collect package info
         self._loaded_packages.append(self.load_storage.get_load_package_info(load_id))
-        self._step_info_complete_load_id(load_id, metrics={"started_at": None, "finished_at": None})
+        # TODO: job metrics must be persisted
+        self._step_info_complete_load_id(
+            load_id,
+            metrics={"started_at": None, "finished_at": None, "job_metrics": self._job_metrics},
+        )
         # delete jobs only now
         self.load_storage.maybe_remove_completed_jobs(load_id)
         logger.info(
             f"All jobs completed, archiving package {load_id} with aborted set to {aborted}"
         )
 
-    def update_load_package_info(self, load_id: str) -> None:
+    def init_jobs_counter(self, load_id: str) -> None:
         # update counter we only care about the jobs that are scheduled to be loaded
         package_jobs = self.load_storage.normalized_packages.get_load_package_jobs(load_id)
         total_jobs = reduce(lambda p, c: p + len(c), package_jobs.values(), 0)
@@ -492,7 +516,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None:
         dropped_tables = current_load_package()["state"].get("dropped_tables", [])
         truncated_tables = current_load_package()["state"].get("truncated_tables", [])
 
-        self.update_load_package_info(load_id)
+        self.init_jobs_counter(load_id)
 
         # initialize analytical storage ie. create dataset required by passed schema
         with self.get_destination_client(schema) as job_client:
@@ -606,7 +630,8 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics:
                 )
             ):
                 # the same load id may be processed across multiple runs
-                if not self.current_load_id:
+                if self.current_load_id is None:
+                    self._job_metrics = {}
                     self._step_info_start_load_id(load_id)
                 self.load_single_package(load_id, schema)
 
diff --git a/dlt/load/utils.py b/dlt/load/utils.py
index 9750f89d4b..741c01f249 100644
--- a/dlt/load/utils.py
+++ b/dlt/load/utils.py
@@ -2,7 +2,7 @@
 from itertools import groupby
 
 from dlt.common import logger
-from dlt.common.storages.load_package import LoadJobInfo, PackageStorage, TJobState
+from dlt.common.storages.load_package import LoadJobInfo, PackageStorage, TPackageJobState
 from dlt.common.schema.utils import (
     fill_hints_from_parent_and_clone_table,
     get_child_tables,
@@ -19,7 +19,7 @@
 
 def get_completed_table_chain(
     schema: Schema,
-    all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]],
+    all_jobs: Iterable[Tuple[TPackageJobState, ParsedLoadJobFileName]],
     top_merged_table: TTableSchema,
     being_completed_job_id: str = None,
 ) -> List[TTableSchema]:
diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py
index 5f84d57d7a..650d10c268 100644
--- a/dlt/normalize/items_normalizers.py
+++ b/dlt/normalize/items_normalizers.py
@@ -3,9 +3,9 @@
 
 from dlt.common import logger
 from dlt.common.json import json
-from dlt.common.data_writers import DataWriterMetrics
 from dlt.common.data_writers.writers import ArrowToObjectAdapter
 from dlt.common.json import custom_pua_decode, may_have_pua
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer
 from dlt.common.runtime import signals
 from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict
diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py
index e80931605c..3df060b141 100644
--- a/dlt/normalize/normalize.py
+++ b/dlt/normalize/normalize.py
@@ -4,10 +4,10 @@
 from concurrent.futures import Future, Executor
 
 from dlt.common import logger
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.runtime.signals import sleep
 from dlt.common.configuration import with_config, known_sections
 from dlt.common.configuration.accessors import config
-from dlt.common.data_writers import DataWriterMetrics
 from dlt.common.data_writers.writers import EMPTY_DATA_WRITER_METRICS
 from dlt.common.runners import TRunMetrics, Runnable, NullExecutor
 from dlt.common.runtime import signals
diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py
index 10d0a00eb1..b8969f64a3 100644
--- a/dlt/normalize/worker.py
+++ b/dlt/normalize/worker.py
@@ -4,12 +4,12 @@
 from dlt.common.configuration.container import Container
 from dlt.common.data_writers import (
     DataWriter,
-    DataWriterMetrics,
     create_import_spec,
     resolve_best_writer_spec,
     get_best_writer_spec,
     is_native_writer,
 )
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.utils import chunks
 from dlt.common.schema.typing import TStoredSchema, TTableSchema
 from dlt.common.storages import (
diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py
index 29770966a6..2f857e5fd5 100644
--- a/dlt/pipeline/trace.py
+++ b/dlt/pipeline/trace.py
@@ -168,7 +168,7 @@ def asdict(self) -> DictStrAny:
         """A dictionary representation of PipelineTrace that can be loaded with `dlt`"""
         d = self._asdict()
         # run step is the same as load step
-        d["steps"] = [step.asdict() for step in self.steps]  # if step.step != "run"
+        d["steps"] = [step.asdict() for step in self.steps if step.step != "run"]
         return d
 
     @property
diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md
index f21d6f0686..40f9419bc2 100644
--- a/docs/website/docs/general-usage/pipeline.md
+++ b/docs/website/docs/general-usage/pipeline.md
@@ -85,6 +85,19 @@ You can inspect stored artifacts using the command
 > 💡 You can attach `Pipeline` instance to an existing working folder, without creating a new
 > pipeline with `dlt.attach`.
 
+### Separate working environments with `pipelines_dir`.
+You can run several pipelines with the same name but with different configuration ie. to target development / staging / production environments.
+Set the `pipelines_dir` argument to store all the working folders in specific place. For example:
+```py
+import dlt
+from dlt.common.pipeline import get_dlt_pipelines_dir
+
+dev_pipelines_dir = os.path.join(get_dlt_pipelines_dir(), "dev")
+pipeline = dlt.pipeline(destination="duckdb", dataset_name="sequence", pipelines_dir=dev_pipelines_dir)
+```
+stores pipeline working folder in `~/.dlt/pipelines/dev/<pipeline_name>`. Mind that you need to pass this `~/.dlt/pipelines/dev/`
+in to all cli commands to get info/trace for that pipeline.
+
 ## Do experiments with dev mode
 
 If you [create a new pipeline script](../walkthroughs/create-a-pipeline.md) you will be
diff --git a/tests/common/data_writers/test_data_writers.py b/tests/common/data_writers/test_data_writers.py
index 9b4e61a2f7..03723b7b55 100644
--- a/tests/common/data_writers/test_data_writers.py
+++ b/tests/common/data_writers/test_data_writers.py
@@ -5,6 +5,7 @@
 
 from dlt.common import pendulum, json
 from dlt.common.data_writers.exceptions import DataWriterNotFound, SpecLookupFailed
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.typing import AnyFun
 
 from dlt.common.data_writers.escape import (
@@ -25,7 +26,6 @@
     ArrowToTypedJsonlListWriter,
     CsvWriter,
     DataWriter,
-    DataWriterMetrics,
     EMPTY_DATA_WRITER_METRICS,
     ImportFileWriter,
     InsertValuesWriter,
@@ -180,12 +180,13 @@ def test_data_writer_metrics_add() -> None:
     metrics = DataWriterMetrics("file", 10, 100, now, now + 10)
     add_m: DataWriterMetrics = metrics + EMPTY_DATA_WRITER_METRICS  # type: ignore[assignment]
     assert add_m == DataWriterMetrics("", 10, 100, now, now + 10)
-    assert metrics + metrics == DataWriterMetrics("", 20, 200, now, now + 10)
+    # will keep "file" because it is in both
+    assert metrics + metrics == DataWriterMetrics("file", 20, 200, now, now + 10)
     assert sum((metrics, metrics, metrics), EMPTY_DATA_WRITER_METRICS) == DataWriterMetrics(
         "", 30, 300, now, now + 10
     )
     # time range extends when added
-    add_m = metrics + DataWriterMetrics("file", 99, 120, now - 10, now + 20)  # type: ignore[assignment]
+    add_m = metrics + DataWriterMetrics("fileX", 99, 120, now - 10, now + 20)  # type: ignore[assignment]
     assert add_m == DataWriterMetrics("", 109, 220, now - 10, now + 20)
 
 
diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py
index baac3b7af5..a1334ba1da 100644
--- a/tests/common/storages/utils.py
+++ b/tests/common/storages/utils.py
@@ -16,7 +16,7 @@
     LoadStorageConfiguration,
     FilesystemConfiguration,
     LoadPackageInfo,
-    TJobState,
+    TPackageJobState,
     LoadStorage,
 )
 from dlt.common.storages import DataItemStorage, FileStorage
@@ -195,7 +195,7 @@ def assert_package_info(
     storage: LoadStorage,
     load_id: str,
     package_state: str,
-    job_state: TJobState,
+    job_state: TPackageJobState,
     jobs_count: int = 1,
 ) -> LoadPackageInfo:
     package_info = storage.get_load_package_info(load_id)
diff --git a/tests/extract/data_writers/test_buffered_writer.py b/tests/extract/data_writers/test_buffered_writer.py
index 5cad5a35b9..205e3f83dc 100644
--- a/tests/extract/data_writers/test_buffered_writer.py
+++ b/tests/extract/data_writers/test_buffered_writer.py
@@ -7,12 +7,12 @@
 from dlt.common.data_writers.exceptions import BufferedDataWriterClosed
 from dlt.common.data_writers.writers import (
     DataWriter,
-    DataWriterMetrics,
     InsertValuesWriter,
     JsonlWriter,
     ALL_WRITERS,
 )
 from dlt.common.destination.capabilities import TLoaderFileFormat, DestinationCapabilitiesContext
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.schema.utils import new_column
 from dlt.common.storages.file_storage import FileStorage
 
diff --git a/tests/extract/data_writers/test_data_item_storage.py b/tests/extract/data_writers/test_data_item_storage.py
index feda51c229..558eeec79e 100644
--- a/tests/extract/data_writers/test_data_item_storage.py
+++ b/tests/extract/data_writers/test_data_item_storage.py
@@ -3,8 +3,9 @@
 import pytest
 
 from dlt.common.configuration.container import Container
-from dlt.common.data_writers.writers import DataWriterMetrics, DataWriter
+from dlt.common.data_writers.writers import DataWriter
 from dlt.common.destination.capabilities import DestinationCapabilitiesContext
+from dlt.common.metrics import DataWriterMetrics
 from dlt.common.schema.utils import new_column
 from dlt.common.storages.data_item_storage import DataItemStorage
 
diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py
index 759f443546..4b8707e989 100644
--- a/tests/load/pipeline/test_filesystem_pipeline.py
+++ b/tests/load/pipeline/test_filesystem_pipeline.py
@@ -12,6 +12,7 @@
 
 from dlt.common import json
 from dlt.common import pendulum
+from dlt.common.storages.configuration import FilesystemConfiguration
 from dlt.common.storages.load_package import ParsedLoadJobFileName
 from dlt.common.utils import uniq_id
 from dlt.common.exceptions import DependencyVersionException
@@ -299,6 +300,17 @@ def data_types():
     assert len(rows) == 10
     assert_all_data_types_row(rows[0], schema=column_schemas)
 
+    # make sure remote_uri is in metrics
+    metrics = info.metrics[info.loads_ids[0]][0]
+    # TODO: only final copy job has remote_uri. not the initial (empty) job for particular files
+    # we could implement an empty job for delta that generates correct remote_uri
+    remote_uri = list(metrics["job_metrics"].values())[-1].remote_uri
+    assert remote_uri.endswith("data_types")
+    bucket_uri = destination_config.bucket_url
+    if FilesystemConfiguration.is_local_path(bucket_uri):
+        bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri)
+    assert remote_uri.startswith(bucket_uri)
+
     # another run should append rows to the table
     info = pipeline.run(data_types())
     assert_load_info(info)
@@ -567,6 +579,7 @@ def two_part():
     assert dt.metadata().partition_columns == []
 
 
+@pytest.mark.essential
 @pytest.mark.parametrize(
     "destination_config",
     destinations_configs(
@@ -798,6 +811,51 @@ def parent_delta():
         get_delta_tables(pipeline, "non_existing_table")
 
 
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(
+        table_format_filesystem_configs=True,
+        table_format="delta",
+        bucket_subset=(FILE_BUCKET,),
+    ),
+    ids=lambda x: x.name,
+)
+def test_parquet_to_delta_upgrade(destination_config: DestinationTestConfiguration):
+    # change the resource to start creating delta tables
+    from dlt.common.libs.deltalake import get_delta_tables
+
+    @dlt.resource()
+    def foo():
+        yield [{"foo": 1}, {"foo": 2}]
+
+    pipeline = destination_config.setup_pipeline("fs_pipe")
+
+    info = pipeline.run(foo())
+    assert_load_info(info)
+    delta_tables = get_delta_tables(pipeline)
+    assert set(delta_tables.keys()) == set()
+
+    # drop the pipeline
+    pipeline.deactivate()
+
+    # redefine the resource
+
+    @dlt.resource(table_format="delta")  # type: ignore
+    def foo():
+        yield [{"foo": 1}, {"foo": 2}]
+
+    pipeline = destination_config.setup_pipeline("fs_pipe")
+
+    info = pipeline.run(foo())
+    assert_load_info(info)
+    delta_tables = get_delta_tables(pipeline)
+    assert set(delta_tables.keys()) == {"foo"}
+
+    # optimize all delta tables to make sure storage is there
+    for table in delta_tables.values():
+        table.vacuum()
+
+
 TEST_LAYOUTS = (
     "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}",
     "{schema_name}.{table_name}.{load_id}.{file_id}.{ext}",
diff --git a/tests/load/pipeline/test_postgres.py b/tests/load/pipeline/test_postgres.py
index a4001b7faa..5cadf701a2 100644
--- a/tests/load/pipeline/test_postgres.py
+++ b/tests/load/pipeline/test_postgres.py
@@ -42,3 +42,18 @@ def test_postgres_encoded_binary(
     # print(bytes(data["table"][0]["hash"]))
     # data in postgres equals unencoded blob
     assert data["table"][0]["hash"].tobytes() == blob
+
+
+# TODO: uncomment and finalize when we implement encoding for psycopg2
+# @pytest.mark.parametrize(
+#     "destination_config",
+#     destinations_configs(default_sql_configs=True, subset=["postgres"]),
+#     ids=lambda x: x.name,
+# )
+# def test_postgres_encoding(destination_config: DestinationTestConfiguration):
+#     from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient
+#     pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True)
+#     client: Psycopg2SqlClient = pipeline.sql_client()
+#     # client.credentials.query["encoding"] = "ru"
+#     with client:
+#         print(client.native_connection.encoding)
diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py
index 7f1427f20f..a760c86526 100644
--- a/tests/load/pipeline/test_stage_loading.py
+++ b/tests/load/pipeline/test_stage_loading.py
@@ -4,6 +4,7 @@
 import dlt, os
 from dlt.common import json, sleep
 from copy import deepcopy
+from dlt.common.storages.configuration import FilesystemConfiguration
 from dlt.common.utils import uniq_id
 from dlt.common.schema.typing import TDataType
 
@@ -16,6 +17,9 @@
 )
 from tests.cases import table_update_and_row
 
+# mark all tests as essential, do not remove
+pytestmark = pytest.mark.essential
+
 
 @dlt.resource(
     table_name="issues", write_disposition="merge", primary_key="id", merge_key=("node_id", "url")
@@ -46,6 +50,18 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None:
 
     info = pipeline.run(github(), loader_file_format=destination_config.file_format)
     assert_load_info(info)
+    # checks if remote_uri is set correctly on copy jobs
+    metrics = info.metrics[info.loads_ids[0]][0]
+    for job_metrics in metrics["job_metrics"].values():
+        remote_uri = job_metrics.remote_uri
+        job_ext = os.path.splitext(job_metrics.job_id)[1]
+        if job_ext not in (".reference", ".sql"):
+            assert remote_uri.endswith(job_ext)
+            bucket_uri = destination_config.bucket_url
+            if FilesystemConfiguration.is_local_path(bucket_uri):
+                bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri)
+            assert remote_uri.startswith(bucket_uri)
+
     package_info = pipeline.get_load_package_info(info.loads_ids[0])
     assert package_info.state == "loaded"
 
diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py
index b55f4ceece..9f0bca6ac5 100644
--- a/tests/load/test_dummy_client.py
+++ b/tests/load/test_dummy_client.py
@@ -8,7 +8,8 @@
 
 from dlt.common.exceptions import TerminalException, TerminalValueError
 from dlt.common.storages import FileStorage, PackageStorage, ParsedLoadJobFileName
-from dlt.common.storages.load_package import LoadJobInfo, TJobState
+from dlt.common.storages.configuration import FilesystemConfiguration
+from dlt.common.storages.load_package import LoadJobInfo, TPackageJobState
 from dlt.common.storages.load_storage import JobFileFormatUnsupported
 from dlt.common.destination.reference import RunnableLoadJob, TDestination
 from dlt.common.schema.utils import (
@@ -32,6 +33,7 @@
 from dlt.load.utils import get_completed_table_chain, init_client, _extend_tables_with_table_chain
 
 from tests.utils import (
+    MockPipeline,
     clean_test_storage,
     init_test_logging,
     TEST_DICT_CONFIG_PROVIDER,
@@ -78,10 +80,14 @@ def test_spool_job_started() -> None:
                 load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name()
             )
         )
+        assert_job_metrics(job, "completed")
         jobs.append(job)
     remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema)
     assert len(remaining_jobs) == 0
     assert len(finalized_jobs) == 2
+    assert len(load._job_metrics) == 2
+    for job in jobs:
+        assert load._job_metrics[job.job_id()] == job.metrics()
 
 
 def test_unsupported_writer_type() -> None:
@@ -199,7 +205,9 @@ def test_spool_job_failed() -> None:
                 load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name()
             )
         )
+        assert_job_metrics(job, "failed")
         jobs.append(job)
+    assert len(jobs) == 2
     # complete files
     remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema)
     assert len(remaining_jobs) == 0
@@ -215,6 +223,8 @@ def test_spool_job_failed() -> None:
                 load_id, PackageStorage.FAILED_JOBS_FOLDER, job.file_name() + ".exception"
             )
         )
+        # load should collect two jobs
+        assert load._job_metrics[job.job_id()] == job.metrics()
     started_files = load.load_storage.normalized_packages.list_started_jobs(load_id)
     assert len(started_files) == 0
 
@@ -226,6 +236,13 @@ def test_spool_job_failed() -> None:
     assert package_info.state == "loaded"
     # all jobs failed
     assert len(package_info.jobs["failed_jobs"]) == 2
+    # check metrics
+    load_info = load.get_step_info(MockPipeline("pipe", True))  # type: ignore[abstract]
+    metrics = load_info.metrics[load_id][0]["job_metrics"]
+    assert len(metrics) == 2
+    for job in jobs:
+        assert job.job_id() in metrics
+        assert metrics[job.job_id()].state == "failed"
 
 
 def test_spool_job_failed_terminally_exception_init() -> None:
@@ -244,6 +261,11 @@ def test_spool_job_failed_terminally_exception_init() -> None:
         assert len(package_info.jobs["started_jobs"]) == 0
         # load id was never committed
         complete_load.assert_not_called()
+        # metrics can be gathered
+        assert len(load._job_metrics) == 2
+        load_info = load.get_step_info(MockPipeline("pipe", True))  # type: ignore[abstract]
+        metrics = load_info.metrics[load_id][0]["job_metrics"]
+        assert len(metrics) == 2
 
 
 def test_spool_job_failed_transiently_exception_init() -> None:
@@ -264,6 +286,10 @@ def test_spool_job_failed_transiently_exception_init() -> None:
 
         # load id was never committed
         complete_load.assert_not_called()
+        # no metrics were gathered
+        assert len(load._job_metrics) == 0
+        load_info = load.get_step_info(MockPipeline("pipe", True))  # type: ignore[abstract]
+        assert len(load_info.metrics) == 0
 
 
 def test_spool_job_failed_exception_complete() -> None:
@@ -279,6 +305,11 @@ def test_spool_job_failed_exception_complete() -> None:
     # both failed - we wait till the current loop is completed and then raise
     assert len(package_info.jobs["failed_jobs"]) == 2
     assert len(package_info.jobs["started_jobs"]) == 0
+    # metrics can be gathered
+    assert len(load._job_metrics) == 2
+    load_info = load.get_step_info(MockPipeline("pipe", True))  # type: ignore[abstract]
+    metrics = load_info.metrics[load_id][0]["job_metrics"]
+    assert len(metrics) == 2
 
 
 def test_spool_job_retry_new() -> None:
@@ -328,6 +359,7 @@ def test_spool_job_retry_started() -> None:
     remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema)
     assert len(remaining_jobs) == 0
     assert len(finalized_jobs) == 0
+    assert len(load._job_metrics) == 0
     # clear retry flag
     dummy_impl.JOBS = {}
     files = load.load_storage.normalized_packages.list_new_jobs(load_id)
@@ -407,6 +439,8 @@ def test_failing_followup_jobs() -> None:
     assert len(dummy_impl.JOBS) == 2
     assert len(dummy_impl.RETRIED_JOBS) == 0
     assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
+    # no metrics were collected
+    assert len(load._job_metrics) == 0
 
     # now we can retry the same load, it will restart the two jobs and successfully create the followup jobs
     load.initial_client_config.fail_followup_job_creation = False  # type: ignore
@@ -436,6 +470,8 @@ def test_failing_table_chain_followup_jobs() -> None:
     assert len(dummy_impl.JOBS) == 2
     assert len(dummy_impl.RETRIED_JOBS) == 0
     assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
+    # no metrics were collected
+    assert len(load._job_metrics) == 0
 
     # now we can retry the same load, it will restart the two jobs and successfully create the table chain followup jobs
     load.initial_client_config.fail_table_chain_followup_job_creation = False  # type: ignore
@@ -662,11 +698,11 @@ def test_get_completed_table_chain_cases() -> None:
     # child completed, parent not
     event_user = schema.get_table("event_user")
     event_user_entities = schema.get_table("event_user__parse_data__entities")
-    event_user_job: Tuple[TJobState, ParsedLoadJobFileName] = (
+    event_user_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = (
         "started_jobs",
         ParsedLoadJobFileName("event_user", "event_user_id", 0, "jsonl"),
     )
-    event_user_entities_job: Tuple[TJobState, ParsedLoadJobFileName] = (
+    event_user_entities_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = (
         "completed_jobs",
         ParsedLoadJobFileName(
             "event_user__parse_data__entities", "event_user__parse_data__entities_id", 0, "jsonl"
@@ -857,6 +893,33 @@ def test_dummy_staging_filesystem() -> None:
     assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
 
 
+def test_load_multiple_packages() -> None:
+    load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0))
+    load.config.pool_type = "none"
+    load_id_1, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
+    sleep(0.1)
+    load_id_2, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
+    run_metrics = load.run(None)
+    assert run_metrics.pending_items == 1
+    # assert load._current_load_id is None
+    metrics_id_1 = load._job_metrics
+    assert len(metrics_id_1) == 2
+    assert load._step_info_metrics(load_id_1)[0]["job_metrics"] == metrics_id_1
+    run_metrics = load.run(None)
+    assert run_metrics.pending_items == 0
+    metrics_id_2 = load._job_metrics
+    assert len(metrics_id_2) == 2
+    assert load._step_info_metrics(load_id_2)[0]["job_metrics"] == metrics_id_2
+    load_info = load.get_step_info(MockPipeline("pipe", True))  # type: ignore[abstract]
+    assert load_id_1 in load_info.metrics
+    assert load_id_2 in load_info.metrics
+    assert load_info.metrics[load_id_1][0]["job_metrics"] == metrics_id_1
+    assert load_info.metrics[load_id_2][0]["job_metrics"] == metrics_id_2
+    # execute empty run
+    load.run(None)
+    assert len(load_info.metrics) == 2
+
+
 def test_terminal_exceptions() -> None:
     try:
         raise TerminalValueError("a")
@@ -866,6 +929,15 @@ def test_terminal_exceptions() -> None:
         raise AssertionError()
 
 
+def assert_job_metrics(job: RunnableLoadJob, expected_state: str) -> None:
+    metrics = job.metrics()
+    assert metrics.state == expected_state
+    assert metrics.started_at <= metrics.finished_at
+    assert metrics.job_id == job.job_id()
+    assert metrics.table_name == job._parsed_file_name.table_name
+    assert metrics.file_path == job._file_path
+
+
 def assert_complete_job(
     load: Load, should_delete_completed: bool = False, load_id: str = None, jobs_per_case: int = 1
 ) -> None:
@@ -910,6 +982,32 @@ def assert_complete_job(
                 assert load.load_storage.loaded_packages.storage.has_folder(completed_path)
             # complete load on client was called
             complete_load.assert_called_once_with(load_id)
+            # assert if all jobs in final state have metrics
+            metrics = load.get_step_info(MockPipeline("pipe", True)).metrics[load_id][0]  # type: ignore[abstract]
+            package_info = load.load_storage.loaded_packages.get_load_package_jobs(load_id)
+            for state, jobs in package_info.items():
+                for job in jobs:
+                    job_metrics = metrics["job_metrics"].get(job.job_id())
+                    if state in ("failed_jobs", "completed_jobs"):
+                        assert job_metrics is not None
+                        assert (
+                            metrics["job_metrics"][job.job_id()].state == "failed"
+                            if state == "failed_jobs"
+                            else "completed"
+                        )
+                        remote_uri = job_metrics.remote_uri
+                        if load.initial_client_config.create_followup_jobs:  # type: ignore
+                            assert remote_uri.endswith(job.file_name())
+                        elif load.is_staging_destination_job(job.file_name()):
+                            # staging destination should contain reference to remote filesystem
+                            assert (
+                                FilesystemConfiguration.make_file_uri(REMOTE_FILESYSTEM)
+                                in remote_uri
+                            )
+                        else:
+                            assert remote_uri is None
+                    else:
+                        assert job_metrics is None
 
 
 def run_all(load: Load) -> None:
@@ -941,9 +1039,9 @@ def setup_loader(
     staging = None
     if filesystem_staging:
         # do not accept jsonl to not conflict with filesystem destination
-        client_config = client_config or DummyClientConfiguration(
-            loader_file_format="reference", completed_prob=1
-        )
+        # client_config = client_config or DummyClientConfiguration(
+        #     loader_file_format="reference", completed_prob=1
+        # )
         staging_system_config = FilesystemDestinationClientConfiguration()._bind_dataset_name(
             dataset_name="dummy"
         )
diff --git a/tests/load/utils.py b/tests/load/utils.py
index d649343c63..086109de8b 100644
--- a/tests/load/utils.py
+++ b/tests/load/utils.py
@@ -45,6 +45,7 @@
 from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration
 from dlt.common.schema.utils import new_table, normalize_table_identifiers
 from dlt.common.storages import ParsedLoadJobFileName, LoadStorage, PackageStorage
+from dlt.common.storages.load_package import create_load_id
 from dlt.common.typing import StrAny
 from dlt.common.utils import uniq_id
 
@@ -712,7 +713,7 @@ def expect_load_file(
         query = query.encode("utf-8")  # type: ignore[assignment]
     file_storage.save(file_name, query)
     table = client.prepare_load_table(table_name)
-    load_id = uniq_id()
+    load_id = create_load_id()
     job = client.create_load_job(table, file_storage.make_full_path(file_name), load_id)
 
     if isinstance(job, RunnableLoadJob):
@@ -873,7 +874,7 @@ def prepare_load_package(
     Create a load package with explicitely provided files
     job_per_case multiplies the amount of load jobs, for big packages use small files
     """
-    load_id = uniq_id()
+    load_id = create_load_id()
     load_storage.new_packages.create_package(load_id)
     for case in cases:
         path = f"./tests/load/cases/loading/{case}"
diff --git a/tests/pipeline/cases/contracts/trace.schema.yaml b/tests/pipeline/cases/contracts/trace.schema.yaml
new file mode 100644
index 0000000000..89831977c0
--- /dev/null
+++ b/tests/pipeline/cases/contracts/trace.schema.yaml
@@ -0,0 +1,772 @@
+version: 4
+version_hash: JE62zVwqT2T/qHTi2Qdnn2d1A/JzCzyGtDwc+qUmbTs=
+engine_version: 9
+name: trace
+tables:
+  _dlt_version:
+    columns:
+      version:
+        data_type: bigint
+        nullable: false
+      engine_version:
+        data_type: bigint
+        nullable: false
+      inserted_at:
+        data_type: timestamp
+        nullable: false
+      schema_name:
+        data_type: text
+        nullable: false
+      version_hash:
+        data_type: text
+        nullable: false
+      schema:
+        data_type: text
+        nullable: false
+    write_disposition: skip
+    description: Created by DLT. Tracks schema updates
+  _dlt_loads:
+    columns:
+      load_id:
+        data_type: text
+        nullable: false
+      schema_name:
+        data_type: text
+        nullable: true
+      status:
+        data_type: bigint
+        nullable: false
+      inserted_at:
+        data_type: timestamp
+        nullable: false
+      schema_version_hash:
+        data_type: text
+        nullable: true
+    write_disposition: skip
+    description: Created by DLT. Tracks completed loads
+  trace:
+    columns:
+      transaction_id:
+        data_type: text
+        nullable: true
+      pipeline_name:
+        data_type: text
+        nullable: true
+      execution_context__ci_run:
+        data_type: bool
+        nullable: true
+      execution_context__python:
+        data_type: text
+        nullable: true
+      execution_context__cpu:
+        data_type: bigint
+        nullable: true
+      execution_context__os__name:
+        data_type: text
+        nullable: true
+      execution_context__os__version:
+        data_type: text
+        nullable: true
+      execution_context__library__name:
+        data_type: text
+        nullable: true
+      execution_context__library__version:
+        data_type: text
+        nullable: true
+      started_at:
+        data_type: timestamp
+        nullable: true
+      finished_at:
+        data_type: timestamp
+        nullable: true
+      engine_version:
+        data_type: bigint
+        nullable: true
+      _dlt_load_id:
+        data_type: text
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    write_disposition: append
+  trace__execution_context__exec_info:
+    columns:
+      value:
+        data_type: text
+        nullable: true
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+    parent: trace
+  trace__steps:
+    columns:
+      span_id:
+        data_type: text
+        nullable: true
+      step:
+        data_type: text
+        nullable: true
+      started_at:
+        data_type: timestamp
+        nullable: true
+      finished_at:
+        data_type: timestamp
+        nullable: true
+      step_info__pipeline__pipeline_name:
+        data_type: text
+        nullable: true
+      step_info__first_run:
+        data_type: bool
+        nullable: true
+      step_info__started_at:
+        data_type: timestamp
+        nullable: true
+      step_info__finished_at:
+        data_type: timestamp
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      load_info__destination_type:
+        data_type: text
+        nullable: true
+      load_info__destination_displayable_credentials:
+        data_type: text
+        nullable: true
+      load_info__destination_name:
+        data_type: text
+        nullable: true
+      load_info__staging_type:
+        data_type: text
+        nullable: true
+      load_info__staging_name:
+        data_type: text
+        nullable: true
+      load_info__staging_displayable_credentials:
+        data_type: text
+        nullable: true
+      load_info__destination_fingerprint:
+        data_type: text
+        nullable: true
+      step_exception:
+        data_type: text
+        nullable: true
+    parent: trace
+  trace__steps__extract_info__job_metrics:
+    columns:
+      file_path:
+        data_type: text
+        nullable: true
+      items_count:
+        data_type: bigint
+        nullable: true
+      file_size:
+        data_type: bigint
+        nullable: true
+      created:
+        data_type: double
+        nullable: true
+      last_modified:
+        data_type: double
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      extract_idx:
+        data_type: bigint
+        nullable: true
+      job_id:
+        data_type: text
+        nullable: true
+      table_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__extract_info__table_metrics:
+    columns:
+      file_path:
+        data_type: text
+        nullable: true
+      items_count:
+        data_type: bigint
+        nullable: true
+      file_size:
+        data_type: bigint
+        nullable: true
+      created:
+        data_type: double
+        nullable: true
+      last_modified:
+        data_type: double
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      extract_idx:
+        data_type: bigint
+        nullable: true
+      table_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__extract_info__resource_metrics:
+    columns:
+      file_path:
+        data_type: text
+        nullable: true
+      items_count:
+        data_type: bigint
+        nullable: true
+      file_size:
+        data_type: bigint
+        nullable: true
+      created:
+        data_type: double
+        nullable: true
+      last_modified:
+        data_type: double
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      extract_idx:
+        data_type: bigint
+        nullable: true
+      resource_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__extract_info__dag:
+    columns:
+      load_id:
+        data_type: text
+        nullable: true
+      extract_idx:
+        data_type: bigint
+        nullable: true
+      parent_name:
+        data_type: text
+        nullable: true
+      resource_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__extract_info__hints:
+    columns:
+      load_id:
+        data_type: text
+        nullable: true
+      extract_idx:
+        data_type: bigint
+        nullable: true
+      resource_name:
+        data_type: text
+        nullable: true
+      columns:
+        data_type: text
+        nullable: true
+      write_disposition:
+        data_type: text
+        nullable: true
+      schema_contract:
+        data_type: text
+        nullable: true
+      table_format:
+        data_type: text
+        nullable: true
+      file_format:
+        data_type: text
+        nullable: true
+      original_columns:
+        data_type: text
+        nullable: true
+      primary_key:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__step_info__loads_ids:
+    columns:
+      value:
+        data_type: text
+        nullable: true
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+    parent: trace__steps
+  trace__steps__step_info__load_packages:
+    columns:
+      load_id:
+        data_type: text
+        nullable: true
+      package_path:
+        data_type: text
+        nullable: true
+      state:
+        data_type: text
+        nullable: true
+      schema_hash:
+        data_type: text
+        nullable: true
+      schema_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      completed_at:
+        data_type: timestamp
+        nullable: true
+    parent: trace__steps
+  trace__steps__step_info__load_packages__jobs:
+    columns:
+      state:
+        data_type: text
+        nullable: true
+      file_path:
+        data_type: text
+        nullable: true
+      file_size:
+        data_type: bigint
+        nullable: true
+      created_at:
+        data_type: timestamp
+        nullable: true
+      elapsed:
+        data_type: double
+        nullable: true
+      table_name:
+        data_type: text
+        nullable: true
+      file_id:
+        data_type: text
+        nullable: true
+      retry_count:
+        data_type: bigint
+        nullable: true
+      file_format:
+        data_type: text
+        nullable: true
+      job_id:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps__step_info__load_packages
+  trace__steps__normalize_info__job_metrics:
+    columns:
+      file_path:
+        data_type: text
+        nullable: true
+      items_count:
+        data_type: bigint
+        nullable: true
+      file_size:
+        data_type: bigint
+        nullable: true
+      created:
+        data_type: double
+        nullable: true
+      last_modified:
+        data_type: double
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      extract_idx:
+        data_type: bigint
+        nullable: true
+      job_id:
+        data_type: text
+        nullable: true
+      table_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__normalize_info__table_metrics:
+    columns:
+      file_path:
+        data_type: text
+        nullable: true
+      items_count:
+        data_type: bigint
+        nullable: true
+      file_size:
+        data_type: bigint
+        nullable: true
+      created:
+        data_type: double
+        nullable: true
+      last_modified:
+        data_type: double
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      extract_idx:
+        data_type: bigint
+        nullable: true
+      table_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__load_info__job_metrics:
+    columns:
+      load_id:
+        data_type: text
+        nullable: true
+      job_id:
+        data_type: text
+        nullable: true
+      file_path:
+        data_type: text
+        nullable: true
+      table_name:
+        data_type: text
+        nullable: true
+      state:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      started_at:
+        data_type: timestamp
+        nullable: true
+      finished_at:
+        data_type: timestamp
+        nullable: true
+      remote_uri:
+        data_type: text
+        nullable: true
+    parent: trace__steps
+  trace__steps__step_info__load_packages__tables:
+    columns:
+      write_disposition:
+        data_type: text
+        nullable: true
+      schema_contract:
+        data_type: text
+        nullable: true
+      table_format:
+        data_type: text
+        nullable: true
+      file_format:
+        data_type: text
+        nullable: true
+      name:
+        data_type: text
+        nullable: true
+      resource:
+        data_type: text
+        nullable: true
+      schema_name:
+        data_type: text
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      parent:
+        data_type: text
+        nullable: true
+      x_normalizer__seen_data:
+        data_type: bool
+        nullable: true
+    parent: trace__steps__step_info__load_packages
+  trace__steps__step_info__load_packages__tables__columns:
+    columns:
+      name:
+        data_type: text
+        nullable: true
+      data_type:
+        data_type: text
+        nullable: true
+      nullable:
+        data_type: bool
+        nullable: true
+      primary_key:
+        data_type: bool
+        nullable: true
+      table_name:
+        data_type: text
+        nullable: true
+      schema_name:
+        data_type: text
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      unique:
+        data_type: bool
+        nullable: true
+      foreign_key:
+        data_type: bool
+        nullable: true
+    parent: trace__steps__step_info__load_packages__tables
+  trace__resolved_config_values:
+    columns:
+      key:
+        data_type: text
+        nullable: true
+      is_secret_hint:
+        data_type: bool
+        nullable: true
+      provider_name:
+        data_type: text
+        nullable: true
+      config_type_name:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace
+  trace__resolved_config_values__sections:
+    columns:
+      value:
+        data_type: text
+        nullable: true
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+    parent: trace__resolved_config_values
+  trace__steps__exception_traces:
+    columns:
+      message:
+        data_type: text
+        nullable: true
+      exception_type:
+        data_type: text
+        nullable: true
+      is_terminal:
+        data_type: bool
+        nullable: true
+      docstring:
+        data_type: text
+        nullable: true
+      load_id:
+        data_type: text
+        nullable: true
+      pipeline_name:
+        data_type: text
+        nullable: true
+      exception_attrs:
+        data_type: text
+        nullable: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+    parent: trace__steps
+  trace__steps__exception_traces__stack_trace:
+    columns:
+      value:
+        data_type: text
+        nullable: true
+      _dlt_id:
+        data_type: text
+        nullable: false
+        unique: true
+      _dlt_parent_id:
+        data_type: text
+        nullable: false
+        foreign_key: true
+      _dlt_list_idx:
+        data_type: bigint
+        nullable: false
+    parent: trace__steps__exception_traces
+settings:
+  detections:
+  - iso_timestamp
+  default_hints:
+    not_null:
+    - _dlt_id
+    - _dlt_root_id
+    - _dlt_parent_id
+    - _dlt_list_idx
+    - _dlt_load_id
+    foreign_key:
+    - _dlt_parent_id
+    root_key:
+    - _dlt_root_id
+    unique:
+    - _dlt_id
+normalizers:
+  names: snake_case
+  json:
+    module: dlt.common.normalizers.json.relational
+previous_hashes:
+- 9Ysjq/W0xpxkI/vBiYm8Qbr2nDP3JMt6KvGKUS/FCyI=
+- NYeAxJ2r+T+dKFnXFhBEPzBP6SO+ORdhOfgQRo/XqBU=
+- RV9jvZSD5dM+ZGjEL3HqokLvtf22K4zMNc3zWRahEw4=
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
index 0ab1f61d72..b6a7feffc1 100644
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -39,7 +39,7 @@
 from dlt.common.utils import uniq_id
 from dlt.common.schema import Schema
 
-from dlt.destinations import filesystem, redshift, dummy
+from dlt.destinations import filesystem, redshift, dummy, duckdb
 from dlt.destinations.impl.filesystem.filesystem import INIT_FILE_NAME
 from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted
 from dlt.extract.extract import ExtractStorage
@@ -2637,6 +2637,57 @@ def comments(user_id: str):
     assert pipeline.last_trace.last_normalize_info.row_counts["user_comments"] == 3
 
 
+def test_exceed_job_file_name_length() -> None:
+    # use very long table name both for parent and for a child
+    data = {
+        "id": 1,
+        "child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": [
+            1,
+            2,
+            3,
+        ],
+        "col use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": (
+            "data"
+        ),
+    }
+
+    table_name = (
+        "parent use very long table name both for parent and for a child use very long table name"
+        " both for parent and for a child use very long table name both for parent and for a child"
+        " use very long table name both for parent and for a child use very long table name both"
+        " for parent and for a child use very long table name both for parent and for a child "
+    )
+
+    pipeline = dlt.pipeline(
+        pipeline_name="test_exceed_job_file_name_length",
+        destination="duckdb",
+    )
+    # path too long
+    with pytest.raises(PipelineStepFailed) as os_err:
+        pipeline.run([data], table_name=table_name)
+    assert isinstance(os_err.value.__cause__, OSError)
+
+    # fit into 255 + 1
+    suffix_len = len(".b61d3af76c.0.insert-values")
+    pipeline = dlt.pipeline(
+        pipeline_name="test_exceed_job_file_name_length",
+        destination=duckdb(
+            max_identifier_length=255 - suffix_len + 1,
+        ),
+    )
+    # path too long
+    with pytest.raises(PipelineStepFailed):
+        pipeline.run([data], table_name=table_name)
+
+    pipeline = dlt.pipeline(
+        pipeline_name="test_exceed_job_file_name_length",
+        destination=duckdb(
+            max_identifier_length=255 - suffix_len,
+        ),
+    )
+    pipeline.run([data], table_name=table_name)
+
+
 def assert_imported_file(
     pipeline: Pipeline,
     table_name: str,
diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py
index 3239e01bab..69c0f01b8b 100644
--- a/tests/pipeline/test_pipeline_trace.py
+++ b/tests/pipeline/test_pipeline_trace.py
@@ -7,6 +7,7 @@
 from unittest.mock import patch
 import pytest
 import requests_mock
+import yaml
 
 import dlt
 
@@ -19,6 +20,8 @@
 from dlt.common.typing import DictStrAny, StrStr, DictStrStr, TSecretValue
 from dlt.common.utils import digest128
 
+from dlt.destinations import dummy, filesystem
+
 from dlt.pipeline.exceptions import PipelineStepFailed
 from dlt.pipeline.pipeline import Pipeline
 from dlt.pipeline.trace import (
@@ -31,7 +34,8 @@
 from dlt.extract.extract import describe_extract_data
 from dlt.extract.pipe import Pipe
 
-from tests.utils import start_test_telemetry
+from tests.pipeline.utils import PIPELINE_TEST_CASES_PATH
+from tests.utils import TEST_STORAGE_ROOT, start_test_telemetry
 from tests.common.configuration.utils import toml_providers, environment
 
 
@@ -122,7 +126,7 @@ def data():
     resolved = _find_resolved_value(trace.resolved_config_values, "credentials", ["databricks"])
     assert resolved.is_secret_hint is True
     assert resolved.value == databricks_creds
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
     # activate pipeline because other was running in assert trace
     p.activate()
 
@@ -153,7 +157,7 @@ def data():
     assert isinstance(step.step_info, ExtractInfo)
     assert len(step.exception_traces) > 0
     assert step.step_info.extract_data_info == [{"name": "async_exception", "data_type": "source"}]
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
 
     extract_info = step.step_info
     # only new (unprocessed) package is present, all other metrics are empty, state won't be extracted
@@ -174,7 +178,7 @@ def data():
     step = trace.steps[2]
     assert step.step == "normalize"
     assert step.step_info is norm_info
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
     assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo)
     assert p.last_trace.last_normalize_info.row_counts == {"_dlt_pipeline_state": 1, "data": 3}
 
@@ -216,7 +220,7 @@ def data():
     assert resolved.is_secret_hint is False
     assert resolved.value == "1.0"
     assert resolved.config_type_name == "DummyClientConfiguration"
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
     assert isinstance(p.last_trace.last_load_info, LoadInfo)
     p.activate()
 
@@ -234,12 +238,157 @@ def data():
     assert step.step == "load"
     assert step.step_info is load_info  # same load info
     assert trace.steps[0].step_info is not extract_info
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
     assert isinstance(p.last_trace.last_load_info, LoadInfo)
     assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo)
     assert isinstance(p.last_trace.last_extract_info, ExtractInfo)
 
 
+def test_trace_schema() -> None:
+    os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True"
+    os.environ["RESTORE_FROM_DESTINATION"] = "False"
+
+    # mock runtime env
+    os.environ["CIRCLECI"] = "1"
+    os.environ["AWS_LAMBDA_FUNCTION_NAME"] = "lambda"
+
+    @dlt.source(section="many_hints")
+    def many_hints(
+        api_type=dlt.config.value,
+        credentials: str = dlt.secrets.value,
+        secret_value: TSecretValue = TSecretValue("123"),  # noqa: B008
+    ):
+        # TODO: create table / column schema from typed dicts, not explicitly
+        @dlt.resource(
+            write_disposition="replace",
+            primary_key="id",
+            table_format="delta",
+            file_format="jsonl",
+            schema_contract="evolve",
+            columns=[
+                {
+                    "name": "multi",
+                    "data_type": "decimal",
+                    "nullable": True,
+                    "cluster": True,
+                    "description": "unknown",
+                    "merge_key": True,
+                    "precision": 9,
+                    "scale": 3,
+                    "sort": True,
+                    "variant": True,
+                    "partition": True,
+                }
+            ],
+        )
+        def data():
+            yield [{"id": 1, "multi": "1.2"}, {"id": 2}, {"id": 3}]
+
+        return data()
+
+    @dlt.source
+    def github():
+        @dlt.resource
+        def get_shuffled_events():
+            for _ in range(1):
+                with open(
+                    "tests/normalize/cases/github.events.load_page_1_duck.json",
+                    "r",
+                    encoding="utf-8",
+                ) as f:
+                    issues = json.load(f)
+                    yield issues
+
+        return get_shuffled_events()
+
+    @dlt.source
+    def async_exception(max_range=1):
+        async def get_val(v):
+            await asyncio.sleep(0.1)
+            if v % 3 == 0:
+                raise ValueError(v)
+            return v
+
+        @dlt.resource
+        def data():
+            yield from [get_val(v) for v in range(1, max_range)]
+
+        return data()
+
+    # create pipeline with staging to get remote_uri in load step job_metrics
+    dummy_dest = dummy(completed_prob=1.0)
+    pipeline = dlt.pipeline(
+        pipeline_name="test_trace_schema",
+        destination=dummy_dest,
+        staging=filesystem(os.path.abspath(os.path.join(TEST_STORAGE_ROOT, "_remote_filesystem"))),
+        dataset_name="various",
+    )
+
+    # mock config
+    os.environ["API_TYPE"] = "REST"
+    os.environ["SOURCES__MANY_HINTS__CREDENTIALS"] = "CREDS"
+
+    info = pipeline.run([many_hints(), github()])
+    info.raise_on_failed_jobs()
+
+    trace = pipeline.last_trace
+    pipeline._schema_storage.storage.save("trace.json", json.dumps(trace, pretty=True))
+
+    schema = dlt.Schema("trace")
+    trace_pipeline = dlt.pipeline(
+        pipeline_name="test_trace_schema_traces", destination=dummy(completed_prob=1.0)
+    )
+    info = trace_pipeline.run([trace], table_name="trace", schema=schema)
+    info.raise_on_failed_jobs()
+
+    # add exception trace
+    with pytest.raises(PipelineStepFailed):
+        pipeline.extract(async_exception(max_range=4))
+
+    trace_exception = pipeline.last_trace
+    pipeline._schema_storage.storage.save(
+        "trace_exception.json", json.dumps(trace_exception, pretty=True)
+    )
+
+    info = trace_pipeline.run([trace_exception], table_name="trace")
+    info.raise_on_failed_jobs()
+    inferred_trace_contract = trace_pipeline.schemas["trace"]
+    inferred_contract_str = inferred_trace_contract.to_pretty_yaml(remove_processing_hints=True)
+
+    # NOTE: this saves actual inferred contract (schema) to schema storage, move it to test cases if you update
+    # trace shapes
+    # TODO: create a proper schema for dlt trace and tables/columns
+    pipeline._schema_storage.storage.save("trace.schema.yaml", inferred_contract_str)
+    # print(pipeline._schema_storage.storage.storage_path)
+
+    # load the schema and use it as contract
+    with open(f"{PIPELINE_TEST_CASES_PATH}/contracts/trace.schema.yaml", encoding="utf-8") as f:
+        imported_schema = yaml.safe_load(f)
+    trace_contract = Schema.from_dict(imported_schema, remove_processing_hints=True)
+    # compare pretty forms of the schemas, they must be identical
+    # NOTE: if this fails you can comment this out and use contract run below to find first offending difference
+    # assert trace_contract.to_pretty_yaml() == inferred_contract_str
+
+    # use trace contract to load data again
+    contract_trace_pipeline = dlt.pipeline(
+        pipeline_name="test_trace_schema_traces_contract", destination=dummy(completed_prob=1.0)
+    )
+    info = contract_trace_pipeline.run(
+        [trace_exception, trace],
+        table_name="trace",
+        schema=trace_contract,
+        schema_contract="freeze",
+    )
+
+    # assert inferred_trace_contract.version_hash == trace_contract.version_hash
+
+    # print(trace_pipeline.schemas["trace"].to_pretty_yaml())
+    # print(pipeline._schema_storage.storage.storage_path)
+
+
+# def test_trace_schema_contract() -> None:
+
+
 def test_save_load_trace() -> None:
     os.environ["COMPLETED_PROB"] = "1.0"
     info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy")
@@ -255,7 +404,7 @@ def test_save_load_trace() -> None:
     assert resolved.is_secret_hint is False
     assert resolved.value == "1.0"
     assert resolved.config_type_name == "DummyClientConfiguration"
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
     # check row counts
     assert pipeline.last_trace.last_normalize_info.row_counts == {
         "_dlt_pipeline_state": 1,
@@ -296,7 +445,7 @@ def data():
     assert run_step.step == "run"
     assert run_step.step_exception is not None
     assert step.step_exception == run_step.step_exception
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
     assert pipeline.last_trace.last_normalize_info is None
 
 
@@ -306,7 +455,7 @@ def test_save_load_empty_trace() -> None:
     pipeline = dlt.pipeline()
     pipeline.run([], table_name="data", destination="dummy")
     trace = pipeline.last_trace
-    assert_trace_printable(trace)
+    assert_trace_serializable(trace)
     assert len(trace.steps) == 4
 
     pipeline.activate()
@@ -529,7 +678,7 @@ def _mock_sentry_before_send(event: DictStrAny, _unused_hint: Any = None) -> Dic
     return event
 
 
-def assert_trace_printable(trace: PipelineTrace) -> None:
+def assert_trace_serializable(trace: PipelineTrace) -> None:
     str(trace)
     trace.asstr(0)
     trace.asstr(1)
diff --git a/tests/pipeline/test_platform_connection.py b/tests/pipeline/test_platform_connection.py
index fa5b143ff5..aa46019382 100644
--- a/tests/pipeline/test_platform_connection.py
+++ b/tests/pipeline/test_platform_connection.py
@@ -65,7 +65,8 @@ def data():
         # basic check of trace result
         assert trace_result, "no trace"
         assert trace_result["pipeline_name"] == "platform_test_pipeline"
-        assert len(trace_result["steps"]) == 4
+        # just extract, normalize and load steps. run step is not serialized to trace (it was just a copy of load)
+        assert len(trace_result["steps"]) == 3
         assert trace_result["execution_context"]["library"]["name"] == "dlt"
 
         # basic check of state result
diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py
index dfdb9c8e40..d3d87f0e0b 100644
--- a/tests/pipeline/utils.py
+++ b/tests/pipeline/utils.py
@@ -98,6 +98,9 @@ def users_materialize_table_schema():
 
 def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None:
     """Asserts that expected number of packages was loaded and there are no failed jobs"""
+    # make sure we can serialize
+    info.asstr(verbosity=2)
+    info.asdict()
     assert len(info.loads_ids) == expected_load_packages
     # all packages loaded
     assert all(p.completed_at is not None for p in info.load_packages) is True
diff --git a/tests/utils.py b/tests/utils.py
index 976a623c0b..1b81881470 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -189,8 +189,9 @@ def wipe_pipeline(preserve_environ) -> Iterator[None]:
     yield
     if container[PipelineContext].is_active():
         # take existing pipeline
-        p = dlt.pipeline()
-        p._wipe_working_folder()
+        # NOTE: no more needed. test storage is wiped fully when test starts
+        # p = dlt.pipeline()
+        # p._wipe_working_folder()
         # deactivate context
         container[PipelineContext].deactivate()
 

From 2788235572de105ff01aaf5c1ebcbe4ea40b249b Mon Sep 17 00:00:00 2001
From: Akela Drissner-Schmid <32450038+akelad@users.noreply.github.com>
Date: Mon, 26 Aug 2024 16:32:22 +0200
Subject: [PATCH 26/34] Update snowflake.md

---
 docs/website/docs/dlt-ecosystem/destinations/snowflake.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
index 181d024a2f..d08578c5a2 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
@@ -136,7 +136,12 @@ If you set the [`replace` strategy](../../general-usage/full-loading.md) to `sta
 recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables.
 
 ## Data loading
-The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are immediately removed (if not specified otherwise).
+The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are kept by default, unless specified otherwise via the `keep_staged_files` parameter:
+
+```toml
+[destination.snowflake]
+keep_staged_files = false
+```
 
 ## Supported file formats
 * [insert-values](../file-formats/insert-format.md) is used by default

From 935dc09efd067549fbcb87b906ccb560d945bd26 Mon Sep 17 00:00:00 2001
From: rudolfix <rudolfix@rudolfix.org>
Date: Tue, 27 Aug 2024 00:20:06 +0200
Subject: [PATCH 27/34] Feat/1711 create with not exists dlt tables (#1740)

* uses normalized column names when linking tables in relational

* destination cap if create table if not exits supported

* generates IF NOT EXISTS for dlt tables

* adds logging for terminal and retry exception in run_managed of load job

* passes schema update to be collected in trace in filesystem

* fixes job log exception message
---
 dlt/common/destination/capabilities.py        |  1 +
 dlt/common/destination/reference.py           |  4 ++++
 dlt/common/normalizers/json/relational.py     | 12 +++++-------
 dlt/destinations/impl/athena/athena.py        |  2 +-
 .../impl/filesystem/filesystem.py             |  5 ++++-
 dlt/destinations/impl/mssql/factory.py        |  1 +
 dlt/destinations/impl/synapse/factory.py      |  4 ++++
 dlt/destinations/job_client_impl.py           | 19 ++++++++++++++-----
 .../parent_child_relationship.py              |  9 ++++-----
 .../test_parent_child_relationship.py         | 10 ++++------
 tests/load/mssql/test_mssql_table_builder.py  | 12 ++++++++++--
 .../postgres/test_postgres_table_builder.py   | 11 ++++++++++-
 tests/pipeline/test_pipeline_trace.py         |  2 +-
 13 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py
index be71cb50e9..52e7d74833 100644
--- a/dlt/common/destination/capabilities.py
+++ b/dlt/common/destination/capabilities.py
@@ -76,6 +76,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext):
     # use naming convention in the schema
     naming_convention: TNamingConventionReferenceArg = None
     alter_add_multi_column: bool = True
+    supports_create_table_if_not_exists: bool = True
     supports_truncate_command: bool = True
     schema_supports_numeric_precision: bool = True
     timestamp_precision: int = 6
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
index b6c7041592..744cbbd1f5 100644
--- a/dlt/common/destination/reference.py
+++ b/dlt/common/destination/reference.py
@@ -383,9 +383,13 @@ def run_managed(
         except (DestinationTerminalException, TerminalValueError) as e:
             self._state = "failed"
             self._exception = e
+            logger.exception(f"Terminal exception in job {self.job_id()} in file {self._file_path}")
         except (DestinationTransientException, Exception) as e:
             self._state = "retry"
             self._exception = e
+            logger.exception(
+                f"Transient exception in job {self.job_id()} in file {self._file_path}"
+            )
         finally:
             self._finished_at = pendulum.now()
             # sanity check
diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py
index 8e296445eb..1dbcec4bff 100644
--- a/dlt/common/normalizers/json/relational.py
+++ b/dlt/common/normalizers/json/relational.py
@@ -184,11 +184,10 @@ def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) ->
         # and all child tables must be lists
         return digest128(f"{parent_row_id}_{child_table}_{list_idx}", DLT_ID_LENGTH_BYTES)
 
-    @staticmethod
-    def _link_row(row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny:
+    def _link_row(self, row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny:
         assert parent_row_id
-        row["_dlt_parent_id"] = parent_row_id
-        row["_dlt_list_idx"] = list_idx
+        row[self.c_dlt_parent_id] = parent_row_id
+        row[self.c_dlt_list_idx] = list_idx
 
         return row
 
@@ -227,7 +226,7 @@ def _add_row_id(
                 if row_id_type == "row_hash":
                     row_id = DataItemNormalizer._get_child_row_hash(parent_row_id, table, pos)
                     # link to parent table
-                    DataItemNormalizer._link_row(flattened_row, parent_row_id, pos)
+                    self._link_row(flattened_row, parent_row_id, pos)
 
         flattened_row[self.c_dlt_id] = row_id
         return row_id
@@ -260,7 +259,6 @@ def _normalize_list(
         parent_row_id: Optional[str] = None,
         _r_lvl: int = 0,
     ) -> TNormalizedRowIterator:
-        v: DictStrAny = None
         table = self.schema.naming.shorten_fragments(*parent_path, *ident_path)
 
         for idx, v in enumerate(seq):
@@ -285,7 +283,7 @@ def _normalize_list(
                 child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx)
                 wrap_v = wrap_in_dict(v)
                 wrap_v[self.c_dlt_id] = child_row_hash
-                e = DataItemNormalizer._link_row(wrap_v, parent_row_id, idx)
+                e = self._link_row(wrap_v, parent_row_id, idx)
                 DataItemNormalizer._extend_row(extend, e)
                 yield (table, self.schema.naming.shorten_fragments(*parent_path)), e
 
diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
index 1429b28240..0c90d171a3 100644
--- a/dlt/destinations/impl/athena/athena.py
+++ b/dlt/destinations/impl/athena/athena.py
@@ -452,7 +452,7 @@ def _get_table_update_sql(
                 partition_clause = self._iceberg_partition_clause(
                     cast(Optional[Dict[str, str]], table.get(PARTITION_HINT))
                 )
-                sql.append(f"""CREATE TABLE {qualified_table_name}
+                sql.append(f"""{self._make_create_table(qualified_table_name, table)}
                         ({columns})
                         {partition_clause}
                         LOCATION '{location.rstrip('/')}'
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 2e09871ba9..5445fd2ae9 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -303,6 +303,7 @@ def update_stored_schema(
         only_tables: Iterable[str] = None,
         expected_update: TSchemaTables = None,
     ) -> TSchemaTables:
+        applied_update = super().update_stored_schema(only_tables, expected_update)
         # create destination dirs for all tables
         table_names = only_tables or self.schema.tables.keys()
         dirs_to_create = self.get_table_dirs(table_names)
@@ -316,7 +317,9 @@ def update_stored_schema(
         if not self.config.as_staging:
             self._store_current_schema()
 
-        return expected_update
+        # we assume that expected_update == applied_update so table schemas in dest were not
+        # externally changed
+        return applied_update
 
     def get_table_dir(self, table_name: str, remote: bool = False) -> str:
         # dlt tables do not respect layout (for now)
diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py
index 85c94c21b7..f1a8bb136a 100644
--- a/dlt/destinations/impl/mssql/factory.py
+++ b/dlt/destinations/impl/mssql/factory.py
@@ -37,6 +37,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext:
         caps.max_text_data_type_length = 2**30 - 1
         caps.is_max_text_data_type_length_in_bytes = False
         caps.supports_ddl_transactions = True
+        caps.supports_create_table_if_not_exists = False  # IF NOT EXISTS not supported
         caps.max_rows_per_insert = 1000
         caps.timestamp_precision = 7
         caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"]
diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py
index bb117e48d2..d5a0281bec 100644
--- a/dlt/destinations/impl/synapse/factory.py
+++ b/dlt/destinations/impl/synapse/factory.py
@@ -63,6 +63,10 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext:
         caps.supports_transactions = True
         caps.supports_ddl_transactions = False
 
+        caps.supports_create_table_if_not_exists = (
+            False  # IF NOT EXISTS on CREATE TABLE not supported
+        )
+
         # Synapse throws "Some part of your SQL statement is nested too deeply. Rewrite the query or break it up into smaller queries."
         # if number of records exceeds a certain number. Which exact number that is seems not deterministic:
         # in tests, I've seen a query with 12230 records run succesfully on one run, but fail on a subsequent run, while the query remained exactly the same.
diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
index 92132dd751..1d6403a2c8 100644
--- a/dlt/destinations/job_client_impl.py
+++ b/dlt/destinations/job_client_impl.py
@@ -522,22 +522,31 @@ def _make_add_column_sql(
         """Make one or more ADD COLUMN sql clauses to be joined in ALTER TABLE statement(s)"""
         return [f"ADD COLUMN {self._get_column_def_sql(c, table_format)}" for c in new_columns]
 
+    def _make_create_table(self, qualified_name: str, table: TTableSchema) -> str:
+        not_exists_clause = " "
+        if (
+            table["name"] in self.schema.dlt_table_names()
+            and self.capabilities.supports_create_table_if_not_exists
+        ):
+            not_exists_clause = " IF NOT EXISTS "
+        return f"CREATE TABLE{not_exists_clause}{qualified_name}"
+
     def _get_table_update_sql(
         self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
     ) -> List[str]:
         # build sql
-        canonical_name = self.sql_client.make_qualified_table_name(table_name)
+        qualified_name = self.sql_client.make_qualified_table_name(table_name)
         table = self.prepare_load_table(table_name)
         table_format = table.get("table_format")
         sql_result: List[str] = []
         if not generate_alter:
             # build CREATE
-            sql = f"CREATE TABLE {canonical_name} (\n"
+            sql = self._make_create_table(qualified_name, table) + " (\n"
             sql += ",\n".join([self._get_column_def_sql(c, table_format) for c in new_columns])
             sql += ")"
             sql_result.append(sql)
         else:
-            sql_base = f"ALTER TABLE {canonical_name}\n"
+            sql_base = f"ALTER TABLE {qualified_name}\n"
             add_column_statements = self._make_add_column_sql(new_columns, table_format)
             if self.capabilities.alter_add_multi_column:
                 column_sql = ",\n"
@@ -561,13 +570,13 @@ def _get_table_update_sql(
                     if hint == "not_null":
                         logger.warning(
                             f"Column(s) {hint_columns} with NOT NULL are being added to existing"
-                            f" table {canonical_name}. If there's data in the table the operation"
+                            f" table {qualified_name}. If there's data in the table the operation"
                             " will fail."
                         )
                     else:
                         logger.warning(
                             f"Column(s) {hint_columns} with hint {hint} are being added to existing"
-                            f" table {canonical_name}. Several hint types may not be added to"
+                            f" table {qualified_name}. Several hint types may not be added to"
                             " existing tables."
                         )
         return sql_result
diff --git a/docs/examples/parent_child_relationship/parent_child_relationship.py b/docs/examples/parent_child_relationship/parent_child_relationship.py
index 39c9f577cc..6de00ffb28 100644
--- a/docs/examples/parent_child_relationship/parent_child_relationship.py
+++ b/docs/examples/parent_child_relationship/parent_child_relationship.py
@@ -22,6 +22,7 @@
 from typing import List, Dict, Any, Generator
 import dlt
 
+
 # Define a dlt resource with write disposition to 'merge'
 @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"})
 def data_source() -> Generator[List[Dict[str, Any]], None, None]:
@@ -44,6 +45,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]:
 
     yield data
 
+
 # Function to add parent_id to each child record within a parent record
 def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]:
     parent_id_key = "parent_id"
@@ -51,6 +53,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]:
         child[parent_id_key] = record[parent_id_key]
     return record
 
+
 if __name__ == "__main__":
     # Create and configure the dlt pipeline
     pipeline = dlt.pipeline(
@@ -60,10 +63,6 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]:
     )
 
     # Run the pipeline
-    load_info = pipeline.run(
-        data_source()
-        .add_map(add_parent_id),
-        primary_key="parent_id"
-    )
+    load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id")
     # Output the load information after pipeline execution
     print(load_info)
diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py
index f671040823..95d1bade97 100644
--- a/docs/examples/parent_child_relationship/test_parent_child_relationship.py
+++ b/docs/examples/parent_child_relationship/test_parent_child_relationship.py
@@ -1,4 +1,3 @@
-
 import pytest
 
 from tests.utils import skipifgithubfork
@@ -29,6 +28,7 @@
 from typing import List, Dict, Any, Generator
 import dlt
 
+
 # Define a dlt resource with write disposition to 'merge'
 @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"})
 def data_source() -> Generator[List[Dict[str, Any]], None, None]:
@@ -51,6 +51,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]:
 
     yield data
 
+
 # Function to add parent_id to each child record within a parent record
 def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]:
     parent_id_key = "parent_id"
@@ -58,6 +59,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]:
         child[parent_id_key] = record[parent_id_key]
     return record
 
+
 @skipifgithubfork
 @pytest.mark.forked
 def test_parent_child_relationship():
@@ -69,10 +71,6 @@ def test_parent_child_relationship():
     )
 
     # Run the pipeline
-    load_info = pipeline.run(
-        data_source()
-        .add_map(add_parent_id),
-        primary_key="parent_id"
-    )
+    load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id")
     # Output the load information after pipeline execution
     print(load_info)
diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py
index d6cf3ec3e8..3f3896de6c 100644
--- a/tests/load/mssql/test_mssql_table_builder.py
+++ b/tests/load/mssql/test_mssql_table_builder.py
@@ -55,8 +55,8 @@ def test_alter_table(client: MsSqlJobClient) -> None:
     # existing table has no columns
     sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)[0]
     sqlfluff.parse(sql, dialect="tsql")
-    canonical_name = client.sql_client.make_qualified_table_name("event_test_table")
-    assert sql.count(f"ALTER TABLE {canonical_name}\nADD") == 1
+    qualified_name = client.sql_client.make_qualified_table_name("event_test_table")
+    assert sql.count(f"ALTER TABLE {qualified_name}\nADD") == 1
     assert "event_test_table" in sql
     assert '"col1" bigint  NOT NULL' in sql
     assert '"col2" float  NOT NULL' in sql
@@ -75,3 +75,11 @@ def test_alter_table(client: MsSqlJobClient) -> None:
     assert '"col6_precision" decimal(6,2)  NOT NULL' in sql
     assert '"col7_precision" varbinary(19)' in sql
     assert '"col11_precision" time(3)  NOT NULL' in sql
+
+
+def test_create_dlt_table(client: MsSqlJobClient) -> None:
+    # non existing table
+    sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0]
+    sqlfluff.parse(sql, dialect="tsql")
+    qualified_name = client.sql_client.make_qualified_table_name("_dlt_version")
+    assert f"CREATE TABLE {qualified_name}" in sql
diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py
index 86bd67db9a..28fd4eec9d 100644
--- a/tests/load/postgres/test_postgres_table_builder.py
+++ b/tests/load/postgres/test_postgres_table_builder.py
@@ -57,7 +57,8 @@ def test_create_table(client: PostgresClient) -> None:
     # non existing table
     sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0]
     sqlfluff.parse(sql, dialect="postgres")
-    assert "event_test_table" in sql
+    qualified_name = client.sql_client.make_qualified_table_name("event_test_table")
+    assert f"CREATE TABLE {qualified_name}" in sql
     assert '"col1" bigint  NOT NULL' in sql
     assert '"col2" double precision  NOT NULL' in sql
     assert '"col3" boolean  NOT NULL' in sql
@@ -173,3 +174,11 @@ def test_create_table_case_sensitive(cs_client: PostgresClient) -> None:
     # every line starts with "Col"
     for line in sql.split("\n")[1:]:
         assert line.startswith('"Col')
+
+
+def test_create_dlt_table(client: PostgresClient) -> None:
+    # non existing table
+    sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0]
+    sqlfluff.parse(sql, dialect="postgres")
+    qualified_name = client.sql_client.make_qualified_table_name("_dlt_version")
+    assert f"CREATE TABLE IF NOT EXISTS {qualified_name}" in sql
diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py
index 69c0f01b8b..4e52d2aa29 100644
--- a/tests/pipeline/test_pipeline_trace.py
+++ b/tests/pipeline/test_pipeline_trace.py
@@ -551,7 +551,7 @@ def test_trace_telemetry() -> None:
         for item in SENTRY_SENT_ITEMS:
             # print(item)
             print(item["logentry"]["message"])
-        assert len(SENTRY_SENT_ITEMS) == 2
+        assert len(SENTRY_SENT_ITEMS) == 4
 
         # trace with exception
         @dlt.resource

From 08e5e7afca0f328da107d6e8eda7ca3c01366d33 Mon Sep 17 00:00:00 2001
From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com>
Date: Tue, 27 Aug 2024 11:59:43 +0400
Subject: [PATCH 28/34] Enable schema evolution for `merge` write disposition
 with `delta` table format (#1742)

* black format

* increase minimum deltalake version dependency

* enable schema evolution for delta table merge

* extract delta table merge logic into separate function

* remove big decimal exclusion due to upstream bugfix

* evolve delta table schema in empty source case

* refactor DeltaLoadFilesystemJob

* uses right table path format in delta lake load job

* allows to pass schema name when getting delta tables and computing table counts

* cleansup usage of remote paths and uris in filesystem load jobs

* removes tempfile from file_storage

---------

Co-authored-by: Marcin Rudolf <rudolfix@rudolfix.org>
---
 dlt/common/libs/deltalake.py                  |  77 ++++++--
 dlt/common/storages/file_storage.py           |  17 +-
 dlt/destinations/fs_client.py                 |   3 +
 .../impl/filesystem/filesystem.py             | 166 +++++++++---------
 poetry.lock                                   | 162 ++++++++---------
 pyproject.toml                                |   2 +-
 tests/libs/test_deltalake.py                  |  14 +-
 .../load/pipeline/test_filesystem_pipeline.py | 133 ++++++++++++--
 tests/pipeline/utils.py                       |  17 +-
 9 files changed, 358 insertions(+), 233 deletions(-)

diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py
index d98795d07c..d4cb46c600 100644
--- a/dlt/common/libs/deltalake.py
+++ b/dlt/common/libs/deltalake.py
@@ -5,13 +5,15 @@
 from dlt.common import logger
 from dlt.common.libs.pyarrow import pyarrow as pa
 from dlt.common.libs.pyarrow import cast_arrow_schema_types
-from dlt.common.schema.typing import TWriteDisposition
+from dlt.common.schema.typing import TWriteDisposition, TTableSchema
+from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop
 from dlt.common.exceptions import MissingDependencyException
 from dlt.common.storages import FilesystemConfiguration
 from dlt.common.utils import assert_min_pkg_version
 from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
 
 try:
+    import deltalake
     from deltalake import write_deltalake, DeltaTable
     from deltalake.writer import try_get_deltatable
 except ModuleNotFoundError:
@@ -74,7 +76,7 @@ def write_delta_table(
     partition_by: Optional[Union[List[str], str]] = None,
     storage_options: Optional[Dict[str, str]] = None,
 ) -> None:
-    """Writes in-memory Arrow table to on-disk Delta table.
+    """Writes in-memory Arrow data to on-disk Delta table.
 
     Thin wrapper around `deltalake.write_deltalake`.
     """
@@ -93,31 +95,73 @@ def write_delta_table(
     )
 
 
-def get_delta_tables(pipeline: Pipeline, *tables: str) -> Dict[str, DeltaTable]:
-    """Returns Delta tables in `pipeline.default_schema` as `deltalake.DeltaTable` objects.
+def merge_delta_table(
+    table: DeltaTable,
+    data: Union[pa.Table, pa.RecordBatchReader],
+    schema: TTableSchema,
+) -> None:
+    """Merges in-memory Arrow data into on-disk Delta table."""
+
+    strategy = schema["x-merge-strategy"]  # type: ignore[typeddict-item]
+    if strategy == "upsert":
+        # `DeltaTable.merge` does not support automatic schema evolution
+        # https://github.com/delta-io/delta-rs/issues/2282
+        _evolve_delta_table_schema(table, data.schema)
+
+        if "parent" in schema:
+            unique_column = get_first_column_name_with_prop(schema, "unique")
+            predicate = f"target.{unique_column} = source.{unique_column}"
+        else:
+            primary_keys = get_columns_names_with_prop(schema, "primary_key")
+            predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys])
+
+        qry = (
+            table.merge(
+                source=ensure_delta_compatible_arrow_data(data),
+                predicate=predicate,
+                source_alias="source",
+                target_alias="target",
+            )
+            .when_matched_update_all()
+            .when_not_matched_insert_all()
+        )
+
+        qry.execute()
+    else:
+        ValueError(f'Merge strategy "{strategy}" not supported.')
+
+
+def get_delta_tables(
+    pipeline: Pipeline, *tables: str, schema_name: str = None
+) -> Dict[str, DeltaTable]:
+    """Returns Delta tables in `pipeline.default_schema (default)` as `deltalake.DeltaTable` objects.
 
     Returned object is a dictionary with table names as keys and `DeltaTable` objects as values.
     Optionally filters dictionary by table names specified as `*tables*`.
-    Raises ValueError if table name specified as `*tables` is not found.
+    Raises ValueError if table name specified as `*tables` is not found. You may try to switch to other
+    schemas via `schema_name` argument.
     """
     from dlt.common.schema.utils import get_table_format
 
-    with pipeline.destination_client() as client:
+    with pipeline.destination_client(schema_name=schema_name) as client:
         assert isinstance(
             client, FilesystemClient
         ), "The `get_delta_tables` function requires a `filesystem` destination."
 
         schema_delta_tables = [
             t["name"]
-            for t in pipeline.default_schema.tables.values()
-            if get_table_format(pipeline.default_schema.tables, t["name"]) == "delta"
+            for t in client.schema.tables.values()
+            if get_table_format(client.schema.tables, t["name"]) == "delta"
         ]
         if len(tables) > 0:
             invalid_tables = set(tables) - set(schema_delta_tables)
             if len(invalid_tables) > 0:
+                available_schemas = ""
+                if len(pipeline.schema_names) > 1:
+                    available_schemas = f" Available schemas are {pipeline.schema_names}"
                 raise ValueError(
-                    "Schema does not contain Delta tables with these names: "
-                    f"{', '.join(invalid_tables)}."
+                    f"Schema {client.schema.name} does not contain Delta tables with these names: "
+                    f"{', '.join(invalid_tables)}.{available_schemas}"
                 )
             schema_delta_tables = [t for t in schema_delta_tables if t in tables]
         table_dirs = client.get_table_dirs(schema_delta_tables, remote=True)
@@ -145,3 +189,16 @@ def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str
             + ". dlt will use the values in `deltalake_storage_options`."
         )
     return {**creds, **extra_options}
+
+
+def _evolve_delta_table_schema(delta_table: DeltaTable, arrow_schema: pa.Schema) -> None:
+    """Evolves `delta_table` schema if different from `arrow_schema`.
+
+    Adds column(s) to `delta_table` present in `arrow_schema` but not in `delta_table`.
+    """
+    new_fields = [
+        deltalake.Field.from_pyarrow(field)
+        for field in ensure_delta_compatible_arrow_schema(arrow_schema)
+        if field not in delta_table.to_pyarrow_dataset().schema
+    ]
+    delta_table.alter.add_columns(new_fields)
diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py
index 7d14b8f7f7..f26cc060a3 100644
--- a/dlt/common/storages/file_storage.py
+++ b/dlt/common/storages/file_storage.py
@@ -3,7 +3,6 @@
 import re
 import stat
 import errno
-import tempfile
 import shutil
 import pathvalidate
 from typing import IO, Any, Optional, List, cast
@@ -29,10 +28,8 @@ def save(self, relative_path: str, data: Any) -> str:
     @staticmethod
     def save_atomic(storage_path: str, relative_path: str, data: Any, file_type: str = "t") -> str:
         mode = "w" + file_type
-        with tempfile.NamedTemporaryFile(
-            dir=storage_path, mode=mode, delete=False, encoding=encoding_for_mode(mode)
-        ) as f:
-            tmp_path = f.name
+        tmp_path = os.path.join(storage_path, uniq_id(8))
+        with open(tmp_path, mode=mode, encoding=encoding_for_mode(mode)) as f:
             f.write(data)
         try:
             dest_path = os.path.join(storage_path, relative_path)
@@ -116,11 +113,11 @@ def open_file(self, relative_path: str, mode: str = "r") -> IO[Any]:
             return FileStorage.open_zipsafe_ro(self.make_full_path(relative_path), mode)
         return open(self.make_full_path(relative_path), mode, encoding=encoding_for_mode(mode))
 
-    def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]:
-        mode = mode + file_type or self.file_type
-        return tempfile.NamedTemporaryFile(
-            dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode)
-        )
+    # def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]:
+    #     mode = mode + file_type or self.file_type
+    #     return tempfile.NamedTemporaryFile(
+    #         dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode)
+    #     )
 
     def has_file(self, relative_path: str) -> bool:
         return os.path.isfile(self.make_full_path(relative_path))
diff --git a/dlt/destinations/fs_client.py b/dlt/destinations/fs_client.py
index 3233446594..14e77b6b4e 100644
--- a/dlt/destinations/fs_client.py
+++ b/dlt/destinations/fs_client.py
@@ -3,9 +3,12 @@
 from abc import ABC, abstractmethod
 from fsspec import AbstractFileSystem
 
+from dlt.common.schema import Schema
+
 
 class FSClientBase(ABC):
     fs_client: AbstractFileSystem
+    schema: Schema
 
     @property
     @abstractmethod
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 5445fd2ae9..05261ccb1b 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -3,7 +3,7 @@
 import base64
 
 from types import TracebackType
-from typing import ClassVar, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast
+from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast
 from fsspec import AbstractFileSystem
 from contextlib import contextmanager
 
@@ -13,7 +13,7 @@
 from dlt.common.storages.fsspec_filesystem import glob_files
 from dlt.common.typing import DictStrAny
 from dlt.common.schema import Schema, TSchemaTables, TTableSchema
-from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop
+from dlt.common.schema.utils import get_columns_names_with_prop
 from dlt.common.storages import FileStorage, fsspec_from_config
 from dlt.common.storages.load_package import (
     LoadJobInfo,
@@ -56,36 +56,36 @@ def __init__(
         self._job_client: FilesystemClient = None
 
     def run(self) -> None:
-        # pick local filesystem pathlib or posix for buckets
-        self.is_local_filesystem = self._job_client.config.protocol == "file"
-        self.pathlib = os.path if self.is_local_filesystem else posixpath
-
-        self.destination_file_name = path_utils.create_path(
-            self._job_client.config.layout,
-            self._file_name,
-            self._job_client.schema.name,
-            self._load_id,
-            current_datetime=self._job_client.config.current_datetime,
-            load_package_timestamp=dlt.current.load_package()["state"]["created_at"],
-            extra_placeholders=self._job_client.config.extra_placeholders,
-        )
+        self.__is_local_filesystem = self._job_client.config.protocol == "file"
         # We would like to avoid failing for local filesystem where
         # deeply nested directory will not exist before writing a file.
         # It `auto_mkdir` is disabled by default in fsspec so we made some
         # trade offs between different options and decided on this.
         # remote_path = f"{client.config.protocol}://{posixpath.join(dataset_path, destination_file_name)}"
         remote_path = self.make_remote_path()
-        if self.is_local_filesystem:
-            self._job_client.fs_client.makedirs(self.pathlib.dirname(remote_path), exist_ok=True)
+        if self.__is_local_filesystem:
+            # use os.path for local file name
+            self._job_client.fs_client.makedirs(os.path.dirname(remote_path), exist_ok=True)
         self._job_client.fs_client.put_file(self._file_path, remote_path)
 
     def make_remote_path(self) -> str:
         """Returns path on the remote filesystem to which copy the file, without scheme. For local filesystem a native path is used"""
+        destination_file_name = path_utils.create_path(
+            self._job_client.config.layout,
+            self._file_name,
+            self._job_client.schema.name,
+            self._load_id,
+            current_datetime=self._job_client.config.current_datetime,
+            load_package_timestamp=dlt.current.load_package()["state"]["created_at"],
+            extra_placeholders=self._job_client.config.extra_placeholders,
+        )
+        # pick local filesystem pathlib or posix for buckets
+        pathlib = os.path if self.__is_local_filesystem else posixpath
         # path.join does not normalize separators and available
         # normalization functions are very invasive and may string the trailing separator
-        return self.pathlib.join(  # type: ignore[no-any-return]
+        return pathlib.join(  # type: ignore[no-any-return]
             self._job_client.dataset_path,
-            path_utils.normalize_path_sep(self.pathlib, self.destination_file_name),
+            path_utils.normalize_path_sep(pathlib, destination_file_name),
         )
 
     def make_remote_uri(self) -> str:
@@ -98,89 +98,81 @@ def metrics(self) -> Optional[LoadJobMetrics]:
 
 class DeltaLoadFilesystemJob(FilesystemLoadJob):
     def __init__(self, file_path: str) -> None:
-        super().__init__(
-            file_path=file_path,
-        )
-
-    def run(self) -> None:
-        # pick local filesystem pathlib or posix for buckets
-        # TODO: since we pass _job_client via run_managed and not set_env_vars it is hard
-        # to write a handler with those two line below only in FilesystemLoadJob
-        self.is_local_filesystem = self._job_client.config.protocol == "file"
-        self.pathlib = os.path if self.is_local_filesystem else posixpath
-        self.destination_file_name = self._job_client.make_remote_uri(
-            self._job_client.get_table_dir(self.load_table_name)
-        )
+        super().__init__(file_path=file_path)
 
+        # create Arrow dataset from Parquet files
         from dlt.common.libs.pyarrow import pyarrow as pa
-        from dlt.common.libs.deltalake import (
-            DeltaTable,
-            write_delta_table,
-            ensure_delta_compatible_arrow_schema,
-            _deltalake_storage_options,
-            try_get_deltatable,
-        )
 
-        # create Arrow dataset from Parquet files
-        file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path)
-        arrow_ds = pa.dataset.dataset(file_paths)
+        self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path)
+        self.arrow_ds = pa.dataset.dataset(self.file_paths)
 
-        # create Delta table object
+    def make_remote_path(self) -> str:
+        # remote path is table dir - delta will create its file structure inside it
+        return self._job_client.get_table_dir(self.load_table_name)
 
-        storage_options = _deltalake_storage_options(self._job_client.config)
-        dt = try_get_deltatable(self.destination_file_name, storage_options=storage_options)
+    def run(self) -> None:
+        logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_uri()}")
 
-        # get partition columns
-        part_cols = get_columns_names_with_prop(self._load_table, "partition")
+        from dlt.common.libs.deltalake import write_delta_table, merge_delta_table
 
         # explicitly check if there is data
         # (https://github.com/delta-io/delta-rs/issues/2686)
-        if arrow_ds.head(1).num_rows == 0:
-            if dt is None:
-                # create new empty Delta table with schema from Arrow table
-                DeltaTable.create(
-                    table_uri=self.destination_file_name,
-                    schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema),
-                    mode="overwrite",
-                    partition_by=part_cols,
-                    storage_options=storage_options,
-                )
+        if self.arrow_ds.head(1).num_rows == 0:
+            self._create_or_evolve_delta_table()
             return
 
-        arrow_rbr = arrow_ds.scanner().to_reader()  # RecordBatchReader
-
-        if self._load_table["write_disposition"] == "merge" and dt is not None:
-            assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies  # type: ignore[typeddict-item]
-
-            if self._load_table["x-merge-strategy"] == "upsert":  # type: ignore[typeddict-item]
-                if "parent" in self._load_table:
-                    unique_column = get_first_column_name_with_prop(self._load_table, "unique")
-                    predicate = f"target.{unique_column} = source.{unique_column}"
-                else:
-                    primary_keys = get_columns_names_with_prop(self._load_table, "primary_key")
-                    predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys])
-
-                qry = (
-                    dt.merge(
-                        source=arrow_rbr,
-                        predicate=predicate,
-                        source_alias="source",
-                        target_alias="target",
-                    )
-                    .when_matched_update_all()
-                    .when_not_matched_insert_all()
+        with self.arrow_ds.scanner().to_reader() as arrow_rbr:  # RecordBatchReader
+            if self._load_table["write_disposition"] == "merge" and self._delta_table is not None:
+                assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies  # type: ignore[typeddict-item]
+                merge_delta_table(
+                    table=self._delta_table,
+                    data=arrow_rbr,
+                    schema=self._load_table,
+                )
+            else:
+                write_delta_table(
+                    table_or_uri=(
+                        self.make_remote_uri() if self._delta_table is None else self._delta_table
+                    ),
+                    data=arrow_rbr,
+                    write_disposition=self._load_table["write_disposition"],
+                    partition_by=self._partition_columns,
+                    storage_options=self._storage_options,
                 )
 
-                qry.execute()
+    @property
+    def _storage_options(self) -> Dict[str, str]:
+        from dlt.common.libs.deltalake import _deltalake_storage_options
+
+        return _deltalake_storage_options(self._job_client.config)
 
-        else:
-            write_delta_table(
-                table_or_uri=self.destination_file_name if dt is None else dt,
-                data=arrow_rbr,
-                write_disposition=self._load_table["write_disposition"],
-                partition_by=part_cols,
-                storage_options=storage_options,
+    @property
+    def _delta_table(self) -> Optional["DeltaTable"]:  # type: ignore[name-defined] # noqa: F821
+        from dlt.common.libs.deltalake import try_get_deltatable
+
+        return try_get_deltatable(self.make_remote_uri(), storage_options=self._storage_options)
+
+    @property
+    def _partition_columns(self) -> List[str]:
+        return get_columns_names_with_prop(self._load_table, "partition")
+
+    def _create_or_evolve_delta_table(self) -> None:
+        from dlt.common.libs.deltalake import (
+            DeltaTable,
+            ensure_delta_compatible_arrow_schema,
+            _evolve_delta_table_schema,
+        )
+
+        if self._delta_table is None:
+            DeltaTable.create(
+                table_uri=self.make_remote_uri(),
+                schema=ensure_delta_compatible_arrow_schema(self.arrow_ds.schema),
+                mode="overwrite",
+                partition_by=self._partition_columns,
+                storage_options=self._storage_options,
             )
+        else:
+            _evolve_delta_table_schema(self._delta_table, self.arrow_ds.schema)
 
 
 class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob):
diff --git a/poetry.lock b/poetry.lock
index d54a73a2ef..230b354b97 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "about-time"
@@ -2102,27 +2102,27 @@ typing-extensions = ">=3.10.0"
 
 [[package]]
 name = "databricks-sql-connector"
-version = "3.1.2"
+version = "3.3.0"
 description = "Databricks SQL Connector for Python"
 optional = true
 python-versions = "<4.0.0,>=3.8.0"
 files = [
-    {file = "databricks_sql_connector-3.1.2-py3-none-any.whl", hash = "sha256:5292bc25b4d8d58d301079b55086331764f067e24862c9365698b2eeddedb737"},
-    {file = "databricks_sql_connector-3.1.2.tar.gz", hash = "sha256:da0df114e0824d49ccfea36c4679c95689fe359191b056ad516446a058307c37"},
+    {file = "databricks_sql_connector-3.3.0-py3-none-any.whl", hash = "sha256:55ee5a4a11291bf91a235ac76e41b419ddd66a9a321065a8bfaf119acbb26d6b"},
+    {file = "databricks_sql_connector-3.3.0.tar.gz", hash = "sha256:19e82965da4c86574adfe9f788c17b4494d98eb8075ba4fd4306573d2edbf194"},
 ]
 
 [package.dependencies]
 lz4 = ">=4.0.2,<5.0.0"
 numpy = [
-    {version = ">=1.16.6", markers = "python_version >= \"3.8\" and python_version < \"3.11\""},
-    {version = ">=1.23.4", markers = "python_version >= \"3.11\""},
+    {version = ">=1.16.6,<2.0.0", markers = "python_version >= \"3.8\" and python_version < \"3.11\""},
+    {version = ">=1.23.4,<2.0.0", markers = "python_version >= \"3.11\""},
 ]
 oauthlib = ">=3.1.0,<4.0.0"
 openpyxl = ">=3.0.10,<4.0.0"
 pandas = {version = ">=1.2.5,<2.2.0", markers = "python_version >= \"3.8\""}
-pyarrow = ">=14.0.1,<15.0.0"
+pyarrow = ">=14.0.1,<17"
 requests = ">=2.18.1,<3.0.0"
-thrift = ">=0.16.0,<0.17.0"
+thrift = ">=0.16.0,<0.21.0"
 urllib3 = ">=1.26"
 
 [package.extras]
@@ -2377,25 +2377,24 @@ files = [
 
 [[package]]
 name = "deltalake"
-version = "0.17.4"
+version = "0.19.1"
 description = "Native Delta Lake Python binding based on delta-rs with Pandas integration"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "deltalake-0.17.4-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3f048bd4cdd3500fbb0d1b34046966ca4b7cefd1e9df71460b881ee8ad7f844a"},
-    {file = "deltalake-0.17.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:b539265d8293794872e1dc3b2daad50abe05ab425e961824b3ac1155bb294604"},
-    {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55e6be5f5ab8d5d34d2ea58d86e93eec2da5d2476e3c15e9520239457618bca4"},
-    {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dde6c2d0a07e9ce47be367d016541d3a499839350852205819353441e1a9c1"},
-    {file = "deltalake-0.17.4-cp38-abi3-win_amd64.whl", hash = "sha256:f51f499d50dad88bdc18c5ed7c2319114759f3220f83aa2d32166c19accee4ce"},
-    {file = "deltalake-0.17.4.tar.gz", hash = "sha256:c3c10577afc46d4b10ed16246d814a8c40b3663099066681eeba89f908373814"},
+    {file = "deltalake-0.19.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ddaaaa9c85a17791c3997cf320ac11dc1725d16cf4b6f0ff1b130853e7b56cd0"},
+    {file = "deltalake-0.19.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0184d5a3f0d4f4f1fb992c3bdc8736329b78b6a4faf1a278109ec35d9945c1d"},
+    {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec9d117fcf6c198f3d554be2f3a6291ca3838530650db236741ff48d4d47abb4"},
+    {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:447ef721319ed15f7b5f6da507efd5fed0e6172e5ae55ac044d5b8fc9b812e47"},
+    {file = "deltalake-0.19.1-cp38-abi3-win_amd64.whl", hash = "sha256:b15bc343a9f8f3de80fbedcebd5d9472b539eb0f538a71739c7fcf699089127e"},
+    {file = "deltalake-0.19.1.tar.gz", hash = "sha256:5e09fabb221fb81e989c283c16278eaffb6e85706d98364abcda5c0c6ca73598"},
 ]
 
 [package.dependencies]
-pyarrow = ">=8"
-pyarrow-hotfix = "*"
+pyarrow = ">=16"
 
 [package.extras]
-devel = ["mypy (>=1.8.0,<1.9.0)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (>=0.3.0,<0.4.0)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"]
+devel = ["azure-storage-blob (==12.20.0)", "mypy (==1.10.1)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (==0.5.2)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"]
 pandas = ["pandas"]
 pyspark = ["delta-spark", "numpy (==1.22.2)", "pyspark"]
 
@@ -4567,17 +4566,17 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)",
 
 [[package]]
 name = "lancedb"
-version = "0.9.0"
+version = "0.13.0b1"
 description = "lancedb"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "lancedb-0.9.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:b1ca08797c72c93ae512aa1078f1891756da157d910fbae8e194fac3528fc1ac"},
-    {file = "lancedb-0.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:15129791f03c2c04b95f914ced2c1556b43d73a24710207b9af77b6e4008bdeb"},
-    {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f093d89447a2039b820d2540a0b64df3024e4549b6808ebd26b44fbe0345cc6"},
-    {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:a8c1f6777e217d2277451038866d280fa5fb38bd161795e51703b043c26dd345"},
-    {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:78dd5800a1148f89d33b7e98d1c8b1c42dee146f03580abc1ca83cb05273ff7f"},
-    {file = "lancedb-0.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:ba5bdc727d3bc131f17414f42372acde5817073feeb553793a3d20003caa1658"},
+    {file = "lancedb-0.13.0b1-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:687b9a08be55e6fa9520255b1b06dcd2e6ba6c64c947410821e9a3a52b2f48ec"},
+    {file = "lancedb-0.13.0b1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ac00684f7e90ffc1b386298670e2c4ddaea8c0b61b6eb1b51dbd4e74feb87a86"},
+    {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbe8fc15bfeec89b6b2a4a42b4b919b6d3e138cf8684af35f77f361d73fe90cd"},
+    {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:231e1f00d724c468922f7951d902622d4ccb21c2db2a148b845beaebee5d35b3"},
+    {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:fecdd71f137e52193bfb5843610f32fe025a60a1edf5f80530704de879706c6b"},
+    {file = "lancedb-0.13.0b1-cp38-abi3-win_amd64.whl", hash = "sha256:7852d9c04a4402407af06bbbf78bf339a169f1df2bf5c70da586ca733ec40a68"},
 ]
 
 [package.dependencies]
@@ -4587,7 +4586,7 @@ deprecation = "*"
 overrides = ">=0.7"
 packaging = "*"
 pydantic = ">=1.10"
-pylance = "0.13.0"
+pylance = "0.16.1"
 ratelimiter = ">=1.0,<2.0"
 requests = ">=2.31.0"
 retry = ">=0.9.2"
@@ -4598,8 +4597,8 @@ azure = ["adlfs (>=2024.2.0)"]
 clip = ["open-clip", "pillow", "torch"]
 dev = ["pre-commit", "ruff"]
 docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
-embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"]
-tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"]
+embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "ibm-watsonx-ai (>=1.1.2)", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"]
+tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19,<=1.3.0)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"]
 
 [[package]]
 name = "lazy-object-proxy"
@@ -6660,63 +6659,52 @@ files = [
 
 [[package]]
 name = "pyarrow"
-version = "14.0.2"
+version = "16.1.0"
 description = "Python library for Apache Arrow"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-14.0.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ba9fe808596c5dbd08b3aeffe901e5f81095baaa28e7d5118e01354c64f22807"},
-    {file = "pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:22a768987a16bb46220cef490c56c671993fbee8fd0475febac0b3e16b00a10e"},
-    {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dbba05e98f247f17e64303eb876f4a80fcd32f73c7e9ad975a83834d81f3fda"},
-    {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a898d134d00b1eca04998e9d286e19653f9d0fcb99587310cd10270907452a6b"},
-    {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:87e879323f256cb04267bb365add7208f302df942eb943c93a9dfeb8f44840b1"},
-    {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:76fc257559404ea5f1306ea9a3ff0541bf996ff3f7b9209fc517b5e83811fa8e"},
-    {file = "pyarrow-14.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0c4a18e00f3a32398a7f31da47fefcd7a927545b396e1f15d0c85c2f2c778cd"},
-    {file = "pyarrow-14.0.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:87482af32e5a0c0cce2d12eb3c039dd1d853bd905b04f3f953f147c7a196915b"},
-    {file = "pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:059bd8f12a70519e46cd64e1ba40e97eae55e0cbe1695edd95384653d7626b23"},
-    {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f16111f9ab27e60b391c5f6d197510e3ad6654e73857b4e394861fc79c37200"},
-    {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ff1264fe4448e8d02073f5ce45a9f934c0f3db0a04460d0b01ff28befc3696"},
-    {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd4f4b472ccf4042f1eab77e6c8bce574543f54d2135c7e396f413046397d5a"},
-    {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:32356bfb58b36059773f49e4e214996888eeea3a08893e7dbde44753799b2a02"},
-    {file = "pyarrow-14.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:52809ee69d4dbf2241c0e4366d949ba035cbcf48409bf404f071f624ed313a2b"},
-    {file = "pyarrow-14.0.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:c87824a5ac52be210d32906c715f4ed7053d0180c1060ae3ff9b7e560f53f944"},
-    {file = "pyarrow-14.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a25eb2421a58e861f6ca91f43339d215476f4fe159eca603c55950c14f378cc5"},
-    {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c1da70d668af5620b8ba0a23f229030a4cd6c5f24a616a146f30d2386fec422"},
-    {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cc61593c8e66194c7cdfae594503e91b926a228fba40b5cf25cc593563bcd07"},
-    {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:78ea56f62fb7c0ae8ecb9afdd7893e3a7dbeb0b04106f5c08dbb23f9c0157591"},
-    {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:37c233ddbce0c67a76c0985612fef27c0c92aef9413cf5aa56952f359fcb7379"},
-    {file = "pyarrow-14.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:e4b123ad0f6add92de898214d404e488167b87b5dd86e9a434126bc2b7a5578d"},
-    {file = "pyarrow-14.0.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e354fba8490de258be7687f341bc04aba181fc8aa1f71e4584f9890d9cb2dec2"},
-    {file = "pyarrow-14.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:20e003a23a13da963f43e2b432483fdd8c38dc8882cd145f09f21792e1cf22a1"},
-    {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0de7575e841f1595ac07e5bc631084fd06ca8b03c0f2ecece733d23cd5102a"},
-    {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e986dc859712acb0bd45601229021f3ffcdfc49044b64c6d071aaf4fa49e98"},
-    {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f7d029f20ef56673a9730766023459ece397a05001f4e4d13805111d7c2108c0"},
-    {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:209bac546942b0d8edc8debda248364f7f668e4aad4741bae58e67d40e5fcf75"},
-    {file = "pyarrow-14.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:1e6987c5274fb87d66bb36816afb6f65707546b3c45c44c28e3c4133c010a881"},
-    {file = "pyarrow-14.0.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a01d0052d2a294a5f56cc1862933014e696aa08cc7b620e8c0cce5a5d362e976"},
-    {file = "pyarrow-14.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a51fee3a7db4d37f8cda3ea96f32530620d43b0489d169b285d774da48ca9785"},
-    {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64df2bf1ef2ef14cee531e2dfe03dd924017650ffaa6f9513d7a1bb291e59c15"},
-    {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c0fa3bfdb0305ffe09810f9d3e2e50a2787e3a07063001dcd7adae0cee3601a"},
-    {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c65bf4fd06584f058420238bc47a316e80dda01ec0dfb3044594128a6c2db794"},
-    {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:63ac901baec9369d6aae1cbe6cca11178fb018a8d45068aaf5bb54f94804a866"},
-    {file = "pyarrow-14.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:75ee0efe7a87a687ae303d63037d08a48ef9ea0127064df18267252cfe2e9541"},
-    {file = "pyarrow-14.0.2.tar.gz", hash = "sha256:36cef6ba12b499d864d1def3e990f97949e0b79400d08b7cf74504ffbd3eb025"},
+    {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
+    {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
+    {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
+    {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
+    {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
+    {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
+    {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
+    {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
+    {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
+    {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
+    {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
+    {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
+    {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
+    {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
+    {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
+    {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
 ]
 
 [package.dependencies]
 numpy = ">=1.16.6"
 
-[[package]]
-name = "pyarrow-hotfix"
-version = "0.6"
-description = ""
-optional = true
-python-versions = ">=3.5"
-files = [
-    {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"},
-    {file = "pyarrow_hotfix-0.6.tar.gz", hash = "sha256:79d3e030f7ff890d408a100ac16d6f00b14d44a502d7897cd9fc3e3a534e9945"},
-]
-
 [[package]]
 name = "pyasn1"
 version = "0.5.0"
@@ -6993,22 +6981,22 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
 
 [[package]]
 name = "pylance"
-version = "0.13.0"
+version = "0.16.1"
 description = "python wrapper for Lance columnar format"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "pylance-0.13.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2f3d6f9eec1f59f45dccb01075ba79868b8d37c8371d6210bcf6418217a0dd8b"},
-    {file = "pylance-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f4861ab466c94b0f9a4b4e6de6e1dfa02f40e7242d8db87447bc7bb7d89606ac"},
-    {file = "pylance-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3cb92547e145f5bfb0ea7d6f483953913b9bdd44c45bea84fc95a18da9f5853"},
-    {file = "pylance-0.13.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d1ddd7700924bc6b6b0774ea63d2aa23f9210a86cd6d6af0cdfa987df776d50d"},
-    {file = "pylance-0.13.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c51d4b6e59cf4dc97c11a35b299f11e80dbdf392e2d8dc498573c26474a3c19e"},
-    {file = "pylance-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:4018ba016f1445874960a4ba2ad5c80cb380f3116683282ee8beabd38fa8989d"},
+    {file = "pylance-0.16.1-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:7092303ae21bc162edd98e20fc39785fa1ec6b67f04132977ac0fd63110ba16f"},
+    {file = "pylance-0.16.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7c2ebdf89928c68f053ab9e369a5477da0a2ba70d47c00075dc10a37039d9e90"},
+    {file = "pylance-0.16.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4525c2fd8095830b753a3efb7285f358b016836086683fe977f9f1de8e6866c"},
+    {file = "pylance-0.16.1-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:645f0ab338bc4bd42bf3321bbb4053261979117aefd8477c2192ba624de27778"},
+    {file = "pylance-0.16.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3a7464d60aca51e89196a79c638bcbff0bddb77158946e2ea6b5fcbc6cfc63e1"},
+    {file = "pylance-0.16.1-cp39-abi3-win_amd64.whl", hash = "sha256:d12c628dfbd49efde15a5512247065341f3efb29989dd08fb5a7023f013471ee"},
 ]
 
 [package.dependencies]
-numpy = ">=1.22"
-pyarrow = ">=12,<15.0.1"
+numpy = ">=1.22,<2"
+pyarrow = ">=12"
 
 [package.extras]
 benchmarks = ["pytest-benchmark"]
@@ -9696,4 +9684,4 @@ weaviate = ["weaviate-client"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<3.13"
-content-hash = "a64fdd2845d27c9abc344809be68cba08f46641aabdc07416c37c802450fe4f3"
+content-hash = "2b8d00f91f33a380b2399989dcac0d1d106d0bd2cd8865c5b7e27a19885753b5"
diff --git a/pyproject.toml b/pyproject.toml
index f33bbbefcf..74161f5ccc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,7 +80,7 @@ databricks-sql-connector = {version = ">=2.9.3", optional = true}
 clickhouse-driver = { version = ">=0.2.7", optional = true }
 clickhouse-connect = { version = ">=0.7.7", optional = true }
 lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'", allow-prereleases = true }
-deltalake = { version = ">=0.17.4", optional = true }
+deltalake = { version = ">=0.19.0", optional = true }
 
 [tool.poetry.extras]
 gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"]
diff --git a/tests/libs/test_deltalake.py b/tests/libs/test_deltalake.py
index 3e2d7cc3f6..dc5586eb32 100644
--- a/tests/libs/test_deltalake.py
+++ b/tests/libs/test_deltalake.py
@@ -95,21 +95,9 @@ def arrow_data(  # type: ignore[return]
     client = cast(FilesystemClient, client)
     storage_options = _deltalake_storage_options(client.config)
 
-    with pytest.raises(Exception):
-        # bug in `delta-rs` causes error when writing big decimal values
-        # https://github.com/delta-io/delta-rs/issues/2510
-        # if this test fails, the bug has been fixed and we should remove this
-        # note from the docs:
-        write_delta_table(
-            remote_dir + "/corrupt_delta_table",
-            arrow_table_all_data_types("arrow-table", include_decimal_default_precision=True)[0],
-            write_disposition="append",
-            storage_options=storage_options,
-        )
-
     arrow_table = arrow_table_all_data_types(
         "arrow-table",
-        include_decimal_default_precision=False,
+        include_decimal_default_precision=True,
         include_decimal_arrow_max_precision=True,
         num_rows=2,
     )[0]
diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py
index 4b8707e989..d88eba7c06 100644
--- a/tests/load/pipeline/test_filesystem_pipeline.py
+++ b/tests/load/pipeline/test_filesystem_pipeline.py
@@ -15,7 +15,7 @@
 from dlt.common.storages.configuration import FilesystemConfiguration
 from dlt.common.storages.load_package import ParsedLoadJobFileName
 from dlt.common.utils import uniq_id
-from dlt.common.exceptions import DependencyVersionException
+from dlt.common.schema.typing import TWriteDisposition
 from dlt.destinations import filesystem
 from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
 from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders
@@ -580,6 +580,103 @@ def two_part():
 
 
 @pytest.mark.essential
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(
+        table_format_filesystem_configs=True,
+        table_format="delta",
+        bucket_subset=(FILE_BUCKET),
+    ),
+    ids=lambda x: x.name,
+)
+@pytest.mark.parametrize(
+    "write_disposition",
+    (
+        "append",
+        "replace",
+        pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"),
+    ),
+)
+def test_delta_table_schema_evolution(
+    destination_config: DestinationTestConfiguration,
+    write_disposition: TWriteDisposition,
+) -> None:
+    """Tests schema evolution (adding new columns) for `delta` table format."""
+    from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data
+    from dlt.common.libs.pyarrow import pyarrow
+
+    @dlt.resource(
+        write_disposition=write_disposition,
+        primary_key="pk",
+        table_format="delta",
+    )
+    def delta_table(data):
+        yield data
+
+    pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True)
+
+    # create Arrow table with one column, one row
+    pk_field = pyarrow.field("pk", pyarrow.int64(), nullable=False)
+    schema = pyarrow.schema([pk_field])
+    arrow_table = pyarrow.Table.from_pydict({"pk": [1]}, schema=schema)
+    assert arrow_table.shape == (1, 1)
+
+    # initial load
+    info = pipeline.run(delta_table(arrow_table))
+    assert_load_info(info)
+    dt = get_delta_tables(pipeline, "delta_table")["delta_table"]
+    expected = ensure_delta_compatible_arrow_data(arrow_table)
+    actual = dt.to_pyarrow_table()
+    assert actual.equals(expected)
+
+    # create Arrow table with many columns, two rows
+    arrow_table = arrow_table_all_data_types(
+        "arrow-table",
+        include_decimal_default_precision=True,
+        include_decimal_arrow_max_precision=True,
+        include_not_normalized_name=False,
+        include_null=False,
+        num_rows=2,
+    )[0]
+    arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]])
+
+    # second load — this should evolve the schema (i.e. add the new columns)
+    info = pipeline.run(delta_table(arrow_table))
+    assert_load_info(info)
+    dt = get_delta_tables(pipeline, "delta_table")["delta_table"]
+    actual = dt.to_pyarrow_table()
+    expected = ensure_delta_compatible_arrow_data(arrow_table)
+    if write_disposition == "append":
+        # just check shape and schema for `append`, because table comparison is
+        # more involved than with the other dispositions
+        assert actual.num_rows == 3
+        actual.schema.equals(expected.schema)
+    else:
+        assert actual.sort_by("pk").equals(expected.sort_by("pk"))
+
+    # create empty Arrow table with additional column
+    arrow_table = arrow_table.append_column(
+        pyarrow.field("another_new_column", pyarrow.string()),
+        [["foo", "foo"]],
+    )
+    empty_arrow_table = arrow_table.schema.empty_table()
+
+    # load 3 — this should evolve the schema without changing data
+    info = pipeline.run(delta_table(empty_arrow_table))
+    assert_load_info(info)
+    dt = get_delta_tables(pipeline, "delta_table")["delta_table"]
+    actual = dt.to_pyarrow_table()
+    expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema
+    assert actual.schema.equals(expected_schema)
+    expected_num_rows = 3 if write_disposition == "append" else 2
+    assert actual.num_rows == expected_num_rows
+    # new column should have NULLs only
+    assert (
+        actual.column("another_new_column").combine_chunks().to_pylist()
+        == [None] * expected_num_rows
+    )
+
+
 @pytest.mark.parametrize(
     "destination_config",
     destinations_configs(
@@ -607,7 +704,7 @@ def delta_table(data):
     # create empty Arrow table with schema
     arrow_table = arrow_table_all_data_types(
         "arrow-table",
-        include_decimal_default_precision=False,
+        include_decimal_default_precision=True,
         include_decimal_arrow_max_precision=True,
         include_not_normalized_name=False,
         include_null=False,
@@ -643,22 +740,6 @@ def delta_table(data):
         ensure_delta_compatible_arrow_data(empty_arrow_table).schema
     )
 
-    # run 3: empty Arrow table with different schema
-    # this should not alter the Delta table
-    empty_arrow_table_2 = pa.schema(
-        [pa.field("foo", pa.int64()), pa.field("bar", pa.string())]
-    ).empty_table()
-
-    info = pipeline.run(delta_table(empty_arrow_table_2))
-    assert_load_info(info)
-    dt = get_delta_tables(pipeline, "delta_table")["delta_table"]
-    assert dt.version() == 1  # still 1, no new commit was done
-    dt_arrow_table = dt.to_pyarrow_table()
-    assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns)  # shape did not change
-    assert dt_arrow_table.schema.equals(  # schema did not change
-        ensure_delta_compatible_arrow_data(empty_arrow_table).schema
-    )
-
     # test `dlt.mark.materialize_table_schema()`
     users_materialize_table_schema.apply_hints(table_format="delta")
     info = pipeline.run(users_materialize_table_schema())
@@ -810,6 +891,22 @@ def parent_delta():
     with pytest.raises(ValueError):
         get_delta_tables(pipeline, "non_existing_table")
 
+    # test unknown schema
+    with pytest.raises(FileNotFoundError):
+        get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2")
+
+    # load to a new schema and under new name
+    aux_schema = dlt.Schema("aux_2")
+    # NOTE: you cannot have a file with name
+    info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema)
+    # also state in seprate package
+    assert_load_info(info, expected_load_packages=2)
+    delta_tables = get_delta_tables(pipeline, schema_name="aux_2")
+    assert "aux_delta__child" in delta_tables.keys()
+    get_delta_tables(pipeline, "aux_delta", schema_name="aux_2")
+    with pytest.raises(ValueError):
+        get_delta_tables(pipeline, "aux_delta")
+
 
 @pytest.mark.parametrize(
     "destination_config",
diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py
index d3d87f0e0b..dfb5f3f82d 100644
--- a/tests/pipeline/utils.py
+++ b/tests/pipeline/utils.py
@@ -177,24 +177,27 @@ def _load_file(client: FSClientBase, filepath) -> List[Dict[str, Any]]:
 #
 
 
-def _load_tables_to_dicts_fs(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]:
+def _load_tables_to_dicts_fs(
+    p: dlt.Pipeline, *table_names: str, schema_name: str = None
+) -> Dict[str, List[Dict[str, Any]]]:
     """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct"""
-    client = p._fs_client()
+    client = p._fs_client(schema_name=schema_name)
+    assert isinstance(client, FilesystemClient)
+
     result: Dict[str, Any] = {}
 
     delta_table_names = [
         table_name
         for table_name in table_names
-        if get_table_format(p.default_schema.tables, table_name) == "delta"
+        if get_table_format(client.schema.tables, table_name) == "delta"
     ]
     if len(delta_table_names) > 0:
         from dlt.common.libs.deltalake import get_delta_tables
 
-        delta_tables = get_delta_tables(p, *table_names)
+        delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name)
 
     for table_name in table_names:
-        if table_name in p.default_schema.data_table_names() and table_name in delta_table_names:
-            assert isinstance(client, FilesystemClient)
+        if table_name in client.schema.data_table_names() and table_name in delta_table_names:
             dt = delta_tables[table_name]
             result[table_name] = dt.to_pyarrow_table().to_pylist()
         else:
@@ -244,7 +247,7 @@ def _sort_list_of_dicts(list_: List[Dict[str, Any]], sortkey: str) -> List[Dict[
         return sorted(list_, key=lambda d: d[sortkey])
 
     if _is_filesystem(p):
-        result = _load_tables_to_dicts_fs(p, *table_names)
+        result = _load_tables_to_dicts_fs(p, *table_names, schema_name=schema_name)
     else:
         result = _load_tables_to_dicts_sql(p, *table_names, schema_name=schema_name)
 

From e337cca079ab21742339e097eb381635eafc5de5 Mon Sep 17 00:00:00 2001
From: rudolfix <rudolfix@rudolfix.org>
Date: Tue, 27 Aug 2024 18:32:07 +0200
Subject: [PATCH 29/34] provides detail exception messages when cursor stored
 value cannot be coerced to data in incremental (#1748)

---
 .../impl/filesystem/filesystem.py             |  1 +
 dlt/extract/incremental/exceptions.py         | 26 ++++++++
 dlt/extract/incremental/transform.py          | 63 ++++++++++++++++---
 tests/extract/test_incremental.py             | 21 ++++++-
 4 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 05261ccb1b..62263a10b9 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -89,6 +89,7 @@ def make_remote_path(self) -> str:
         )
 
     def make_remote_uri(self) -> str:
+        """Returns path on a remote filesystem as a full uri including scheme."""
         return self._job_client.make_remote_uri(self.make_remote_path())
 
     def metrics(self) -> Optional[LoadJobMetrics]:
diff --git a/dlt/extract/incremental/exceptions.py b/dlt/extract/incremental/exceptions.py
index e318a028dc..a5f94c2974 100644
--- a/dlt/extract/incremental/exceptions.py
+++ b/dlt/extract/incremental/exceptions.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 from dlt.extract.exceptions import PipeException
 from dlt.common.typing import TDataItem
 
@@ -13,6 +15,30 @@ def __init__(self, pipe_name: str, json_path: str, item: TDataItem, msg: str = N
         super().__init__(pipe_name, msg)
 
 
+class IncrementalCursorInvalidCoercion(PipeException):
+    def __init__(
+        self,
+        pipe_name: str,
+        cursor_path: str,
+        cursor_value: TDataItem,
+        cursor_value_type: str,
+        item: TDataItem,
+        item_type: Any,
+        details: str,
+    ) -> None:
+        self.cursor_path = cursor_path
+        self.cursor_value = cursor_value
+        self.cursor_value_type = cursor_value_type
+        self.item = item
+        msg = (
+            f"Could not coerce {cursor_value_type} with value {cursor_value} and type"
+            f" {type(cursor_value)} to actual data item {item} at path {cursor_path} with type"
+            f" {item_type}: {details}. You need to use different data type for"
+            f" {cursor_value_type} or cast your data ie. by using `add_map` on this resource."
+        )
+        super().__init__(pipe_name, msg)
+
+
 class IncrementalPrimaryKeyMissing(PipeException):
     def __init__(self, pipe_name: str, primary_key_column: str, item: TDataItem) -> None:
         self.primary_key_column = primary_key_column
diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py
index 947e21f7b8..0ac9fdf520 100644
--- a/dlt/extract/incremental/transform.py
+++ b/dlt/extract/incremental/transform.py
@@ -8,6 +8,7 @@
 from dlt.common.typing import TDataItem
 from dlt.common.jsonpath import find_values, JSONPathFields, compile_path
 from dlt.extract.incremental.exceptions import (
+    IncrementalCursorInvalidCoercion,
     IncrementalCursorPathMissing,
     IncrementalPrimaryKeyMissing,
 )
@@ -158,14 +159,36 @@ def __call__(
 
         # Check whether end_value has been reached
         # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value
-        if self.end_value is not None and (
-            last_value_func((row_value, self.end_value)) != self.end_value
-            or last_value_func((row_value,)) == self.end_value
-        ):
-            return None, False, True
-
+        if self.end_value is not None:
+            try:
+                if (
+                    last_value_func((row_value, self.end_value)) != self.end_value
+                    or last_value_func((row_value,)) == self.end_value
+                ):
+                    return None, False, True
+            except Exception as ex:
+                raise IncrementalCursorInvalidCoercion(
+                    self.resource_name,
+                    self.cursor_path,
+                    self.end_value,
+                    "end_value",
+                    row_value,
+                    type(row_value).__name__,
+                    str(ex),
+                ) from ex
         check_values = (row_value,) + ((last_value,) if last_value is not None else ())
-        new_value = last_value_func(check_values)
+        try:
+            new_value = last_value_func(check_values)
+        except Exception as ex:
+            raise IncrementalCursorInvalidCoercion(
+                self.resource_name,
+                self.cursor_path,
+                last_value,
+                "start_value/initial_value",
+                row_value,
+                type(row_value).__name__,
+                str(ex),
+            ) from ex
         # new_value is "less" or equal to last_value (the actual max)
         if last_value == new_value:
             # use func to compute row_value into last_value compatible
@@ -294,14 +317,36 @@ def __call__(
 
         # If end_value is provided, filter to include table rows that are "less" than end_value
         if self.end_value is not None:
-            end_value_scalar = to_arrow_scalar(self.end_value, cursor_data_type)
+            try:
+                end_value_scalar = to_arrow_scalar(self.end_value, cursor_data_type)
+            except Exception as ex:
+                raise IncrementalCursorInvalidCoercion(
+                    self.resource_name,
+                    cursor_path,
+                    self.end_value,
+                    "end_value",
+                    "<arrow column>",
+                    cursor_data_type,
+                    str(ex),
+                ) from ex
             tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar))
             # Is max row value higher than end value?
             # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary
             end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py()
 
         if self.start_value is not None:
-            start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type)
+            try:
+                start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type)
+            except Exception as ex:
+                raise IncrementalCursorInvalidCoercion(
+                    self.resource_name,
+                    cursor_path,
+                    self.start_value,
+                    "start_value/initial_value",
+                    "<arrow column>",
+                    cursor_data_type,
+                    str(ex),
+                ) from ex
             # Remove rows lower or equal than the last start value
             keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar)
             start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py())
diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py
index f4082a7d86..c401552fb2 100644
--- a/tests/extract/test_incremental.py
+++ b/tests/extract/test_incremental.py
@@ -30,6 +30,7 @@
 from dlt.sources.helpers.transform import take_first
 from dlt.extract.incremental import IncrementalResourceWrapper, Incremental
 from dlt.extract.incremental.exceptions import (
+    IncrementalCursorInvalidCoercion,
     IncrementalCursorPathMissing,
     IncrementalPrimaryKeyMissing,
 )
@@ -1303,7 +1304,7 @@ def some_data(
     )
     # will cause invalid comparison
     if item_type == "object":
-        with pytest.raises(InvalidStepFunctionArguments):
+        with pytest.raises(IncrementalCursorInvalidCoercion):
             list(resource)
     else:
         data = data_item_to_list(item_type, list(resource))
@@ -2065,3 +2066,21 @@ def test_source():
     incremental_steps = test_source_incremental().table_name._pipe._steps
     assert isinstance(incremental_steps[-2], ValidateItem)
     assert isinstance(incremental_steps[-1], IncrementalResourceWrapper)
+
+
+@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS)
+def test_cursor_date_coercion(item_type: TestDataItemFormat) -> None:
+    today = datetime.today().date()
+
+    @dlt.resource()
+    def updated_is_int(updated_at=dlt.sources.incremental("updated_at", initial_value=today)):
+        data = [{"updated_at": d} for d in [1, 2, 3]]
+        yield data_to_item_format(item_type, data)
+
+    pip_1_name = "test_pydantic_columns_validator_" + uniq_id()
+    pipeline = dlt.pipeline(pipeline_name=pip_1_name, destination="duckdb")
+
+    with pytest.raises(PipelineStepFailed) as pip_ex:
+        pipeline.run(updated_is_int())
+    assert isinstance(pip_ex.value.__cause__, IncrementalCursorInvalidCoercion)
+    assert pip_ex.value.__cause__.cursor_path == "updated_at"

From 98ca505fd06b8146a4355c6355174abe8b45ef66 Mon Sep 17 00:00:00 2001
From: VioletM <sansiositres@gmail.com>
Date: Wed, 28 Aug 2024 06:28:50 -0400
Subject: [PATCH 30/34] Expose staging tables truncation to config (#1717)

* Expose staging tables truncation to config

* Fix comments, add tests

* Fix tests

* Move implementation from mixing, add tests

* Fix docs grammar
---
 dlt/common/destination/reference.py           |  8 ++-
 dlt/destinations/impl/athena/athena.py        |  2 +-
 dlt/destinations/impl/bigquery/bigquery.py    |  3 +
 .../impl/clickhouse/clickhouse.py             |  3 +
 .../impl/databricks/databricks.py             |  3 +
 dlt/destinations/impl/dremio/dremio.py        |  3 +
 dlt/destinations/impl/dummy/configuration.py  |  2 +
 dlt/destinations/impl/dummy/dummy.py          |  3 +
 dlt/destinations/impl/redshift/redshift.py    |  3 +
 dlt/destinations/impl/snowflake/snowflake.py  |  3 +
 dlt/destinations/impl/synapse/synapse.py      |  3 +
 dlt/load/utils.py                             |  7 +-
 docs/website/docs/dlt-ecosystem/staging.md    | 72 ++++++++++++-------
 tests/load/pipeline/test_stage_loading.py     | 57 ++++++++++++++-
 tests/load/test_dummy_client.py               | 17 +++++
 15 files changed, 152 insertions(+), 37 deletions(-)

diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
index 744cbbd1f5..0944b03bea 100644
--- a/dlt/common/destination/reference.py
+++ b/dlt/common/destination/reference.py
@@ -269,6 +269,8 @@ class DestinationClientDwhWithStagingConfiguration(DestinationClientDwhConfigura
 
     staging_config: Optional[DestinationClientStagingConfiguration] = None
     """configuration of the staging, if present, injected at runtime"""
+    truncate_tables_on_staging_destination_before_load: bool = True
+    """If dlt should truncate the tables on staging destination before loading data."""
 
 
 TLoadJobState = Literal["ready", "running", "failed", "retry", "completed"]
@@ -578,7 +580,7 @@ def with_staging_dataset(self) -> ContextManager["JobClientBase"]:
         return self  # type: ignore
 
 
-class SupportsStagingDestination:
+class SupportsStagingDestination(ABC):
     """Adds capability to support a staging destination for the load"""
 
     def should_load_data_to_staging_dataset_on_staging_destination(
@@ -586,9 +588,9 @@ def should_load_data_to_staging_dataset_on_staging_destination(
     ) -> bool:
         return False
 
+    @abstractmethod
     def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
-        # the default is to truncate the tables on the staging destination...
-        return True
+        pass
 
 
 # TODO: type Destination properly
diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
index 0c90d171a3..b28309b930 100644
--- a/dlt/destinations/impl/athena/athena.py
+++ b/dlt/destinations/impl/athena/athena.py
@@ -531,7 +531,7 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable
         if table["write_disposition"] == "replace" and not self._is_iceberg_table(
             self.prepare_load_table(table["name"])
         ):
-            return True
+            return self.config.truncate_tables_on_staging_destination_before_load
         return False
 
     def should_load_data_to_staging_dataset_on_staging_destination(
diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
index 8291415434..11326cf3ed 100644
--- a/dlt/destinations/impl/bigquery/bigquery.py
+++ b/dlt/destinations/impl/bigquery/bigquery.py
@@ -503,6 +503,9 @@ def _should_autodetect_schema(self, table_name: str) -> bool:
             self.schema._schema_tables, table_name, AUTODETECT_SCHEMA_HINT, allow_none=True
         ) or (self.config.autodetect_schema and table_name not in self.schema.dlt_table_names())
 
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
+
 
 def _streaming_load(
     items: List[Dict[Any, Any]], table: Dict[str, Any], job_client: BigQueryClient
diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py
index 5f17a5a18c..282fbaf338 100644
--- a/dlt/destinations/impl/clickhouse/clickhouse.py
+++ b/dlt/destinations/impl/clickhouse/clickhouse.py
@@ -372,3 +372,6 @@ def _from_db_type(
         self, ch_t: str, precision: Optional[int], scale: Optional[int]
     ) -> TColumnType:
         return self.type_mapper.from_db_type(ch_t, precision, scale)
+
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
index 2f23e88ea0..38412b2608 100644
--- a/dlt/destinations/impl/databricks/databricks.py
+++ b/dlt/destinations/impl/databricks/databricks.py
@@ -325,3 +325,6 @@ def _get_storage_table_query_columns(self) -> List[str]:
             "full_data_type"
         )
         return fields
+
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py
index 68a3fedc31..149d106dcd 100644
--- a/dlt/destinations/impl/dremio/dremio.py
+++ b/dlt/destinations/impl/dremio/dremio.py
@@ -210,3 +210,6 @@ def _make_add_column_sql(
         self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None
     ) -> List[str]:
         return ["ADD COLUMNS (" + ", ".join(self._get_column_def_sql(c) for c in new_columns) + ")"]
+
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py
index 023b88e51a..a066479294 100644
--- a/dlt/destinations/impl/dummy/configuration.py
+++ b/dlt/destinations/impl/dummy/configuration.py
@@ -34,6 +34,8 @@ class DummyClientConfiguration(DestinationClientConfiguration):
     """raise terminal exception in job init"""
     fail_transiently_in_init: bool = False
     """raise transient exception in job init"""
+    truncate_tables_on_staging_destination_before_load: bool = True
+    """truncate tables on staging destination"""
 
     # new jobs workflows
     create_followup_jobs: bool = False
diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py
index 49b55ec65d..feb09369dc 100644
--- a/dlt/destinations/impl/dummy/dummy.py
+++ b/dlt/destinations/impl/dummy/dummy.py
@@ -202,6 +202,9 @@ def complete_load(self, load_id: str) -> None:
     def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool:
         return super().should_load_data_to_staging_dataset(table)
 
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
+
     @contextmanager
     def with_staging_dataset(self) -> Iterator[JobClientBase]:
         try:
diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py
index 93827c8163..0e201dc4e0 100644
--- a/dlt/destinations/impl/redshift/redshift.py
+++ b/dlt/destinations/impl/redshift/redshift.py
@@ -274,3 +274,6 @@ def _from_db_type(
         self, pq_t: str, precision: Optional[int], scale: Optional[int]
     ) -> TColumnType:
         return self.type_mapper.from_db_type(pq_t, precision, scale)
+
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py
index 8b4eabc961..6688b5bc17 100644
--- a/dlt/destinations/impl/snowflake/snowflake.py
+++ b/dlt/destinations/impl/snowflake/snowflake.py
@@ -325,3 +325,6 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
         return (
             f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}"
         )
+
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py
index e43e2a6dfa..750a4895f0 100644
--- a/dlt/destinations/impl/synapse/synapse.py
+++ b/dlt/destinations/impl/synapse/synapse.py
@@ -173,6 +173,9 @@ def create_load_job(
             )
         return job
 
+    def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        return self.config.truncate_tables_on_staging_destination_before_load
+
 
 class SynapseCopyFileLoadJob(CopyRemoteFileLoadJob):
     def __init__(
diff --git a/dlt/load/utils.py b/dlt/load/utils.py
index 741c01f249..e3a2ebcd79 100644
--- a/dlt/load/utils.py
+++ b/dlt/load/utils.py
@@ -179,9 +179,10 @@ def _init_dataset_and_update_schema(
     applied_update = job_client.update_stored_schema(
         only_tables=update_tables, expected_update=expected_update
     )
-    logger.info(
-        f"Client for {job_client.config.destination_type} will truncate tables {staging_text}"
-    )
+    if truncate_tables:
+        logger.info(
+            f"Client for {job_client.config.destination_type} will truncate tables {staging_text}"
+        )
 
     job_client.initialize_storage(truncate_tables=truncate_tables)
     return applied_update
diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md
index 05e31a574b..789189b7dd 100644
--- a/docs/website/docs/dlt-ecosystem/staging.md
+++ b/docs/website/docs/dlt-ecosystem/staging.md
@@ -1,36 +1,33 @@
 ---
 title: Staging
-description: Configure an s3 or gcs bucket for staging before copying into the destination
+description: Configure an S3 or GCS bucket for staging before copying into the destination
 keywords: [staging, destination]
 ---
 # Staging
 
-The goal of staging is to bring the data closer to the database engine so the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two
-staging areas:
+The goal of staging is to bring the data closer to the database engine so that the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two staging areas:
 1. A **staging dataset** used by the [merge and replace loads](../general-usage/incremental-loading.md#merge-incremental_loading) to deduplicate and merge data with the destination.
-2. A **staging storage** which is typically a s3/gcp bucket where [loader files](file-formats/) are copied before they are loaded by the destination.
+2. A **staging storage** which is typically an S3/GCP bucket where [loader files](file-formats/) are copied before they are loaded by the destination.
 
 ## Staging dataset
-`dlt` creates a staging dataset when write disposition of any of the loaded resources requires it. It creates and migrates required tables exactly like for the
-main dataset. Data in staging tables is truncated when load step begins and only for tables that will participate in it.
-Such staging dataset has the same name as the dataset passed to `dlt.pipeline` but with `_staging` suffix in the name. Alternatively, you can provide your own staging dataset pattern or use a fixed name, identical for all the
-configured datasets.
+`dlt` creates a staging dataset when the write disposition of any of the loaded resources requires it. It creates and migrates required tables exactly like for the main dataset. Data in staging tables is truncated when the load step begins and only for tables that will participate in it.
+Such a staging dataset has the same name as the dataset passed to `dlt.pipeline` but with a `_staging` suffix in the name. Alternatively, you can provide your own staging dataset pattern or use a fixed name, identical for all the configured datasets.
 ```toml
 [destination.postgres]
 staging_dataset_name_layout="staging_%s"
 ```
-Entry above switches the pattern to `staging_` prefix and for example for dataset with name **github_data** `dlt` will create **staging_github_data**.
+The entry above switches the pattern to `staging_` prefix and for example, for a dataset with the name **github_data**, `dlt` will create **staging_github_data**.
 
-To configure static staging dataset name, you can do the following (we use destination factory)
+To configure a static staging dataset name, you can do the following (we use the destination factory)
 ```py
 import dlt
 
 dest_ = dlt.destinations.postgres(staging_dataset_name_layout="_dlt_staging")
 ```
-All pipelines using `dest_` as destination will use **staging_dataset** to store staging tables. Make sure that your pipelines are not overwriting each other's tables.
+All pipelines using `dest_` as the destination will use the **staging_dataset** to store staging tables. Make sure that your pipelines are not overwriting each other's tables.
 
-### Cleanup up staging dataset automatically
-`dlt` does not truncate tables in staging dataset at the end of the load. Data that is left after contains all the extracted data and may be useful for debugging.
+### Cleanup staging dataset automatically
+`dlt` does not truncate tables in the staging dataset at the end of the load. Data that is left after contains all the extracted data and may be useful for debugging.
 If you prefer to truncate it, put the following line in `config.toml`:
 
 ```toml
@@ -39,19 +36,23 @@ truncate_staging_dataset=true
 ```
 
 ## Staging storage
-`dlt` allows to chain destinations where the first one (`staging`) is responsible for uploading the files from local filesystem to the remote storage. It then generates followup jobs for the second destination that (typically) copy the files from remote storage into destination.
+`dlt` allows chaining destinations where the first one (`staging`) is responsible for uploading the files from the local filesystem to the remote storage. It then generates follow-up jobs for the second destination that (typically) copy the files from remote storage into the destination.
 
-Currently, only one destination the [filesystem](destinations/filesystem.md) can be used as a staging. Following destinations can copy remote files:
-1. [Redshift.](destinations/redshift.md#staging-support)
-2. [Bigquery.](destinations/bigquery.md#staging-support)
-3. [Snowflake.](destinations/snowflake.md#staging-support)
+Currently, only one destination, the [filesystem](destinations/filesystem.md), can be used as staging. The following destinations can copy remote files:
+
+1. [Azure Synapse](destinations/synapse#staging-support)
+1. [Athena](destinations/athena#staging-support)
+1. [Bigquery](destinations/bigquery.md#staging-support)
+1. [Dremio](destinations/dremio#staging-support)
+1. [Redshift](destinations/redshift.md#staging-support)
+1. [Snowflake](destinations/snowflake.md#staging-support)
 
 ### How to use
-In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below we'll use `filesystem` staging with `parquet` files to load into `Redshift` destination.
+In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below we'll use `filesystem` staging with `parquet` files to load into the `Redshift` destination.
 
-1. **Set up the s3 bucket and filesystem staging.**
+1. **Set up the S3 bucket and filesystem staging.**
 
-    Please follow our guide in [filesystem destination documentation](destinations/filesystem.md). Test the staging as standalone destination to make sure that files go where you want them. In your `secrets.toml` you should now have a working `filesystem` configuration:
+    Please follow our guide in the [filesystem destination documentation](destinations/filesystem.md). Test the staging as a standalone destination to make sure that files go where you want them. In your `secrets.toml`, you should now have a working `filesystem` configuration:
     ```toml
     [destination.filesystem]
     bucket_url = "s3://[your_bucket_name]" # replace with your bucket name,
@@ -63,15 +64,15 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel
 
 2. **Set up the Redshift destination.**
 
-    Please follow our guide in [redshift destination documentation](destinations/redshift.md). In your `secrets.toml` you added:
+    Please follow our guide in the [redshift destination documentation](destinations/redshift.md). In your `secrets.toml`, you added:
     ```toml
     # keep it at the top of your toml file! before any section starts
     destination.redshift.credentials="redshift://loader:<password>@localhost/dlt_data?connect_timeout=15"
     ```
 
-3. **Authorize Redshift cluster to access the staging bucket.**
+3. **Authorize the Redshift cluster to access the staging bucket.**
 
-    By default `dlt` will forward the credentials configured for `filesystem` to the `Redshift` COPY command. If you are fine with this, move to the next step.
+    By default, `dlt` will forward the credentials configured for `filesystem` to the `Redshift` COPY command. If you are fine with this, move to the next step.
 
 4. **Chain staging to destination and request `parquet` file format.**
 
@@ -79,7 +80,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel
     ```py
     # Create a dlt pipeline that will load
     # chess player data to the redshift destination
-    # via staging on s3
+    # via staging on S3
     pipeline = dlt.pipeline(
         pipeline_name='chess_pipeline',
         destination='redshift',
@@ -87,7 +88,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel
         dataset_name='player_data'
     )
     ```
-    `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify `parquet` file format (just to demonstrate how to do it):
+    `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify the `parquet` file format (just to demonstrate how to do it):
     ```py
     info = pipeline.run(chess(), loader_file_format="parquet")
     ```
@@ -96,4 +97,21 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel
 
     Run the pipeline script as usual.
 
-> 💡 Please note that `dlt` does not delete loaded files from the staging storage after the load is complete.
+:::tip
+Please note that `dlt` does not delete loaded files from the staging storage after the load is complete, but it truncates previously loaded files.
+:::
+
+### How to prevent staging files truncation
+
+Before `dlt` loads data to the staging storage, it truncates previously loaded files. To prevent it and keep the whole history
+of loaded files, you can use the following parameter:
+
+```toml
+[destination.redshift]
+truncate_table_before_load_on_staging_destination=false
+```
+
+:::caution
+The [Athena](destinations/athena#staging-support) destination only truncates not iceberg tables with `replace` merge_disposition.
+Therefore, the parameter `truncate_table_before_load_on_staging_destination` only controls the truncation of corresponding files for these tables.
+:::
diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py
index a760c86526..f216fa3c05 100644
--- a/tests/load/pipeline/test_stage_loading.py
+++ b/tests/load/pipeline/test_stage_loading.py
@@ -1,12 +1,12 @@
 import pytest
-from typing import Dict, Any, List
+from typing import List
 
 import dlt, os
-from dlt.common import json, sleep
-from copy import deepcopy
+from dlt.common import json
 from dlt.common.storages.configuration import FilesystemConfiguration
 from dlt.common.utils import uniq_id
 from dlt.common.schema.typing import TDataType
+from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
 
 from tests.load.pipeline.test_merge_disposition import github
 from tests.pipeline.utils import load_table_counts, assert_load_info
@@ -40,6 +40,13 @@ def load_modified_issues():
         yield from issues
 
 
+@dlt.resource(table_name="events", write_disposition="append", primary_key="timestamp")
+def event_many_load_2():
+    with open("tests/normalize/cases/event.event.many_load_2.json", "r", encoding="utf-8") as f:
+        events = json.load(f)
+        yield from events
+
+
 @pytest.mark.parametrize(
     "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name
 )
@@ -183,6 +190,50 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None:
     assert replace_counts == initial_counts
 
 
+@pytest.mark.parametrize(
+    "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name
+)
+def test_truncate_staging_dataset(destination_config: DestinationTestConfiguration) -> None:
+    """This test checks if tables truncation on staging destination done according to the configuration.
+
+    Test loads data to the destination three times:
+    * with truncation
+    * without truncation (after this 2 staging files should be left)
+    * with truncation (after this 1 staging file should be left)
+    """
+    pipeline = destination_config.setup_pipeline(
+        pipeline_name="test_stage_loading", dataset_name="test_staging_load" + uniq_id()
+    )
+    resource = event_many_load_2()
+    table_name: str = resource.table_name  # type: ignore[assignment]
+
+    # load the data, files stay on the stage after the load
+    info = pipeline.run(resource)
+    assert_load_info(info)
+
+    # load the data without truncating of the staging, should see two files on staging
+    pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = False
+    info = pipeline.run(resource)
+    assert_load_info(info)
+    # check there are two staging files
+    _, staging_client = pipeline._get_destination_clients(pipeline.default_schema)
+    with staging_client:
+        assert len(staging_client.list_table_files(table_name)) == 2  # type: ignore[attr-defined]
+
+    # load the data with truncating, so only new file is on the staging
+    pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = True
+    info = pipeline.run(resource)
+    assert_load_info(info)
+    # check that table exists in the destination
+    with pipeline.sql_client() as sql_client:
+        qual_name = sql_client.make_qualified_table_name
+        assert len(sql_client.execute_sql(f"SELECT * from {qual_name(table_name)}")) > 4
+    # check there is only one staging file
+    _, staging_client = pipeline._get_destination_clients(pipeline.default_schema)
+    with staging_client:
+        assert len(staging_client.list_table_files(table_name)) == 1  # type: ignore[attr-defined]
+
+
 @pytest.mark.parametrize(
     "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name
 )
diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py
index 9f0bca6ac5..59b7acac15 100644
--- a/tests/load/test_dummy_client.py
+++ b/tests/load/test_dummy_client.py
@@ -548,6 +548,23 @@ def test_completed_loop_with_delete_completed() -> None:
     assert_complete_job(load, should_delete_completed=True)
 
 
+@pytest.mark.parametrize("to_truncate", [True, False])
+def test_truncate_table_before_load_on_stanging(to_truncate) -> None:
+    load = setup_loader(
+        client_config=DummyClientConfiguration(
+            truncate_tables_on_staging_destination_before_load=to_truncate
+        )
+    )
+    load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
+    destination_client = load.get_destination_client(schema)
+    assert (
+        destination_client.should_truncate_table_before_load_on_staging_destination(  # type: ignore
+            schema.tables["_dlt_version"]
+        )
+        == to_truncate
+    )
+
+
 def test_retry_on_new_loop() -> None:
     # test job that retries sitting in new jobs
     load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0))

From 4e1c6077c7ed4bbaf127e34a2cbc7d87fe48d924 Mon Sep 17 00:00:00 2001
From: rudolfix <rudolfix@rudolfix.org>
Date: Wed, 28 Aug 2024 13:17:11 +0200
Subject: [PATCH 31/34] enables external location and named credential in
 databricks (#1755)

* allows to configure external location and named credential for databricks

* fixes #1703

* normalizes 'value' when wrapping simple objects in relational, fixes #1754

* simplifies fsspec globbing and allows various url formats that are preserved when reconstituting full url, allows abfss databricks format

* adds info on partially loaded packages to docs

* renames remote_uri to remote_url in traces

* fixes delta for abfss

* adds nested tables dlt columns collision test
---
 .github/workflows/test_destinations.yml       |   1 +
 .../configuration/specs/azure_credentials.py  |   2 +
 dlt/common/libs/deltalake.py                  |   3 +-
 dlt/common/metrics.py                         |   2 +-
 dlt/common/normalizers/json/__init__.py       |   4 +-
 dlt/common/normalizers/json/relational.py     |   4 +-
 dlt/common/storages/configuration.py          | 119 +++++++++++++-----
 dlt/common/storages/fsspec_filesystem.py      |  58 +++++----
 dlt/destinations/impl/athena/athena.py        |   1 -
 dlt/destinations/impl/bigquery/bigquery.py    |   2 +-
 .../impl/databricks/configuration.py          |   4 +
 .../impl/databricks/databricks.py             | 108 ++++++++++------
 dlt/destinations/impl/databricks/factory.py   |   6 +
 dlt/destinations/impl/dummy/dummy.py          |   4 +-
 .../impl/filesystem/filesystem.py             |  32 ++---
 .../dlt-ecosystem/destinations/databricks.md  |  33 ++++-
 .../dlt-ecosystem/destinations/snowflake.md   |   2 +-
 .../docs/running-in-production/running.md     |  16 ++-
 tests/.dlt/config.toml                        |   3 +-
 tests/common/cases/normalizers/sql_upper.py   |   2 -
 .../common/storages/test_local_filesystem.py  |  10 +-
 .../test_destination_name_and_config.py       |   4 +-
 .../test_databricks_configuration.py          |  50 +++++++-
 .../load/filesystem/test_filesystem_common.py |  54 +++++---
 .../load/pipeline/test_databricks_pipeline.py |  85 +++++++++++++
 .../load/pipeline/test_filesystem_pipeline.py |  18 +--
 tests/load/pipeline/test_stage_loading.py     |  10 +-
 tests/load/test_dummy_client.py               |  10 +-
 tests/load/utils.py                           |  12 +-
 .../cases/contracts/trace.schema.yaml         |   2 +-
 tests/pipeline/test_pipeline.py               |  14 +++
 tests/pipeline/test_pipeline_trace.py         |   2 +-
 32 files changed, 510 insertions(+), 167 deletions(-)
 create mode 100644 tests/load/pipeline/test_databricks_pipeline.py

diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml
index a034ac7eb0..7fae69ff9e 100644
--- a/.github/workflows/test_destinations.yml
+++ b/.github/workflows/test_destinations.yml
@@ -29,6 +29,7 @@ env:
   # Test redshift and filesystem with all buckets
   # postgres runs again here so we can test on mac/windows
   ACTIVE_DESTINATIONS: "[\"redshift\", \"postgres\", \"duckdb\", \"filesystem\", \"dummy\"]"
+  # note that all buckets are enabled for testing
 
 jobs:
   get_docs_changes:
diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py
index 7fa34fa00f..6794b581ce 100644
--- a/dlt/common/configuration/specs/azure_credentials.py
+++ b/dlt/common/configuration/specs/azure_credentials.py
@@ -32,6 +32,8 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]:
         creds = self.to_adlfs_credentials()
         if creds["sas_token"] is None:
             creds.pop("sas_token")
+        if creds["account_key"] is None:
+            creds.pop("account_key")
         return creds
 
     def create_sas_token(self) -> None:
diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py
index d4cb46c600..38b23ea27a 100644
--- a/dlt/common/libs/deltalake.py
+++ b/dlt/common/libs/deltalake.py
@@ -176,7 +176,8 @@ def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str
     """Returns dict that can be passed as `storage_options` in `deltalake` library."""
     creds = {}
     extra_options = {}
-    if config.protocol in ("az", "gs", "s3"):
+    # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery
+    if hasattr(config.credentials, "to_object_store_rs_credentials"):
         creds = config.credentials.to_object_store_rs_credentials()
     if config.deltalake_storage_options is not None:
         extra_options = config.deltalake_storage_options
diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py
index 5cccee4045..d6acf19d0d 100644
--- a/dlt/common/metrics.py
+++ b/dlt/common/metrics.py
@@ -64,7 +64,7 @@ class LoadJobMetrics(NamedTuple):
     started_at: datetime.datetime
     finished_at: datetime.datetime
     state: Optional[str]
-    remote_uri: Optional[str]
+    remote_url: Optional[str]
 
 
 class LoadMetrics(StepMetrics):
diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py
index a13bab15f4..725f6a8355 100644
--- a/dlt/common/normalizers/json/__init__.py
+++ b/dlt/common/normalizers/json/__init__.py
@@ -54,9 +54,9 @@ class SupportsDataItemNormalizer(Protocol):
     """A class with a name DataItemNormalizer deriving from normalizers.json.DataItemNormalizer"""
 
 
-def wrap_in_dict(item: Any) -> DictStrAny:
+def wrap_in_dict(label: str, item: Any) -> DictStrAny:
     """Wraps `item` that is not a dictionary into dictionary that can be json normalized"""
-    return {"value": item}
+    return {label: item}
 
 
 __all__ = [
diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py
index 1dbcec4bff..33184640f0 100644
--- a/dlt/common/normalizers/json/relational.py
+++ b/dlt/common/normalizers/json/relational.py
@@ -281,7 +281,7 @@ def _normalize_list(
             else:
                 # list of simple types
                 child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx)
-                wrap_v = wrap_in_dict(v)
+                wrap_v = wrap_in_dict(self.c_value, v)
                 wrap_v[self.c_dlt_id] = child_row_hash
                 e = self._link_row(wrap_v, parent_row_id, idx)
                 DataItemNormalizer._extend_row(extend, e)
@@ -387,7 +387,7 @@ def normalize_data_item(
     ) -> TNormalizedRowIterator:
         # wrap items that are not dictionaries in dictionary, otherwise they cannot be processed by the JSON normalizer
         if not isinstance(item, dict):
-            item = wrap_in_dict(item)
+            item = wrap_in_dict(self.c_value, item)
         # we will extend event with all the fields necessary to load it as root row
         row = cast(DictStrAny, item)
         # identify load id if loaded data must be processed after loading incrementally
diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py
index b2bdb3a7b6..04780528c4 100644
--- a/dlt/common/storages/configuration.py
+++ b/dlt/common/storages/configuration.py
@@ -1,7 +1,7 @@
 import os
 import pathlib
 from typing import Any, Literal, Optional, Type, get_args, ClassVar, Dict, Union
-from urllib.parse import urlparse, unquote
+from urllib.parse import urlparse, unquote, urlunparse
 
 from dlt.common.configuration import configspec, resolve_type
 from dlt.common.configuration.exceptions import ConfigurationValueError
@@ -52,6 +52,53 @@ class LoadStorageConfiguration(BaseConfiguration):
 ]
 
 
+def _make_az_url(scheme: str, fs_path: str, bucket_url: str) -> str:
+    parsed_bucket_url = urlparse(bucket_url)
+    if parsed_bucket_url.username:
+        # az://<container_name>@<storage_account_name>.dfs.core.windows.net/<path>
+        # fs_path always starts with container
+        split_path = fs_path.split("/", maxsplit=1)
+        if len(split_path) == 1:
+            split_path.append("")
+        container, path = split_path
+        netloc = f"{container}@{parsed_bucket_url.hostname}"
+        return urlunparse(parsed_bucket_url._replace(path=path, scheme=scheme, netloc=netloc))
+    return f"{scheme}://{fs_path}"
+
+
+def _make_file_url(scheme: str, fs_path: str, bucket_url: str) -> str:
+    """Creates a normalized file:// url from a local path
+
+    netloc is never set. UNC paths are represented as file://host/path
+    """
+    p_ = pathlib.Path(fs_path)
+    p_ = p_.expanduser().resolve()
+    return p_.as_uri()
+
+
+MAKE_URI_DISPATCH = {"az": _make_az_url, "file": _make_file_url}
+
+MAKE_URI_DISPATCH["adl"] = MAKE_URI_DISPATCH["az"]
+MAKE_URI_DISPATCH["abfs"] = MAKE_URI_DISPATCH["az"]
+MAKE_URI_DISPATCH["azure"] = MAKE_URI_DISPATCH["az"]
+MAKE_URI_DISPATCH["abfss"] = MAKE_URI_DISPATCH["az"]
+MAKE_URI_DISPATCH["local"] = MAKE_URI_DISPATCH["file"]
+
+
+def make_fsspec_url(scheme: str, fs_path: str, bucket_url: str) -> str:
+    """Creates url from `fs_path` and `scheme` using bucket_url as an `url` template
+
+    Args:
+        scheme (str): scheme of the resulting url
+        fs_path (str): kind of absolute path that fsspec uses to locate resources for particular filesystem.
+        bucket_url (str): an url template. the structure of url will be preserved if possible
+    """
+    _maker = MAKE_URI_DISPATCH.get(scheme)
+    if _maker:
+        return _maker(scheme, fs_path, bucket_url)
+    return f"{scheme}://{fs_path}"
+
+
 @configspec
 class FilesystemConfiguration(BaseConfiguration):
     """A configuration defining filesystem location and access credentials.
@@ -59,7 +106,7 @@ class FilesystemConfiguration(BaseConfiguration):
     When configuration is resolved, `bucket_url` is used to extract a protocol and request corresponding credentials class.
     * s3
     * gs, gcs
-    * az, abfs, adl
+    * az, abfs, adl, abfss, azure
     * file, memory
     * gdrive
     """
@@ -72,6 +119,8 @@ class FilesystemConfiguration(BaseConfiguration):
         "az": AnyAzureCredentials,
         "abfs": AnyAzureCredentials,
         "adl": AnyAzureCredentials,
+        "abfss": AnyAzureCredentials,
+        "azure": AnyAzureCredentials,
     }
 
     bucket_url: str = None
@@ -93,17 +142,21 @@ def protocol(self) -> str:
         else:
             return urlparse(self.bucket_url).scheme
 
+    @property
+    def is_local_filesystem(self) -> bool:
+        return self.protocol == "file"
+
     def on_resolved(self) -> None:
-        uri = urlparse(self.bucket_url)
-        if not uri.path and not uri.netloc:
+        url = urlparse(self.bucket_url)
+        if not url.path and not url.netloc:
             raise ConfigurationValueError(
                 "File path and netloc are missing. Field bucket_url of"
-                " FilesystemClientConfiguration must contain valid uri with a path or host:password"
+                " FilesystemClientConfiguration must contain valid url with a path or host:password"
                 " component."
             )
         # this is just a path in a local file system
         if self.is_local_path(self.bucket_url):
-            self.bucket_url = self.make_file_uri(self.bucket_url)
+            self.bucket_url = self.make_file_url(self.bucket_url)
 
     @resolve_type("credentials")
     def resolve_credentials_type(self) -> Type[CredentialsConfiguration]:
@@ -122,44 +175,50 @@ def fingerprint(self) -> str:
         if self.is_local_path(self.bucket_url):
             return digest128("")
 
-        uri = urlparse(self.bucket_url)
-        return digest128(self.bucket_url.replace(uri.path, ""))
+        url = urlparse(self.bucket_url)
+        return digest128(self.bucket_url.replace(url.path, ""))
+
+    def make_url(self, fs_path: str) -> str:
+        """Makes a full url (with scheme) form fs_path which is kind-of absolute path used by fsspec to identify resources.
+        This method will use `bucket_url` to infer the original form of the url.
+        """
+        return make_fsspec_url(self.protocol, fs_path, self.bucket_url)
 
     def __str__(self) -> str:
         """Return displayable destination location"""
-        uri = urlparse(self.bucket_url)
+        url = urlparse(self.bucket_url)
         # do not show passwords
-        if uri.password:
-            new_netloc = f"{uri.username}:****@{uri.hostname}"
-            if uri.port:
-                new_netloc += f":{uri.port}"
-            return uri._replace(netloc=new_netloc).geturl()
+        if url.password:
+            new_netloc = f"{url.username}:****@{url.hostname}"
+            if url.port:
+                new_netloc += f":{url.port}"
+            return url._replace(netloc=new_netloc).geturl()
         return self.bucket_url
 
     @staticmethod
-    def is_local_path(uri: str) -> bool:
-        """Checks if `uri` is a local path, without a schema"""
-        uri_parsed = urlparse(uri)
+    def is_local_path(url: str) -> bool:
+        """Checks if `url` is a local path, without a schema"""
+        url_parsed = urlparse(url)
         # this prevents windows absolute paths to be recognized as schemas
-        return not uri_parsed.scheme or os.path.isabs(uri)
+        return not url_parsed.scheme or os.path.isabs(url)
 
     @staticmethod
-    def make_local_path(file_uri: str) -> str:
+    def make_local_path(file_url: str) -> str:
         """Gets a valid local filesystem path from file:// scheme.
         Supports POSIX/Windows/UNC paths
 
         Returns:
             str: local filesystem path
         """
-        uri = urlparse(file_uri)
-        if uri.scheme != "file":
-            raise ValueError(f"Must be file scheme but is {uri.scheme}")
-        if not uri.path and not uri.netloc:
+        url = urlparse(file_url)
+        if url.scheme != "file":
+            raise ValueError(f"Must be file scheme but is {url.scheme}")
+        if not url.path and not url.netloc:
             raise ConfigurationValueError("File path and netloc are missing.")
-        local_path = unquote(uri.path)
-        if uri.netloc:
+        local_path = unquote(url.path)
+        if url.netloc:
             # or UNC file://localhost/path
-            local_path = "//" + unquote(uri.netloc) + local_path
+            local_path = "//" + unquote(url.netloc) + local_path
         else:
             # if we are on windows, strip the POSIX root from path which is always absolute
             if os.path.sep != local_path[0]:
@@ -172,11 +231,9 @@ def make_local_path(file_uri: str) -> str:
         return str(pathlib.Path(local_path))
 
     @staticmethod
-    def make_file_uri(local_path: str) -> str:
-        """Creates a normalized file:// uri from a local path
+    def make_file_url(local_path: str) -> str:
+        """Creates a normalized file:// url from a local path
 
         netloc is never set. UNC paths are represented as file://host/path
         """
-        p_ = pathlib.Path(local_path)
-        p_ = p_.expanduser().resolve()
-        return p_.as_uri()
+        return make_fsspec_url("file", local_path, None)
diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py
index be9ae2bbb1..7da5ebabef 100644
--- a/dlt/common/storages/fsspec_filesystem.py
+++ b/dlt/common/storages/fsspec_filesystem.py
@@ -21,7 +21,7 @@
 )
 from urllib.parse import urlparse
 
-from fsspec import AbstractFileSystem, register_implementation
+from fsspec import AbstractFileSystem, register_implementation, get_filesystem_class
 from fsspec.core import url_to_fs
 
 from dlt import version
@@ -32,7 +32,11 @@
     AzureCredentials,
 )
 from dlt.common.exceptions import MissingDependencyException
-from dlt.common.storages.configuration import FileSystemCredentials, FilesystemConfiguration
+from dlt.common.storages.configuration import (
+    FileSystemCredentials,
+    FilesystemConfiguration,
+    make_fsspec_url,
+)
 from dlt.common.time import ensure_pendulum_datetime
 from dlt.common.typing import DictStrAny
 
@@ -65,18 +69,20 @@ class FileItem(TypedDict, total=False):
 MTIME_DISPATCH["gs"] = MTIME_DISPATCH["gcs"]
 MTIME_DISPATCH["s3a"] = MTIME_DISPATCH["s3"]
 MTIME_DISPATCH["abfs"] = MTIME_DISPATCH["az"]
+MTIME_DISPATCH["abfss"] = MTIME_DISPATCH["az"]
 
 # Map of protocol to a filesystem type
 CREDENTIALS_DISPATCH: Dict[str, Callable[[FilesystemConfiguration], DictStrAny]] = {
     "s3": lambda config: cast(AwsCredentials, config.credentials).to_s3fs_credentials(),
-    "adl": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(),
     "az": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(),
-    "gcs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(),
     "gs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(),
     "gdrive": lambda config: {"credentials": cast(GcpCredentials, config.credentials)},
-    "abfs": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(),
-    "azure": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(),
 }
+CREDENTIALS_DISPATCH["adl"] = CREDENTIALS_DISPATCH["az"]
+CREDENTIALS_DISPATCH["abfs"] = CREDENTIALS_DISPATCH["az"]
+CREDENTIALS_DISPATCH["azure"] = CREDENTIALS_DISPATCH["az"]
+CREDENTIALS_DISPATCH["abfss"] = CREDENTIALS_DISPATCH["az"]
+CREDENTIALS_DISPATCH["gcs"] = CREDENTIALS_DISPATCH["gs"]
 
 
 def fsspec_filesystem(
@@ -90,7 +96,7 @@ def fsspec_filesystem(
     Please supply credentials instance corresponding to the protocol.
     The `protocol` is just the code name of the filesystem i.e.:
     * s3
-    * az, abfs
+    * az, abfs, abfss, adl, azure
     * gcs, gs
 
     also see filesystem_from_config
@@ -136,7 +142,7 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys
 
     Authenticates following filesystems:
     * s3
-    * az, abfs
+    * az, abfs, abfss, adl, azure
     * gcs, gs
 
     All other filesystems are not authenticated
@@ -146,8 +152,14 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys
     fs_kwargs = prepare_fsspec_args(config)
 
     try:
+        # first get the class to check the protocol
+        fs_cls = get_filesystem_class(config.protocol)
+        if fs_cls.protocol == "abfs":
+            # if storage account is present in bucket_url and in credentials, az fsspec will fail
+            if urlparse(config.bucket_url).username:
+                fs_kwargs.pop("account_name")
         return url_to_fs(config.bucket_url, **fs_kwargs)  # type: ignore
-    except ModuleNotFoundError as e:
+    except ImportError as e:
         raise MissingDependencyException(
             "filesystem", [f"{version.DLT_PKG_NAME}[{config.protocol}]"]
         ) from e
@@ -291,10 +303,8 @@ def glob_files(
     """
     is_local_fs = "file" in fs_client.protocol
     if is_local_fs and FilesystemConfiguration.is_local_path(bucket_url):
-        bucket_url = FilesystemConfiguration.make_file_uri(bucket_url)
-        bucket_url_parsed = urlparse(bucket_url)
-    else:
-        bucket_url_parsed = urlparse(bucket_url)
+        bucket_url = FilesystemConfiguration.make_file_url(bucket_url)
+    bucket_url_parsed = urlparse(bucket_url)
 
     if is_local_fs:
         root_dir = FilesystemConfiguration.make_local_path(bucket_url)
@@ -302,7 +312,8 @@ def glob_files(
         files = glob.glob(str(pathlib.Path(root_dir).joinpath(file_glob)), recursive=True)
         glob_result = {file: fs_client.info(file) for file in files}
     else:
-        root_dir = bucket_url_parsed._replace(scheme="", query="").geturl().lstrip("/")
+        # convert to fs_path
+        root_dir = fs_client._strip_protocol(bucket_url)
         filter_url = posixpath.join(root_dir, file_glob)
         glob_result = fs_client.glob(filter_url, detail=True)
         if isinstance(glob_result, list):
@@ -314,20 +325,23 @@ def glob_files(
     for file, md in glob_result.items():
         if md["type"] != "file":
             continue
+        scheme = bucket_url_parsed.scheme
+
         # relative paths are always POSIX
         if is_local_fs:
-            rel_path = pathlib.Path(file).relative_to(root_dir).as_posix()
-            file_url = FilesystemConfiguration.make_file_uri(file)
+            # use OS pathlib for local paths
+            loc_path = pathlib.Path(file)
+            file_name = loc_path.name
+            rel_path = loc_path.relative_to(root_dir).as_posix()
+            file_url = FilesystemConfiguration.make_file_url(file)
         else:
-            rel_path = posixpath.relpath(file.lstrip("/"), root_dir)
-            file_url = bucket_url_parsed._replace(
-                path=posixpath.join(bucket_url_parsed.path, rel_path)
-            ).geturl()
+            file_name = posixpath.basename(file)
+            rel_path = posixpath.relpath(file, root_dir)
+            file_url = make_fsspec_url(scheme, file, bucket_url)
 
-        scheme = bucket_url_parsed.scheme
         mime_type, encoding = guess_mime_type(rel_path)
         yield FileItem(
-            file_name=posixpath.basename(rel_path),
+            file_name=file_name,
             relative_path=rel_path,
             file_url=file_url,
             mime_type=mime_type,
diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
index b28309b930..b3b2fbcf0f 100644
--- a/dlt/destinations/impl/athena/athena.py
+++ b/dlt/destinations/impl/athena/athena.py
@@ -34,7 +34,6 @@
 
 from dlt.common import logger
 from dlt.common.exceptions import TerminalValueError
-from dlt.common.storages.fsspec_filesystem import fsspec_from_config
 from dlt.common.utils import uniq_id, without_none
 from dlt.common.schema import TColumnSchema, Schema, TTableSchema
 from dlt.common.schema.typing import (
diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
index 11326cf3ed..1dd4c727be 100644
--- a/dlt/destinations/impl/bigquery/bigquery.py
+++ b/dlt/destinations/impl/bigquery/bigquery.py
@@ -432,7 +432,7 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load
         # append to table for merge loads (append to stage) and regular appends.
         table_name = table["name"]
 
-        # determine whether we load from local or uri
+        # determine whether we load from local or url
         bucket_path = None
         ext: str = os.path.splitext(file_path)[1][1:]
         if ReferenceFollowupJobRequest.is_reference_job(file_path):
diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py
index 3bd2d12a5a..789dbedae9 100644
--- a/dlt/destinations/impl/databricks/configuration.py
+++ b/dlt/destinations/impl/databricks/configuration.py
@@ -43,6 +43,10 @@ def to_connector_params(self) -> Dict[str, Any]:
 class DatabricksClientConfiguration(DestinationClientDwhWithStagingConfiguration):
     destination_type: Final[str] = dataclasses.field(default="databricks", init=False, repr=False, compare=False)  # type: ignore[misc]
     credentials: DatabricksCredentials = None
+    staging_credentials_name: Optional[str] = None
+    "If set, credentials with given name will be used in copy command"
+    is_staging_external_location: bool = False
+    """If true, the temporary credentials are not propagated to the COPY command"""
 
     def __str__(self) -> str:
         """Return displayable destination location"""
diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
index 38412b2608..614e6e97c5 100644
--- a/dlt/destinations/impl/databricks/databricks.py
+++ b/dlt/destinations/impl/databricks/databricks.py
@@ -1,4 +1,4 @@
-from typing import ClassVar, Dict, Optional, Sequence, Tuple, List, Any, Iterable, Type, cast
+from typing import Optional, Sequence, List, cast
 from urllib.parse import urlparse, urlunparse
 
 from dlt import config
@@ -6,20 +6,17 @@
 from dlt.common.destination.reference import (
     HasFollowupJobs,
     FollowupJobRequest,
-    TLoadJobState,
     RunnableLoadJob,
-    CredentialsConfiguration,
     SupportsStagingDestination,
     LoadJob,
 )
 from dlt.common.configuration.specs import (
     AwsCredentialsWithoutDefaults,
-    AzureCredentials,
     AzureCredentialsWithoutDefaults,
 )
 from dlt.common.exceptions import TerminalValueError
 from dlt.common.storages.file_storage import FileStorage
-from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns
+from dlt.common.schema import TColumnSchema, Schema
 from dlt.common.schema.typing import TTableSchema, TColumnType, TSchemaTables, TTableFormat
 from dlt.common.schema.utils import table_schema_has_type
 from dlt.common.storages import FilesystemConfiguration, fsspec_from_config
@@ -35,6 +32,9 @@
 from dlt.destinations.type_mapping import TypeMapper
 
 
+AZURE_BLOB_STORAGE_PROTOCOLS = ["az", "abfss", "abfs"]
+
+
 class DatabricksTypeMapper(TypeMapper):
     sct_to_unbound_dbt = {
         "complex": "STRING",  # Databricks supports complex types like ARRAY
@@ -137,41 +137,51 @@ def run(self) -> None:
         if bucket_path:
             bucket_url = urlparse(bucket_path)
             bucket_scheme = bucket_url.scheme
-            # referencing an staged files via a bucket URL requires explicit AWS credentials
-            if bucket_scheme == "s3" and isinstance(
-                staging_credentials, AwsCredentialsWithoutDefaults
-            ):
-                s3_creds = staging_credentials.to_session_credentials()
-                credentials_clause = f"""WITH(CREDENTIAL(
-                AWS_ACCESS_KEY='{s3_creds["aws_access_key_id"]}',
-                AWS_SECRET_KEY='{s3_creds["aws_secret_access_key"]}',
-
-                AWS_SESSION_TOKEN='{s3_creds["aws_session_token"]}'
-                ))
-                """
-                from_clause = f"FROM '{bucket_path}'"
-            elif bucket_scheme in ["az", "abfs"] and isinstance(
-                staging_credentials, AzureCredentialsWithoutDefaults
-            ):
-                # Explicit azure credentials are needed to load from bucket without a named stage
-                credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))"""
-                # Converts an az://<container_name>/<path> to abfss://<container_name>@<storage_account_name>.dfs.core.windows.net/<path>
-                # as required by snowflake
-                _path = bucket_url.path
-                bucket_path = urlunparse(
-                    bucket_url._replace(
-                        scheme="abfss",
-                        netloc=f"{bucket_url.netloc}@{staging_credentials.azure_storage_account_name}.dfs.core.windows.net",
-                        path=_path,
-                    )
-                )
-                from_clause = f"FROM '{bucket_path}'"
-            else:
+
+            if bucket_scheme not in AZURE_BLOB_STORAGE_PROTOCOLS + ["s3"]:
                 raise LoadJobTerminalException(
                     self._file_path,
                     f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and"
                     " azure buckets are supported",
                 )
+
+            if self._job_client.config.is_staging_external_location:
+                # just skip the credentials clause for external location
+                # https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location
+                pass
+            elif self._job_client.config.staging_credentials_name:
+                # add named credentials
+                credentials_clause = (
+                    f"WITH(CREDENTIAL {self._job_client.config.staging_credentials_name} )"
+                )
+            else:
+                # referencing an staged files via a bucket URL requires explicit AWS credentials
+                if bucket_scheme == "s3":
+                    assert isinstance(staging_credentials, AwsCredentialsWithoutDefaults)
+                    s3_creds = staging_credentials.to_session_credentials()
+                    credentials_clause = f"""WITH(CREDENTIAL(
+                    AWS_ACCESS_KEY='{s3_creds["aws_access_key_id"]}',
+                    AWS_SECRET_KEY='{s3_creds["aws_secret_access_key"]}',
+
+                    AWS_SESSION_TOKEN='{s3_creds["aws_session_token"]}'
+                    ))
+                    """
+                elif bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS:
+                    assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults)
+                    # Explicit azure credentials are needed to load from bucket without a named stage
+                    credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))"""
+                    bucket_path = self.ensure_databricks_abfss_url(
+                        bucket_path, staging_credentials.azure_storage_account_name
+                    )
+
+            if bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS:
+                assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults)
+                bucket_path = self.ensure_databricks_abfss_url(
+                    bucket_path, staging_credentials.azure_storage_account_name
+                )
+
+            # always add FROM clause
+            from_clause = f"FROM '{bucket_path}'"
         else:
             raise LoadJobTerminalException(
                 self._file_path,
@@ -231,6 +241,34 @@ def run(self) -> None:
             """
         self._sql_client.execute_sql(statement)
 
+    @staticmethod
+    def ensure_databricks_abfss_url(
+        bucket_path: str, azure_storage_account_name: str = None
+    ) -> str:
+        bucket_url = urlparse(bucket_path)
+        # Converts an az://<container_name>/<path> to abfss://<container_name>@<storage_account_name>.dfs.core.windows.net/<path>
+        if bucket_url.username:
+            # has the right form, ensure abfss schema
+            return urlunparse(bucket_url._replace(scheme="abfss"))
+
+        if not azure_storage_account_name:
+            raise TerminalValueError(
+                f"Could not convert azure blob storage url {bucket_path} into form required by"
+                " Databricks"
+                " (abfss://<container_name>@<storage_account_name>.dfs.core.windows.net/<path>)"
+                " because storage account name is not known. Please use Databricks abfss://"
+                " canonical url as bucket_url in staging credentials"
+            )
+        # as required by databricks
+        _path = bucket_url.path
+        return urlunparse(
+            bucket_url._replace(
+                scheme="abfss",
+                netloc=f"{bucket_url.netloc}@{azure_storage_account_name}.dfs.core.windows.net",
+                path=_path,
+            )
+        )
+
 
 class DatabricksMergeJob(SqlMergeFollowupJob):
     @classmethod
diff --git a/dlt/destinations/impl/databricks/factory.py b/dlt/destinations/impl/databricks/factory.py
index 409d3bc4be..6108b69da9 100644
--- a/dlt/destinations/impl/databricks/factory.py
+++ b/dlt/destinations/impl/databricks/factory.py
@@ -54,6 +54,8 @@ def client_class(self) -> t.Type["DatabricksClient"]:
     def __init__(
         self,
         credentials: t.Union[DatabricksCredentials, t.Dict[str, t.Any], str] = None,
+        is_staging_external_location: t.Optional[bool] = False,
+        staging_credentials_name: t.Optional[str] = None,
         destination_name: t.Optional[str] = None,
         environment: t.Optional[str] = None,
         **kwargs: t.Any,
@@ -65,10 +67,14 @@ def __init__(
         Args:
             credentials: Credentials to connect to the databricks database. Can be an instance of `DatabricksCredentials` or
                 a connection string in the format `databricks://user:password@host:port/database`
+            is_staging_external_location: If true, the temporary credentials are not propagated to the COPY command
+            staging_credentials_name: If set, credentials with given name will be used in copy command
             **kwargs: Additional arguments passed to the destination config
         """
         super().__init__(
             credentials=credentials,
+            is_staging_external_location=is_staging_external_location,
+            staging_credentials_name=staging_credentials_name,
             destination_name=destination_name,
             environment=environment,
             **kwargs,
diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py
index feb09369dc..fc87faaf5a 100644
--- a/dlt/destinations/impl/dummy/dummy.py
+++ b/dlt/destinations/impl/dummy/dummy.py
@@ -90,9 +90,9 @@ def run(self) -> None:
 
     def metrics(self) -> Optional[LoadJobMetrics]:
         m = super().metrics()
-        # add remote uri if there's followup job
+        # add remote url if there's followup job
         if self.config.create_followup_jobs:
-            m = m._replace(remote_uri=self._file_name)
+            m = m._replace(remote_url=self._file_name)
         return m
 
 
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 62263a10b9..ac5ffb9ef3 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -56,7 +56,7 @@ def __init__(
         self._job_client: FilesystemClient = None
 
     def run(self) -> None:
-        self.__is_local_filesystem = self._job_client.config.protocol == "file"
+        self.__is_local_filesystem = self._job_client.config.is_local_filesystem
         # We would like to avoid failing for local filesystem where
         # deeply nested directory will not exist before writing a file.
         # It `auto_mkdir` is disabled by default in fsspec so we made some
@@ -88,13 +88,13 @@ def make_remote_path(self) -> str:
             path_utils.normalize_path_sep(pathlib, destination_file_name),
         )
 
-    def make_remote_uri(self) -> str:
-        """Returns path on a remote filesystem as a full uri including scheme."""
-        return self._job_client.make_remote_uri(self.make_remote_path())
+    def make_remote_url(self) -> str:
+        """Returns path on a remote filesystem as a full url including scheme."""
+        return self._job_client.make_remote_url(self.make_remote_path())
 
     def metrics(self) -> Optional[LoadJobMetrics]:
         m = super().metrics()
-        return m._replace(remote_uri=self.make_remote_uri())
+        return m._replace(remote_url=self.make_remote_url())
 
 
 class DeltaLoadFilesystemJob(FilesystemLoadJob):
@@ -112,7 +112,7 @@ def make_remote_path(self) -> str:
         return self._job_client.get_table_dir(self.load_table_name)
 
     def run(self) -> None:
-        logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_uri()}")
+        logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()}")
 
         from dlt.common.libs.deltalake import write_delta_table, merge_delta_table
 
@@ -133,7 +133,7 @@ def run(self) -> None:
             else:
                 write_delta_table(
                     table_or_uri=(
-                        self.make_remote_uri() if self._delta_table is None else self._delta_table
+                        self.make_remote_url() if self._delta_table is None else self._delta_table
                     ),
                     data=arrow_rbr,
                     write_disposition=self._load_table["write_disposition"],
@@ -151,7 +151,7 @@ def _storage_options(self) -> Dict[str, str]:
     def _delta_table(self) -> Optional["DeltaTable"]:  # type: ignore[name-defined] # noqa: F821
         from dlt.common.libs.deltalake import try_get_deltatable
 
-        return try_get_deltatable(self.make_remote_uri(), storage_options=self._storage_options)
+        return try_get_deltatable(self.make_remote_url(), storage_options=self._storage_options)
 
     @property
     def _partition_columns(self) -> List[str]:
@@ -166,7 +166,7 @@ def _create_or_evolve_delta_table(self) -> None:
 
         if self._delta_table is None:
             DeltaTable.create(
-                table_uri=self.make_remote_uri(),
+                table_uri=self.make_remote_url(),
                 schema=ensure_delta_compatible_arrow_schema(self.arrow_ds.schema),
                 mode="overwrite",
                 partition_by=self._partition_columns,
@@ -185,7 +185,7 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRe
         elif final_state == "completed":
             ref_job = ReferenceFollowupJobRequest(
                 original_file_name=self.file_name(),
-                remote_paths=[self._job_client.make_remote_uri(self.make_remote_path())],
+                remote_paths=[self._job_client.make_remote_url(self.make_remote_path())],
             )
             jobs.append(ref_job)
         return jobs
@@ -208,7 +208,7 @@ def __init__(
     ) -> None:
         super().__init__(schema, config, capabilities)
         self.fs_client, fs_path = fsspec_from_config(config)
-        self.is_local_filesystem = config.protocol == "file"
+        self.is_local_filesystem = config.is_local_filesystem
         self.bucket_path = (
             config.make_local_path(config.bucket_url) if self.is_local_filesystem else fs_path
         )
@@ -319,7 +319,7 @@ def get_table_dir(self, table_name: str, remote: bool = False) -> str:
         table_prefix = self.get_table_prefix(table_name)
         table_dir: str = self.pathlib.dirname(table_prefix)
         if remote:
-            table_dir = self.make_remote_uri(table_dir)
+            table_dir = self.make_remote_url(table_dir)
         return table_dir
 
     def get_table_prefix(self, table_name: str) -> str:
@@ -353,7 +353,7 @@ def list_files_with_prefixes(self, table_dir: str, prefixes: List[str]) -> List[
         # we fallback to our own glob implementation that is tested to return consistent results for
         # filesystems we support. we were not able to use `find` or `walk` because they were selecting
         # files wrongly (on azure walk on path1/path2/ would also select files from path1/path2_v2/ but returning wrong dirs)
-        for details in glob_files(self.fs_client, self.make_remote_uri(table_dir), "**"):
+        for details in glob_files(self.fs_client, self.make_remote_url(table_dir), "**"):
             file = details["file_name"]
             filepath = self.pathlib.join(table_dir, details["relative_path"])
             # skip INIT files
@@ -388,12 +388,12 @@ def create_load_job(
         cls = FilesystemLoadJobWithFollowup if self.config.as_staging else FilesystemLoadJob
         return cls(file_path)
 
-    def make_remote_uri(self, remote_path: str) -> str:
+    def make_remote_url(self, remote_path: str) -> str:
         """Returns uri to the remote filesystem to which copy the file"""
         if self.is_local_filesystem:
-            return self.config.make_file_uri(remote_path)
+            return self.config.make_file_url(remote_path)
         else:
-            return f"{self.config.protocol}://{remote_path}"
+            return self.config.make_url(remote_path)
 
     def __enter__(self) -> "FilesystemClient":
         return self
diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md
index 6cd5767dcb..ddb82c95b2 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md
@@ -117,6 +117,8 @@ access_token = "MY_ACCESS_TOKEN"
 catalog = "my_catalog"
 ```
 
+See [staging support](#staging-support) for authentication options when `dlt` copies files from buckets.
+
 ## Write disposition
 All write dispositions are supported
 
@@ -166,6 +168,11 @@ pipeline = dlt.pipeline(
 
 Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) for details on connecting your Azure Blob Storage container with the bucket_url and credentials.
 
+Databricks requires that you use ABFS urls in following format:
+**abfss://container_name@storage_account_name.dfs.core.windows.net/path**
+
+`dlt` is able to adapt the other representation (ie **az://container-name/path**') still we recommend that you use the correct form.
+
 Example to set up Databricks with Azure as a staging destination:
 
 ```py
@@ -175,10 +182,34 @@ Example to set up Databricks with Azure as a staging destination:
 pipeline = dlt.pipeline(
     pipeline_name='chess_pipeline',
     destination='databricks',
-    staging=dlt.destinations.filesystem('az://your-container-name'), # add this to activate the staging location
+    staging=dlt.destinations.filesystem('abfss://dlt-ci-data@dltdata.dfs.core.windows.net'), # add this to activate the staging location
     dataset_name='player_data'
 )
+
 ```
+
+### Use external locations and stored credentials
+`dlt` forwards bucket credentials to `COPY INTO` SQL command by default. You may prefer to use [external locations or stored credentials instead](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location) that are stored on the Databricks side.
+
+If you set up external location for your staging path, you can tell `dlt` to use it:
+```toml
+[destination.databricks]
+is_staging_external_location=true
+```
+
+If you set up Databricks credential named ie. **credential_x**, you can tell `dlt` to use it:
+```toml
+[destination.databricks]
+staging_credentials_name="credential_x"
+```
+
+Both options are available from code:
+```py
+import dlt
+
+bricks = dlt.destinations.databricks(staging_credentials_name="credential_x")
+```
+
 ### dbt support
 This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-databricks](https://github.com/databricks/dbt-databricks)
 
diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
index d08578c5a2..57e6db311d 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
@@ -176,7 +176,7 @@ Note that we ignore missing columns `ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE` and
 Snowflake supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns):
 * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created.
 
-### Table and column identifiers
+## Table and column identifiers
 Snowflake supports both case sensitive and case insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate
 case sensitive identifiers that must be quoted in SQL statements.
 
diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md
index 3b5762612c..cc089a1393 100644
--- a/docs/website/docs/running-in-production/running.md
+++ b/docs/website/docs/running-in-production/running.md
@@ -271,7 +271,7 @@ load_info.raise_on_failed_jobs()
 ```
 
 You may also abort the load package with `LoadClientJobFailed` (terminal exception) on a first
-failed job. Such package is immediately moved to completed but its load id is not added to the
+failed job. Such package is will be completed but its load id is not added to the
 `_dlt_loads` table. All the jobs that were running in parallel are completed before raising. The dlt
 state, if present, will not be visible to `dlt`. Here's example `config.toml` to enable this option:
 
@@ -282,6 +282,20 @@ load.workers=1
 load.raise_on_failed_jobs=true
 ```
 
+:::caution
+Note that certain write dispositions will irreversibly modify your data
+1. `replace` write disposition with the default `truncate-and-insert` [strategy](../general-usage/full-loading.md) will truncate tables before loading.
+2. `merge` write disposition will merge staging dataset tables into the destination dataset. This will happen only when all data for this table (and nested tables) got loaded.
+
+Here's what you can do to deal with partially loaded packages:
+1. Retry the load step in case of transient errors
+2. Use replace strategy with staging dataset so replace happens only when data for the table (and all nested tables) was fully loaded and is atomic operation (if possible)
+3. Use only "append" write disposition. When your load package fails you are able to use `_dlt_load_id` to remove all unprocessed data.
+4. Use "staging append" (`merge` disposition without primary key and merge key defined).
+
+:::
+
+
 ### What `run` does inside
 
 Before adding retry to pipeline steps, note how `run` method actually works:
diff --git a/tests/.dlt/config.toml b/tests/.dlt/config.toml
index ba86edf417..292175569b 100644
--- a/tests/.dlt/config.toml
+++ b/tests/.dlt/config.toml
@@ -6,7 +6,8 @@ bucket_url_gs="gs://ci-test-bucket"
 bucket_url_s3="s3://dlt-ci-test-bucket"
 bucket_url_file="_storage"
 bucket_url_az="az://dlt-ci-test-bucket"
+bucket_url_abfss="abfss://dlt-ci-test-bucket@dltdata.dfs.core.windows.net"
 bucket_url_r2="s3://dlt-ci-test-bucket"
 # use "/" as root path
 bucket_url_gdrive="gdrive://15eC3e5MNew2XAIefWNlG8VlEa0ISnnaG"
-memory="memory://m"
\ No newline at end of file
+memory="memory:///m"
\ No newline at end of file
diff --git a/tests/common/cases/normalizers/sql_upper.py b/tests/common/cases/normalizers/sql_upper.py
index f2175f06ad..eb88775f95 100644
--- a/tests/common/cases/normalizers/sql_upper.py
+++ b/tests/common/cases/normalizers/sql_upper.py
@@ -1,5 +1,3 @@
-from typing import Any, Sequence
-
 from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention
 
 
diff --git a/tests/common/storages/test_local_filesystem.py b/tests/common/storages/test_local_filesystem.py
index 14e3cc23d4..1bfe6c0b5b 100644
--- a/tests/common/storages/test_local_filesystem.py
+++ b/tests/common/storages/test_local_filesystem.py
@@ -45,7 +45,7 @@
 )
 def test_local_path_win_configuration(bucket_url: str, file_url: str) -> None:
     assert FilesystemConfiguration.is_local_path(bucket_url) is True
-    assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url
+    assert FilesystemConfiguration.make_file_url(bucket_url) == file_url
 
     c = resolve_configuration(FilesystemConfiguration(bucket_url))
     assert c.protocol == "file"
@@ -66,7 +66,7 @@ def test_local_path_win_configuration(bucket_url: str, file_url: str) -> None:
 def test_local_user_win_path_configuration(bucket_url: str) -> None:
     file_url = "file:///" + pathlib.Path(bucket_url).expanduser().as_posix().lstrip("/")
     assert FilesystemConfiguration.is_local_path(bucket_url) is True
-    assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url
+    assert FilesystemConfiguration.make_file_url(bucket_url) == file_url
 
     c = resolve_configuration(FilesystemConfiguration(bucket_url))
     assert c.protocol == "file"
@@ -99,7 +99,7 @@ def test_file_win_configuration() -> None:
 )
 def test_file_posix_configuration(bucket_url: str, file_url: str) -> None:
     assert FilesystemConfiguration.is_local_path(bucket_url) is True
-    assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url
+    assert FilesystemConfiguration.make_file_url(bucket_url) == file_url
 
     c = resolve_configuration(FilesystemConfiguration(bucket_url))
     assert c.protocol == "file"
@@ -117,7 +117,7 @@ def test_file_posix_configuration(bucket_url: str, file_url: str) -> None:
 def test_local_user_posix_path_configuration(bucket_url: str) -> None:
     file_url = "file:///" + pathlib.Path(bucket_url).expanduser().as_posix().lstrip("/")
     assert FilesystemConfiguration.is_local_path(bucket_url) is True
-    assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url
+    assert FilesystemConfiguration.make_file_url(bucket_url) == file_url
 
     c = resolve_configuration(FilesystemConfiguration(bucket_url))
     assert c.protocol == "file"
@@ -166,7 +166,7 @@ def test_file_filesystem_configuration(
         assert FilesystemConfiguration.make_local_path(bucket_url) == str(
             pathlib.Path(local_path).resolve()
         )
-        assert FilesystemConfiguration.make_file_uri(local_path) == norm_bucket_url
+        assert FilesystemConfiguration.make_file_url(local_path) == norm_bucket_url
 
     if local_path == "":
         with pytest.raises(ConfigurationValueError):
diff --git a/tests/destinations/test_destination_name_and_config.py b/tests/destinations/test_destination_name_and_config.py
index 11de706722..1e432a7803 100644
--- a/tests/destinations/test_destination_name_and_config.py
+++ b/tests/destinations/test_destination_name_and_config.py
@@ -60,7 +60,7 @@ def test_set_name_and_environment() -> None:
 def test_preserve_destination_instance() -> None:
     dummy1 = dummy(destination_name="dummy1", environment="dev/null/1")
     filesystem1 = filesystem(
-        FilesystemConfiguration.make_file_uri(TEST_STORAGE_ROOT),
+        FilesystemConfiguration.make_file_url(TEST_STORAGE_ROOT),
         destination_name="local_fs",
         environment="devel",
     )
@@ -210,7 +210,7 @@ def test_destination_config_in_name(environment: DictStrStr) -> None:
     with pytest.raises(ConfigFieldMissingException):
         p.destination_client()
 
-    environment["DESTINATION__FILESYSTEM-PROD__BUCKET_URL"] = FilesystemConfiguration.make_file_uri(
+    environment["DESTINATION__FILESYSTEM-PROD__BUCKET_URL"] = FilesystemConfiguration.make_file_url(
         "_storage"
     )
     assert p._fs_client().dataset_path.endswith(p.dataset_name)
diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py
index f6a06180c9..bb989a887c 100644
--- a/tests/load/databricks/test_databricks_configuration.py
+++ b/tests/load/databricks/test_databricks_configuration.py
@@ -3,9 +3,12 @@
 
 pytest.importorskip("databricks")
 
+from dlt.common.exceptions import TerminalValueError
+from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob
+from dlt.common.configuration import resolve_configuration
 
+from dlt.destinations import databricks
 from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration
-from dlt.common.configuration import resolve_configuration
 
 # mark all tests as essential, do not remove
 pytestmark = pytest.mark.essential
@@ -34,3 +37,48 @@ def test_databricks_credentials_to_connector_params():
     assert params["extra_a"] == "a"
     assert params["extra_b"] == "b"
     assert params["_socket_timeout"] == credentials.socket_timeout
+
+
+def test_databricks_configuration() -> None:
+    bricks = databricks()
+    config = bricks.configuration(None, accept_partial=True)
+    assert config.is_staging_external_location is False
+    assert config.staging_credentials_name is None
+
+    os.environ["IS_STAGING_EXTERNAL_LOCATION"] = "true"
+    os.environ["STAGING_CREDENTIALS_NAME"] = "credential"
+    config = bricks.configuration(None, accept_partial=True)
+    assert config.is_staging_external_location is True
+    assert config.staging_credentials_name == "credential"
+
+    # explicit params
+    bricks = databricks(is_staging_external_location=None, staging_credentials_name="credential2")
+    config = bricks.configuration(None, accept_partial=True)
+    assert config.staging_credentials_name == "credential2"
+    assert config.is_staging_external_location is None
+
+
+def test_databricks_abfss_converter() -> None:
+    with pytest.raises(TerminalValueError):
+        DatabricksLoadJob.ensure_databricks_abfss_url("az://dlt-ci-test-bucket")
+
+    abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url(
+        "az://dlt-ci-test-bucket", "my_account"
+    )
+    assert abfss_url == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net"
+
+    abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url(
+        "az://dlt-ci-test-bucket/path/to/file.parquet", "my_account"
+    )
+    assert (
+        abfss_url
+        == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet"
+    )
+
+    abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url(
+        "az://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet"
+    )
+    assert (
+        abfss_url
+        == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet"
+    )
diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py
index 3cad7dda2c..29ca1a2b57 100644
--- a/tests/load/filesystem/test_filesystem_common.py
+++ b/tests/load/filesystem/test_filesystem_common.py
@@ -3,8 +3,8 @@
 
 from typing import Tuple, Union, Dict
 from urllib.parse import urlparse
-
-from fsspec import AbstractFileSystem
+from fsspec import AbstractFileSystem, get_filesystem_class, register_implementation
+from fsspec.core import filesystem as fs_filesystem
 import pytest
 
 from tenacity import retry, stop_after_attempt, wait_fixed
@@ -15,6 +15,7 @@
 from dlt.common.configuration.inject import with_config
 from dlt.common.configuration.specs import AnyAzureCredentials
 from dlt.common.storages import fsspec_from_config, FilesystemConfiguration
+from dlt.common.storages.configuration import make_fsspec_url
 from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files
 from dlt.common.utils import custom_environ, uniq_id
 from dlt.destinations import filesystem
@@ -22,11 +23,12 @@
     FilesystemDestinationClientConfiguration,
 )
 from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders
+
+from tests.common.configuration.utils import environment
 from tests.common.storages.utils import TEST_SAMPLE_FILES, assert_sample_files
-from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET
+from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET, WITH_GDRIVE_BUCKETS
 from tests.utils import autouse_test_storage
-from .utils import self_signed_cert
-from tests.common.configuration.utils import environment
+from tests.load.filesystem.utils import self_signed_cert
 
 
 # mark all tests as essential, do not remove
@@ -53,6 +55,24 @@ def test_filesystem_configuration() -> None:
     }
 
 
+@pytest.mark.parametrize("bucket_url", WITH_GDRIVE_BUCKETS)
+def test_remote_url(bucket_url: str) -> None:
+    # make absolute urls out of paths
+    scheme = urlparse(bucket_url).scheme
+    if not scheme:
+        scheme = "file"
+        bucket_url = FilesystemConfiguration.make_file_url(bucket_url)
+    if scheme == "gdrive":
+        from dlt.common.storages.fsspecs.google_drive import GoogleDriveFileSystem
+
+        register_implementation("gdrive", GoogleDriveFileSystem, "GoogleDriveFileSystem")
+
+    fs_class = get_filesystem_class(scheme)
+    fs_path = fs_class._strip_protocol(bucket_url)
+    # reconstitute url
+    assert make_fsspec_url(scheme, fs_path, bucket_url) == bucket_url
+
+
 def test_filesystem_instance(with_gdrive_buckets_env: str) -> None:
     @retry(stop=stop_after_attempt(10), wait=wait_fixed(1), reraise=True)
     def check_file_exists(filedir_: str, file_url_: str):
@@ -72,10 +92,8 @@ def check_file_changed(file_url_: str):
     bucket_url = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"]
     config = get_config()
     # we do not add protocol to bucket_url (we need relative path)
-    assert bucket_url.startswith(config.protocol) or config.protocol == "file"
+    assert bucket_url.startswith(config.protocol) or config.is_local_filesystem
     filesystem, url = fsspec_from_config(config)
-    if config.protocol != "file":
-        assert bucket_url.endswith(url)
     # do a few file ops
     now = pendulum.now()
     filename = f"filesystem_common_{uniq_id()}"
@@ -113,7 +131,9 @@ def test_glob_overlapping_path_files(with_gdrive_buckets_env: str) -> None:
     # "standard_source/sample" overlaps with a real existing "standard_source/samples". walk operation on azure
     # will return all files from "standard_source/samples" and report the wrong "standard_source/sample" path to the user
     # here we test we do not have this problem with out glob
-    bucket_url, _, filesystem = glob_test_setup(bucket_url, "standard_source/sample")
+    bucket_url, config, filesystem = glob_test_setup(bucket_url, "standard_source/sample")
+    if config.protocol in ["file"]:
+        pytest.skip(f"{config.protocol} not supported in this test")
     # use glob to get data
     all_file_items = list(glob_files(filesystem, bucket_url))
     assert len(all_file_items) == 0
@@ -272,18 +292,18 @@ def glob_test_setup(
     config = get_config()
     # enable caches
     config.read_only = True
-    if config.protocol in ["file"]:
-        pytest.skip(f"{config.protocol} not supported in this test")
 
     # may contain query string
-    bucket_url_parsed = urlparse(bucket_url)
-    bucket_url = bucket_url_parsed._replace(
-        path=posixpath.join(bucket_url_parsed.path, glob_folder)
-    ).geturl()
-    filesystem, _ = fsspec_from_config(config)
+    filesystem, fs_path = fsspec_from_config(config)
+    bucket_url = make_fsspec_url(config.protocol, posixpath.join(fs_path, glob_folder), bucket_url)
     if config.protocol == "memory":
-        mem_path = os.path.join("m", "standard_source")
+        mem_path = os.path.join("/m", "standard_source")
         if not filesystem.isdir(mem_path):
             filesystem.mkdirs(mem_path)
             filesystem.upload(TEST_SAMPLE_FILES, mem_path, recursive=True)
+    if config.protocol == "file":
+        file_path = os.path.join("_storage", "standard_source")
+        if not filesystem.isdir(file_path):
+            filesystem.mkdirs(file_path)
+            filesystem.upload(TEST_SAMPLE_FILES, file_path, recursive=True)
     return bucket_url, config, filesystem
diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py
new file mode 100644
index 0000000000..5f8641f9fa
--- /dev/null
+++ b/tests/load/pipeline/test_databricks_pipeline.py
@@ -0,0 +1,85 @@
+import pytest
+import os
+
+from dlt.common.utils import uniq_id
+from tests.load.utils import DestinationTestConfiguration, destinations_configs, AZ_BUCKET
+from tests.pipeline.utils import assert_load_info
+
+
+# mark all tests as essential, do not remove
+pytestmark = pytest.mark.essential
+
+
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(
+        default_sql_configs=True, bucket_subset=(AZ_BUCKET), subset=("databricks",)
+    ),
+    ids=lambda x: x.name,
+)
+def test_databricks_external_location(destination_config: DestinationTestConfiguration) -> None:
+    # do not interfere with state
+    os.environ["RESTORE_FROM_DESTINATION"] = "False"
+    dataset_name = "test_databricks_external_location" + uniq_id()
+
+    from dlt.destinations import databricks, filesystem
+    from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob
+
+    abfss_bucket_url = DatabricksLoadJob.ensure_databricks_abfss_url(AZ_BUCKET, "dltdata")
+    stage = filesystem(abfss_bucket_url)
+
+    # should load abfss formatted url just fine
+    bricks = databricks(is_staging_external_location=False)
+    pipeline = destination_config.setup_pipeline(
+        "test_databricks_external_location",
+        dataset_name=dataset_name,
+        destination=bricks,
+        staging=stage,
+    )
+    info = pipeline.run([1, 2, 3], table_name="digits")
+    assert_load_info(info)
+    # get metrics
+    metrics = info.metrics[info.loads_ids[0]][0]
+    remote_url = list(metrics["job_metrics"].values())[0].remote_url
+    # abfss form was preserved
+    assert remote_url.startswith(abfss_bucket_url)
+
+    # should fail on internal config error as external location is not configured
+    bricks = databricks(is_staging_external_location=True)
+    pipeline = destination_config.setup_pipeline(
+        "test_databricks_external_location",
+        dataset_name=dataset_name,
+        destination=bricks,
+        staging=stage,
+    )
+    info = pipeline.run([1, 2, 3], table_name="digits")
+    assert info.has_failed_jobs is True
+    assert (
+        "Invalid configuration value detected"
+        in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message
+    )
+
+    # should fail on non existing stored credentials
+    bricks = databricks(is_staging_external_location=False, staging_credentials_name="CREDENTIAL_X")
+    pipeline = destination_config.setup_pipeline(
+        "test_databricks_external_location",
+        dataset_name=dataset_name,
+        destination=bricks,
+        staging=stage,
+    )
+    info = pipeline.run([1, 2, 3], table_name="digits")
+    assert info.has_failed_jobs is True
+    assert (
+        "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message
+    )
+
+    # should fail on non existing stored credentials
+    # auto stage with regular az:// used
+    pipeline = destination_config.setup_pipeline(
+        "test_databricks_external_location", dataset_name=dataset_name, destination=bricks
+    )
+    info = pipeline.run([1, 2, 3], table_name="digits")
+    assert info.has_failed_jobs is True
+    assert (
+        "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message
+    )
diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py
index d88eba7c06..bc6cbd9848 100644
--- a/tests/load/pipeline/test_filesystem_pipeline.py
+++ b/tests/load/pipeline/test_filesystem_pipeline.py
@@ -300,16 +300,16 @@ def data_types():
     assert len(rows) == 10
     assert_all_data_types_row(rows[0], schema=column_schemas)
 
-    # make sure remote_uri is in metrics
+    # make sure remote_url is in metrics
     metrics = info.metrics[info.loads_ids[0]][0]
-    # TODO: only final copy job has remote_uri. not the initial (empty) job for particular files
-    # we could implement an empty job for delta that generates correct remote_uri
-    remote_uri = list(metrics["job_metrics"].values())[-1].remote_uri
-    assert remote_uri.endswith("data_types")
-    bucket_uri = destination_config.bucket_url
-    if FilesystemConfiguration.is_local_path(bucket_uri):
-        bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri)
-    assert remote_uri.startswith(bucket_uri)
+    # TODO: only final copy job has remote_url. not the initial (empty) job for particular files
+    # we could implement an empty job for delta that generates correct remote_url
+    remote_url = list(metrics["job_metrics"].values())[-1].remote_url
+    assert remote_url.endswith("data_types")
+    bucket_url = destination_config.bucket_url
+    if FilesystemConfiguration.is_local_path(bucket_url):
+        bucket_url = FilesystemConfiguration.make_file_url(bucket_url)
+    assert remote_url.startswith(bucket_url)
 
     # another run should append rows to the table
     info = pipeline.run(data_types())
diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py
index f216fa3c05..42dee5fc8f 100644
--- a/tests/load/pipeline/test_stage_loading.py
+++ b/tests/load/pipeline/test_stage_loading.py
@@ -57,17 +57,17 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None:
 
     info = pipeline.run(github(), loader_file_format=destination_config.file_format)
     assert_load_info(info)
-    # checks if remote_uri is set correctly on copy jobs
+    # checks if remote_url is set correctly on copy jobs
     metrics = info.metrics[info.loads_ids[0]][0]
     for job_metrics in metrics["job_metrics"].values():
-        remote_uri = job_metrics.remote_uri
+        remote_url = job_metrics.remote_url
         job_ext = os.path.splitext(job_metrics.job_id)[1]
         if job_ext not in (".reference", ".sql"):
-            assert remote_uri.endswith(job_ext)
+            assert remote_url.endswith(job_ext)
             bucket_uri = destination_config.bucket_url
             if FilesystemConfiguration.is_local_path(bucket_uri):
-                bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri)
-            assert remote_uri.startswith(bucket_uri)
+                bucket_uri = FilesystemConfiguration.make_file_url(bucket_uri)
+            assert remote_url.startswith(bucket_uri)
 
     package_info = pipeline.get_load_package_info(info.loads_ids[0])
     assert package_info.state == "loaded"
diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py
index 59b7acac15..72c5772668 100644
--- a/tests/load/test_dummy_client.py
+++ b/tests/load/test_dummy_client.py
@@ -1012,17 +1012,17 @@ def assert_complete_job(
                             if state == "failed_jobs"
                             else "completed"
                         )
-                        remote_uri = job_metrics.remote_uri
+                        remote_url = job_metrics.remote_url
                         if load.initial_client_config.create_followup_jobs:  # type: ignore
-                            assert remote_uri.endswith(job.file_name())
+                            assert remote_url.endswith(job.file_name())
                         elif load.is_staging_destination_job(job.file_name()):
                             # staging destination should contain reference to remote filesystem
                             assert (
-                                FilesystemConfiguration.make_file_uri(REMOTE_FILESYSTEM)
-                                in remote_uri
+                                FilesystemConfiguration.make_file_url(REMOTE_FILESYSTEM)
+                                in remote_url
                             )
                         else:
-                            assert remote_uri is None
+                            assert remote_url is None
                     else:
                         assert job_metrics is None
 
diff --git a/tests/load/utils.py b/tests/load/utils.py
index 086109de8b..15b1e1575e 100644
--- a/tests/load/utils.py
+++ b/tests/load/utils.py
@@ -70,6 +70,7 @@
 AWS_BUCKET = dlt.config.get("tests.bucket_url_s3", str)
 GCS_BUCKET = dlt.config.get("tests.bucket_url_gs", str)
 AZ_BUCKET = dlt.config.get("tests.bucket_url_az", str)
+ABFS_BUCKET = dlt.config.get("tests.bucket_url_abfss", str)
 GDRIVE_BUCKET = dlt.config.get("tests.bucket_url_gdrive", str)
 FILE_BUCKET = dlt.config.get("tests.bucket_url_file", str)
 R2_BUCKET = dlt.config.get("tests.bucket_url_r2", str)
@@ -79,6 +80,7 @@
     "s3",
     "gs",
     "az",
+    "abfss",
     "gdrive",
     "file",
     "memory",
@@ -86,7 +88,15 @@
 ]
 
 # Filter out buckets not in all filesystem drivers
-WITH_GDRIVE_BUCKETS = [GCS_BUCKET, AWS_BUCKET, FILE_BUCKET, MEMORY_BUCKET, AZ_BUCKET, GDRIVE_BUCKET]
+WITH_GDRIVE_BUCKETS = [
+    GCS_BUCKET,
+    AWS_BUCKET,
+    FILE_BUCKET,
+    MEMORY_BUCKET,
+    ABFS_BUCKET,
+    AZ_BUCKET,
+    GDRIVE_BUCKET,
+]
 WITH_GDRIVE_BUCKETS = [
     bucket
     for bucket in WITH_GDRIVE_BUCKETS
diff --git a/tests/pipeline/cases/contracts/trace.schema.yaml b/tests/pipeline/cases/contracts/trace.schema.yaml
index 89831977c0..c324818338 100644
--- a/tests/pipeline/cases/contracts/trace.schema.yaml
+++ b/tests/pipeline/cases/contracts/trace.schema.yaml
@@ -562,7 +562,7 @@ tables:
       finished_at:
         data_type: timestamp
         nullable: true
-      remote_uri:
+      remote_url:
         data_type: text
         nullable: true
     parent: trace__steps
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
index b6a7feffc1..027a2b4e72 100644
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -2600,6 +2600,20 @@ def ids(_id=dlt.sources.incremental("_id", initial_value=2)):
     assert pipeline.last_trace.last_normalize_info.row_counts["_ids"] == 2
 
 
+def test_dlt_columns_nested_table_collisions() -> None:
+    # we generate all identifiers in upper case to test for a bug where dlt columns for nested tables were hardcoded to
+    # small caps. they got normalized to upper case after the first run and then added again as small caps
+    # generating duplicate columns and raising collision exception as duckdb is ci destination
+    duck = duckdb(naming_convention="tests.common.cases.normalizers.sql_upper")
+    pipeline = dlt.pipeline("test_dlt_columns_child_table_collisions", destination=duck)
+    customers = [
+        {"id": 1, "name": "dave", "orders": [1, 2, 3]},
+    ]
+    assert_load_info(pipeline.run(customers, table_name="CUSTOMERS"))
+    # this one would fail without bugfix
+    assert_load_info(pipeline.run(customers, table_name="CUSTOMERS"))
+
+
 def test_access_pipeline_in_resource() -> None:
     pipeline = dlt.pipeline("test_access_pipeline_in_resource", destination="duckdb")
 
diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py
index 4e52d2aa29..d2bb035a17 100644
--- a/tests/pipeline/test_pipeline_trace.py
+++ b/tests/pipeline/test_pipeline_trace.py
@@ -315,7 +315,7 @@ def data():
 
         return data()
 
-    # create pipeline with staging to get remote_uri in load step job_metrics
+    # create pipeline with staging to get remote_url in load step job_metrics
     dummy_dest = dummy(completed_prob=1.0)
     pipeline = dlt.pipeline(
         pipeline_name="test_trace_schema",

From 63f89542678c7af51089f94365aa6834ccca90e7 Mon Sep 17 00:00:00 2001
From: Marcin Rudolf <rudolfix@rudolfix.org>
Date: Wed, 28 Aug 2024 13:20:16 +0200
Subject: [PATCH 32/34] bumps dlt version to 0.5.4

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 74161f5ccc..d32285572f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dlt"
-version = "0.5.4a0"
+version = "0.5.4"
 description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run."
 authors = ["dltHub Inc. <services@dlthub.com>"]
 maintainers = [ "Marcin Rudolf <marcin@dlthub.com>", "Adrian Brudaru <adrian@dlthub.com>", "Anton Burnashev <anton@dlthub.com>", "David Scharf <david@dlthub.com>" ]

From b48c7c3e7db9fb4ff321b668b9b22553b7882b31 Mon Sep 17 00:00:00 2001
From: rudolfix <rudolfix@rudolfix.org>
Date: Wed, 28 Aug 2024 19:11:56 +0200
Subject: [PATCH 33/34] runs staging tests on athena (#1764)

* always truncates staging tables on athena + replace without iceberg

* adds athena staging configs to all staging configs

* updates athena tests for staging destination
---
 dlt/common/destination/reference.py       | 11 +++++
 dlt/destinations/impl/athena/athena.py    |  2 +-
 tests/load/pipeline/test_stage_loading.py | 23 ++++++++++-
 tests/load/utils.py                       | 49 ++++++++++++++---------
 4 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
index 0944b03bea..e7bba266df 100644
--- a/dlt/common/destination/reference.py
+++ b/dlt/common/destination/reference.py
@@ -586,10 +586,21 @@ class SupportsStagingDestination(ABC):
     def should_load_data_to_staging_dataset_on_staging_destination(
         self, table: TTableSchema
     ) -> bool:
+        """If set to True, and staging destination is configured, the data will be loaded to staging dataset on staging destination
+        instead of a regular dataset on staging destination. Currently it is used by Athena Iceberg which uses staging dataset
+        on staging destination to copy data to iceberg tables stored on regular dataset on staging destination.
+        The default is to load data to regular dataset on staging destination from where warehouses like Snowflake (that have their
+        own storage) will copy data.
+        """
         return False
 
     @abstractmethod
     def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
+        """If set to True, data in `table` will be truncated on staging destination (regular dataset). This is the default behavior which
+        can be changed with a config flag.
+        For Athena + Iceberg this setting is always False - Athena uses regular dataset to store Iceberg tables and we avoid touching it.
+        For Athena we truncate those tables only on "replace" write disposition.
+        """
         pass
 
 
diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
index b3b2fbcf0f..a5a8ae2562 100644
--- a/dlt/destinations/impl/athena/athena.py
+++ b/dlt/destinations/impl/athena/athena.py
@@ -530,7 +530,7 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable
         if table["write_disposition"] == "replace" and not self._is_iceberg_table(
             self.prepare_load_table(table["name"])
         ):
-            return self.config.truncate_tables_on_staging_destination_before_load
+            return True
         return False
 
     def should_load_data_to_staging_dataset_on_staging_destination(
diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py
index 42dee5fc8f..3bfa050fd7 100644
--- a/tests/load/pipeline/test_stage_loading.py
+++ b/tests/load/pipeline/test_stage_loading.py
@@ -218,7 +218,18 @@ def test_truncate_staging_dataset(destination_config: DestinationTestConfigurati
     # check there are two staging files
     _, staging_client = pipeline._get_destination_clients(pipeline.default_schema)
     with staging_client:
-        assert len(staging_client.list_table_files(table_name)) == 2  # type: ignore[attr-defined]
+        # except Athena + Iceberg which does not store tables in staging dataset
+        if (
+            destination_config.destination == "athena"
+            and destination_config.table_format == "iceberg"
+        ):
+            table_count = 0
+            # but keeps them in staging dataset on staging destination - but only the last one
+            with staging_client.with_staging_dataset():  # type: ignore[attr-defined]
+                assert len(staging_client.list_table_files(table_name)) == 1  # type: ignore[attr-defined]
+        else:
+            table_count = 2
+        assert len(staging_client.list_table_files(table_name)) == table_count  # type: ignore[attr-defined]
 
     # load the data with truncating, so only new file is on the staging
     pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = True
@@ -231,7 +242,15 @@ def test_truncate_staging_dataset(destination_config: DestinationTestConfigurati
     # check there is only one staging file
     _, staging_client = pipeline._get_destination_clients(pipeline.default_schema)
     with staging_client:
-        assert len(staging_client.list_table_files(table_name)) == 1  # type: ignore[attr-defined]
+        # except for Athena which does not delete staging destination tables
+        if destination_config.destination == "athena":
+            if destination_config.table_format == "iceberg":
+                table_count = 0
+            else:
+                table_count = 3
+        else:
+            table_count = 1
+        assert len(staging_client.list_table_files(table_name)) == table_count  # type: ignore[attr-defined]
 
 
 @pytest.mark.parametrize(
diff --git a/tests/load/utils.py b/tests/load/utils.py
index 15b1e1575e..5427904d52 100644
--- a/tests/load/utils.py
+++ b/tests/load/utils.py
@@ -257,6 +257,27 @@ def destinations_configs(
     # build destination configs
     destination_configs: List[DestinationTestConfiguration] = []
 
+    # default sql configs that are also default staging configs
+    default_sql_configs_with_staging = [
+        # Athena needs filesystem staging, which will be automatically set; we have to supply a bucket url though.
+        DestinationTestConfiguration(
+            destination="athena",
+            file_format="parquet",
+            supports_merge=False,
+            bucket_url=AWS_BUCKET,
+        ),
+        DestinationTestConfiguration(
+            destination="athena",
+            file_format="parquet",
+            bucket_url=AWS_BUCKET,
+            force_iceberg=True,
+            supports_merge=True,
+            supports_dbt=False,
+            table_format="iceberg",
+            extra_info="iceberg",
+        ),
+    ]
+
     # default non staging sql based configs, one per destination
     if default_sql_configs:
         destination_configs += [
@@ -268,26 +289,10 @@ def destinations_configs(
             DestinationTestConfiguration(destination="duckdb", file_format="parquet"),
             DestinationTestConfiguration(destination="motherduck", file_format="insert_values"),
         ]
-        # Athena needs filesystem staging, which will be automatically set; we have to supply a bucket url though.
-        destination_configs += [
-            DestinationTestConfiguration(
-                destination="athena",
-                file_format="parquet",
-                supports_merge=False,
-                bucket_url=AWS_BUCKET,
-            )
-        ]
-        destination_configs += [
-            DestinationTestConfiguration(
-                destination="athena",
-                file_format="parquet",
-                bucket_url=AWS_BUCKET,
-                force_iceberg=True,
-                supports_merge=True,
-                supports_dbt=False,
-                extra_info="iceberg",
-            )
-        ]
+
+        # add Athena staging configs
+        destination_configs += default_sql_configs_with_staging
+
         destination_configs += [
             DestinationTestConfiguration(
                 destination="clickhouse", file_format="jsonl", supports_dbt=False
@@ -332,6 +337,10 @@ def destinations_configs(
             DestinationTestConfiguration(destination="qdrant", extra_info="server"),
         ]
 
+    if (default_sql_configs or all_staging_configs) and not default_sql_configs:
+        # athena default configs not added yet
+        destination_configs += default_sql_configs_with_staging
+
     if default_staging_configs or all_staging_configs:
         destination_configs += [
             DestinationTestConfiguration(

From e9c9ecfa8a644fdb516dd74aabca3bf75bafb154 Mon Sep 17 00:00:00 2001
From: Marcin Rudolf <rudolfix@rudolfix.org>
Date: Wed, 28 Aug 2024 21:45:16 +0200
Subject: [PATCH 34/34] fixes staging tests for athena

---
 tests/load/pipeline/test_stage_loading.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py
index 3bfa050fd7..6c4f6dfec8 100644
--- a/tests/load/pipeline/test_stage_loading.py
+++ b/tests/load/pipeline/test_stage_loading.py
@@ -74,8 +74,14 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None:
 
     assert len(package_info.jobs["failed_jobs"]) == 0
     # we have 4 parquet and 4 reference jobs plus one merge job
-    num_jobs = 4 + 4 + 1 if destination_config.supports_merge else 4 + 4
-    assert len(package_info.jobs["completed_jobs"]) == num_jobs
+    num_jobs = 4 + 4
+    num_sql_jobs = 0
+    if destination_config.supports_merge:
+        num_sql_jobs += 1
+    # sql job is used to copy parquet to Athena Iceberg table (_dlt_pipeline_state)
+    if destination_config.destination == "athena" and destination_config.table_format == "iceberg":
+        num_sql_jobs += 1
+    assert len(package_info.jobs["completed_jobs"]) == num_jobs + num_sql_jobs
     assert (
         len(
             [
@@ -110,7 +116,7 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None:
                     if x.job_file_info.file_format == "sql"
                 ]
             )
-            == 1
+            == num_sql_jobs
         )
 
     initial_counts = load_table_counts(