From 8b4fc8c25d2a46f86432d496b413983da2bd87eb Mon Sep 17 00:00:00 2001 From: Willi Date: Thu, 8 Aug 2024 19:27:34 +0530 Subject: [PATCH 01/34] RangePaginator: Stops pagination in case of page without data items --- dlt/sources/helpers/rest_client/client.py | 2 +- dlt/sources/helpers/rest_client/paginators.py | 62 +++++++++++-------- .../helpers/rest_client/test_client.py | 2 +- .../helpers/rest_client/test_paginators.py | 29 +++++++++ 4 files changed, 68 insertions(+), 27 deletions(-) diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index 73ae064299..c05dabc30c 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -225,7 +225,7 @@ def raise_for_status(response: Response, *args: Any, **kwargs: Any) -> None: if paginator is None: paginator = self.detect_paginator(response, data) - paginator.update_state(response) + paginator.update_state(response, data) paginator.update_request(request) # yield data with context diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 4c8ce70bb2..078d4b0a87 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -1,6 +1,6 @@ import warnings from abc import ABC, abstractmethod -from typing import Optional, Dict, Any +from typing import Any, Dict, List, Optional from urllib.parse import urlparse, urljoin from requests import Response, Request @@ -39,7 +39,7 @@ def init_request(self, request: Request) -> None: # noqa: B027, optional overri pass @abstractmethod - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Updates the paginator's state based on the response from the API. This method should extract necessary pagination details (like next page @@ -73,7 +73,7 @@ def __str__(self) -> str: class SinglePagePaginator(BasePaginator): """A paginator for single-page API responses.""" - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: self._has_next_page = False def update_request(self, request: Request) -> None: @@ -96,6 +96,7 @@ def __init__( maximum_value: Optional[int] = None, total_path: Optional[jsonpath.TJsonPath] = None, error_message_items: str = "items", + stop_after_empty_page: bool = False, ): """ Args: @@ -127,6 +128,7 @@ def __init__( self.maximum_value = maximum_value self.total_path = jsonpath.compile_path(total_path) if total_path else None self.error_message_items = error_message_items + self.stop_after_empty_page = stop_after_empty_page def init_request(self, request: Request) -> None: if request.params is None: @@ -134,26 +136,32 @@ def init_request(self, request: Request) -> None: request.params[self.param_name] = self.current_value - def update_state(self, response: Response) -> None: - total = None - if self.total_path: - response_json = response.json() - values = jsonpath.find_values(self.total_path, response_json) - total = values[0] if values else None - if total is None: - self._handle_missing_total(response_json) - - try: - total = int(total) - except ValueError: - self._handle_invalid_total(total) - - self.current_value += self.value_step - - if (total is not None and self.current_value >= total + self.base_index) or ( - self.maximum_value is not None and self.current_value >= self.maximum_value - ): + def update_state(self, response: Response, data: List[Any] = None) -> None: + if self._stop_after_this_page(data): self._has_next_page = False + else: + total = None + if self.total_path: + response_json = response.json() + values = jsonpath.find_values(self.total_path, response_json) + total = values[0] if values else None + if total is None: + self._handle_missing_total(response_json) + + try: + total = int(total) + except ValueError: + self._handle_invalid_total(total) + + self.current_value += self.value_step + + if (total is not None and self.current_value >= total + self.base_index) or ( + self.maximum_value is not None and self.current_value >= self.maximum_value + ): + self._has_next_page = False + + def _stop_after_this_page(self, data: List[Any]) -> bool: + return self.stop_after_empty_page and data == [] def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: raise ValueError( @@ -229,6 +237,7 @@ def __init__( page_param: str = "page", total_path: jsonpath.TJsonPath = "total", maximum_page: Optional[int] = None, + stop_after_empty_page: bool = False, ): """ Args: @@ -260,6 +269,7 @@ def __init__( value_step=1, maximum_value=maximum_page, error_message_items="pages", + stop_after_empty_page=stop_after_empty_page, ) def __str__(self) -> str: @@ -330,6 +340,7 @@ def __init__( limit_param: str = "limit", total_path: jsonpath.TJsonPath = "total", maximum_offset: Optional[int] = None, + stop_after_empty_page: bool = False, ) -> None: """ Args: @@ -356,6 +367,7 @@ def __init__( total_path=total_path, value_step=limit, maximum_value=maximum_offset, + stop_after_empty_page=stop_after_empty_page, ) self.limit_param = limit_param self.limit = limit @@ -484,7 +496,7 @@ def __init__(self, links_next_key: str = "next") -> None: super().__init__() self.links_next_key = links_next_key - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Extracts the next page URL from the 'Link' header in the response.""" self._next_reference = response.links.get(self.links_next_key, {}).get("url") @@ -539,7 +551,7 @@ def __init__( super().__init__() self.next_url_path = jsonpath.compile_path(next_url_path) - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Extracts the next page URL from the JSON response.""" values = jsonpath.find_values(self.next_url_path, response.json()) self._next_reference = values[0] if values else None @@ -618,7 +630,7 @@ def __init__( self.cursor_path = jsonpath.compile_path(cursor_path) self.cursor_param = cursor_param - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Extracts the cursor value from the JSON response.""" values = jsonpath.find_values(self.cursor_path, response.json()) self._next_reference = values[0] if values else None diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index f5de1ec5da..af914bf89d 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -400,7 +400,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response): + def update_state(self, response, data): self._next_reference = response.json().get("next_page") def update_request(self, request): diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 8a3c136e09..9e4ccada72 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -1,3 +1,4 @@ +from typing import Any, List from unittest.mock import Mock import pytest @@ -312,6 +313,19 @@ def test_client_pagination(self, rest_client): assert_pagination(pages) + def test_stop_after_empty_page(self): + paginator = OffsetPaginator( + offset=0, + limit=50, + maximum_offset=100, + total_path=None, + stop_after_empty_page=True, + ) + response = Mock(Response, json=lambda: {"items": []}) + no_data_found: List[Any] = [] + paginator.update_state(response, no_data_found) # Page 1 + assert paginator.has_next_page is False + @pytest.mark.usefixtures("mock_api_server") class TestPageNumberPaginator: @@ -372,6 +386,21 @@ def test_maximum_page(self): assert paginator.current_value == 3 assert paginator.has_next_page is False + def test_stop_after_empty_page(self): + paginator = PageNumberPaginator( + base_page=1, + page=1, + maximum_page=5, + stop_after_empty_page=True, + total_path=None, + ) + response = Mock(Response, json=lambda: {"items": []}) + no_data_found: List[Any] = [] + assert paginator.has_next_page is True + paginator.update_state(response, no_data_found) + assert paginator.current_value == 1 + assert paginator.has_next_page is False + def test_client_pagination_one_based(self, rest_client): pages_iter = rest_client.paginate( "/posts", From 8d4ffa9e49f083866c507de68c57196332e0493c Mon Sep 17 00:00:00 2001 From: Willi Date: Fri, 9 Aug 2024 15:58:55 +0530 Subject: [PATCH 02/34] Defaults RangePaginator to stop after having received an empty page --- dlt/sources/helpers/rest_client/paginators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 078d4b0a87..a96413d84e 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -96,7 +96,7 @@ def __init__( maximum_value: Optional[int] = None, total_path: Optional[jsonpath.TJsonPath] = None, error_message_items: str = "items", - stop_after_empty_page: bool = False, + stop_after_empty_page: bool = True, ): """ Args: @@ -237,7 +237,7 @@ def __init__( page_param: str = "page", total_path: jsonpath.TJsonPath = "total", maximum_page: Optional[int] = None, - stop_after_empty_page: bool = False, + stop_after_empty_page: bool = True, ): """ Args: @@ -340,7 +340,7 @@ def __init__( limit_param: str = "limit", total_path: jsonpath.TJsonPath = "total", maximum_offset: Optional[int] = None, - stop_after_empty_page: bool = False, + stop_after_empty_page: bool = True, ) -> None: """ Args: From e9ecf88a741033034b7beff4fcb0c3e8a12d12e9 Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 12 Aug 2024 15:31:04 +0530 Subject: [PATCH 03/34] Documents how to stop paginator, updates docs on json_link --- dlt/sources/helpers/rest_client/paginators.py | 12 +++-- .../verified-sources/rest_api.md | 8 +-- .../docs/general-usage/http/rest-client.md | 51 +++++++++++++++---- 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index a96413d84e..083b95da18 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -96,7 +96,7 @@ def __init__( maximum_value: Optional[int] = None, total_path: Optional[jsonpath.TJsonPath] = None, error_message_items: str = "items", - stop_after_empty_page: bool = True, + stop_after_empty_page: Optional[bool] = True, ): """ Args: @@ -117,6 +117,8 @@ def __init__( If not provided, `maximum_value` must be specified. error_message_items (str): The name of the items in the error message. Defaults to 'items'. + stop_after_empty_page (bool): Whether pagination should stop when + a page contains no result items. Defaults to `True`. """ super().__init__() if total_path is None and maximum_value is None: @@ -237,7 +239,7 @@ def __init__( page_param: str = "page", total_path: jsonpath.TJsonPath = "total", maximum_page: Optional[int] = None, - stop_after_empty_page: bool = True, + stop_after_empty_page: Optional[bool] = True, ): """ Args: @@ -255,6 +257,8 @@ def __init__( will stop once this page is reached or exceeded, even if more data is available. This allows you to limit the maximum number of pages for pagination. Defaults to None. + stop_after_empty_page (bool): Whether pagination should stop when + a page contains no result items. Defaults to `True`. """ if total_path is None and maximum_page is None: raise ValueError("Either `total_path` or `maximum_page` must be provided.") @@ -340,7 +344,7 @@ def __init__( limit_param: str = "limit", total_path: jsonpath.TJsonPath = "total", maximum_offset: Optional[int] = None, - stop_after_empty_page: bool = True, + stop_after_empty_page: Optional[bool] = True, ) -> None: """ Args: @@ -358,6 +362,8 @@ def __init__( pagination will stop once this offset is reached or exceeded, even if more data is available. This allows you to limit the maximum range for pagination. Defaults to None. + stop_after_empty_page (bool): Whether pagination should stop when + a page contains no result items. Defaults to `True`. """ if total_path is None and maximum_offset is None: raise ValueError("Either `total_path` or `maximum_offset` must be provided.") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 4b72b3276e..b4d2d08daa 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -371,7 +371,7 @@ You can configure the pagination for the `posts` resource like this: { "path": "posts", "paginator": { - "type": "json_response", + "type": "json_link", "next_url_path": "pagination.next", } } @@ -380,7 +380,7 @@ You can configure the pagination for the `posts` resource like this: Alternatively, you can use the paginator instance directly: ```py -from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator +from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator # ... @@ -402,8 +402,8 @@ These are the available paginators: | ------------ | -------------- | ----------- | | `json_link` | [JSONLinkPaginator](../../general-usage/http/rest-client.md#jsonresponsepaginator) | The link to the next page is in the body (JSON) of the response.
*Parameters:*
  • `next_url_path` (str) - the JSONPath to the next page URL
| | `header_link` | [HeaderLinkPaginator](../../general-usage/http/rest-client.md#headerlinkpaginator) | The links to the next page are in the response headers.
*Parameters:*
  • `link_header` (str) - the name of the header containing the links. Default is "next".
| -| `offset` | [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.
*Parameters:*
  • `limit` (int) - the maximum number of items to retrieve in each request
  • `offset` (int) - the initial offset for the first request. Defaults to `0`
  • `offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"
  • `limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"
  • `total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset`
  • `maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count
| -| `page_number` | [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.
*Parameters:*
  • `base_page` (int) - the starting page number. Defaults to `0`
  • `page_param` (str) - the query parameter name for the page number. Defaults to "page"
  • `total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page`
  • `maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached
| +| `offset` | [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.
*Parameters:*
  • `limit` (int) - the maximum number of items to retrieve in each request
  • `offset` (int) - the initial offset for the first request. Defaults to `0`
  • `offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"
  • `limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"
  • `total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`
  • `maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| +| `page_number` | [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.
*Parameters:*
  • `base_page` (int) - the starting page number. Defaults to `0`
  • `page_param` (str) - the query parameter name for the page number. Defaults to "page"
  • `total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`
  • `maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| | `cursor` | [JSONResponseCursorPaginator](../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON).
*Parameters:*
  • `cursor_path` (str) - the JSONPath to the cursor value. Defaults to "cursors.next"
  • `cursor_param` (str) - the query parameter name for the cursor. Defaults to "after"
| | `single_page` | SinglePagePaginator | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | | `auto` | `None` | Explicitly specify that the source should automatically detect the pagination method. | diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index ddd66a233b..9451ca689d 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -183,8 +183,9 @@ need to specify the paginator when the API uses a different relation type. - `offset`: The initial offset for the first request. Defaults to `0`. - `offset_param`: The name of the query parameter used to specify the offset. Defaults to `"offset"`. - `limit_param`: The name of the query parameter used to specify the limit. Defaults to `"limit"`. -- `total_path`: A JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset`. +- `total_path`: A JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`. - `maximum_offset`: Optional maximum offset value. Limits pagination even without total count. +- `stop_after_empty_page`: Whether pagination should stop when a page contains no result items. Defaults to `True`. **Example:** @@ -198,7 +199,7 @@ E.g. `https://api.example.com/items?offset=0&limit=100`, `https://api.example.co } ``` -You can paginate through responses from this API using `OffsetPaginator`: +You can paginate through responses from this API using the `OffsetPaginator`: ```py client = RESTClient( @@ -210,20 +211,34 @@ client = RESTClient( ) ``` -In a different scenario where the API does not provide the total count, you can use `maximum_offset` to limit the pagination: +Pagination stops by default when a page contains no records. This is especially useful when the API does not provide the total item count. +Here, the `total_path` parameter is set to `None` because the API does not provide the total count. ```py client = RESTClient( base_url="https://api.example.com", paginator=OffsetPaginator( limit=100, - maximum_offset=1000, - total_path=None + total_path=None, ) ) ``` -Note, that in this case, the `total_path` parameter is set explicitly to `None` to indicate that the API does not provide the total count. +Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_offset` is reached before the first empty page then pagination stops: + +```py +client = RESTClient( + base_url="https://api.example.com", + paginator=OffsetPaginator( + limit=10, + maximum_offset=20, # limits response to 20 records + total_path=None, + ) +) +``` + +You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_offset` to guarantee that the paginator terminates. + #### PageNumberPaginator @@ -234,8 +249,9 @@ Note, that in this case, the `total_path` parameter is set explicitly to `None` - `base_page`: The index of the initial page from the API perspective. Normally, it's 0-based or 1-based (e.g., 1, 2, 3, ...) indexing for the pages. Defaults to 0. - `page`: The page number for the first request. If not provided, the initial value will be set to `base_page`. - `page_param`: The query parameter name for the page number. Defaults to `"page"`. -- `total_path`: A JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page`. +- `total_path`: A JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`. - `maximum_page`: Optional maximum page number. Stops pagination once this page is reached. +- `stop_after_empty_page`: Whether pagination should stop when a page contains no result items. Defaults to `True`. **Example:** @@ -248,7 +264,7 @@ Assuming an API endpoint `https://api.example.com/items` paginates by page numbe } ``` -You can paginate through responses from this API using `PageNumberPaginator`: +You can paginate through responses from this API using the `PageNumberPaginator`: ```py client = RESTClient( @@ -259,19 +275,32 @@ client = RESTClient( ) ``` -If the API does not provide the total number of pages: +Pagination stops by default when a page contains no records. This is especially useful when the API does not provide the total item count. +Here, the `total_path` parameter is set to `None` because the API does not provide the total count. ```py client = RESTClient( base_url="https://api.example.com", paginator=PageNumberPaginator( - maximum_page=5, # Stops after fetching 5 pages total_path=None ) ) ``` -Note, that in the case above, the `total_path` parameter is set explicitly to `None` to indicate that the API does not provide the total count. +Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_page` is reached before the first empty page then pagination stops: + +```py +client = RESTClient( + base_url="https://api.example.com", + paginator=OffsetPaginator( + maximum_page=2, # limits response to 2 pages + total_path=None, + ) +) +``` + +You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_page` to guarantee that the paginator terminates. + #### JSONResponseCursorPaginator From 5e78dcc45efc3400811c00dc1ce1bb7564ba3f6c Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 12 Aug 2024 17:05:28 +0530 Subject: [PATCH 04/34] Either total_path or maximum_value or stop_after_empty_pages is required --- dlt/sources/helpers/rest_client/paginators.py | 18 ++++-- .../helpers/rest_client/test_paginators.py | 56 +++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 083b95da18..888539a64d 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -121,8 +121,10 @@ def __init__( a page contains no result items. Defaults to `True`. """ super().__init__() - if total_path is None and maximum_value is None: - raise ValueError("Either `total_path` or `maximum_value` must be provided.") + if total_path is None and maximum_value is None and not stop_after_empty_page: + raise ValueError( + "Either `total_path` or `maximum_value` or stop_after_empty_page must be provided." + ) self.param_name = param_name self.current_value = initial_value self.value_step = value_step @@ -260,8 +262,10 @@ def __init__( stop_after_empty_page (bool): Whether pagination should stop when a page contains no result items. Defaults to `True`. """ - if total_path is None and maximum_page is None: - raise ValueError("Either `total_path` or `maximum_page` must be provided.") + if total_path is None and maximum_page is None and not stop_after_empty_page: + raise ValueError( + "Either `total_path` or `maximum_page` or `stop_after_empty_page` must be provided." + ) page = page if page is not None else base_page @@ -365,8 +369,10 @@ def __init__( stop_after_empty_page (bool): Whether pagination should stop when a page contains no result items. Defaults to `True`. """ - if total_path is None and maximum_offset is None: - raise ValueError("Either `total_path` or `maximum_offset` must be provided.") + if total_path is None and maximum_offset is None and not stop_after_empty_page: + raise ValueError( + "Either `total_path` or `maximum_offset` or `stop_after_empty_page` must be provided." + ) super().__init__( param_name=offset_param, initial_value=offset, diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 9e4ccada72..7357169101 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -326,6 +326,36 @@ def test_stop_after_empty_page(self): paginator.update_state(response, no_data_found) # Page 1 assert paginator.has_next_page is False + def test_guarantee_termination(self): + OffsetPaginator( + limit=10, + total_path=None, + ) + + OffsetPaginator( + limit=10, + total_path=None, + maximum_offset=1, + stop_after_empty_page=False, + ) + + with pytest.raises(ValueError) as e: + OffsetPaginator( + limit=10, + total_path=None, + stop_after_empty_page=False, + ) + assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided") + + with pytest.raises(ValueError) as e: + OffsetPaginator( + limit=10, + total_path=None, + stop_after_empty_page=False, + maximum_offset=None, + ) + assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided") + @pytest.mark.usefixtures("mock_api_server") class TestPageNumberPaginator: @@ -431,6 +461,32 @@ def test_client_pagination_zero_based(self, rest_client): assert_pagination(pages) + def test_guarantee_termination(self): + PageNumberPaginator( + total_path=None, + ) + + PageNumberPaginator( + total_path=None, + maximum_page=1, + stop_after_empty_page=False, + ) + + with pytest.raises(ValueError) as e: + PageNumberPaginator( + total_path=None, + stop_after_empty_page=False, + ) + assert e.match("`total_path` or `maximum_page` or `stop_after_empty_page` must be provided") + + with pytest.raises(ValueError) as e: + PageNumberPaginator( + total_path=None, + stop_after_empty_page=False, + maximum_page=None, + ) + assert e.match("`total_path` or `maximum_page` or `stop_after_empty_page` must be provided") + @pytest.mark.usefixtures("mock_api_server") class TestJSONResponseCursorPaginator: From 44b82749365592de0b12879fb564d56c05120c72 Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 12 Aug 2024 17:55:20 +0530 Subject: [PATCH 05/34] updates docs to new type signature --- dlt/sources/helpers/rest_client/paginators.py | 14 +++++++------- .../website/docs/general-usage/http/rest-client.md | 8 +++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 888539a64d..993cbf7f26 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -39,7 +39,7 @@ def init_request(self, request: Request) -> None: # noqa: B027, optional overri pass @abstractmethod - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Updates the paginator's state based on the response from the API. This method should extract necessary pagination details (like next page @@ -73,7 +73,7 @@ def __str__(self) -> str: class SinglePagePaginator(BasePaginator): """A paginator for single-page API responses.""" - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: self._has_next_page = False def update_request(self, request: Request) -> None: @@ -140,7 +140,7 @@ def init_request(self, request: Request) -> None: request.params[self.param_name] = self.current_value - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: if self._stop_after_this_page(data): self._has_next_page = False else: @@ -164,7 +164,7 @@ def update_state(self, response: Response, data: List[Any] = None) -> None: ): self._has_next_page = False - def _stop_after_this_page(self, data: List[Any]) -> bool: + def _stop_after_this_page(self, data: Optional[List[Any]]) -> bool: return self.stop_after_empty_page and data == [] def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: @@ -508,7 +508,7 @@ def __init__(self, links_next_key: str = "next") -> None: super().__init__() self.links_next_key = links_next_key - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Extracts the next page URL from the 'Link' header in the response.""" self._next_reference = response.links.get(self.links_next_key, {}).get("url") @@ -563,7 +563,7 @@ def __init__( super().__init__() self.next_url_path = jsonpath.compile_path(next_url_path) - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Extracts the next page URL from the JSON response.""" values = jsonpath.find_values(self.next_url_path, response.json()) self._next_reference = values[0] if values else None @@ -642,7 +642,7 @@ def __init__( self.cursor_path = jsonpath.compile_path(cursor_path) self.cursor_param = cursor_param - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Extracts the cursor value from the JSON response.""" values = jsonpath.find_values(self.cursor_path, response.json()) self._next_reference = values[0] if values else None diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 9451ca689d..40c83f8c5b 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -339,7 +339,7 @@ When working with APIs that use non-standard pagination schemes, or when you nee - `init_request(request: Request) -> None`: This method is called before making the first API call in the `RESTClient.paginate` method. You can use this method to set up the initial request query parameters, headers, etc. For example, you can set the initial page number or cursor value. -- `update_state(response: Response) -> None`: This method updates the paginator's state based on the response of the API call. Typically, you extract pagination details (like the next page reference) from the response and store them in the paginator instance. +- `update_state(response: Response, data: Optional[List[Any]]) -> None`: This method updates the paginator's state based on the response of the API call. Typically, you extract pagination details (like the next page reference) from the response and store them in the paginator instance. - `update_request(request: Request) -> None`: Before making the next API call in `RESTClient.paginate` method, `update_request` is used to modify the request with the necessary parameters to fetch the next page (based on the current state of the paginator). For example, you can add query parameters to the request, or modify the URL. @@ -348,6 +348,7 @@ When working with APIs that use non-standard pagination schemes, or when you nee Suppose an API uses query parameters for pagination, incrementing an page parameter for each subsequent page, without providing direct links to next pages in its responses. E.g. `https://api.example.com/posts?page=1`, `https://api.example.com/posts?page=2`, etc. Here's how you could implement a paginator for this scheme: ```py +from typing import Any, List, Optional from dlt.sources.helpers.rest_client.paginators import BasePaginator from dlt.sources.helpers.requests import Response, Request @@ -361,7 +362,7 @@ class QueryParamPaginator(BasePaginator): # This will set the initial page number (e.g. page=1) self.update_request(request) - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: # Assuming the API returns an empty list when no more data is available if not response.json(): self._has_next_page = False @@ -399,6 +400,7 @@ def get_data(): Some APIs use POST requests for pagination, where the next page is fetched by sending a POST request with a cursor or other parameters in the request body. This is frequently used in "search" API endpoints or other endpoints with big payloads. Here's how you could implement a paginator for a case like this: ```py +from typing import Any, List, Optional from dlt.sources.helpers.rest_client.paginators import BasePaginator from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.requests import Response, Request @@ -408,7 +410,7 @@ class PostBodyPaginator(BasePaginator): super().__init__() self.cursor = None - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: # Assuming the API returns an empty list when no more data is available if not response.json(): self._has_next_page = False From e42f4d729f9f65e049ba87fb72f2ae7652867264 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:18:06 +0530 Subject: [PATCH 06/34] Updated the docs: Using pipeline.default_schema.toprettyyaml() (#1660) --- docs/website/docs/general-usage/schema.md | 24 ++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 0e3e3bba1f..df405de1af 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -352,8 +352,30 @@ load_info = pipeline.run(source_data) ``` This example iterates through MongoDB collections, applying the complex [data type](schema#data-types) to a specified column, and then processes the data with `pipeline.run`. -## Export and import schema files +## View and print the schema +To view and print the default schema in a clear YAML format use the command: + +```py +pipeline.default_schema.to_pretty_yaml() +``` +This can be used in a pipeline as: +```py +# Create a pipeline +pipeline = dlt.pipeline( + pipeline_name="chess_pipeline", + destination='duckdb', + dataset_name="games_data") + +# Run the pipeline +load_info = pipeline.run(source) + +# Print the default schema in a pretty YAML format +print(pipeline.default_schema.to_pretty_yaml()) +``` +This will display a structured YAML representation of your schema, showing details like tables, columns, data types, and metadata, including version, version_hash, and engine_version. + +## Export and import schema files Please follow the guide on [how to adjust a schema](../walkthroughs/adjust-a-schema.md) to export and import `yaml` schema files in your pipeline. From a9c29586ff108de6ccf0bb5aace945df7ba94765 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Wed, 14 Aug 2024 14:51:38 +0400 Subject: [PATCH 07/34] Add `storage_options` to `DeltaTable.create` (#1686) * add storage_options to delta table create statement --- dlt/common/schema/exceptions.py | 9 ++++++--- dlt/common/schema/utils.py | 4 +++- dlt/destinations/impl/filesystem/filesystem.py | 1 + dlt/normalize/schema.py | 5 ++++- tests/load/pipeline/test_filesystem_pipeline.py | 2 +- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 1055163942..2e75b4b3a1 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -246,12 +246,15 @@ def __init__(self, schema_name: str, table_name: str, column: TColumnSchemaBase) elif column.get("primary_key"): key_type = "primary key" - msg = f"The column {column['name']} in table {table_name} did not receive any data during this load. " + msg = ( + f"The column {column['name']} in table {table_name} did not receive any data during" + " this load. " + ) if key_type or not nullable: msg += f"It is marked as non-nullable{' '+key_type} and it must have values. " msg += ( - "This can happen if you specify the column manually, for example using the 'merge_key', 'primary_key' or 'columns' argument " - "but it does not exist in the data." + "This can happen if you specify the column manually, for example using the 'merge_key'," + " 'primary_key' or 'columns' argument but it does not exist in the data." ) super().__init__(schema_name, msg) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index d879c21b3c..8b87a7e5fe 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -357,7 +357,9 @@ def is_nullable_column(col: TColumnSchemaBase) -> bool: return col.get("nullable", True) -def find_incomplete_columns(tables: List[TTableSchema]) -> Iterable[Tuple[str, TColumnSchemaBase, bool]]: +def find_incomplete_columns( + tables: List[TTableSchema], +) -> Iterable[Tuple[str, TColumnSchemaBase, bool]]: """Yields (table_name, column, nullable) for all incomplete columns in `tables`""" for table in tables: for col in table["columns"].values(): diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 449d5c1862..7009ad95ac 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -123,6 +123,7 @@ def run(self) -> None: table_uri=dt_path, schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema), mode="overwrite", + storage_options=storage_options, ) return diff --git a/dlt/normalize/schema.py b/dlt/normalize/schema.py index 4967fab18f..c01d184c92 100644 --- a/dlt/normalize/schema.py +++ b/dlt/normalize/schema.py @@ -3,13 +3,16 @@ from dlt.common.schema.exceptions import UnboundColumnException from dlt.common import logger + def verify_normalized_schema(schema: Schema) -> None: """Verify the schema is valid for next stage after normalization. 1. Log warning if any incomplete nullable columns are in any data tables 2. Raise `UnboundColumnException` on incomplete non-nullable columns (e.g. missing merge/primary key) """ - for table_name, column, nullable in find_incomplete_columns(schema.data_tables(seen_data_only=True)): + for table_name, column, nullable in find_incomplete_columns( + schema.data_tables(seen_data_only=True) + ): exc = UnboundColumnException(schema.name, table_name, column) if nullable: logger.warning(str(exc)) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 0554b1ef3c..71620e889d 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -442,7 +442,7 @@ def complex_table(): destinations_configs( table_format_filesystem_configs=True, table_format="delta", - bucket_subset=(FILE_BUCKET), + bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) From 61fc190fa580d0efc32ce6a074a088e674554040 Mon Sep 17 00:00:00 2001 From: Willi Date: Wed, 14 Aug 2024 17:42:49 +0530 Subject: [PATCH 08/34] documents pluggable custom auth --- .../docs/dlt-ecosystem/verified-sources/rest_api.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 4b72b3276e..ca9a9360c5 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -553,6 +553,19 @@ Available authentication types: For more complex authentication methods, you can implement a [custom authentication class](../../general-usage/http/rest-client.md#implementing-custom-authentication) and use it in the configuration. +You can use the dictionary configuration syntax also for custom authentication classes after registering them as follows: + +```py +rest_api.config_setup.register_auth("custom_auth", CustomAuth) + +{ + # ... + "auth": { + "type": "custom_auth", + "api_key": dlt.secrets["sources.my_source.my_api_key"], + } +} +``` ### Define resource relationships From 9bd0b2e258cc8abd3745598e0be9339e15b59847 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 14 Aug 2024 14:30:02 +0200 Subject: [PATCH 09/34] bumps to pre release 0.5.4a0 (#1689) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fcf508f95b..f33bbbefcf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.5.3" +version = "0.5.4a0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] From 122fc7f7ae3d7f00a899cedcb7bb1d77a15accb7 Mon Sep 17 00:00:00 2001 From: VioletM Date: Wed, 14 Aug 2024 08:46:09 -0400 Subject: [PATCH 10/34] Allow different from credentials project_id for BigQuery (#1680) --- dlt/destinations/impl/bigquery/bigquery.py | 1 + .../impl/bigquery/configuration.py | 4 ++- dlt/destinations/impl/bigquery/sql_client.py | 8 ++++-- .../dlt-ecosystem/destinations/bigquery.md | 12 ++++++++ tests/load/bigquery/test_bigquery_client.py | 28 +++++++++++++++++++ 5 files changed, 49 insertions(+), 4 deletions(-) diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index ef4e31acd1..c6bf2e7654 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -225,6 +225,7 @@ def __init__( config.credentials, capabilities, config.get_location(), + config.project_id, config.http_timeout, config.retry_deadline, ) diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index 47cc997a4a..3d71b0c8ea 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -1,6 +1,6 @@ import dataclasses import warnings -from typing import ClassVar, List, Final +from typing import ClassVar, List, Final, Optional from dlt.common.configuration import configspec from dlt.common.configuration.specs import GcpServiceAccountCredentials @@ -14,6 +14,8 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_type: Final[str] = dataclasses.field(default="bigquery", init=False, repr=False, compare=False) # type: ignore credentials: GcpServiceAccountCredentials = None location: str = "US" + project_id: Optional[str] = None + """Note, that this is BigQuery project_id which could be different from credentials.project_id""" has_case_sensitive_identifiers: bool = True """If True then dlt expects to load data into case sensitive dataset""" should_set_case_sensitivity_on_new_dataset: bool = False diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py index dfc4094e7b..c56742f1ff 100644 --- a/dlt/destinations/impl/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -82,14 +82,16 @@ def __init__( credentials: GcpServiceAccountCredentialsWithoutDefaults, capabilities: DestinationCapabilitiesContext, location: str = "US", + project_id: Optional[str] = None, http_timeout: float = 15.0, retry_deadline: float = 60.0, ) -> None: self._client: bigquery.Client = None self.credentials: GcpServiceAccountCredentialsWithoutDefaults = credentials self.location = location + self.project_id = project_id or self.credentials.project_id self.http_timeout = http_timeout - super().__init__(credentials.project_id, dataset_name, staging_dataset_name, capabilities) + super().__init__(self.project_id, dataset_name, staging_dataset_name, capabilities) self._default_retry = bigquery.DEFAULT_RETRY.with_deadline(retry_deadline) self._default_query = bigquery.QueryJobConfig( @@ -100,7 +102,7 @@ def __init__( @raise_open_connection_error def open_connection(self) -> bigquery.Client: self._client = bigquery.Client( - self.credentials.project_id, + self.project_id, credentials=self.credentials.to_native_credentials(), location=self.location, ) @@ -240,7 +242,7 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB conn.close() def catalog_name(self, escape: bool = True) -> Optional[str]: - project_id = self.capabilities.casefold_identifier(self.credentials.project_id) + project_id = self.capabilities.casefold_identifier(self.project_id) if escape: project_id = self.capabilities.escape_identifier(project_id) return project_id diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 51d124251a..334e08c4a7 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -112,6 +112,18 @@ VMs available on GCP (cloud functions, Composer runners, Colab notebooks) have a location = "US" ``` +### Using Different `project_id` + +You can set the `project_id` in your configuration to be different from the one in your credentials, provided your account has access to it: +```toml +[destination.bigquery] +project_id = "project_id_destination" + +[destination.bigquery.credentials] +project_id = "project_id_credentials" +``` +In this scenario, `project_id_credentials` will be used for authentication, while `project_id_destination` will be used as the data destination. + ## Write Disposition All write dispositions are supported. diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py index 80bd008730..c92f18e159 100644 --- a/tests/load/bigquery/test_bigquery_client.py +++ b/tests/load/bigquery/test_bigquery_client.py @@ -32,6 +32,7 @@ prepare_table, yield_client_with_storage, cm_yield_client_with_storage, + cm_yield_client, ) # mark all tests as essential, do not remove @@ -53,6 +54,18 @@ def auto_delete_storage() -> None: delete_test_storage() +@pytest.fixture +def bigquery_project_id() -> Iterator[str]: + project_id = "different_project_id" + project_id_key = "DESTINATION__BIGQUERY__PROJECT_ID" + saved_project_id = os.environ.get(project_id_key) + os.environ[project_id_key] = project_id + yield project_id + del os.environ[project_id_key] + if saved_project_id: + os.environ[project_id_key] = saved_project_id + + def test_service_credentials_with_default(environment: Any) -> None: gcpc = GcpServiceAccountCredentials() # resolve will miss values and try to find default credentials on the machine @@ -247,6 +260,21 @@ def test_bigquery_configuration() -> None: ) +def test_bigquery_different_project_id(bigquery_project_id) -> None: + """Test scenario when bigquery project_id different from gcp credentials project_id.""" + config = resolve_configuration( + BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset"), + sections=("destination", "bigquery"), + ) + assert config.project_id == bigquery_project_id + with cm_yield_client( + "bigquery", + dataset_name="dataset", + default_config_values={"project_id": bigquery_project_id}, + ) as client: + assert bigquery_project_id in client.sql_client.catalog_name() + + def test_bigquery_autodetect_configuration(client: BigQueryClient) -> None: # no schema autodetect assert client._should_autodetect_schema("event_slot") is False From 982b448d533303cef803b681a2150d3f3a531f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Willi=20M=C3=BCller?= Date: Thu, 15 Aug 2024 13:59:12 +0200 Subject: [PATCH 11/34] improves formatting in error message Co-authored-by: Anton Burnashev --- dlt/sources/helpers/rest_client/paginators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 993cbf7f26..f87eaea873 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -123,7 +123,7 @@ def __init__( super().__init__() if total_path is None and maximum_value is None and not stop_after_empty_page: raise ValueError( - "Either `total_path` or `maximum_value` or stop_after_empty_page must be provided." + "Either `total_path` or `maximum_value` or `stop_after_empty_page` must be provided." ) self.param_name = param_name self.current_value = initial_value From a4dbd5d479659820c41a42f4bdf255836e96f7af Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Thu, 15 Aug 2024 21:29:11 +0400 Subject: [PATCH 12/34] fix delta table dangling parquet file bug (#1695) --- .../impl/filesystem/filesystem.py | 3 +- .../load/pipeline/test_filesystem_pipeline.py | 45 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 7009ad95ac..9683617db8 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -36,6 +36,7 @@ from dlt.destinations.job_impl import ( ReferenceFollowupJob, FinalizedLoadJob, + FinalizedLoadJobWithFollowupJobs, ) from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations import path_utils @@ -366,7 +367,7 @@ def create_load_job( if ReferenceFollowupJob.is_reference_job(file_path): return DeltaLoadFilesystemJob(file_path) # otherwise just continue - return FilesystemLoadJobWithFollowup(file_path) + return FinalizedLoadJobWithFollowupJobs(file_path) cls = FilesystemLoadJobWithFollowup if self.config.as_staging else FilesystemLoadJob return cls(file_path) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 71620e889d..f9196cc909 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -314,6 +314,51 @@ def data_types(): assert len(rows) == 10 +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + table_format_filesystem_configs=True, + table_format="delta", + bucket_subset=(FILE_BUCKET), + ), + ids=lambda x: x.name, +) +def test_delta_table_does_not_contain_job_files( + destination_config: DestinationTestConfiguration, +) -> None: + """Asserts Parquet job files do not end up in Delta table.""" + + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) + + @dlt.resource(table_format="delta") + def delta_table(): + yield [{"foo": 1}] + + # create Delta table + info = pipeline.run(delta_table()) + assert_load_info(info) + + # get Parquet jobs + completed_jobs = info.load_packages[0].jobs["completed_jobs"] + parquet_jobs = [ + job + for job in completed_jobs + if job.job_file_info.table_name == "delta_table" and job.file_path.endswith(".parquet") + ] + assert len(parquet_jobs) == 1 + + # get Parquet files in Delta table folder + with pipeline.destination_client() as client: + assert isinstance(client, FilesystemClient) + table_dir = client.get_table_dir("delta_table") + parquet_files = [f for f in client.fs_client.ls(table_dir) if f.endswith(".parquet")] + assert len(parquet_files) == 1 + + # Parquet file should not be the job file + file_id = parquet_jobs[0].job_file_info.file_id + assert file_id not in parquet_files[0] + + @pytest.mark.parametrize( "destination_config", destinations_configs( From 01423f7892a1dc8d50447f5f27c9e8573e5e254a Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Thu, 15 Aug 2024 22:46:27 +0400 Subject: [PATCH 13/34] Add `delta` table partitioning support (#1696) * add delta table partitioning support * document delta table partitioning support * Update docs/website/docs/dlt-ecosystem/destinations/filesystem.md --------- Co-authored-by: Anton Burnashev --- dlt/common/libs/deltalake.py | 9 +- .../impl/filesystem/filesystem.py | 5 ++ .../dlt-ecosystem/destinations/filesystem.md | 17 ++++ .../load/pipeline/test_filesystem_pipeline.py | 85 +++++++++++++++++++ 4 files changed, 114 insertions(+), 2 deletions(-) diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index e6cd91bd0a..d98795d07c 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Union +from typing import Optional, Dict, Union, List from pathlib import Path from dlt import version, Pipeline @@ -71,9 +71,13 @@ def write_delta_table( table_or_uri: Union[str, Path, DeltaTable], data: Union[pa.Table, pa.RecordBatchReader], write_disposition: TWriteDisposition, + partition_by: Optional[Union[List[str], str]] = None, storage_options: Optional[Dict[str, str]] = None, ) -> None: - """Writes in-memory Arrow table to on-disk Delta table.""" + """Writes in-memory Arrow table to on-disk Delta table. + + Thin wrapper around `deltalake.write_deltalake`. + """ # throws warning for `s3` protocol: https://github.com/delta-io/delta-rs/issues/2460 # TODO: upgrade `deltalake` lib after https://github.com/delta-io/delta-rs/pull/2500 @@ -81,6 +85,7 @@ def write_delta_table( write_deltalake( # type: ignore[call-overload] table_or_uri=table_or_uri, data=ensure_delta_compatible_arrow_data(data), + partition_by=partition_by, mode=get_delta_write_mode(write_disposition), schema_mode="merge", # enable schema evolution (adding new columns) storage_options=storage_options, diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 9683617db8..f2466f25a2 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -115,6 +115,9 @@ def run(self) -> None: storage_options = _deltalake_storage_options(self._job_client.config) dt = try_get_deltatable(dt_path, storage_options=storage_options) + # get partition columns + part_cols = get_columns_names_with_prop(self._load_table, "partition") + # explicitly check if there is data # (https://github.com/delta-io/delta-rs/issues/2686) if arrow_ds.head(1).num_rows == 0: @@ -124,6 +127,7 @@ def run(self) -> None: table_uri=dt_path, schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema), mode="overwrite", + partition_by=part_cols, storage_options=storage_options, ) return @@ -159,6 +163,7 @@ def run(self) -> None: table_or_uri=dt_path if dt is None else dt, data=arrow_rbr, write_disposition=self._load_table["write_disposition"], + partition_by=part_cols, storage_options=storage_options, ) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index bba0ff3df3..018b838363 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -530,6 +530,23 @@ def my_delta_resource(): > `dlt` always uses `parquet` as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded. +#### Delta table partitioning +A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column: + +```py +@dlt.resource( + table_format="delta", + columns={"foo": {"partition": True}} +) +def my_delta_resource(): + ... +``` + +:::caution +It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match. +::: + + #### Storage options You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index f9196cc909..759f443546 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -482,6 +482,91 @@ def complex_table(): assert len(rows_dict["complex_table__child__grandchild"]) == 5 +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + table_format_filesystem_configs=True, + table_format="delta", + bucket_subset=(FILE_BUCKET), + ), + ids=lambda x: x.name, +) +def test_delta_table_partitioning( + destination_config: DestinationTestConfiguration, +) -> None: + """Tests partitioning for `delta` table format.""" + + from dlt.common.libs.deltalake import get_delta_tables + from tests.pipeline.utils import users_materialize_table_schema + + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) + + # zero partition columns + @dlt.resource(table_format="delta") + def zero_part(): + yield {"foo": 1, "bar": 1} + + info = pipeline.run(zero_part()) + assert_load_info(info) + dt = get_delta_tables(pipeline, "zero_part")["zero_part"] + assert dt.metadata().partition_columns == [] + assert load_table_counts(pipeline, "zero_part")["zero_part"] == 1 + + # one partition column + @dlt.resource(table_format="delta", columns={"c1": {"partition": True}}) + def one_part(): + yield [ + {"c1": "foo", "c2": 1}, + {"c1": "foo", "c2": 2}, + {"c1": "bar", "c2": 3}, + {"c1": "baz", "c2": 4}, + ] + + info = pipeline.run(one_part()) + assert_load_info(info) + dt = get_delta_tables(pipeline, "one_part")["one_part"] + assert dt.metadata().partition_columns == ["c1"] + assert load_table_counts(pipeline, "one_part")["one_part"] == 4 + + # two partition columns + @dlt.resource( + table_format="delta", columns={"c1": {"partition": True}, "c2": {"partition": True}} + ) + def two_part(): + yield [ + {"c1": "foo", "c2": 1, "c3": True}, + {"c1": "foo", "c2": 2, "c3": True}, + {"c1": "bar", "c2": 1, "c3": True}, + {"c1": "baz", "c2": 1, "c3": True}, + ] + + info = pipeline.run(two_part()) + assert_load_info(info) + dt = get_delta_tables(pipeline, "two_part")["two_part"] + assert dt.metadata().partition_columns == ["c1", "c2"] + assert load_table_counts(pipeline, "two_part")["two_part"] == 4 + + # test partitioning with empty source + users_materialize_table_schema.apply_hints( + table_format="delta", + columns={"id": {"partition": True}}, + ) + info = pipeline.run(users_materialize_table_schema()) + assert_load_info(info) + dt = get_delta_tables(pipeline, "users")["users"] + assert dt.metadata().partition_columns == ["id"] + assert load_table_counts(pipeline, "users")["users"] == 0 + + # changing partitioning after initial table creation is not supported + zero_part.apply_hints(columns={"foo": {"partition": True}}) + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(zero_part()) + assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) + assert "partitioning" in pip_ex.value.__context__.retry_message + dt = get_delta_tables(pipeline, "zero_part")["zero_part"] + assert dt.metadata().partition_columns == [] + + @pytest.mark.parametrize( "destination_config", destinations_configs( From 49b45fb4592e53e2d0d7eaf09c1c4279927b7853 Mon Sep 17 00:00:00 2001 From: Willi Date: Fri, 16 Aug 2024 17:12:21 +0530 Subject: [PATCH 14/34] sets default argument to None --- dlt/sources/helpers/rest_client/paginators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index f87eaea873..91b364c395 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -164,7 +164,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) -> ): self._has_next_page = False - def _stop_after_this_page(self, data: Optional[List[Any]]) -> bool: + def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool: return self.stop_after_empty_page and data == [] def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: From 1f26fe74587fb13046ce0646fe97426150283b65 Mon Sep 17 00:00:00 2001 From: Willi Date: Fri, 16 Aug 2024 17:13:39 +0530 Subject: [PATCH 15/34] passes non-empty list to paginator.update_state() and interprets both None and [] as "no data" --- dlt/sources/helpers/rest_client/paginators.py | 2 +- .../helpers/rest_client/test_paginators.py | 34 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 91b364c395..632c93d0c7 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -165,7 +165,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) -> self._has_next_page = False def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool: - return self.stop_after_empty_page and data == [] + return self.stop_after_empty_page and not data def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: raise ValueError( diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 7357169101..7ae6aa10dc 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -242,7 +242,7 @@ class TestOffsetPaginator: def test_update_state(self): paginator = OffsetPaginator(offset=0, limit=10) response = Mock(Response, json=lambda: {"total": 20}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -253,7 +253,7 @@ def test_update_state(self): def test_update_state_with_string_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "20"}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -261,13 +261,13 @@ def test_update_state_with_invalid_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_update_state_without_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_init_request(self): paginator = OffsetPaginator(offset=123, limit=42) @@ -281,7 +281,7 @@ def test_init_request(self): response = Mock(Response, json=lambda: {"total": 200}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) # Test for the next request next_request = Mock(spec=Request) @@ -295,11 +295,11 @@ def test_init_request(self): def test_maximum_offset(self): paginator = OffsetPaginator(offset=0, limit=50, maximum_offset=100, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response) # Offset 0 to 50 + paginator.update_state(response, data=[{}]) # Offset 0 to 50 assert paginator.current_value == 50 assert paginator.has_next_page is True - paginator.update_state(response) # Offset 50 to 100 + paginator.update_state(response, data=[{}]) # Offset 50 to 100 assert paginator.current_value == 100 assert paginator.has_next_page is False @@ -362,22 +362,22 @@ class TestPageNumberPaginator: def test_update_state(self): paginator = PageNumberPaginator(base_page=1, page=1, total_path="total_pages") response = Mock(Response, json=lambda: {"total_pages": 3}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 3 assert paginator.has_next_page is True # Test for reaching the end - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.has_next_page is False def test_update_state_with_string_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total": "3"}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 2 assert paginator.has_next_page is True @@ -385,34 +385,34 @@ def test_update_state_with_invalid_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total_pages": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_update_state_without_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_update_request(self): paginator = PageNumberPaginator(base_page=1, page=1, page_param="page") request = Mock(Request) response = Mock(Response, json=lambda: {"total": 3}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) request.params = {} paginator.update_request(request) assert request.params["page"] == 2 - paginator.update_state(response) + paginator.update_state(response, data=[{}]) paginator.update_request(request) assert request.params["page"] == 3 def test_maximum_page(self): paginator = PageNumberPaginator(base_page=1, page=1, maximum_page=3, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response) # Page 1 + paginator.update_state(response, data=[{}]) # Page 1 assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response) # Page 2 + paginator.update_state(response, data=[{}]) # Page 2 assert paginator.current_value == 3 assert paginator.has_next_page is False From 5bf78ae3f04dfbfe26f4c6b737756f2e46b970f8 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 16 Aug 2024 16:40:09 +0200 Subject: [PATCH 16/34] fixes load job counter (#1702) * displays log counter when new counter is created * initializes load job counters only when package starts --- dlt/common/runtime/collector.py | 1 + dlt/load/load.py | 10 ++++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dlt/common/runtime/collector.py b/dlt/common/runtime/collector.py index 95117b70cc..be5453cdd3 100644 --- a/dlt/common/runtime/collector.py +++ b/dlt/common/runtime/collector.py @@ -170,6 +170,7 @@ def update( total=total, ) self.messages[counter_key] = None + self.last_log_time = None self.counters[counter_key] += inc if message is not None: diff --git a/dlt/load/load.py b/dlt/load/load.py index 34b7e2b5b7..99a12d69ee 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -471,7 +471,7 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) f"All jobs completed, archiving package {load_id} with aborted set to {aborted}" ) - def update_loadpackage_info(self, load_id: str) -> None: + def update_load_package_info(self, load_id: str) -> None: # update counter we only care about the jobs that are scheduled to be loaded package_jobs = self.load_storage.normalized_packages.get_load_package_jobs(load_id) total_jobs = reduce(lambda p, c: p + len(c), package_jobs.values(), 0) @@ -492,6 +492,8 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: dropped_tables = current_load_package()["state"].get("dropped_tables", []) truncated_tables = current_load_package()["state"].get("truncated_tables", []) + self.update_load_package_info(load_id) + # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: if (expected_update := self.load_storage.begin_schema_update(load_id)) is not None: @@ -539,14 +541,10 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: pending_exception: Optional[LoadClientJobException] = None while True: try: - # we continously spool new jobs and complete finished ones + # we continuously spool new jobs and complete finished ones running_jobs, finalized_jobs, new_pending_exception = self.complete_jobs( load_id, running_jobs, schema ) - # update load package info if any jobs where finalized - if finalized_jobs: - self.update_loadpackage_info(load_id) - pending_exception = pending_exception or new_pending_exception # do not spool new jobs if there was a signal or an exception was encountered From 83bab151a81ad3e3beaad8b4486741bd3e28d2fa Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 19 Aug 2024 17:55:44 +0530 Subject: [PATCH 17/34] refactors magic to telling name --- .../helpers/rest_client/test_paginators.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 7ae6aa10dc..5c9f484bbc 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -17,6 +17,8 @@ from .conftest import assert_pagination +NON_EMPTY_PAGE = [{"some": "data"}] + @pytest.mark.usefixtures("mock_api_server") class TestHeaderLinkPaginator: @@ -242,7 +244,7 @@ class TestOffsetPaginator: def test_update_state(self): paginator = OffsetPaginator(offset=0, limit=10) response = Mock(Response, json=lambda: {"total": 20}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -253,7 +255,7 @@ def test_update_state(self): def test_update_state_with_string_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "20"}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -261,13 +263,13 @@ def test_update_state_with_invalid_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_update_state_without_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_init_request(self): paginator = OffsetPaginator(offset=123, limit=42) @@ -281,7 +283,7 @@ def test_init_request(self): response = Mock(Response, json=lambda: {"total": 200}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) # Test for the next request next_request = Mock(spec=Request) @@ -295,11 +297,11 @@ def test_init_request(self): def test_maximum_offset(self): paginator = OffsetPaginator(offset=0, limit=50, maximum_offset=100, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response, data=[{}]) # Offset 0 to 50 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Offset 0 to 50 assert paginator.current_value == 50 assert paginator.has_next_page is True - paginator.update_state(response, data=[{}]) # Offset 50 to 100 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Offset 50 to 100 assert paginator.current_value == 100 assert paginator.has_next_page is False @@ -362,22 +364,22 @@ class TestPageNumberPaginator: def test_update_state(self): paginator = PageNumberPaginator(base_page=1, page=1, total_path="total_pages") response = Mock(Response, json=lambda: {"total_pages": 3}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 3 assert paginator.has_next_page is True # Test for reaching the end - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.has_next_page is False def test_update_state_with_string_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total": "3"}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 2 assert paginator.has_next_page is True @@ -385,34 +387,34 @@ def test_update_state_with_invalid_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total_pages": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_update_state_without_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_update_request(self): paginator = PageNumberPaginator(base_page=1, page=1, page_param="page") request = Mock(Request) response = Mock(Response, json=lambda: {"total": 3}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) request.params = {} paginator.update_request(request) assert request.params["page"] == 2 - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) paginator.update_request(request) assert request.params["page"] == 3 def test_maximum_page(self): paginator = PageNumberPaginator(base_page=1, page=1, maximum_page=3, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response, data=[{}]) # Page 1 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Page 1 assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response, data=[{}]) # Page 2 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Page 2 assert paginator.current_value == 3 assert paginator.has_next_page is False From 843b658fdec9e71fd5129fd3a726878a0b29d83f Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:24:32 +0400 Subject: [PATCH 18/34] Enable `scd2` record reinsert (#1707) * make sorting optional * enable scd2 record reinsertion by dropping unique constraint * document scd2 row id uniqueness characteristics * assert unique constraint is not dropped when users bring their own hash --- dlt/destinations/sql_jobs.py | 8 +- dlt/extract/hints.py | 5 + .../docs/general-usage/incremental-loading.md | 13 +++ tests/load/pipeline/test_scd2.py | 108 +++++++++++++++--- 4 files changed, 115 insertions(+), 19 deletions(-) diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index cddae52bb7..51e5b95a0e 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -749,20 +749,20 @@ def gen_scd2_sql( INSERT INTO {root_table_name} ({col_str}, {from_}, {to}) SELECT {col_str}, {boundary_ts} AS {from_}, {active_record_literal} AS {to} FROM {staging_root_table_name} AS s - WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name}); + WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name} WHERE {is_active_clause}); """) # insert list elements for new active records in child tables child_tables = table_chain[1:] if child_tables: - unique_column = escape_column_id( - cls._get_unique_col(table_chain, sql_client, root_table) - ) # TODO: - based on deterministic child hashes (OK) # - if row hash changes all is right # - if it does not we only capture new records, while we should replace existing with those in stage # - this write disposition is way more similar to regular merge (how root tables are handled is different, other tables handled same) for table in child_tables: + unique_column = escape_column_id( + cls._get_unique_col(table_chain, sql_client, table) + ) table_name, staging_table_name = sql_client.get_qualified_table_names(table["name"]) sql.append(f""" INSERT INTO {table_name} diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index dce375afb0..123a8455e1 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -465,11 +465,16 @@ def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None: "x-valid-to": True, "x-active-record-timestamp": mddict.get("active_record_timestamp"), } + # unique constraint is dropped for C_DLT_ID when used to store + # SCD2 row hash (only applies to root table) hash_ = mddict.get("row_version_column_name", DataItemNormalizer.C_DLT_ID) dict_["columns"][hash_] = { "name": hash_, "nullable": False, "x-row-version": True, + # duplicate value in row hash column is possible in case + # of insert-delete-reinsert pattern + "unique": False, } @staticmethod diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index b130f7a4f5..8eb1002dcf 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -251,6 +251,19 @@ executed. You can achieve the same in the decorator `@dlt.source(root_key=True)` ### `scd2` strategy `dlt` can create [Slowly Changing Dimension Type 2](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) (SCD2) destination tables for dimension tables that change in the source. The resource is expected to provide a full extract of the source table each run. A row hash is stored in `_dlt_id` and used as surrogate key to identify source records that have been inserted, updated, or deleted. A `NULL` value is used by default to indicate an active record, but it's possible to use a configurable high timestamp (e.g. 9999-12-31 00:00:00.000000) instead. +:::note +The `unique` hint for `_dlt_id` in the root table is set to `false` when using `scd2`. This differs from [default behavior](./destination-tables.md#child-and-parent-tables). The reason is that the surrogate key stored in `_dlt_id` contains duplicates after an _insert-delete-reinsert_ pattern: +1. record with surrogate key X is inserted in a load at `t1` +2. record with surrogate key X is deleted in a later load at `t2` +3. record with surrogate key X is reinserted in an even later load at `t3` + +After this pattern, the `scd2` table in the destination has two records for surrogate key X: one for validity window `[t1, t2]`, and one for `[t3, NULL]`. A duplicate value exists in `_dlt_id` because both records have the same surrogate key. + +Note that: +- the composite key `(_dlt_id, _dlt_valid_from)` is unique +- `_dlt_id` remains unique for child tables—`scd2` does not affect this +::: + #### Example: `scd2` merge strategy ```py @dlt.resource( diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py index 8b41c354b2..8f2c0c2486 100644 --- a/tests/load/pipeline/test_scd2.py +++ b/tests/load/pipeline/test_scd2.py @@ -46,7 +46,7 @@ def get_load_package_created_at(pipeline: dlt.Pipeline, load_info: LoadInfo) -> def get_table( - pipeline: dlt.Pipeline, table_name: str, sort_column: str, include_root_id: bool = True + pipeline: dlt.Pipeline, table_name: str, sort_column: str = None, include_root_id: bool = True ) -> List[Dict[str, Any]]: """Returns destination table contents as list of dictionaries.""" @@ -54,6 +54,21 @@ def strip_timezone(ts: datetime) -> datetime: """Converts timezone of datetime object to UTC and removes timezone awareness.""" return ensure_pendulum_datetime(ts).astimezone(tz=timezone.utc).replace(tzinfo=None) + table = [ + { + k: strip_timezone(v) if isinstance(v, datetime) else v + for k, v in r.items() + if not k.startswith("_dlt") + or k in DEFAULT_VALIDITY_COLUMN_NAMES + or (k == "_dlt_root_id" if include_root_id else False) + } + for r in load_tables_to_dicts(pipeline, table_name)[table_name] + ] + + if sort_column is None: + return table + return sorted(table, key=lambda d: d[sort_column]) + return sorted( [ { @@ -139,8 +154,8 @@ def r(data): assert table["columns"][from_]["x-valid-from"] # type: ignore[typeddict-item] assert table["columns"][to]["x-valid-to"] # type: ignore[typeddict-item] assert table["columns"]["_dlt_id"]["x-row-version"] # type: ignore[typeddict-item] - # _dlt_id is still unique - assert table["columns"]["_dlt_id"]["unique"] + # root table _dlt_id is not unique with `scd2` merge strategy + assert not table["columns"]["_dlt_id"]["unique"] # assert load results ts_1 = get_load_package_created_at(p, info) @@ -288,7 +303,7 @@ def r(data): {from_: ts_2, to: None, "nk": 1, "c1": "foo_updated"}, # new ] assert_records_as_set( - get_table(p, "dim_test__c2", cname), + get_table(p, "dim_test__c2"), [ {"_dlt_root_id": get_row_hash(l1_1), cname: 1}, {"_dlt_root_id": get_row_hash(l2_1), cname: 1}, # new @@ -310,7 +325,7 @@ def r(data): ts_3 = get_load_package_created_at(p, info) assert_load_info(info) assert_records_as_set( - get_table(p, "dim_test", "c1"), + get_table(p, "dim_test"), [ {from_: ts_1, to: None, "nk": 2, "c1": "bar"}, {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, @@ -326,7 +341,7 @@ def r(data): {"_dlt_root_id": get_row_hash(l3_1), cname: 2}, # new {"_dlt_root_id": get_row_hash(l1_2), cname: 3}, ] - assert_records_as_set(get_table(p, "dim_test__c2", cname), exp_3) + assert_records_as_set(get_table(p, "dim_test__c2"), exp_3) # load 4 — delete a record dim_snap = [ @@ -336,7 +351,7 @@ def r(data): ts_4 = get_load_package_created_at(p, info) assert_load_info(info) assert_records_as_set( - get_table(p, "dim_test", "c1"), + get_table(p, "dim_test"), [ {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"}, # updated {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, @@ -345,7 +360,7 @@ def r(data): ], ) assert_records_as_set( - get_table(p, "dim_test__c2", cname), exp_3 + get_table(p, "dim_test__c2"), exp_3 ) # deletes should not alter child tables # load 5 — insert a record @@ -357,7 +372,7 @@ def r(data): ts_5 = get_load_package_created_at(p, info) assert_load_info(info) assert_records_as_set( - get_table(p, "dim_test", "c1"), + get_table(p, "dim_test"), [ {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"}, {from_: ts_5, to: None, "nk": 3, "c1": "baz"}, # new @@ -367,7 +382,7 @@ def r(data): ], ) assert_records_as_set( - get_table(p, "dim_test__c2", cname), + get_table(p, "dim_test__c2"), [ {"_dlt_root_id": get_row_hash(l1_1), cname: 1}, {"_dlt_root_id": get_row_hash(l2_1), cname: 1}, @@ -403,7 +418,7 @@ def r(data): info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) assert_load_info(info) assert_records_as_set( - get_table(p, "dim_test__c2__cc1", "value"), + get_table(p, "dim_test__c2__cc1"), [ {"_dlt_root_id": get_row_hash(l1_1), "value": 1}, {"_dlt_root_id": get_row_hash(l1_2), "value": 1}, @@ -419,7 +434,7 @@ def r(data): info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) assert_load_info(info) assert_records_as_set( - (get_table(p, "dim_test__c2__cc1", "value")), + (get_table(p, "dim_test__c2__cc1")), [ {"_dlt_root_id": get_row_hash(l1_1), "value": 1}, {"_dlt_root_id": get_row_hash(l1_2), "value": 1}, @@ -443,7 +458,7 @@ def r(data): {"_dlt_root_id": get_row_hash(l1_2), "value": 2}, {"_dlt_root_id": get_row_hash(l3_1), "value": 2}, # new ] - assert_records_as_set(get_table(p, "dim_test__c2__cc1", "value"), exp_3) + assert_records_as_set(get_table(p, "dim_test__c2__cc1"), exp_3) # load 4 — delete a record dim_snap = [ @@ -451,7 +466,7 @@ def r(data): ] info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) assert_load_info(info) - assert_records_as_set(get_table(p, "dim_test__c2__cc1", "value"), exp_3) + assert_records_as_set(get_table(p, "dim_test__c2__cc1"), exp_3) # load 5 — insert a record dim_snap = [ @@ -461,7 +476,7 @@ def r(data): info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) assert_load_info(info) assert_records_as_set( - get_table(p, "dim_test__c2__cc1", "value"), + get_table(p, "dim_test__c2__cc1"), [ {"_dlt_root_id": get_row_hash(l1_1), "value": 1}, {"_dlt_root_id": get_row_hash(l1_2), "value": 1}, @@ -474,6 +489,67 @@ def r(data): ) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, supports_merge=True), + ids=lambda x: x.name, +) +def test_record_reinsert(destination_config: DestinationTestConfiguration) -> None: + p = destination_config.setup_pipeline("abstract", dev_mode=True) + + @dlt.resource( + table_name="dim_test", write_disposition={"disposition": "merge", "strategy": "scd2"} + ) + def r(data): + yield data + + # load 1 — initial load + dim_snap = [ + r1 := {"nk": 1, "c1": "foo", "c2": "foo", "child": [1]}, + r2 := {"nk": 2, "c1": "bar", "c2": "bar", "child": [2, 3]}, + ] + info = p.run(r(dim_snap)) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 2 + assert load_table_counts(p, "dim_test__child")["dim_test__child"] == 3 + ts_1 = get_load_package_created_at(p, info) + + # load 2 — delete natural key 1 + dim_snap = [r2] + info = p.run(r(dim_snap)) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 2 + assert load_table_counts(p, "dim_test__child")["dim_test__child"] == 3 + ts_2 = get_load_package_created_at(p, info) + + # load 3 — reinsert natural key 1 + dim_snap = [r1, r2] + info = p.run(r(dim_snap)) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 3 + assert load_table_counts(p, "dim_test__child")["dim_test__child"] == 3 # no new record + ts_3 = get_load_package_created_at(p, info) + + # assert parent records + from_, to = DEFAULT_VALIDITY_COLUMN_NAMES + r1_no_child = {k: v for k, v in r1.items() if k != "child"} + r2_no_child = {k: v for k, v in r2.items() if k != "child"} + expected = [ + {**{from_: ts_1, to: ts_2}, **r1_no_child}, + {**{from_: ts_3, to: None}, **r1_no_child}, + {**{from_: ts_1, to: None}, **r2_no_child}, + ] + assert_records_as_set(get_table(p, "dim_test"), expected) + + # assert child records + expected = [ + {"_dlt_root_id": get_row_hash(r1), "value": 1}, # links to two records in parent + {"_dlt_root_id": get_row_hash(r2), "value": 2}, + {"_dlt_root_id": get_row_hash(r2), "value": 3}, + ] + assert_records_as_set(get_table(p, "dim_test__child"), expected) + + @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), @@ -633,6 +709,8 @@ def r(data): table = p.default_schema.get_table("dim_test") assert table["columns"]["row_hash"]["x-row-version"] # type: ignore[typeddict-item] assert "x-row-version" not in table["columns"]["_dlt_id"] + # _dlt_id unique constraint should not be dropped when users bring their own hash + assert table["columns"]["_dlt_id"]["unique"] # load 2 — update and delete a record dim_snap = [ From 6f778ebf047967c358e5bb0d20e99efb12063261 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Thu, 22 Aug 2024 19:04:44 +0400 Subject: [PATCH 19/34] `scd2` custom "valid from" / "valid to" value feature (#1709) * black format * remove code remnant * add scd2 custom boundary timestamp feature * add invalid scd2 active record timestamp test * document scd2 boundary timestamp argument --- dlt/common/schema/typing.py | 1 + dlt/destinations/sql_jobs.py | 18 ++- dlt/extract/hints.py | 14 ++ dlt/sources/helpers/rest_client/paginators.py | 8 +- .../docs/general-usage/incremental-loading.md | 18 ++- tests/load/pipeline/test_scd2.py | 143 +++++++++++++++--- .../helpers/rest_client/test_paginators.py | 8 +- 7 files changed, 174 insertions(+), 36 deletions(-) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 9a4dd51d4b..284c55caac 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -187,6 +187,7 @@ class TMergeDispositionDict(TWriteDispositionDict, total=False): strategy: Optional[TLoaderMergeStrategy] validity_column_names: Optional[List[str]] active_record_timestamp: Optional[TAnyDateTime] + boundary_timestamp: Optional[TAnyDateTime] row_version_column_name: Optional[str] diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 51e5b95a0e..a1e38a2c20 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Sequence, Tuple, cast, TypedDict, Optional, Callable, Union import yaml -from dlt.common.logger import pretty_format_exception +from dlt.common.time import ensure_pendulum_datetime from dlt.common.schema.typing import ( TTableSchema, @@ -721,10 +721,18 @@ def gen_scd2_sql( format_datetime_literal = ( DestinationCapabilitiesContext.generic_capabilities().format_datetime_literal ) - boundary_ts = format_datetime_literal( - current_load_package()["state"]["created_at"], + + boundary_ts = ensure_pendulum_datetime( + root_table.get( # type: ignore[arg-type] + "x-boundary-timestamp", + current_load_package()["state"]["created_at"], + ) + ) + boundary_literal = format_datetime_literal( + boundary_ts, caps.timestamp_precision, ) + active_record_timestamp = get_active_record_timestamp(root_table) if active_record_timestamp is None: active_record_literal = "NULL" @@ -737,7 +745,7 @@ def gen_scd2_sql( # retire updated and deleted records sql.append(f""" - {cls.gen_update_table_prefix(root_table_name)} {to} = {boundary_ts} + {cls.gen_update_table_prefix(root_table_name)} {to} = {boundary_literal} WHERE {is_active_clause} AND {hash_} NOT IN (SELECT {hash_} FROM {staging_root_table_name}); """) @@ -747,7 +755,7 @@ def gen_scd2_sql( col_str = ", ".join([c for c in columns if c not in (from_, to)]) sql.append(f""" INSERT INTO {root_table_name} ({col_str}, {from_}, {to}) - SELECT {col_str}, {boundary_ts} AS {from_}, {active_record_literal} AS {to} + SELECT {col_str}, {boundary_literal} AS {from_}, {active_record_literal} AS {to} FROM {staging_root_table_name} AS s WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name} WHERE {is_active_clause}); """) diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 123a8455e1..67a6b3e83a 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -26,6 +26,7 @@ new_table, ) from dlt.common.typing import TDataItem +from dlt.common.time import ensure_pendulum_datetime from dlt.common.utils import clone_dict_nested from dlt.common.normalizers.json.relational import DataItemNormalizer from dlt.common.validation import validate_dict_ignoring_xkeys @@ -444,6 +445,8 @@ def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None: mddict: TMergeDispositionDict = deepcopy(dict_["write_disposition"]) if mddict is not None: dict_["x-merge-strategy"] = mddict.get("strategy", DEFAULT_MERGE_STRATEGY) + if "boundary_timestamp" in mddict: + dict_["x-boundary-timestamp"] = mddict["boundary_timestamp"] # add columns for `scd2` merge strategy if dict_.get("x-merge-strategy") == "scd2": if mddict.get("validity_column_names") is None: @@ -512,3 +515,14 @@ def validate_write_disposition_hint(wd: TTableHintTemplate[TWriteDispositionConf f'`{wd["strategy"]}` is not a valid merge strategy. ' f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}.""" ) + + for ts in ("active_record_timestamp", "boundary_timestamp"): + if ts == "active_record_timestamp" and wd.get("active_record_timestamp") is None: + continue # None is allowed for active_record_timestamp + if ts in wd: + try: + ensure_pendulum_datetime(wd[ts]) # type: ignore[literal-required] + except Exception: + raise ValueError( + f'could not parse `{ts}` value "{wd[ts]}"' # type: ignore[literal-required] + ) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 632c93d0c7..872d4f34e8 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -123,7 +123,8 @@ def __init__( super().__init__() if total_path is None and maximum_value is None and not stop_after_empty_page: raise ValueError( - "Either `total_path` or `maximum_value` or `stop_after_empty_page` must be provided." + "Either `total_path` or `maximum_value` or `stop_after_empty_page` must be" + " provided." ) self.param_name = param_name self.current_value = initial_value @@ -164,7 +165,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) -> ): self._has_next_page = False - def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool: + def _stop_after_this_page(self, data: Optional[List[Any]] = None) -> bool: return self.stop_after_empty_page and not data def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: @@ -371,7 +372,8 @@ def __init__( """ if total_path is None and maximum_offset is None and not stop_after_empty_page: raise ValueError( - "Either `total_path` or `maximum_offset` or `stop_after_empty_page` must be provided." + "Either `total_path` or `maximum_offset` or `stop_after_empty_page` must be" + " provided." ) super().__init__( param_name=offset_param, diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 8eb1002dcf..68fc46e6dc 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -348,7 +348,23 @@ You can configure the literal used to indicate an active record with `active_rec write_disposition={ "disposition": "merge", "strategy": "scd2", - "active_record_timestamp": "9999-12-31", # e.g. datetime.datetime(9999, 12, 31) is also accepted + # accepts various types of date/datetime objects + "active_record_timestamp": "9999-12-31", + } +) +def dim_customer(): + ... +``` + +#### Example: configure boundary timestamp +You can configure the "boundary timestamp" used for record validity windows with `boundary_timestamp`. The provided date(time) value is used as "valid from" for new records and as "valid to" for retired records. The timestamp at which a load package is created is used if `boundary_timestamp` is omitted. +```py +@dlt.resource( + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + # accepts various types of date/datetime objects + "boundary_timestamp": "2024-08-21T12:15:00+00:00", } ) def dim_customer(): diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py index 8f2c0c2486..065da5ce94 100644 --- a/tests/load/pipeline/test_scd2.py +++ b/tests/load/pipeline/test_scd2.py @@ -3,6 +3,7 @@ import pytest from typing import List, Dict, Any, Optional from datetime import date, datetime, timezone # noqa: I251 +from contextlib import nullcontext as does_not_raise import dlt from dlt.common.typing import TAnyDateTime @@ -45,15 +46,16 @@ def get_load_package_created_at(pipeline: dlt.Pipeline, load_info: LoadInfo) -> return reduce_pendulum_datetime_precision(created_at, caps.timestamp_precision) +def strip_timezone(ts: TAnyDateTime) -> pendulum.DateTime: + """Converts timezone of datetime object to UTC and removes timezone awareness.""" + return ensure_pendulum_datetime(ts).astimezone(tz=timezone.utc).replace(tzinfo=None) + + def get_table( pipeline: dlt.Pipeline, table_name: str, sort_column: str = None, include_root_id: bool = True ) -> List[Dict[str, Any]]: """Returns destination table contents as list of dictionaries.""" - def strip_timezone(ts: datetime) -> datetime: - """Converts timezone of datetime object to UTC and removes timezone awareness.""" - return ensure_pendulum_datetime(ts).astimezone(tz=timezone.utc).replace(tzinfo=None) - table = [ { k: strip_timezone(v) if isinstance(v, datetime) else v @@ -69,20 +71,6 @@ def strip_timezone(ts: datetime) -> datetime: return table return sorted(table, key=lambda d: d[sort_column]) - return sorted( - [ - { - k: strip_timezone(v) if isinstance(v, datetime) else v - for k, v in r.items() - if not k.startswith("_dlt") - or k in DEFAULT_VALIDITY_COLUMN_NAMES - or (k == "_dlt_root_id" if include_root_id else False) - } - for r in load_tables_to_dicts(pipeline, table_name)[table_name] - ], - key=lambda d: d[sort_column], - ) - @pytest.mark.essential @pytest.mark.parametrize( @@ -596,6 +584,7 @@ def r(data): "9999-12-31T00:00:00", "9999-12-31T00:00:00+00:00", "9999-12-31T00:00:00+01:00", + "i_am_not_a_timestamp", ], ) def test_active_record_timestamp( @@ -604,22 +593,126 @@ def test_active_record_timestamp( ) -> None: p = destination_config.setup_pipeline("abstract", dev_mode=True) + context = does_not_raise() + if active_record_timestamp == "i_am_not_a_timestamp": + context = pytest.raises(ValueError) # type: ignore[assignment] + + with context: + + @dlt.resource( + table_name="dim_test", + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "active_record_timestamp": active_record_timestamp, + }, + ) + def r(): + yield {"foo": "bar"} + + p.run(r()) + actual_active_record_timestamp = ensure_pendulum_datetime( + load_tables_to_dicts(p, "dim_test")["dim_test"][0]["_dlt_valid_to"] + ) + assert actual_active_record_timestamp == ensure_pendulum_datetime(active_record_timestamp) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb"]), + ids=lambda x: x.name, +) +def test_boundary_timestamp( + destination_config: DestinationTestConfiguration, +) -> None: + p = destination_config.setup_pipeline("abstract", dev_mode=True) + + ts1 = "2024-08-21T12:15:00+00:00" + ts2 = "2024-08-22" + ts3 = date(2024, 8, 20) # earlier than ts1 and ts2 + ts4 = "i_am_not_a_timestamp" + @dlt.resource( table_name="dim_test", write_disposition={ "disposition": "merge", "strategy": "scd2", - "active_record_timestamp": active_record_timestamp, + "boundary_timestamp": ts1, }, ) - def r(): - yield {"foo": "bar"} + def r(data): + yield data - p.run(r()) - actual_active_record_timestamp = ensure_pendulum_datetime( - load_tables_to_dicts(p, "dim_test")["dim_test"][0]["_dlt_valid_to"] + # load 1 — initial load + dim_snap = [ + l1_1 := {"nk": 1, "foo": "foo"}, + l1_2 := {"nk": 2, "foo": "foo"}, + ] + info = p.run(r(dim_snap)) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 2 + from_, to = DEFAULT_VALIDITY_COLUMN_NAMES + expected = [ + {**{from_: strip_timezone(ts1), to: None}, **l1_1}, + {**{from_: strip_timezone(ts1), to: None}, **l1_2}, + ] + assert get_table(p, "dim_test", "nk") == expected + + # load 2 — different source records, different boundary timestamp + r.apply_hints( + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "boundary_timestamp": ts2, + } ) - assert actual_active_record_timestamp == ensure_pendulum_datetime(active_record_timestamp) + dim_snap = [ + l2_1 := {"nk": 1, "foo": "bar"}, # natural key 1 updated + # l1_2, # natural key 2 no longer present + l2_3 := {"nk": 3, "foo": "foo"}, # new natural key + ] + info = p.run(r(dim_snap)) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 4 + expected = [ + {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_1}, # retired + {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_2}, # retired + {**{from_: strip_timezone(ts2), to: None}, **l2_1}, # new + {**{from_: strip_timezone(ts2), to: None}, **l2_3}, # new + ] + assert_records_as_set(get_table(p, "dim_test"), expected) + + # load 3 — earlier boundary timestamp + # we naively apply any valid timestamp + # may lead to "valid from" > "valid to", as in this test case + r.apply_hints( + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "boundary_timestamp": ts3, + } + ) + dim_snap = [l2_1] # natural key 3 no longer present + info = p.run(r(dim_snap)) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 4 + expected = [ + {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_1}, # unchanged + {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_2}, # unchanged + {**{from_: strip_timezone(ts2), to: None}, **l2_1}, # unchanged + {**{from_: strip_timezone(ts2), to: strip_timezone(ts3)}, **l2_3}, # retired + ] + assert_records_as_set(get_table(p, "dim_test"), expected) + + # invalid boundary timestamp should raise error + with pytest.raises(ValueError): + r.apply_hints( + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "boundary_timestamp": ts4, + } + ) @pytest.mark.parametrize( diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 5c9f484bbc..39e3d767a0 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -347,7 +347,9 @@ def test_guarantee_termination(self): total_path=None, stop_after_empty_page=False, ) - assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided") + assert e.match( + "`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided" + ) with pytest.raises(ValueError) as e: OffsetPaginator( @@ -356,7 +358,9 @@ def test_guarantee_termination(self): stop_after_empty_page=False, maximum_offset=None, ) - assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided") + assert e.match( + "`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided" + ) @pytest.mark.usefixtures("mock_api_server") From 49dabb87b92ad1ba916348b38031f7b4fd6d7b7c Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Thu, 22 Aug 2024 19:47:34 +0400 Subject: [PATCH 20/34] Make `make lint` fail on `black` format diff (#1716) * make lint fail on black format diff and add diff coloring * format with black --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 15fb895a9f..f47047a3fe 100644 --- a/Makefile +++ b/Makefile @@ -52,7 +52,7 @@ lint: poetry run mypy --config-file mypy.ini dlt tests poetry run flake8 --max-line-length=200 dlt poetry run flake8 --max-line-length=200 tests --exclude tests/reflection/module_cases - poetry run black dlt docs tests --diff --extend-exclude=".*syntax_error.py" + poetry run black dlt docs tests --check --diff --color --extend-exclude=".*syntax_error.py" # poetry run isort ./ --diff # $(MAKE) lint-security From c51445c007bf7167d2e52facf2df0b2be00cb08a Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 23 Aug 2024 19:11:56 +0530 Subject: [PATCH 21/34] Docs/issue 1661 add tip to source docs and update weaviate docs (#1662) --- .../dlt-ecosystem/destinations/lancedb.md | 17 +++++++++++++++- .../docs/dlt-ecosystem/destinations/qdrant.md | 19 ++++++++++++++++-- .../dlt-ecosystem/destinations/weaviate.md | 16 +++++++++++++++ docs/website/docs/general-usage/source.md | 20 +++++++++++++++++++ 4 files changed, 69 insertions(+), 3 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md index dbf90da4b9..8b7f3854ee 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md @@ -144,7 +144,22 @@ lancedb_adapter( ) ``` -Bear in mind that you can't use an adapter on a [dlt source](../../general-usage/source.md), only a [dlt resource](../../general-usage/resource.md). +When using the `lancedb_adapter`, it's important to apply it directly to resources, not to the whole source. Here's an example: + +```py +products_tables = sql_database().with_resources("products", "customers") + +pipeline = dlt.pipeline( + pipeline_name="postgres_to_lancedb_pipeline", + destination="lancedb", + ) + +# apply adapter to the needed resources +lancedb_adapter(products_tables.products, embed="description") +lancedb_adapter(products_tables.customers, embed="bio") + +info = pipeline.run(products_tables) +``` ## Write disposition diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 9f19007227..5fc8097440 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -106,10 +106,25 @@ qdrant_adapter( ) ``` -:::tip +When using the `qdrant_adapter`, it's important to apply it directly to resources, not to the whole source. Here's an example: -A more comprehensive pipeline would load data from some API or use one of dlt's [verified sources](../verified-sources/). +```py +products_tables = sql_database().with_resources("products", "customers") + +pipeline = dlt.pipeline( + pipeline_name="postgres_to_qdrant_pipeline", + destination="qdrant", + ) +# apply adapter to the needed resources +qdrant_adapter(products_tables.products, embed="description") +qdrant_adapter(products_tables.customers, embed="bio") + +info = pipeline.run(products_tables) +``` + +:::tip +A more comprehensive pipeline would load data from some API or use one of dlt's [verified sources](../verified-sources/). ::: ## Write disposition diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index c6597fadce..43bd85ce41 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -116,6 +116,22 @@ weaviate_adapter( tokenization={"title": "word", "description": "whitespace"}, ) ``` +When using the `weaviate_adapter`, it's important to apply it directly to resources, not to the whole source. Here's an example: + +```py +products_tables = sql_database().with_resources("products", "customers") + +pipeline = dlt.pipeline( + pipeline_name="postgres_to_weaviate_pipeline", + destination="weaviate", + ) + +# apply adapter to the needed resources +weaviate_adapter(products_tables.products, vectorize="description") +weaviate_adapter(products_tables.customers, vectorize="bio") + +info = pipeline.run(products_tables) +``` :::tip diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index 936a3160f0..98c7a13b81 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -187,6 +187,26 @@ Several data sources are prone to contain semi-structured documents with very de MongoDB databases. Our practical experience is that setting the `max_nesting_level` to 2 or 3 produces the clearest and human-readable schemas. +:::tip +The `max_table_nesting` parameter at the source level doesn't automatically apply to individual +resources when accessed directly (e.g., using `source.resources["resource_1"])`. To make sure it +works, either use `source.with_resources("resource_1")` or set the parameter directly on the resource. +::: + + +You can directly configure the `max_table_nesting` parameter on the resource level as: + +```py +@dlt.resource(max_table_nesting=0) +def my_resource(): + ... +``` +or +```py +my_source = source() +my_source.my_resource.max_table_nesting = 0 +``` + ### Modify schema The schema is available via `schema` property of the source. From 6f7591e2d79c544e82accb205780171d3962863f Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 23 Aug 2024 20:03:48 +0530 Subject: [PATCH 22/34] Add custom parent-child relationships example (#1678) --- .../parent_child_relationship/__init__.py | 0 .../parent_child_relationship.py | 69 ++++++++++++++++ .../test_parent_child_relationship.py | 78 +++++++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 docs/examples/parent_child_relationship/__init__.py create mode 100644 docs/examples/parent_child_relationship/parent_child_relationship.py create mode 100644 docs/examples/parent_child_relationship/test_parent_child_relationship.py diff --git a/docs/examples/parent_child_relationship/__init__.py b/docs/examples/parent_child_relationship/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/parent_child_relationship/parent_child_relationship.py b/docs/examples/parent_child_relationship/parent_child_relationship.py new file mode 100644 index 0000000000..39c9f577cc --- /dev/null +++ b/docs/examples/parent_child_relationship/parent_child_relationship.py @@ -0,0 +1,69 @@ +""" +--- +title: Load parent table records into child table +description: Learn how to integrate custom parent keys into child records +keywords: [parent child relationship, parent key] +--- + +This example demonstrates handling data with parent-child relationships using the `dlt` library. +You learn how to integrate specific fields (e.g., primary, foreign keys) from a parent record into each child record. + +In this example, we'll explore how to: + +- Add `parent_id` into each child record using `add_parent_id` function +- Use the [`add_map` function](https://dlthub.com/docs/api_reference/extract/resource#add_map) to apply this +custom logic to every record in the dataset + +:::note important +Please note that dlt metadata, including `_dlt_id` and `_dlt_load_id`, will still be loaded into the tables. +::: +""" + +from typing import List, Dict, Any, Generator +import dlt + +# Define a dlt resource with write disposition to 'merge' +@dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) +def data_source() -> Generator[List[Dict[str, Any]], None, None]: + # Example data + data = [ + { + "parent_id": 1, + "parent_name": "Alice", + "children": [ + {"child_id": 1, "child_name": "Child 1"}, + {"child_id": 2, "child_name": "Child 2"}, + ], + }, + { + "parent_id": 2, + "parent_name": "Bob", + "children": [{"child_id": 3, "child_name": "Child 3"}], + }, + ] + + yield data + +# Function to add parent_id to each child record within a parent record +def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: + parent_id_key = "parent_id" + for child in record["children"]: + child[parent_id_key] = record[parent_id_key] + return record + +if __name__ == "__main__": + # Create and configure the dlt pipeline + pipeline = dlt.pipeline( + pipeline_name="generic_pipeline", + destination="duckdb", + dataset_name="dataset", + ) + + # Run the pipeline + load_info = pipeline.run( + data_source() + .add_map(add_parent_id), + primary_key="parent_id" + ) + # Output the load information after pipeline execution + print(load_info) diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py new file mode 100644 index 0000000000..f671040823 --- /dev/null +++ b/docs/examples/parent_child_relationship/test_parent_child_relationship.py @@ -0,0 +1,78 @@ + +import pytest + +from tests.utils import skipifgithubfork + + +""" +--- +title: Load parent table records into child table +description: Learn how to integrate custom parent keys into child records +keywords: [parent child relationship, parent key] +--- + +This example demonstrates handling data with parent-child relationships using +the `dlt` library. You learn how to integrate specific fields (e.g., primary, +foreign keys) from a parent record into each child record. + +In this example, we'll explore how to: + +- Add `parent_id` into each child record using `add_parent_id` function +- Use the [`add_map` function](https://dlthub.com/docs/api_reference/extract/resource#add_map) to apply this +custom logic to every record in the dataset + +:::note important +Please note that dlt metadata, including `_dlt_id` and `_dlt_load_id`, will still be loaded into the tables. +::: +""" + +from typing import List, Dict, Any, Generator +import dlt + +# Define a dlt resource with write disposition to 'merge' +@dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) +def data_source() -> Generator[List[Dict[str, Any]], None, None]: + # Example data + data = [ + { + "parent_id": 1, + "parent_name": "Alice", + "children": [ + {"child_id": 1, "child_name": "Child 1"}, + {"child_id": 2, "child_name": "Child 2"}, + ], + }, + { + "parent_id": 2, + "parent_name": "Bob", + "children": [{"child_id": 3, "child_name": "Child 3"}], + }, + ] + + yield data + +# Function to add parent_id to each child record within a parent record +def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: + parent_id_key = "parent_id" + for child in record["children"]: + child[parent_id_key] = record[parent_id_key] + return record + +@skipifgithubfork +@pytest.mark.forked +def test_parent_child_relationship(): + # Create and configure the dlt pipeline + pipeline = dlt.pipeline( + pipeline_name="generic_pipeline", + destination="duckdb", + dataset_name="dataset", + ) + + # Run the pipeline + load_info = pipeline.run( + data_source() + .add_map(add_parent_id), + primary_key="parent_id" + ) + # Output the load information after pipeline execution + print(load_info) From d9a7b93ca74d237ea6d92a774a017eef1013f3f5 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sun, 25 Aug 2024 14:50:49 -0600 Subject: [PATCH 23/34] Correct the library name for mem stats to `psutil` (#1733) --- docs/website/docs/reference/performance.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 075d351553..0ee62acec7 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -62,7 +62,7 @@ Several [text file formats](../dlt-ecosystem/file-formats/) have `gzip` compress Keep in mind load packages are buffered to disk and are left for any troubleshooting, so you can [clear disk space by setting the `delete_completed_jobs` option](../running-in-production/running.md#data-left-behind). ### Observing cpu and memory usage -Please make sure that you have the `psutils` package installed (note that Airflow installs it by default). Then you can dump the stats periodically by setting the [progress](../general-usage/pipeline.md#display-the-loading-progress) to `log` in `config.toml`: +Please make sure that you have the `psutil` package installed (note that Airflow installs it by default). Then you can dump the stats periodically by setting the [progress](../general-usage/pipeline.md#display-the-loading-progress) to `log` in `config.toml`: ```toml progress="log" ``` @@ -258,4 +258,4 @@ DLT_USE_JSON=simplejson ## Using the built in requests wrapper or RESTClient for API calls -Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches. \ No newline at end of file +Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches. From 7d7c14f71d14612f0de873110eaa6d300a4544c2 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 26 Aug 2024 02:23:43 +0530 Subject: [PATCH 24/34] Replaced "full_refresh" with "dev_mode" (#1735) --- docs/technical/general_usage.md | 10 +++++----- .../dlt-ecosystem/verified-sources/sql_database.md | 2 +- .../docs/dlt-ecosystem/verified-sources/stripe.md | 2 +- .../docs/dlt-ecosystem/verified-sources/workable.md | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/technical/general_usage.md b/docs/technical/general_usage.md index 336c892c66..2df903b062 100644 --- a/docs/technical/general_usage.md +++ b/docs/technical/general_usage.md @@ -47,7 +47,7 @@ Pipeline can be explicitly created and configured via `dlt.pipeline()` that retu 4. dataset_name - name of the dataset where the data goes (see later the default names) 5. import_schema_path - default is None 6. export_schema_path - default is None -7. full_refresh - if set to True the pipeline working dir will be erased and the dataset name will get the unique suffix (current timestamp). ie the `my_data` becomes `my_data_20221107164856`. +7. dev_mode - if set to True the pipeline working dir will be erased and the dataset name will get the unique suffix (current timestamp). ie the `my_data` becomes `my_data_20221107164856`. > **Achtung** as per `secrets_and_config.md` the arguments passed to `dlt.pipeline` are configurable and if skipped will be injected by the config providers. **the values provided explicitly in the code have a full precedence over all config providers** @@ -101,7 +101,7 @@ In case **there are more schemas in the pipeline**, the data will be loaded into 1. `spotify` tables and `labels` will load into `spotify_data_1` 2. `mel` resource will load into `spotify_data_1_echonest` -The `full_refresh` option: dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. This allows a non destructive full refresh. Nothing is being deleted/dropped from the destination. +The `dev_mode` option: dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. This allows a non destructive full refresh. Nothing is being deleted/dropped from the destination. ## pipeline working directory and state Another fundamental concept is the pipeline working directory. This directory keeps the following information: @@ -117,7 +117,7 @@ The `restore_from_destination` argument to `dlt.pipeline` let's the user restore The state is being stored in the destination together with other data. So only when all pipeline stages are completed the state is available for restoration. -The pipeline cannot be restored if `full_refresh` flag is set. +The pipeline cannot be restored if `dev_mode` flag is set. The other way to trigger full refresh is to drop destination dataset. `dlt` detects that and resets the pipeline local working folder. @@ -155,8 +155,8 @@ The default json normalizer will convert json documents into tables. All the key ❗ [more here](working_with_schemas.md) -### Full refresh mode -If `full_refresh` flag is passed to `dlt.pipeline` then +### Dev mode mode +If `dev_mode` flag is passed to `dlt.pipeline` then 1. the pipeline working dir is fully wiped out (state, schemas, temp files) 2. dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. 3. pipeline will not be restored from the destination diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index eeb717515a..c89a63a524 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -652,6 +652,6 @@ resource. Below we show you an example on how to pseudonymize the data before it print(info) ``` -1. Remember to keep the pipeline name and destination dataset name consistent. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) from the last run, which is essential for incremental loading. Altering these names could initiate a "[full_refresh](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh)", interfering with the metadata tracking necessary for [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading). +1. Remember to keep the pipeline name and destination dataset name consistent. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) from the last run, which is essential for incremental loading. Altering these names could initiate a "[dev_mode](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-dev-mode)", interfering with the metadata tracking necessary for [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 8c39a5090e..fdbefeddf1 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -232,6 +232,6 @@ verified source. load_info = pipeline.run(data=[source_single, source_incremental]) print(load_info) ``` - > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“full_refresh”](../../general-usage/pipeline#do-experiments-with-full-refresh), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading). + > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“dev_mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 472f48a28f..9229ddca7e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -272,7 +272,7 @@ To create your data pipeline using single loading and destination dataset names. The pipeline name helps retrieve the [state](https://dlthub.com/docs/general-usage/state) of the last run, essential for incremental data loading. Changing these names might trigger a - [“full_refresh”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh), + [“dev_mode”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-dev-mode), disrupting metadata tracking for [incremental data loading](https://dlthub.com/docs/general-usage/incremental-loading). From 011d7ff508f3d5a2da666e418a7137fb79acab49 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sun, 25 Aug 2024 23:07:02 +0200 Subject: [PATCH 25/34] feat/1681 collects load job metrics and adds remote uri (#1708) * collects basic load job metrics in LoadJob * adds remote uri to filesystem copy jobs metrics * adds job id to load package info * adds table name to job metrics * skips run step when serializing trace * adds trace shape test with trace schema * tests job file name too long * docs running pipelines with the same name for different envs * extracts step metrics in common, renames followup jobs * fixes tests * fixes tests * tests delta filesystem for remote_uri * adds exec_info to trace contract test * tests remote_uri for filesystem copy * fixes platform test --- dlt/common/data_writers/__init__.py | 2 - dlt/common/data_writers/buffered.py | 3 +- dlt/common/data_writers/writers.py | 20 +- dlt/common/destination/reference.py | 27 +- dlt/common/metrics.py | 71 ++ dlt/common/pipeline.py | 103 +-- dlt/common/storages/__init__.py | 4 +- dlt/common/storages/data_item_storage.py | 7 +- dlt/common/storages/load_package.py | 51 +- dlt/common/storages/load_storage.py | 8 +- dlt/destinations/impl/athena/athena.py | 10 +- dlt/destinations/impl/bigquery/bigquery.py | 12 +- .../impl/clickhouse/clickhouse.py | 12 +- .../impl/databricks/databricks.py | 12 +- dlt/destinations/impl/dremio/dremio.py | 12 +- dlt/destinations/impl/dummy/configuration.py | 2 +- dlt/destinations/impl/dummy/dummy.py | 33 +- .../impl/filesystem/filesystem.py | 43 +- dlt/destinations/impl/mssql/mssql.py | 8 +- dlt/destinations/impl/postgres/postgres.py | 4 +- dlt/destinations/impl/redshift/redshift.py | 10 +- dlt/destinations/impl/snowflake/snowflake.py | 6 +- dlt/destinations/impl/synapse/synapse.py | 8 +- dlt/destinations/job_client_impl.py | 18 +- dlt/destinations/job_impl.py | 11 +- dlt/destinations/sql_jobs.py | 6 +- dlt/extract/extractors.py | 2 +- dlt/extract/storage.py | 3 +- dlt/load/load.py | 41 +- dlt/load/utils.py | 4 +- dlt/normalize/items_normalizers.py | 2 +- dlt/normalize/normalize.py | 2 +- dlt/normalize/worker.py | 2 +- dlt/pipeline/trace.py | 2 +- docs/website/docs/general-usage/pipeline.md | 13 + .../common/data_writers/test_data_writers.py | 7 +- tests/common/storages/utils.py | 4 +- .../data_writers/test_buffered_writer.py | 2 +- .../data_writers/test_data_item_storage.py | 3 +- .../load/pipeline/test_filesystem_pipeline.py | 58 ++ tests/load/pipeline/test_postgres.py | 15 + tests/load/pipeline/test_stage_loading.py | 16 + tests/load/test_dummy_client.py | 110 ++- tests/load/utils.py | 5 +- .../cases/contracts/trace.schema.yaml | 772 ++++++++++++++++++ tests/pipeline/test_pipeline.py | 53 +- tests/pipeline/test_pipeline_trace.py | 169 +++- tests/pipeline/test_platform_connection.py | 3 +- tests/pipeline/utils.py | 3 + tests/utils.py | 5 +- 50 files changed, 1552 insertions(+), 247 deletions(-) create mode 100644 dlt/common/metrics.py create mode 100644 tests/pipeline/cases/contracts/trace.schema.yaml diff --git a/dlt/common/data_writers/__init__.py b/dlt/common/data_writers/__init__.py index 945e74a37b..9966590c06 100644 --- a/dlt/common/data_writers/__init__.py +++ b/dlt/common/data_writers/__init__.py @@ -1,6 +1,5 @@ from dlt.common.data_writers.writers import ( DataWriter, - DataWriterMetrics, TDataItemFormat, FileWriterSpec, create_import_spec, @@ -22,7 +21,6 @@ "resolve_best_writer_spec", "get_best_writer_spec", "is_native_writer", - "DataWriterMetrics", "TDataItemFormat", "BufferedDataWriter", "new_file_id", diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 8077007edb..945fca6580 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -3,6 +3,7 @@ import contextlib from typing import ClassVar, Iterator, List, IO, Any, Optional, Type, Generic +from dlt.common.metrics import DataWriterMetrics from dlt.common.typing import TDataItem, TDataItems from dlt.common.data_writers.exceptions import ( BufferedDataWriterClosed, @@ -10,7 +11,7 @@ FileImportNotFound, InvalidFileNameTemplateException, ) -from dlt.common.data_writers.writers import TWriter, DataWriter, DataWriterMetrics, FileWriterSpec +from dlt.common.data_writers.writers import TWriter, DataWriter, FileWriterSpec from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.configuration import with_config, known_sections, configspec from dlt.common.configuration.specs import BaseConfiguration diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index d324792a83..abd3343ea1 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -34,6 +34,7 @@ TLoaderFileFormat, ALL_SUPPORTED_FILE_FORMATS, ) +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.typing import StrAny @@ -59,25 +60,6 @@ class FileWriterSpec(NamedTuple): supports_compression: bool = False -class DataWriterMetrics(NamedTuple): - file_path: str - items_count: int - file_size: int - created: float - last_modified: float - - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: - if isinstance(other, DataWriterMetrics): - return DataWriterMetrics( - "", # path is not known - self.items_count + other.items_count, - self.file_size + other.file_size, - min(self.created, other.created), - max(self.last_modified, other.last_modified), - ) - return NotImplemented - - EMPTY_DATA_WRITER_METRICS = DataWriterMetrics("", 0, 0, 2**32, 0.0) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 3af7dcff13..b6c7041592 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -24,10 +24,11 @@ from copy import deepcopy import inspect -from dlt.common import logger +from dlt.common import logger, pendulum from dlt.common.configuration.specs.base_configuration import extract_inner_hint from dlt.common.destination.utils import verify_schema_capabilities from dlt.common.exceptions import TerminalValueError +from dlt.common.metrics import LoadJobMetrics from dlt.common.normalizers.naming import NamingConvention from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.utils import ( @@ -284,6 +285,8 @@ def __init__(self, file_path: str) -> None: # NOTE: we only accept a full filepath in the constructor assert self._file_name != self._file_path self._parsed_file_name = ParsedLoadJobFileName.parse(self._file_name) + self._started_at: pendulum.DateTime = None + self._finished_at: pendulum.DateTime = None def job_id(self) -> str: """The job id that is derived from the file name and does not changes during job lifecycle""" @@ -306,6 +309,18 @@ def exception(self) -> str: """The exception associated with failed or retry states""" pass + def metrics(self) -> Optional[LoadJobMetrics]: + """Returns job execution metrics""" + return LoadJobMetrics( + self._parsed_file_name.job_id(), + self._file_path, + self._parsed_file_name.table_name, + self._started_at, + self._finished_at, + self.state(), + None, + ) + class RunnableLoadJob(LoadJob, ABC): """Represents a runnable job that loads a single file @@ -361,6 +376,7 @@ def run_managed( # filepath is now moved to running try: self._state = "running" + self._started_at = pendulum.now() self._job_client.prepare_load_job_execution(self) self.run() self._state = "completed" @@ -371,6 +387,7 @@ def run_managed( self._state = "retry" self._exception = e finally: + self._finished_at = pendulum.now() # sanity check assert self._state in ("completed", "retry", "failed") @@ -391,7 +408,7 @@ def exception(self) -> str: return str(self._exception) -class FollowupJob: +class FollowupJobRequest: """Base class for follow up jobs that should be created""" @abstractmethod @@ -403,8 +420,8 @@ def new_file_path(self) -> str: class HasFollowupJobs: """Adds a trait that allows to create single or table chain followup jobs""" - def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: - """Return list of new jobs. `final_state` is state to which this job transits""" + def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: + """Return list of jobs requests for jobs that should be created. `final_state` is state to which this job transits""" return [] @@ -479,7 +496,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py new file mode 100644 index 0000000000..5cccee4045 --- /dev/null +++ b/dlt/common/metrics.py @@ -0,0 +1,71 @@ +import datetime # noqa: I251 +from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict # noqa: 251 + + +class DataWriterMetrics(NamedTuple): + file_path: str + items_count: int + file_size: int + created: float + last_modified: float + + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + if isinstance(other, DataWriterMetrics): + return DataWriterMetrics( + self.file_path if self.file_path == other.file_path else "", + # self.table_name if self.table_name == other.table_name else "", + self.items_count + other.items_count, + self.file_size + other.file_size, + min(self.created, other.created), + max(self.last_modified, other.last_modified), + ) + return NotImplemented + + +class StepMetrics(TypedDict): + """Metrics for particular package processed in particular pipeline step""" + + started_at: datetime.datetime + """Start of package processing""" + finished_at: datetime.datetime + """End of package processing""" + + +class ExtractDataInfo(TypedDict): + name: str + data_type: str + + +class ExtractMetrics(StepMetrics): + schema_name: str + job_metrics: Dict[str, DataWriterMetrics] + """Metrics collected per job id during writing of job file""" + table_metrics: Dict[str, DataWriterMetrics] + """Job metrics aggregated by table""" + resource_metrics: Dict[str, DataWriterMetrics] + """Job metrics aggregated by resource""" + dag: List[Tuple[str, str]] + """A resource dag where elements of the list are graph edges""" + hints: Dict[str, Dict[str, Any]] + """Hints passed to the resources""" + + +class NormalizeMetrics(StepMetrics): + job_metrics: Dict[str, DataWriterMetrics] + """Metrics collected per job id during writing of job file""" + table_metrics: Dict[str, DataWriterMetrics] + """Job metrics aggregated by table""" + + +class LoadJobMetrics(NamedTuple): + job_id: str + file_path: str + table_name: str + started_at: datetime.datetime + finished_at: datetime.datetime + state: Optional[str] + remote_uri: Optional[str] + + +class LoadMetrics(StepMetrics): + job_metrics: Dict[str, LoadJobMetrics] diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 1e1416eb53..8a07ddbd33 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -16,7 +16,6 @@ Optional, Protocol, Sequence, - TYPE_CHECKING, Tuple, TypeVar, TypedDict, @@ -36,6 +35,14 @@ from dlt.common.destination import TDestinationReferenceArg, TDestination from dlt.common.destination.exceptions import DestinationHasFailedJobs from dlt.common.exceptions import PipelineStateNotAvailable, SourceSectionNotAvailable +from dlt.common.metrics import ( + DataWriterMetrics, + ExtractDataInfo, + ExtractMetrics, + LoadMetrics, + NormalizeMetrics, + StepMetrics, +) from dlt.common.schema import Schema from dlt.common.schema.typing import ( TColumnNames, @@ -44,11 +51,12 @@ TSchemaContract, ) from dlt.common.source import get_current_pipe_name +from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.time import ensure_pendulum_datetime, precise_time from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize from dlt.common.jsonpath import delete_matches, TAnyJsonPath -from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat +from dlt.common.data_writers.writers import TLoaderFileFormat from dlt.common.utils import RowCounts, merge_row_counts from dlt.common.versioned_state import TVersionedState @@ -68,15 +76,6 @@ class _StepInfo(NamedTuple): finished_at: datetime.datetime -class StepMetrics(TypedDict): - """Metrics for particular package processed in particular pipeline step""" - - started_at: datetime.datetime - """Start of package processing""" - finished_at: datetime.datetime - """End of package processing""" - - TStepMetricsCo = TypeVar("TStepMetricsCo", bound=StepMetrics, covariant=True) @@ -154,17 +153,20 @@ def _load_packages_asstr(load_packages: List[LoadPackageInfo], verbosity: int) - return msg @staticmethod - def job_metrics_asdict( + def writer_metrics_asdict( job_metrics: Dict[str, DataWriterMetrics], key_name: str = "job_id", extend: StrAny = None ) -> List[DictStrAny]: - jobs = [] - for job_id, metrics in job_metrics.items(): + entities = [] + for entity_id, metrics in job_metrics.items(): d = metrics._asdict() if extend: d.update(extend) - d[key_name] = job_id - jobs.append(d) - return jobs + d[key_name] = entity_id + # add job-level info if known + if metrics.file_path: + d["table_name"] = ParsedLoadJobFileName.parse(metrics.file_path).table_name + entities.append(d) + return entities def _astuple(self) -> _StepInfo: return _StepInfo( @@ -177,25 +179,6 @@ def _astuple(self) -> _StepInfo: ) -class ExtractDataInfo(TypedDict): - name: str - data_type: str - - -class ExtractMetrics(StepMetrics): - schema_name: str - job_metrics: Dict[str, DataWriterMetrics] - """Metrics collected per job id during writing of job file""" - table_metrics: Dict[str, DataWriterMetrics] - """Job metrics aggregated by table""" - resource_metrics: Dict[str, DataWriterMetrics] - """Job metrics aggregated by resource""" - dag: List[Tuple[str, str]] - """A resource dag where elements of the list are graph edges""" - hints: Dict[str, Dict[str, Any]] - """Hints passed to the resources""" - - class _ExtractInfo(NamedTuple): """NamedTuple cannot be part of the derivation chain so we must re-declare all fields to use it as mixin later""" @@ -228,16 +211,8 @@ def asdict(self) -> DictStrAny: for load_id, metrics_list in self.metrics.items(): for idx, metrics in enumerate(metrics_list): extend = {"load_id": load_id, "extract_idx": idx} - load_metrics["job_metrics"].extend( - self.job_metrics_asdict(metrics["job_metrics"], extend=extend) - ) - load_metrics["table_metrics"].extend( - self.job_metrics_asdict( - metrics["table_metrics"], key_name="table_name", extend=extend - ) - ) load_metrics["resource_metrics"].extend( - self.job_metrics_asdict( + self.writer_metrics_asdict( metrics["resource_metrics"], key_name="resource_name", extend=extend ) ) @@ -253,6 +228,15 @@ def asdict(self) -> DictStrAny: for name, hints in metrics["hints"].items() ] ) + load_metrics["job_metrics"].extend( + self.writer_metrics_asdict(metrics["job_metrics"], extend=extend) + ) + load_metrics["table_metrics"].extend( + self.writer_metrics_asdict( + metrics["table_metrics"], key_name="table_name", extend=extend + ) + ) + d.update(load_metrics) return d @@ -260,13 +244,6 @@ def asstr(self, verbosity: int = 0) -> str: return self._load_packages_asstr(self.load_packages, verbosity) -class NormalizeMetrics(StepMetrics): - job_metrics: Dict[str, DataWriterMetrics] - """Metrics collected per job id during writing of job file""" - table_metrics: Dict[str, DataWriterMetrics] - """Job metrics aggregated by table""" - - class _NormalizeInfo(NamedTuple): pipeline: "SupportsPipeline" metrics: Dict[str, List[NormalizeMetrics]] @@ -305,10 +282,10 @@ def asdict(self) -> DictStrAny: for idx, metrics in enumerate(metrics_list): extend = {"load_id": load_id, "extract_idx": idx} load_metrics["job_metrics"].extend( - self.job_metrics_asdict(metrics["job_metrics"], extend=extend) + self.writer_metrics_asdict(metrics["job_metrics"], extend=extend) ) load_metrics["table_metrics"].extend( - self.job_metrics_asdict( + self.writer_metrics_asdict( metrics["table_metrics"], key_name="table_name", extend=extend ) ) @@ -326,10 +303,6 @@ def asstr(self, verbosity: int = 0) -> str: return msg -class LoadMetrics(StepMetrics): - pass - - class _LoadInfo(NamedTuple): pipeline: "SupportsPipeline" metrics: Dict[str, List[LoadMetrics]] @@ -354,7 +327,19 @@ class LoadInfo(StepInfo[LoadMetrics], _LoadInfo): # type: ignore[misc] def asdict(self) -> DictStrAny: """A dictionary representation of LoadInfo that can be loaded with `dlt`""" - return super().asdict() + d = super().asdict() + # transform metrics + d.pop("metrics") + load_metrics: Dict[str, List[Any]] = {"job_metrics": []} + for load_id, metrics_list in self.metrics.items(): + # one set of metrics per package id + assert len(metrics_list) == 1 + metrics = metrics_list[0] + for job_metrics in metrics["job_metrics"].values(): + load_metrics["job_metrics"].append({"load_id": load_id, **job_metrics._asdict()}) + + d.update(load_metrics) + return d def asstr(self, verbosity: int = 0) -> str: msg = f"Pipeline {self.pipeline.pipeline_name} load step completed in " diff --git a/dlt/common/storages/__init__.py b/dlt/common/storages/__init__.py index 7bb3c0cf97..50876a01cd 100644 --- a/dlt/common/storages/__init__.py +++ b/dlt/common/storages/__init__.py @@ -8,7 +8,7 @@ LoadJobInfo, LoadPackageInfo, PackageStorage, - TJobState, + TPackageJobState, create_load_id, ) from .data_item_storage import DataItemStorage @@ -40,7 +40,7 @@ "LoadJobInfo", "LoadPackageInfo", "PackageStorage", - "TJobState", + "TPackageJobState", "create_load_id", "fsspec_from_config", "fsspec_filesystem", diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index 29a9da8acf..0f70c04bc5 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -1,14 +1,13 @@ -from pathlib import Path -from typing import Dict, Any, List, Sequence +from typing import Dict, Any, List from abc import ABC, abstractmethod from dlt.common import logger +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema import TTableSchemaColumns -from dlt.common.typing import StrAny, TDataItems +from dlt.common.typing import TDataItems from dlt.common.data_writers import ( BufferedDataWriter, DataWriter, - DataWriterMetrics, FileWriterSpec, ) diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index b0ed93f734..d569fbe662 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -143,8 +143,8 @@ def create_load_id() -> str: # folders to manage load jobs in a single load package -TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] -WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState)) +TPackageJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] +WORKING_FOLDERS: Set[TPackageJobState] = set(get_args(TPackageJobState)) TLoadPackageStatus = Literal["new", "extracted", "normalized", "loaded", "aborted"] @@ -191,7 +191,7 @@ def __str__(self) -> str: class LoadJobInfo(NamedTuple): - state: TJobState + state: TPackageJobState file_path: str file_size: int created_at: datetime.datetime @@ -204,6 +204,7 @@ def asdict(self) -> DictStrAny: # flatten del d["job_file_info"] d.update(self.job_file_info._asdict()) + d["job_id"] = self.job_file_info.job_id() return d def asstr(self, verbosity: int = 0) -> str: @@ -241,7 +242,7 @@ class _LoadPackageInfo(NamedTuple): schema: Schema schema_update: TSchemaTables completed_at: datetime.datetime - jobs: Dict[TJobState, List[LoadJobInfo]] + jobs: Dict[TPackageJobState, List[LoadJobInfo]] class LoadPackageInfo(SupportsHumanize, _LoadPackageInfo): @@ -298,10 +299,10 @@ def __str__(self) -> str: class PackageStorage: - NEW_JOBS_FOLDER: ClassVar[TJobState] = "new_jobs" - FAILED_JOBS_FOLDER: ClassVar[TJobState] = "failed_jobs" - STARTED_JOBS_FOLDER: ClassVar[TJobState] = "started_jobs" - COMPLETED_JOBS_FOLDER: ClassVar[TJobState] = "completed_jobs" + NEW_JOBS_FOLDER: ClassVar[TPackageJobState] = "new_jobs" + FAILED_JOBS_FOLDER: ClassVar[TPackageJobState] = "failed_jobs" + STARTED_JOBS_FOLDER: ClassVar[TPackageJobState] = "started_jobs" + COMPLETED_JOBS_FOLDER: ClassVar[TPackageJobState] = "completed_jobs" SCHEMA_FILE_NAME: ClassVar[str] = "schema.json" SCHEMA_UPDATES_FILE_NAME = ( # updates to the tables in schema created by normalizer @@ -330,11 +331,11 @@ def get_package_path(self, load_id: str) -> str: """Gets path of the package relative to storage root""" return load_id - def get_job_state_folder_path(self, load_id: str, state: TJobState) -> str: + def get_job_state_folder_path(self, load_id: str, state: TPackageJobState) -> str: """Gets path to the jobs in `state` in package `load_id`, relative to the storage root""" return os.path.join(self.get_package_path(load_id), state) - def get_job_file_path(self, load_id: str, state: TJobState, file_name: str) -> str: + def get_job_file_path(self, load_id: str, state: TPackageJobState, file_name: str) -> str: """Get path to job with `file_name` in `state` in package `load_id`, relative to the storage root""" return os.path.join(self.get_job_state_folder_path(load_id, state), file_name) @@ -369,12 +370,12 @@ def list_failed_jobs(self, load_id: str) -> Sequence[str]: def list_job_with_states_for_table( self, load_id: str, table_name: str - ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]: return self.filter_jobs_for_table(self.list_all_jobs_with_states(load_id), table_name) def list_all_jobs_with_states( self, load_id: str - ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]: info = self.get_load_package_jobs(load_id) state_jobs = [] for state, jobs in info.items(): @@ -413,7 +414,7 @@ def is_package_completed(self, load_id: str) -> bool: # def import_job( - self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs" + self, load_id: str, job_file_path: str, job_state: TPackageJobState = "new_jobs" ) -> None: """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`""" self.storage.atomic_import( @@ -568,12 +569,14 @@ def get_load_package_state_path(self, load_id: str) -> str: # Get package info # - def get_load_package_jobs(self, load_id: str) -> Dict[TJobState, List[ParsedLoadJobFileName]]: + def get_load_package_jobs( + self, load_id: str + ) -> Dict[TPackageJobState, List[ParsedLoadJobFileName]]: """Gets all jobs in a package and returns them as lists assigned to a particular state.""" package_path = self.get_package_path(load_id) if not self.storage.has_folder(package_path): raise LoadPackageNotFound(load_id) - all_jobs: Dict[TJobState, List[ParsedLoadJobFileName]] = {} + all_jobs: Dict[TPackageJobState, List[ParsedLoadJobFileName]] = {} for state in WORKING_FOLDERS: jobs: List[ParsedLoadJobFileName] = [] with contextlib.suppress(FileNotFoundError): @@ -616,7 +619,7 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: schema = Schema.from_dict(self._load_schema(load_id)) # read jobs with all statuses - all_job_infos: Dict[TJobState, List[LoadJobInfo]] = {} + all_job_infos: Dict[TPackageJobState, List[LoadJobInfo]] = {} for state, jobs in package_jobs.items(): all_job_infos[state] = [ self._read_job_file_info(load_id, state, job, package_created_at) for job in jobs @@ -643,7 +646,7 @@ def get_job_failed_message(self, load_id: str, job: ParsedLoadJobFileName) -> st return failed_message def job_to_job_info( - self, load_id: str, state: TJobState, job: ParsedLoadJobFileName + self, load_id: str, state: TPackageJobState, job: ParsedLoadJobFileName ) -> LoadJobInfo: """Creates partial job info by converting job object. size, mtime and failed message will not be populated""" full_path = os.path.join( @@ -660,7 +663,11 @@ def job_to_job_info( ) def _read_job_file_info( - self, load_id: str, state: TJobState, job: ParsedLoadJobFileName, now: DateTime = None + self, + load_id: str, + state: TPackageJobState, + job: ParsedLoadJobFileName, + now: DateTime = None, ) -> LoadJobInfo: """Creates job info by reading additional props from storage""" failed_message = None @@ -687,8 +694,8 @@ def _read_job_file_info( def _move_job( self, load_id: str, - source_folder: TJobState, - dest_folder: TJobState, + source_folder: TPackageJobState, + dest_folder: TPackageJobState, file_name: str, new_file_name: str = None, ) -> str: @@ -736,8 +743,8 @@ def _job_elapsed_time_seconds(file_path: str, now_ts: float = None) -> float: @staticmethod def filter_jobs_for_table( - all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]], table_name: str - ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + all_jobs: Iterable[Tuple[TPackageJobState, ParsedLoadJobFileName]], table_name: str + ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]: return [job for job in all_jobs if job[1].table_name == table_name] diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index 00e95fbad9..8ac1d74e9a 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -17,7 +17,7 @@ LoadPackageInfo, PackageStorage, ParsedLoadJobFileName, - TJobState, + TPackageJobState, TLoadPackageState, TJobFileFormat, ) @@ -141,16 +141,16 @@ def commit_schema_update(self, load_id: str, applied_update: TSchemaTables) -> N """Marks schema update as processed and stores the update that was applied at the destination""" load_path = self.get_normalized_package_path(load_id) schema_update_file = join(load_path, PackageStorage.SCHEMA_UPDATES_FILE_NAME) - processed_schema_update_file = join( + applied_schema_update_file = join( load_path, PackageStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME ) # delete initial schema update self.storage.delete(schema_update_file) # save applied update - self.storage.save(processed_schema_update_file, json.dumps(applied_update)) + self.storage.save(applied_schema_update_file, json.dumps(applied_update)) def import_new_job( - self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs" + self, load_id: str, job_file_path: str, job_state: TPackageJobState = "new_jobs" ) -> None: """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`""" # TODO: use normalize storage and add file type checks diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 371c1bae22..1429b28240 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -46,7 +46,7 @@ from dlt.common.schema.utils import table_schema_has_type from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import LoadJob -from dlt.common.destination.reference import FollowupJob, SupportsStagingDestination +from dlt.common.destination.reference import FollowupJobRequest, SupportsStagingDestination from dlt.common.data_writers.escape import escape_hive_identifier from dlt.destinations.sql_jobs import SqlStagingCopyFollowupJob, SqlMergeFollowupJob @@ -490,7 +490,7 @@ def create_load_job( def _create_append_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self._is_iceberg_table(self.prepare_load_table(table_chain[0]["name"])): return [ SqlStagingCopyFollowupJob.from_table_chain( @@ -501,7 +501,7 @@ def _create_append_followup_jobs( def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self._is_iceberg_table(self.prepare_load_table(table_chain[0]["name"])): return [ SqlStagingCopyFollowupJob.from_table_chain( @@ -510,7 +510,9 @@ def _create_replace_followup_jobs( ] return super()._create_replace_followup_jobs(table_chain) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [AthenaMergeJob.from_table_chain(table_chain, self.sql_client)] def _is_iceberg_table(self, table: TTableSchema) -> bool: diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index c6bf2e7654..8291415434 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -16,7 +16,7 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( HasFollowupJobs, - FollowupJob, + FollowupJobRequest, TLoadJobState, RunnableLoadJob, SupportsStagingDestination, @@ -51,7 +51,7 @@ from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.destinations.impl.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS from dlt.destinations.job_client_impl import SqlJobClientWithStaging -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_jobs import SqlMergeFollowupJob from dlt.destinations.type_mapping import TypeMapper from dlt.destinations.utils import parse_db_data_type_str_with_precision @@ -234,7 +234,9 @@ def __init__( self.sql_client: BigQuerySqlClient = sql_client # type: ignore self.type_mapper = BigQueryTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)] def create_load_job( @@ -433,8 +435,8 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load # determine whether we load from local or uri bucket_path = None ext: str = os.path.splitext(file_path)[1][1:] - if ReferenceFollowupJob.is_reference_job(file_path): - bucket_path = ReferenceFollowupJob.resolve_reference(file_path) + if ReferenceFollowupJobRequest.is_reference_job(file_path): + bucket_path = ReferenceFollowupJobRequest.resolve_reference(file_path) ext = os.path.splitext(bucket_path)[1][1:] # Select a correct source format diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index 5bd34e0e0d..5f17a5a18c 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -20,7 +20,7 @@ TLoadJobState, HasFollowupJobs, RunnableLoadJob, - FollowupJob, + FollowupJobRequest, LoadJob, ) from dlt.common.schema import Schema, TColumnSchema @@ -52,7 +52,7 @@ SqlJobClientBase, SqlJobClientWithStaging, ) -from dlt.destinations.job_impl import ReferenceFollowupJob, FinalizedLoadJobWithFollowupJobs +from dlt.destinations.job_impl import ReferenceFollowupJobRequest, FinalizedLoadJobWithFollowupJobs from dlt.destinations.sql_jobs import SqlMergeFollowupJob from dlt.destinations.type_mapping import TypeMapper @@ -141,8 +141,8 @@ def run(self) -> None: bucket_path = None file_name = self._file_name - if ReferenceFollowupJob.is_reference_job(self._file_path): - bucket_path = ReferenceFollowupJob.resolve_reference(self._file_path) + if ReferenceFollowupJobRequest.is_reference_job(self._file_path): + bucket_path = ReferenceFollowupJobRequest.resolve_reference(self._file_path) file_name = FileStorage.get_file_name_from_file_path(bucket_path) bucket_url = urlparse(bucket_path) bucket_scheme = bucket_url.scheme @@ -288,7 +288,9 @@ def __init__( self.active_hints = deepcopy(HINT_TO_CLICKHOUSE_ATTR) self.type_mapper = ClickHouseTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [ClickHouseMergeJob.from_table_chain(table_chain, self.sql_client)] def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 0a203c21b6..2f23e88ea0 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -5,7 +5,7 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( HasFollowupJobs, - FollowupJob, + FollowupJobRequest, TLoadJobState, RunnableLoadJob, CredentialsConfiguration, @@ -31,7 +31,7 @@ from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration from dlt.destinations.impl.databricks.sql_client import DatabricksSqlClient from dlt.destinations.sql_jobs import SqlMergeFollowupJob -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.type_mapping import TypeMapper @@ -121,8 +121,8 @@ def run(self) -> None: staging_credentials = self._staging_config.credentials # extract and prepare some vars bucket_path = orig_bucket_path = ( - ReferenceFollowupJob.resolve_reference(self._file_path) - if ReferenceFollowupJob.is_reference_job(self._file_path) + ReferenceFollowupJobRequest.resolve_reference(self._file_path) + if ReferenceFollowupJobRequest.is_reference_job(self._file_path) else "" ) file_name = ( @@ -279,7 +279,9 @@ def create_load_job( ) return job - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [DatabricksMergeJob.from_table_chain(table_chain, self.sql_client)] def _make_add_column_sql( diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py index 3611665f6c..68a3fedc31 100644 --- a/dlt/destinations/impl/dremio/dremio.py +++ b/dlt/destinations/impl/dremio/dremio.py @@ -7,7 +7,7 @@ TLoadJobState, RunnableLoadJob, SupportsStagingDestination, - FollowupJob, + FollowupJobRequest, LoadJob, ) from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns @@ -19,7 +19,7 @@ from dlt.destinations.impl.dremio.sql_client import DremioSqlClient from dlt.destinations.job_client_impl import SqlJobClientWithStaging from dlt.destinations.job_impl import FinalizedLoadJobWithFollowupJobs -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_jobs import SqlMergeFollowupJob from dlt.destinations.type_mapping import TypeMapper from dlt.destinations.sql_client import SqlClientBase @@ -101,8 +101,8 @@ def run(self) -> None: # extract and prepare some vars bucket_path = ( - ReferenceFollowupJob.resolve_reference(self._file_path) - if ReferenceFollowupJob.is_reference_job(self._file_path) + ReferenceFollowupJobRequest.resolve_reference(self._file_path) + if ReferenceFollowupJobRequest.is_reference_job(self._file_path) else "" ) @@ -201,7 +201,9 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" ) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [DremioMergeJob.from_table_chain(table_chain, self.sql_client)] def _make_add_column_sql( diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py index 7bc1d9e943..023b88e51a 100644 --- a/dlt/destinations/impl/dummy/configuration.py +++ b/dlt/destinations/impl/dummy/configuration.py @@ -25,7 +25,7 @@ class DummyClientConfiguration(DestinationClientConfiguration): retry_prob: float = 0.0 """probability of job retry""" completed_prob: float = 0.0 - """probablibitly of successful job completion""" + """probability of successful job completion""" exception_prob: float = 0.0 """probability of exception transient exception when running job""" timeout: float = 10.0 diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 7d406c969f..49b55ec65d 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -14,6 +14,7 @@ ) import os import time +from dlt.common.metrics import LoadJobMetrics from dlt.common.pendulum import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.storages import FileStorage @@ -25,7 +26,7 @@ ) from dlt.common.destination.reference import ( HasFollowupJobs, - FollowupJob, + FollowupJobRequest, SupportsStagingDestination, TLoadJobState, RunnableLoadJob, @@ -37,10 +38,9 @@ from dlt.destinations.exceptions import ( LoadJobNotExistsException, - LoadJobInvalidStateTransitionException, ) from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest class LoadDummyBaseJob(RunnableLoadJob): @@ -78,18 +78,25 @@ def run(self) -> None: c_r = random.random() if self.config.retry_prob >= c_r: # this will make the job go to a retry state - raise DestinationTransientException("a random retry occured") + raise DestinationTransientException("a random retry occurred") # fail prob c_r = random.random() if self.config.fail_prob >= c_r: # this will make the the job go to a failed state - raise DestinationTerminalException("a random fail occured") + raise DestinationTerminalException("a random fail occurred") time.sleep(0.1) + def metrics(self) -> Optional[LoadJobMetrics]: + m = super().metrics() + # add remote uri if there's followup job + if self.config.create_followup_jobs: + m = m._replace(remote_uri=self._file_name) + return m -class DummyFollowupJob(ReferenceFollowupJob): + +class DummyFollowupJobRequest(ReferenceFollowupJobRequest): def __init__( self, original_file_name: str, remote_paths: List[str], config: DummyClientConfiguration ) -> None: @@ -100,9 +107,9 @@ def __init__( class LoadDummyJob(LoadDummyBaseJob, HasFollowupJobs): - def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: + def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: if self.config.create_followup_jobs and final_state == "completed": - new_job = DummyFollowupJob( + new_job = DummyFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._file_name], config=self.config, @@ -113,8 +120,8 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: JOBS: Dict[str, LoadDummyBaseJob] = {} -CREATED_FOLLOWUP_JOBS: Dict[str, FollowupJob] = {} -CREATED_TABLE_CHAIN_FOLLOWUP_JOBS: Dict[str, FollowupJob] = {} +CREATED_FOLLOWUP_JOBS: Dict[str, FollowupJobRequest] = {} +CREATED_TABLE_CHAIN_FOLLOWUP_JOBS: Dict[str, FollowupJobRequest] = {} RETRIED_JOBS: Dict[str, LoadDummyBaseJob] = {} @@ -173,7 +180,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: """Creates a list of followup jobs that should be executed after a table chain is completed""" # if sql job follow up is configure we schedule a merge job that will always fail @@ -184,7 +191,7 @@ def create_table_chain_completed_followup_jobs( if self.config.create_followup_table_chain_reference_jobs: table_job_paths = [job.file_path for job in completed_table_chain_jobs] file_name = FileStorage.get_file_name_from_file_path(table_job_paths[0]) - job = ReferenceFollowupJob(file_name, table_job_paths) + job = ReferenceFollowupJobRequest(file_name, table_job_paths) CREATED_TABLE_CHAIN_FOLLOWUP_JOBS[job.job_id()] = job return [job] return [] @@ -212,7 +219,7 @@ def __exit__( pass def _create_job(self, job_id: str) -> LoadDummyBaseJob: - if ReferenceFollowupJob.is_reference_job(job_id): + if ReferenceFollowupJobRequest.is_reference_job(job_id): return LoadDummyBaseJob(job_id, config=self.config) else: return LoadDummyJob(job_id, config=self.config) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index f2466f25a2..2e09871ba9 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -9,6 +9,7 @@ import dlt from dlt.common import logger, time, json, pendulum +from dlt.common.metrics import LoadJobMetrics from dlt.common.storages.fsspec_filesystem import glob_files from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema @@ -21,7 +22,7 @@ ) from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( - FollowupJob, + FollowupJobRequest, TLoadJobState, RunnableLoadJob, JobClientBase, @@ -34,7 +35,7 @@ ) from dlt.common.destination.exceptions import DestinationUndefinedEntity from dlt.destinations.job_impl import ( - ReferenceFollowupJob, + ReferenceFollowupJobRequest, FinalizedLoadJob, FinalizedLoadJobWithFollowupJobs, ) @@ -87,6 +88,13 @@ def make_remote_path(self) -> str: path_utils.normalize_path_sep(self.pathlib, self.destination_file_name), ) + def make_remote_uri(self) -> str: + return self._job_client.make_remote_uri(self.make_remote_path()) + + def metrics(self) -> Optional[LoadJobMetrics]: + m = super().metrics() + return m._replace(remote_uri=self.make_remote_uri()) + class DeltaLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: @@ -95,6 +103,15 @@ def __init__(self, file_path: str) -> None: ) def run(self) -> None: + # pick local filesystem pathlib or posix for buckets + # TODO: since we pass _job_client via run_managed and not set_env_vars it is hard + # to write a handler with those two line below only in FilesystemLoadJob + self.is_local_filesystem = self._job_client.config.protocol == "file" + self.pathlib = os.path if self.is_local_filesystem else posixpath + self.destination_file_name = self._job_client.make_remote_uri( + self._job_client.get_table_dir(self.load_table_name) + ) + from dlt.common.libs.pyarrow import pyarrow as pa from dlt.common.libs.deltalake import ( DeltaTable, @@ -105,15 +122,13 @@ def run(self) -> None: ) # create Arrow dataset from Parquet files - file_paths = ReferenceFollowupJob.resolve_references(self._file_path) + file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) arrow_ds = pa.dataset.dataset(file_paths) # create Delta table object - dt_path = self._job_client.make_remote_uri( - self._job_client.get_table_dir(self.load_table_name) - ) + storage_options = _deltalake_storage_options(self._job_client.config) - dt = try_get_deltatable(dt_path, storage_options=storage_options) + dt = try_get_deltatable(self.destination_file_name, storage_options=storage_options) # get partition columns part_cols = get_columns_names_with_prop(self._load_table, "partition") @@ -124,7 +139,7 @@ def run(self) -> None: if dt is None: # create new empty Delta table with schema from Arrow table DeltaTable.create( - table_uri=dt_path, + table_uri=self.destination_file_name, schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema), mode="overwrite", partition_by=part_cols, @@ -160,7 +175,7 @@ def run(self) -> None: else: write_delta_table( - table_or_uri=dt_path if dt is None else dt, + table_or_uri=self.destination_file_name if dt is None else dt, data=arrow_rbr, write_disposition=self._load_table["write_disposition"], partition_by=part_cols, @@ -169,13 +184,13 @@ def run(self) -> None: class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): - def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: + def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) if self._load_table.get("table_format") == "delta": # delta table jobs only require table chain followup jobs pass elif final_state == "completed": - ref_job = ReferenceFollowupJob( + ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._job_client.make_remote_uri(self.make_remote_path())], ) @@ -369,7 +384,7 @@ def create_load_job( import dlt.common.libs.deltalake # assert dependencies are installed # a reference job for a delta table indicates a table chain followup job - if ReferenceFollowupJob.is_reference_job(file_path): + if ReferenceFollowupJobRequest.is_reference_job(file_path): return DeltaLoadFilesystemJob(file_path) # otherwise just continue return FinalizedLoadJobWithFollowupJobs(file_path) @@ -578,7 +593,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: assert completed_table_chain_jobs is not None jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs @@ -591,5 +606,5 @@ def create_table_chain_completed_followup_jobs( if job.job_file_info.table_name == table["name"] ] file_name = FileStorage.get_file_name_from_file_path(table_job_paths[0]) - jobs.append(ReferenceFollowupJob(file_name, table_job_paths)) + jobs.append(ReferenceFollowupJobRequest(file_name, table_job_paths)) return jobs diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index a67423a873..750dc93a10 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -1,7 +1,7 @@ from typing import Dict, Optional, Sequence, List, Any from dlt.common.exceptions import TerminalValueError -from dlt.common.destination.reference import FollowupJob +from dlt.common.destination.reference import FollowupJobRequest from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.schema import TColumnSchema, TColumnHint, Schema from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat @@ -160,7 +160,9 @@ def __init__( self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {} self.type_mapper = MsSqlTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [MsSqlMergeJob.from_table_chain(table_chain, self.sql_client)] def _make_add_column_sql( @@ -189,7 +191,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self.config.replace_strategy == "staging-optimized": return [MsSqlStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py index 5ae5f27a6e..a832bfe07f 100644 --- a/dlt/destinations/impl/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -9,7 +9,7 @@ from dlt.common.destination.reference import ( HasFollowupJobs, RunnableLoadJob, - FollowupJob, + FollowupJobRequest, LoadJob, TLoadJobState, ) @@ -246,7 +246,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self.config.replace_strategy == "staging-optimized": return [PostgresStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index 81abd57803..93827c8163 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -14,7 +14,7 @@ from dlt.common.destination.reference import ( - FollowupJob, + FollowupJobRequest, CredentialsConfiguration, SupportsStagingDestination, LoadJob, @@ -33,7 +33,7 @@ from dlt.destinations.job_client_impl import CopyRemoteFileLoadJob from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper @@ -238,7 +238,9 @@ def __init__( self.config: RedshiftClientConfiguration = config self.type_mapper = RedshiftTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)] def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: @@ -258,7 +260,7 @@ def create_load_job( """Starts SqlLoadJob for files ending with .sql or returns None to let derived classes to handle their specific jobs""" job = super().create_load_job(table, file_path, load_id, restore) if not job: - assert ReferenceFollowupJob.is_reference_job( + assert ReferenceFollowupJobRequest.is_reference_job( file_path ), "Redshift must use staging to load files" job = RedshiftCopyFileLoadJob( diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index 904b524791..8b4eabc961 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -29,7 +29,7 @@ from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.type_mapping import TypeMapper @@ -98,11 +98,11 @@ def run(self) -> None: self._sql_client = self._job_client.sql_client # resolve reference - is_local_file = not ReferenceFollowupJob.is_reference_job(self._file_path) + is_local_file = not ReferenceFollowupJobRequest.is_reference_job(self._file_path) file_url = ( self._file_path if is_local_file - else ReferenceFollowupJob.resolve_reference(self._file_path) + else ReferenceFollowupJobRequest.resolve_reference(self._file_path) ) # take file name file_name = FileStorage.get_file_name_from_file_path(file_url) diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index d1b38f73bd..e43e2a6dfa 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -5,7 +5,7 @@ from urllib.parse import urlparse, urlunparse from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import SupportsStagingDestination, FollowupJob, LoadJob +from dlt.common.destination.reference import SupportsStagingDestination, FollowupJobRequest, LoadJob from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint from dlt.common.schema.utils import ( @@ -19,7 +19,7 @@ AzureServicePrincipalCredentialsWithoutDefaults, ) -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import ( SqlJobClientBase, @@ -131,7 +131,7 @@ def _get_columstore_valid_column(self, c: TColumnSchema) -> TColumnSchema: def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: return SqlJobClientBase._create_replace_followup_jobs(self, table_chain) def prepare_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: @@ -163,7 +163,7 @@ def create_load_job( ) -> LoadJob: job = super().create_load_job(table, file_path, load_id, restore) if not job: - assert ReferenceFollowupJob.is_reference_job( + assert ReferenceFollowupJobRequest.is_reference_job( file_path ), "Synapse must use staging to load files" job = SynapseCopyFileLoadJob( diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 7fdd979c5d..92132dd751 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -42,7 +42,7 @@ WithStateSync, DestinationClientConfiguration, DestinationClientDwhConfiguration, - FollowupJob, + FollowupJobRequest, WithStagingDataset, RunnableLoadJob, LoadJob, @@ -53,7 +53,7 @@ from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.destinations.job_impl import ( - ReferenceFollowupJob, + ReferenceFollowupJobRequest, ) from dlt.destinations.sql_jobs import SqlMergeFollowupJob, SqlStagingCopyFollowupJob from dlt.destinations.typing import TNativeConn @@ -118,7 +118,7 @@ def __init__( super().__init__(file_path) self._job_client: "SqlJobClientBase" = None self._staging_credentials = staging_credentials - self._bucket_path = ReferenceFollowupJob.resolve_reference(file_path) + self._bucket_path = ReferenceFollowupJobRequest.resolve_reference(file_path) class SqlJobClientBase(JobClientBase, WithStateSync): @@ -216,16 +216,18 @@ def should_truncate_table_before_load(self, table: TTableSchema) -> bool: def _create_append_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: return [] - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [SqlMergeFollowupJob.from_table_chain(table_chain, self.sql_client)] def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: - jobs: List[FollowupJob] = [] + ) -> List[FollowupJobRequest]: + jobs: List[FollowupJobRequest] = [] if self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]: jobs.append( SqlStagingCopyFollowupJob.from_table_chain( @@ -238,7 +240,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs diff --git a/dlt/destinations/job_impl.py b/dlt/destinations/job_impl.py index 41c939f482..1f54913064 100644 --- a/dlt/destinations/job_impl.py +++ b/dlt/destinations/job_impl.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod import os import tempfile # noqa: 251 -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Optional from dlt.common.json import json from dlt.common.destination.reference import ( @@ -9,9 +9,10 @@ TLoadJobState, RunnableLoadJob, JobClientBase, - FollowupJob, + FollowupJobRequest, LoadJob, ) +from dlt.common.metrics import LoadJobMetrics from dlt.common.storages.load_package import commit_load_package_state from dlt.common.schema import Schema, TTableSchema from dlt.common.storages import FileStorage @@ -56,7 +57,7 @@ class FinalizedLoadJobWithFollowupJobs(FinalizedLoadJob, HasFollowupJobs): pass -class FollowupJobImpl(FollowupJob): +class FollowupJobRequestImpl(FollowupJobRequest): """ Class to create a new loadjob, not stateful and not runnable """ @@ -79,7 +80,7 @@ def job_id(self) -> str: return self._parsed_file_name.job_id() -class ReferenceFollowupJob(FollowupJobImpl): +class ReferenceFollowupJobRequest(FollowupJobRequestImpl): def __init__(self, original_file_name: str, remote_paths: List[str]) -> None: file_name = os.path.splitext(original_file_name)[0] + "." + "reference" self._remote_paths = remote_paths @@ -98,7 +99,7 @@ def resolve_references(file_path: str) -> List[str]: @staticmethod def resolve_reference(file_path: str) -> str: - refs = ReferenceFollowupJob.resolve_references(file_path) + refs = ReferenceFollowupJobRequest.resolve_references(file_path) assert len(refs) == 1 return refs[0] diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index a1e38a2c20..d5f005ee9a 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -21,7 +21,7 @@ from dlt.common.utils import uniq_id from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.destinations.exceptions import MergeDispositionException -from dlt.destinations.job_impl import FollowupJobImpl +from dlt.destinations.job_impl import FollowupJobRequestImpl from dlt.destinations.sql_client import SqlClientBase from dlt.common.destination.exceptions import DestinationTransientException @@ -45,7 +45,7 @@ def __init__(self, original_exception: Exception, table_chain: Sequence[TTableSc ) -class SqlFollowupJob(FollowupJobImpl): +class SqlFollowupJob(FollowupJobRequestImpl): """Sql base job for jobs that rely on the whole tablechain""" @classmethod @@ -54,7 +54,7 @@ def from_table_chain( table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None, - ) -> FollowupJobImpl: + ) -> FollowupJobRequestImpl: """Generates a list of sql statements, that will be executed by the sql client when the job is executed in the loader. The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 4a1de2517d..8a91dd7477 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -4,9 +4,9 @@ from dlt.common.configuration import known_sections, resolve_configuration, with_config from dlt.common import logger from dlt.common.configuration.specs import BaseConfiguration, configspec -from dlt.common.data_writers import DataWriterMetrics from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.exceptions import MissingDependencyException +from dlt.common.metrics import DataWriterMetrics from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.typing import TDataItems, TDataItem, TLoaderFileFormat from dlt.common.schema import Schema, utils diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py index de777ad60e..395366b09e 100644 --- a/dlt/extract/storage.py +++ b/dlt/extract/storage.py @@ -1,7 +1,8 @@ import os from typing import Dict, List -from dlt.common.data_writers import TDataItemFormat, DataWriterMetrics, DataWriter, FileWriterSpec +from dlt.common.data_writers import TDataItemFormat, DataWriter, FileWriterSpec +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema import Schema from dlt.common.storages import ( NormalizeStorageConfiguration, diff --git a/dlt/load/load.py b/dlt/load/load.py index 99a12d69ee..f084c9d3d9 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -5,12 +5,17 @@ import os from dlt.common import logger +from dlt.common.metrics import LoadJobMetrics from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config from dlt.common.pipeline import LoadInfo, LoadMetrics, SupportsPipeline, WithStepInfo from dlt.common.schema.utils import get_top_level_table -from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState +from dlt.common.storages.load_storage import ( + LoadPackageInfo, + ParsedLoadJobFileName, + TPackageJobState, +) from dlt.common.storages.load_package import ( LoadPackageStateInjectableContext, load_package as current_load_package, @@ -29,7 +34,7 @@ Destination, RunnableLoadJob, LoadJob, - FollowupJob, + FollowupJobRequest, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination, @@ -84,6 +89,7 @@ def __init__( self.pool = NullExecutor() self.load_storage: LoadStorage = self.create_storage(is_storage_owner) self._loaded_packages: List[LoadPackageInfo] = [] + self._job_metrics: Dict[str, LoadJobMetrics] = {} self._run_loop_sleep_duration: float = ( 1.0 # amount of time to sleep between querying completed jobs ) @@ -308,7 +314,7 @@ def create_followup_jobs( where they will be picked up for execution """ - jobs: List[FollowupJob] = [] + jobs: List[FollowupJobRequest] = [] if isinstance(starting_job, HasFollowupJobs): # check for merge jobs only for jobs executing on the destination, the staging destination jobs must be excluded # NOTE: we may move that logic to the interface @@ -392,6 +398,11 @@ def complete_jobs( # create followup jobs self.create_followup_jobs(load_id, state, job, schema) + # preserve metrics + metrics = job.metrics() + if metrics: + self._job_metrics[job.job_id()] = metrics + # try to get exception message from job failed_message = job.exception() self.load_storage.normalized_packages.fail_job( @@ -423,7 +434,7 @@ def complete_jobs( if r_c > 0 and r_c % self.config.raise_on_max_retries == 0: pending_exception = LoadClientJobRetry( load_id, - job.job_file_info().job_id(), + job.job_id(), r_c, self.config.raise_on_max_retries, retry_message=retry_message, @@ -431,6 +442,15 @@ def complete_jobs( elif state == "completed": # create followup jobs self.create_followup_jobs(load_id, state, job, schema) + + # preserve metrics + # TODO: metrics should be persisted. this is different vs. all other steps because load step + # may be restarted in the middle of execution + # NOTE: we could use package state but cases with 100k jobs must be tested + metrics = job.metrics() + if metrics: + self._job_metrics[job.job_id()] = metrics + # move to completed folder after followup jobs are created # in case of exception when creating followup job, the loader will retry operation and try to complete again self.load_storage.normalized_packages.complete_job(load_id, job.file_name()) @@ -464,14 +484,18 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) self.load_storage.complete_load_package(load_id, aborted) # collect package info self._loaded_packages.append(self.load_storage.get_load_package_info(load_id)) - self._step_info_complete_load_id(load_id, metrics={"started_at": None, "finished_at": None}) + # TODO: job metrics must be persisted + self._step_info_complete_load_id( + load_id, + metrics={"started_at": None, "finished_at": None, "job_metrics": self._job_metrics}, + ) # delete jobs only now self.load_storage.maybe_remove_completed_jobs(load_id) logger.info( f"All jobs completed, archiving package {load_id} with aborted set to {aborted}" ) - def update_load_package_info(self, load_id: str) -> None: + def init_jobs_counter(self, load_id: str) -> None: # update counter we only care about the jobs that are scheduled to be loaded package_jobs = self.load_storage.normalized_packages.get_load_package_jobs(load_id) total_jobs = reduce(lambda p, c: p + len(c), package_jobs.values(), 0) @@ -492,7 +516,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: dropped_tables = current_load_package()["state"].get("dropped_tables", []) truncated_tables = current_load_package()["state"].get("truncated_tables", []) - self.update_load_package_info(load_id) + self.init_jobs_counter(load_id) # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: @@ -606,7 +630,8 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: ) ): # the same load id may be processed across multiple runs - if not self.current_load_id: + if self.current_load_id is None: + self._job_metrics = {} self._step_info_start_load_id(load_id) self.load_single_package(load_id, schema) diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 9750f89d4b..741c01f249 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -2,7 +2,7 @@ from itertools import groupby from dlt.common import logger -from dlt.common.storages.load_package import LoadJobInfo, PackageStorage, TJobState +from dlt.common.storages.load_package import LoadJobInfo, PackageStorage, TPackageJobState from dlt.common.schema.utils import ( fill_hints_from_parent_and_clone_table, get_child_tables, @@ -19,7 +19,7 @@ def get_completed_table_chain( schema: Schema, - all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]], + all_jobs: Iterable[Tuple[TPackageJobState, ParsedLoadJobFileName]], top_merged_table: TTableSchema, being_completed_job_id: str = None, ) -> List[TTableSchema]: diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 5f84d57d7a..650d10c268 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -3,9 +3,9 @@ from dlt.common import logger from dlt.common.json import json -from dlt.common.data_writers import DataWriterMetrics from dlt.common.data_writers.writers import ArrowToObjectAdapter from dlt.common.json import custom_pua_decode, may_have_pua +from dlt.common.metrics import DataWriterMetrics from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.runtime import signals from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index e80931605c..3df060b141 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -4,10 +4,10 @@ from concurrent.futures import Future, Executor from dlt.common import logger +from dlt.common.metrics import DataWriterMetrics from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config -from dlt.common.data_writers import DataWriterMetrics from dlt.common.data_writers.writers import EMPTY_DATA_WRITER_METRICS from dlt.common.runners import TRunMetrics, Runnable, NullExecutor from dlt.common.runtime import signals diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py index 10d0a00eb1..b8969f64a3 100644 --- a/dlt/normalize/worker.py +++ b/dlt/normalize/worker.py @@ -4,12 +4,12 @@ from dlt.common.configuration.container import Container from dlt.common.data_writers import ( DataWriter, - DataWriterMetrics, create_import_spec, resolve_best_writer_spec, get_best_writer_spec, is_native_writer, ) +from dlt.common.metrics import DataWriterMetrics from dlt.common.utils import chunks from dlt.common.schema.typing import TStoredSchema, TTableSchema from dlt.common.storages import ( diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 29770966a6..2f857e5fd5 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -168,7 +168,7 @@ def asdict(self) -> DictStrAny: """A dictionary representation of PipelineTrace that can be loaded with `dlt`""" d = self._asdict() # run step is the same as load step - d["steps"] = [step.asdict() for step in self.steps] # if step.step != "run" + d["steps"] = [step.asdict() for step in self.steps if step.step != "run"] return d @property diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index f21d6f0686..40f9419bc2 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -85,6 +85,19 @@ You can inspect stored artifacts using the command > 💡 You can attach `Pipeline` instance to an existing working folder, without creating a new > pipeline with `dlt.attach`. +### Separate working environments with `pipelines_dir`. +You can run several pipelines with the same name but with different configuration ie. to target development / staging / production environments. +Set the `pipelines_dir` argument to store all the working folders in specific place. For example: +```py +import dlt +from dlt.common.pipeline import get_dlt_pipelines_dir + +dev_pipelines_dir = os.path.join(get_dlt_pipelines_dir(), "dev") +pipeline = dlt.pipeline(destination="duckdb", dataset_name="sequence", pipelines_dir=dev_pipelines_dir) +``` +stores pipeline working folder in `~/.dlt/pipelines/dev/`. Mind that you need to pass this `~/.dlt/pipelines/dev/` +in to all cli commands to get info/trace for that pipeline. + ## Do experiments with dev mode If you [create a new pipeline script](../walkthroughs/create-a-pipeline.md) you will be diff --git a/tests/common/data_writers/test_data_writers.py b/tests/common/data_writers/test_data_writers.py index 9b4e61a2f7..03723b7b55 100644 --- a/tests/common/data_writers/test_data_writers.py +++ b/tests/common/data_writers/test_data_writers.py @@ -5,6 +5,7 @@ from dlt.common import pendulum, json from dlt.common.data_writers.exceptions import DataWriterNotFound, SpecLookupFailed +from dlt.common.metrics import DataWriterMetrics from dlt.common.typing import AnyFun from dlt.common.data_writers.escape import ( @@ -25,7 +26,6 @@ ArrowToTypedJsonlListWriter, CsvWriter, DataWriter, - DataWriterMetrics, EMPTY_DATA_WRITER_METRICS, ImportFileWriter, InsertValuesWriter, @@ -180,12 +180,13 @@ def test_data_writer_metrics_add() -> None: metrics = DataWriterMetrics("file", 10, 100, now, now + 10) add_m: DataWriterMetrics = metrics + EMPTY_DATA_WRITER_METRICS # type: ignore[assignment] assert add_m == DataWriterMetrics("", 10, 100, now, now + 10) - assert metrics + metrics == DataWriterMetrics("", 20, 200, now, now + 10) + # will keep "file" because it is in both + assert metrics + metrics == DataWriterMetrics("file", 20, 200, now, now + 10) assert sum((metrics, metrics, metrics), EMPTY_DATA_WRITER_METRICS) == DataWriterMetrics( "", 30, 300, now, now + 10 ) # time range extends when added - add_m = metrics + DataWriterMetrics("file", 99, 120, now - 10, now + 20) # type: ignore[assignment] + add_m = metrics + DataWriterMetrics("fileX", 99, 120, now - 10, now + 20) # type: ignore[assignment] assert add_m == DataWriterMetrics("", 109, 220, now - 10, now + 20) diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index baac3b7af5..a1334ba1da 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -16,7 +16,7 @@ LoadStorageConfiguration, FilesystemConfiguration, LoadPackageInfo, - TJobState, + TPackageJobState, LoadStorage, ) from dlt.common.storages import DataItemStorage, FileStorage @@ -195,7 +195,7 @@ def assert_package_info( storage: LoadStorage, load_id: str, package_state: str, - job_state: TJobState, + job_state: TPackageJobState, jobs_count: int = 1, ) -> LoadPackageInfo: package_info = storage.get_load_package_info(load_id) diff --git a/tests/extract/data_writers/test_buffered_writer.py b/tests/extract/data_writers/test_buffered_writer.py index 5cad5a35b9..205e3f83dc 100644 --- a/tests/extract/data_writers/test_buffered_writer.py +++ b/tests/extract/data_writers/test_buffered_writer.py @@ -7,12 +7,12 @@ from dlt.common.data_writers.exceptions import BufferedDataWriterClosed from dlt.common.data_writers.writers import ( DataWriter, - DataWriterMetrics, InsertValuesWriter, JsonlWriter, ALL_WRITERS, ) from dlt.common.destination.capabilities import TLoaderFileFormat, DestinationCapabilitiesContext +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema.utils import new_column from dlt.common.storages.file_storage import FileStorage diff --git a/tests/extract/data_writers/test_data_item_storage.py b/tests/extract/data_writers/test_data_item_storage.py index feda51c229..558eeec79e 100644 --- a/tests/extract/data_writers/test_data_item_storage.py +++ b/tests/extract/data_writers/test_data_item_storage.py @@ -3,8 +3,9 @@ import pytest from dlt.common.configuration.container import Container -from dlt.common.data_writers.writers import DataWriterMetrics, DataWriter +from dlt.common.data_writers.writers import DataWriter from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema.utils import new_column from dlt.common.storages.data_item_storage import DataItemStorage diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 759f443546..4b8707e989 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -12,6 +12,7 @@ from dlt.common import json from dlt.common import pendulum +from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id from dlt.common.exceptions import DependencyVersionException @@ -299,6 +300,17 @@ def data_types(): assert len(rows) == 10 assert_all_data_types_row(rows[0], schema=column_schemas) + # make sure remote_uri is in metrics + metrics = info.metrics[info.loads_ids[0]][0] + # TODO: only final copy job has remote_uri. not the initial (empty) job for particular files + # we could implement an empty job for delta that generates correct remote_uri + remote_uri = list(metrics["job_metrics"].values())[-1].remote_uri + assert remote_uri.endswith("data_types") + bucket_uri = destination_config.bucket_url + if FilesystemConfiguration.is_local_path(bucket_uri): + bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) + assert remote_uri.startswith(bucket_uri) + # another run should append rows to the table info = pipeline.run(data_types()) assert_load_info(info) @@ -567,6 +579,7 @@ def two_part(): assert dt.metadata().partition_columns == [] +@pytest.mark.essential @pytest.mark.parametrize( "destination_config", destinations_configs( @@ -798,6 +811,51 @@ def parent_delta(): get_delta_tables(pipeline, "non_existing_table") +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + table_format_filesystem_configs=True, + table_format="delta", + bucket_subset=(FILE_BUCKET,), + ), + ids=lambda x: x.name, +) +def test_parquet_to_delta_upgrade(destination_config: DestinationTestConfiguration): + # change the resource to start creating delta tables + from dlt.common.libs.deltalake import get_delta_tables + + @dlt.resource() + def foo(): + yield [{"foo": 1}, {"foo": 2}] + + pipeline = destination_config.setup_pipeline("fs_pipe") + + info = pipeline.run(foo()) + assert_load_info(info) + delta_tables = get_delta_tables(pipeline) + assert set(delta_tables.keys()) == set() + + # drop the pipeline + pipeline.deactivate() + + # redefine the resource + + @dlt.resource(table_format="delta") # type: ignore + def foo(): + yield [{"foo": 1}, {"foo": 2}] + + pipeline = destination_config.setup_pipeline("fs_pipe") + + info = pipeline.run(foo()) + assert_load_info(info) + delta_tables = get_delta_tables(pipeline) + assert set(delta_tables.keys()) == {"foo"} + + # optimize all delta tables to make sure storage is there + for table in delta_tables.values(): + table.vacuum() + + TEST_LAYOUTS = ( "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}", "{schema_name}.{table_name}.{load_id}.{file_id}.{ext}", diff --git a/tests/load/pipeline/test_postgres.py b/tests/load/pipeline/test_postgres.py index a4001b7faa..5cadf701a2 100644 --- a/tests/load/pipeline/test_postgres.py +++ b/tests/load/pipeline/test_postgres.py @@ -42,3 +42,18 @@ def test_postgres_encoded_binary( # print(bytes(data["table"][0]["hash"])) # data in postgres equals unencoded blob assert data["table"][0]["hash"].tobytes() == blob + + +# TODO: uncomment and finalize when we implement encoding for psycopg2 +# @pytest.mark.parametrize( +# "destination_config", +# destinations_configs(default_sql_configs=True, subset=["postgres"]), +# ids=lambda x: x.name, +# ) +# def test_postgres_encoding(destination_config: DestinationTestConfiguration): +# from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient +# pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True) +# client: Psycopg2SqlClient = pipeline.sql_client() +# # client.credentials.query["encoding"] = "ru" +# with client: +# print(client.native_connection.encoding) diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index 7f1427f20f..a760c86526 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -4,6 +4,7 @@ import dlt, os from dlt.common import json, sleep from copy import deepcopy +from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.utils import uniq_id from dlt.common.schema.typing import TDataType @@ -16,6 +17,9 @@ ) from tests.cases import table_update_and_row +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + @dlt.resource( table_name="issues", write_disposition="merge", primary_key="id", merge_key=("node_id", "url") @@ -46,6 +50,18 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: info = pipeline.run(github(), loader_file_format=destination_config.file_format) assert_load_info(info) + # checks if remote_uri is set correctly on copy jobs + metrics = info.metrics[info.loads_ids[0]][0] + for job_metrics in metrics["job_metrics"].values(): + remote_uri = job_metrics.remote_uri + job_ext = os.path.splitext(job_metrics.job_id)[1] + if job_ext not in (".reference", ".sql"): + assert remote_uri.endswith(job_ext) + bucket_uri = destination_config.bucket_url + if FilesystemConfiguration.is_local_path(bucket_uri): + bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) + assert remote_uri.startswith(bucket_uri) + package_info = pipeline.get_load_package_info(info.loads_ids[0]) assert package_info.state == "loaded" diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index b55f4ceece..9f0bca6ac5 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -8,7 +8,8 @@ from dlt.common.exceptions import TerminalException, TerminalValueError from dlt.common.storages import FileStorage, PackageStorage, ParsedLoadJobFileName -from dlt.common.storages.load_package import LoadJobInfo, TJobState +from dlt.common.storages.configuration import FilesystemConfiguration +from dlt.common.storages.load_package import LoadJobInfo, TPackageJobState from dlt.common.storages.load_storage import JobFileFormatUnsupported from dlt.common.destination.reference import RunnableLoadJob, TDestination from dlt.common.schema.utils import ( @@ -32,6 +33,7 @@ from dlt.load.utils import get_completed_table_chain, init_client, _extend_tables_with_table_chain from tests.utils import ( + MockPipeline, clean_test_storage, init_test_logging, TEST_DICT_CONFIG_PROVIDER, @@ -78,10 +80,14 @@ def test_spool_job_started() -> None: load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name() ) ) + assert_job_metrics(job, "completed") jobs.append(job) remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema) assert len(remaining_jobs) == 0 assert len(finalized_jobs) == 2 + assert len(load._job_metrics) == 2 + for job in jobs: + assert load._job_metrics[job.job_id()] == job.metrics() def test_unsupported_writer_type() -> None: @@ -199,7 +205,9 @@ def test_spool_job_failed() -> None: load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name() ) ) + assert_job_metrics(job, "failed") jobs.append(job) + assert len(jobs) == 2 # complete files remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema) assert len(remaining_jobs) == 0 @@ -215,6 +223,8 @@ def test_spool_job_failed() -> None: load_id, PackageStorage.FAILED_JOBS_FOLDER, job.file_name() + ".exception" ) ) + # load should collect two jobs + assert load._job_metrics[job.job_id()] == job.metrics() started_files = load.load_storage.normalized_packages.list_started_jobs(load_id) assert len(started_files) == 0 @@ -226,6 +236,13 @@ def test_spool_job_failed() -> None: assert package_info.state == "loaded" # all jobs failed assert len(package_info.jobs["failed_jobs"]) == 2 + # check metrics + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + metrics = load_info.metrics[load_id][0]["job_metrics"] + assert len(metrics) == 2 + for job in jobs: + assert job.job_id() in metrics + assert metrics[job.job_id()].state == "failed" def test_spool_job_failed_terminally_exception_init() -> None: @@ -244,6 +261,11 @@ def test_spool_job_failed_terminally_exception_init() -> None: assert len(package_info.jobs["started_jobs"]) == 0 # load id was never committed complete_load.assert_not_called() + # metrics can be gathered + assert len(load._job_metrics) == 2 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + metrics = load_info.metrics[load_id][0]["job_metrics"] + assert len(metrics) == 2 def test_spool_job_failed_transiently_exception_init() -> None: @@ -264,6 +286,10 @@ def test_spool_job_failed_transiently_exception_init() -> None: # load id was never committed complete_load.assert_not_called() + # no metrics were gathered + assert len(load._job_metrics) == 0 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + assert len(load_info.metrics) == 0 def test_spool_job_failed_exception_complete() -> None: @@ -279,6 +305,11 @@ def test_spool_job_failed_exception_complete() -> None: # both failed - we wait till the current loop is completed and then raise assert len(package_info.jobs["failed_jobs"]) == 2 assert len(package_info.jobs["started_jobs"]) == 0 + # metrics can be gathered + assert len(load._job_metrics) == 2 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + metrics = load_info.metrics[load_id][0]["job_metrics"] + assert len(metrics) == 2 def test_spool_job_retry_new() -> None: @@ -328,6 +359,7 @@ def test_spool_job_retry_started() -> None: remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema) assert len(remaining_jobs) == 0 assert len(finalized_jobs) == 0 + assert len(load._job_metrics) == 0 # clear retry flag dummy_impl.JOBS = {} files = load.load_storage.normalized_packages.list_new_jobs(load_id) @@ -407,6 +439,8 @@ def test_failing_followup_jobs() -> None: assert len(dummy_impl.JOBS) == 2 assert len(dummy_impl.RETRIED_JOBS) == 0 assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0 + # no metrics were collected + assert len(load._job_metrics) == 0 # now we can retry the same load, it will restart the two jobs and successfully create the followup jobs load.initial_client_config.fail_followup_job_creation = False # type: ignore @@ -436,6 +470,8 @@ def test_failing_table_chain_followup_jobs() -> None: assert len(dummy_impl.JOBS) == 2 assert len(dummy_impl.RETRIED_JOBS) == 0 assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0 + # no metrics were collected + assert len(load._job_metrics) == 0 # now we can retry the same load, it will restart the two jobs and successfully create the table chain followup jobs load.initial_client_config.fail_table_chain_followup_job_creation = False # type: ignore @@ -662,11 +698,11 @@ def test_get_completed_table_chain_cases() -> None: # child completed, parent not event_user = schema.get_table("event_user") event_user_entities = schema.get_table("event_user__parse_data__entities") - event_user_job: Tuple[TJobState, ParsedLoadJobFileName] = ( + event_user_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = ( "started_jobs", ParsedLoadJobFileName("event_user", "event_user_id", 0, "jsonl"), ) - event_user_entities_job: Tuple[TJobState, ParsedLoadJobFileName] = ( + event_user_entities_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = ( "completed_jobs", ParsedLoadJobFileName( "event_user__parse_data__entities", "event_user__parse_data__entities_id", 0, "jsonl" @@ -857,6 +893,33 @@ def test_dummy_staging_filesystem() -> None: assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0 +def test_load_multiple_packages() -> None: + load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0)) + load.config.pool_type = "none" + load_id_1, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) + sleep(0.1) + load_id_2, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) + run_metrics = load.run(None) + assert run_metrics.pending_items == 1 + # assert load._current_load_id is None + metrics_id_1 = load._job_metrics + assert len(metrics_id_1) == 2 + assert load._step_info_metrics(load_id_1)[0]["job_metrics"] == metrics_id_1 + run_metrics = load.run(None) + assert run_metrics.pending_items == 0 + metrics_id_2 = load._job_metrics + assert len(metrics_id_2) == 2 + assert load._step_info_metrics(load_id_2)[0]["job_metrics"] == metrics_id_2 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + assert load_id_1 in load_info.metrics + assert load_id_2 in load_info.metrics + assert load_info.metrics[load_id_1][0]["job_metrics"] == metrics_id_1 + assert load_info.metrics[load_id_2][0]["job_metrics"] == metrics_id_2 + # execute empty run + load.run(None) + assert len(load_info.metrics) == 2 + + def test_terminal_exceptions() -> None: try: raise TerminalValueError("a") @@ -866,6 +929,15 @@ def test_terminal_exceptions() -> None: raise AssertionError() +def assert_job_metrics(job: RunnableLoadJob, expected_state: str) -> None: + metrics = job.metrics() + assert metrics.state == expected_state + assert metrics.started_at <= metrics.finished_at + assert metrics.job_id == job.job_id() + assert metrics.table_name == job._parsed_file_name.table_name + assert metrics.file_path == job._file_path + + def assert_complete_job( load: Load, should_delete_completed: bool = False, load_id: str = None, jobs_per_case: int = 1 ) -> None: @@ -910,6 +982,32 @@ def assert_complete_job( assert load.load_storage.loaded_packages.storage.has_folder(completed_path) # complete load on client was called complete_load.assert_called_once_with(load_id) + # assert if all jobs in final state have metrics + metrics = load.get_step_info(MockPipeline("pipe", True)).metrics[load_id][0] # type: ignore[abstract] + package_info = load.load_storage.loaded_packages.get_load_package_jobs(load_id) + for state, jobs in package_info.items(): + for job in jobs: + job_metrics = metrics["job_metrics"].get(job.job_id()) + if state in ("failed_jobs", "completed_jobs"): + assert job_metrics is not None + assert ( + metrics["job_metrics"][job.job_id()].state == "failed" + if state == "failed_jobs" + else "completed" + ) + remote_uri = job_metrics.remote_uri + if load.initial_client_config.create_followup_jobs: # type: ignore + assert remote_uri.endswith(job.file_name()) + elif load.is_staging_destination_job(job.file_name()): + # staging destination should contain reference to remote filesystem + assert ( + FilesystemConfiguration.make_file_uri(REMOTE_FILESYSTEM) + in remote_uri + ) + else: + assert remote_uri is None + else: + assert job_metrics is None def run_all(load: Load) -> None: @@ -941,9 +1039,9 @@ def setup_loader( staging = None if filesystem_staging: # do not accept jsonl to not conflict with filesystem destination - client_config = client_config or DummyClientConfiguration( - loader_file_format="reference", completed_prob=1 - ) + # client_config = client_config or DummyClientConfiguration( + # loader_file_format="reference", completed_prob=1 + # ) staging_system_config = FilesystemDestinationClientConfiguration()._bind_dataset_name( dataset_name="dummy" ) diff --git a/tests/load/utils.py b/tests/load/utils.py index d649343c63..086109de8b 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -45,6 +45,7 @@ from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration from dlt.common.schema.utils import new_table, normalize_table_identifiers from dlt.common.storages import ParsedLoadJobFileName, LoadStorage, PackageStorage +from dlt.common.storages.load_package import create_load_id from dlt.common.typing import StrAny from dlt.common.utils import uniq_id @@ -712,7 +713,7 @@ def expect_load_file( query = query.encode("utf-8") # type: ignore[assignment] file_storage.save(file_name, query) table = client.prepare_load_table(table_name) - load_id = uniq_id() + load_id = create_load_id() job = client.create_load_job(table, file_storage.make_full_path(file_name), load_id) if isinstance(job, RunnableLoadJob): @@ -873,7 +874,7 @@ def prepare_load_package( Create a load package with explicitely provided files job_per_case multiplies the amount of load jobs, for big packages use small files """ - load_id = uniq_id() + load_id = create_load_id() load_storage.new_packages.create_package(load_id) for case in cases: path = f"./tests/load/cases/loading/{case}" diff --git a/tests/pipeline/cases/contracts/trace.schema.yaml b/tests/pipeline/cases/contracts/trace.schema.yaml new file mode 100644 index 0000000000..89831977c0 --- /dev/null +++ b/tests/pipeline/cases/contracts/trace.schema.yaml @@ -0,0 +1,772 @@ +version: 4 +version_hash: JE62zVwqT2T/qHTi2Qdnn2d1A/JzCzyGtDwc+qUmbTs= +engine_version: 9 +name: trace +tables: + _dlt_version: + columns: + version: + data_type: bigint + nullable: false + engine_version: + data_type: bigint + nullable: false + inserted_at: + data_type: timestamp + nullable: false + schema_name: + data_type: text + nullable: false + version_hash: + data_type: text + nullable: false + schema: + data_type: text + nullable: false + write_disposition: skip + description: Created by DLT. Tracks schema updates + _dlt_loads: + columns: + load_id: + data_type: text + nullable: false + schema_name: + data_type: text + nullable: true + status: + data_type: bigint + nullable: false + inserted_at: + data_type: timestamp + nullable: false + schema_version_hash: + data_type: text + nullable: true + write_disposition: skip + description: Created by DLT. Tracks completed loads + trace: + columns: + transaction_id: + data_type: text + nullable: true + pipeline_name: + data_type: text + nullable: true + execution_context__ci_run: + data_type: bool + nullable: true + execution_context__python: + data_type: text + nullable: true + execution_context__cpu: + data_type: bigint + nullable: true + execution_context__os__name: + data_type: text + nullable: true + execution_context__os__version: + data_type: text + nullable: true + execution_context__library__name: + data_type: text + nullable: true + execution_context__library__version: + data_type: text + nullable: true + started_at: + data_type: timestamp + nullable: true + finished_at: + data_type: timestamp + nullable: true + engine_version: + data_type: bigint + nullable: true + _dlt_load_id: + data_type: text + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + write_disposition: append + trace__execution_context__exec_info: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace + trace__steps: + columns: + span_id: + data_type: text + nullable: true + step: + data_type: text + nullable: true + started_at: + data_type: timestamp + nullable: true + finished_at: + data_type: timestamp + nullable: true + step_info__pipeline__pipeline_name: + data_type: text + nullable: true + step_info__first_run: + data_type: bool + nullable: true + step_info__started_at: + data_type: timestamp + nullable: true + step_info__finished_at: + data_type: timestamp + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + load_info__destination_type: + data_type: text + nullable: true + load_info__destination_displayable_credentials: + data_type: text + nullable: true + load_info__destination_name: + data_type: text + nullable: true + load_info__staging_type: + data_type: text + nullable: true + load_info__staging_name: + data_type: text + nullable: true + load_info__staging_displayable_credentials: + data_type: text + nullable: true + load_info__destination_fingerprint: + data_type: text + nullable: true + step_exception: + data_type: text + nullable: true + parent: trace + trace__steps__extract_info__job_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + job_id: + data_type: text + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__table_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__resource_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + resource_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__dag: + columns: + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + parent_name: + data_type: text + nullable: true + resource_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__hints: + columns: + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + resource_name: + data_type: text + nullable: true + columns: + data_type: text + nullable: true + write_disposition: + data_type: text + nullable: true + schema_contract: + data_type: text + nullable: true + table_format: + data_type: text + nullable: true + file_format: + data_type: text + nullable: true + original_columns: + data_type: text + nullable: true + primary_key: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__step_info__loads_ids: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace__steps + trace__steps__step_info__load_packages: + columns: + load_id: + data_type: text + nullable: true + package_path: + data_type: text + nullable: true + state: + data_type: text + nullable: true + schema_hash: + data_type: text + nullable: true + schema_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + completed_at: + data_type: timestamp + nullable: true + parent: trace__steps + trace__steps__step_info__load_packages__jobs: + columns: + state: + data_type: text + nullable: true + file_path: + data_type: text + nullable: true + file_size: + data_type: bigint + nullable: true + created_at: + data_type: timestamp + nullable: true + elapsed: + data_type: double + nullable: true + table_name: + data_type: text + nullable: true + file_id: + data_type: text + nullable: true + retry_count: + data_type: bigint + nullable: true + file_format: + data_type: text + nullable: true + job_id: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps__step_info__load_packages + trace__steps__normalize_info__job_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + job_id: + data_type: text + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__normalize_info__table_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__load_info__job_metrics: + columns: + load_id: + data_type: text + nullable: true + job_id: + data_type: text + nullable: true + file_path: + data_type: text + nullable: true + table_name: + data_type: text + nullable: true + state: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + started_at: + data_type: timestamp + nullable: true + finished_at: + data_type: timestamp + nullable: true + remote_uri: + data_type: text + nullable: true + parent: trace__steps + trace__steps__step_info__load_packages__tables: + columns: + write_disposition: + data_type: text + nullable: true + schema_contract: + data_type: text + nullable: true + table_format: + data_type: text + nullable: true + file_format: + data_type: text + nullable: true + name: + data_type: text + nullable: true + resource: + data_type: text + nullable: true + schema_name: + data_type: text + nullable: true + load_id: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: + data_type: text + nullable: true + x_normalizer__seen_data: + data_type: bool + nullable: true + parent: trace__steps__step_info__load_packages + trace__steps__step_info__load_packages__tables__columns: + columns: + name: + data_type: text + nullable: true + data_type: + data_type: text + nullable: true + nullable: + data_type: bool + nullable: true + primary_key: + data_type: bool + nullable: true + table_name: + data_type: text + nullable: true + schema_name: + data_type: text + nullable: true + load_id: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + unique: + data_type: bool + nullable: true + foreign_key: + data_type: bool + nullable: true + parent: trace__steps__step_info__load_packages__tables + trace__resolved_config_values: + columns: + key: + data_type: text + nullable: true + is_secret_hint: + data_type: bool + nullable: true + provider_name: + data_type: text + nullable: true + config_type_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace + trace__resolved_config_values__sections: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace__resolved_config_values + trace__steps__exception_traces: + columns: + message: + data_type: text + nullable: true + exception_type: + data_type: text + nullable: true + is_terminal: + data_type: bool + nullable: true + docstring: + data_type: text + nullable: true + load_id: + data_type: text + nullable: true + pipeline_name: + data_type: text + nullable: true + exception_attrs: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__exception_traces__stack_trace: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace__steps__exception_traces +settings: + detections: + - iso_timestamp + default_hints: + not_null: + - _dlt_id + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + - _dlt_load_id + foreign_key: + - _dlt_parent_id + root_key: + - _dlt_root_id + unique: + - _dlt_id +normalizers: + names: snake_case + json: + module: dlt.common.normalizers.json.relational +previous_hashes: +- 9Ysjq/W0xpxkI/vBiYm8Qbr2nDP3JMt6KvGKUS/FCyI= +- NYeAxJ2r+T+dKFnXFhBEPzBP6SO+ORdhOfgQRo/XqBU= +- RV9jvZSD5dM+ZGjEL3HqokLvtf22K4zMNc3zWRahEw4= diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 0ab1f61d72..b6a7feffc1 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -39,7 +39,7 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations import filesystem, redshift, dummy +from dlt.destinations import filesystem, redshift, dummy, duckdb from dlt.destinations.impl.filesystem.filesystem import INIT_FILE_NAME from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractStorage @@ -2637,6 +2637,57 @@ def comments(user_id: str): assert pipeline.last_trace.last_normalize_info.row_counts["user_comments"] == 3 +def test_exceed_job_file_name_length() -> None: + # use very long table name both for parent and for a child + data = { + "id": 1, + "child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": [ + 1, + 2, + 3, + ], + "col use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": ( + "data" + ), + } + + table_name = ( + "parent use very long table name both for parent and for a child use very long table name" + " both for parent and for a child use very long table name both for parent and for a child" + " use very long table name both for parent and for a child use very long table name both" + " for parent and for a child use very long table name both for parent and for a child " + ) + + pipeline = dlt.pipeline( + pipeline_name="test_exceed_job_file_name_length", + destination="duckdb", + ) + # path too long + with pytest.raises(PipelineStepFailed) as os_err: + pipeline.run([data], table_name=table_name) + assert isinstance(os_err.value.__cause__, OSError) + + # fit into 255 + 1 + suffix_len = len(".b61d3af76c.0.insert-values") + pipeline = dlt.pipeline( + pipeline_name="test_exceed_job_file_name_length", + destination=duckdb( + max_identifier_length=255 - suffix_len + 1, + ), + ) + # path too long + with pytest.raises(PipelineStepFailed): + pipeline.run([data], table_name=table_name) + + pipeline = dlt.pipeline( + pipeline_name="test_exceed_job_file_name_length", + destination=duckdb( + max_identifier_length=255 - suffix_len, + ), + ) + pipeline.run([data], table_name=table_name) + + def assert_imported_file( pipeline: Pipeline, table_name: str, diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 3239e01bab..69c0f01b8b 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -7,6 +7,7 @@ from unittest.mock import patch import pytest import requests_mock +import yaml import dlt @@ -19,6 +20,8 @@ from dlt.common.typing import DictStrAny, StrStr, DictStrStr, TSecretValue from dlt.common.utils import digest128 +from dlt.destinations import dummy, filesystem + from dlt.pipeline.exceptions import PipelineStepFailed from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.trace import ( @@ -31,7 +34,8 @@ from dlt.extract.extract import describe_extract_data from dlt.extract.pipe import Pipe -from tests.utils import start_test_telemetry +from tests.pipeline.utils import PIPELINE_TEST_CASES_PATH +from tests.utils import TEST_STORAGE_ROOT, start_test_telemetry from tests.common.configuration.utils import toml_providers, environment @@ -122,7 +126,7 @@ def data(): resolved = _find_resolved_value(trace.resolved_config_values, "credentials", ["databricks"]) assert resolved.is_secret_hint is True assert resolved.value == databricks_creds - assert_trace_printable(trace) + assert_trace_serializable(trace) # activate pipeline because other was running in assert trace p.activate() @@ -153,7 +157,7 @@ def data(): assert isinstance(step.step_info, ExtractInfo) assert len(step.exception_traces) > 0 assert step.step_info.extract_data_info == [{"name": "async_exception", "data_type": "source"}] - assert_trace_printable(trace) + assert_trace_serializable(trace) extract_info = step.step_info # only new (unprocessed) package is present, all other metrics are empty, state won't be extracted @@ -174,7 +178,7 @@ def data(): step = trace.steps[2] assert step.step == "normalize" assert step.step_info is norm_info - assert_trace_printable(trace) + assert_trace_serializable(trace) assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo) assert p.last_trace.last_normalize_info.row_counts == {"_dlt_pipeline_state": 1, "data": 3} @@ -216,7 +220,7 @@ def data(): assert resolved.is_secret_hint is False assert resolved.value == "1.0" assert resolved.config_type_name == "DummyClientConfiguration" - assert_trace_printable(trace) + assert_trace_serializable(trace) assert isinstance(p.last_trace.last_load_info, LoadInfo) p.activate() @@ -234,12 +238,157 @@ def data(): assert step.step == "load" assert step.step_info is load_info # same load info assert trace.steps[0].step_info is not extract_info - assert_trace_printable(trace) + assert_trace_serializable(trace) assert isinstance(p.last_trace.last_load_info, LoadInfo) assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo) assert isinstance(p.last_trace.last_extract_info, ExtractInfo) +def test_trace_schema() -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" + os.environ["RESTORE_FROM_DESTINATION"] = "False" + + # mock runtime env + os.environ["CIRCLECI"] = "1" + os.environ["AWS_LAMBDA_FUNCTION_NAME"] = "lambda" + + @dlt.source(section="many_hints") + def many_hints( + api_type=dlt.config.value, + credentials: str = dlt.secrets.value, + secret_value: TSecretValue = TSecretValue("123"), # noqa: B008 + ): + # TODO: create table / column schema from typed dicts, not explicitly + @dlt.resource( + write_disposition="replace", + primary_key="id", + table_format="delta", + file_format="jsonl", + schema_contract="evolve", + columns=[ + { + "name": "multi", + "data_type": "decimal", + "nullable": True, + "cluster": True, + "description": "unknown", + "merge_key": True, + "precision": 9, + "scale": 3, + "sort": True, + "variant": True, + "partition": True, + } + ], + ) + def data(): + yield [{"id": 1, "multi": "1.2"}, {"id": 2}, {"id": 3}] + + return data() + + @dlt.source + def github(): + @dlt.resource + def get_shuffled_events(): + for _ in range(1): + with open( + "tests/normalize/cases/github.events.load_page_1_duck.json", + "r", + encoding="utf-8", + ) as f: + issues = json.load(f) + yield issues + + return get_shuffled_events() + + @dlt.source + def async_exception(max_range=1): + async def get_val(v): + await asyncio.sleep(0.1) + if v % 3 == 0: + raise ValueError(v) + return v + + @dlt.resource + def data(): + yield from [get_val(v) for v in range(1, max_range)] + + return data() + + # create pipeline with staging to get remote_uri in load step job_metrics + dummy_dest = dummy(completed_prob=1.0) + pipeline = dlt.pipeline( + pipeline_name="test_trace_schema", + destination=dummy_dest, + staging=filesystem(os.path.abspath(os.path.join(TEST_STORAGE_ROOT, "_remote_filesystem"))), + dataset_name="various", + ) + + # mock config + os.environ["API_TYPE"] = "REST" + os.environ["SOURCES__MANY_HINTS__CREDENTIALS"] = "CREDS" + + info = pipeline.run([many_hints(), github()]) + info.raise_on_failed_jobs() + + trace = pipeline.last_trace + pipeline._schema_storage.storage.save("trace.json", json.dumps(trace, pretty=True)) + + schema = dlt.Schema("trace") + trace_pipeline = dlt.pipeline( + pipeline_name="test_trace_schema_traces", destination=dummy(completed_prob=1.0) + ) + info = trace_pipeline.run([trace], table_name="trace", schema=schema) + info.raise_on_failed_jobs() + + # add exception trace + with pytest.raises(PipelineStepFailed): + pipeline.extract(async_exception(max_range=4)) + + trace_exception = pipeline.last_trace + pipeline._schema_storage.storage.save( + "trace_exception.json", json.dumps(trace_exception, pretty=True) + ) + + info = trace_pipeline.run([trace_exception], table_name="trace") + info.raise_on_failed_jobs() + inferred_trace_contract = trace_pipeline.schemas["trace"] + inferred_contract_str = inferred_trace_contract.to_pretty_yaml(remove_processing_hints=True) + + # NOTE: this saves actual inferred contract (schema) to schema storage, move it to test cases if you update + # trace shapes + # TODO: create a proper schema for dlt trace and tables/columns + pipeline._schema_storage.storage.save("trace.schema.yaml", inferred_contract_str) + # print(pipeline._schema_storage.storage.storage_path) + + # load the schema and use it as contract + with open(f"{PIPELINE_TEST_CASES_PATH}/contracts/trace.schema.yaml", encoding="utf-8") as f: + imported_schema = yaml.safe_load(f) + trace_contract = Schema.from_dict(imported_schema, remove_processing_hints=True) + # compare pretty forms of the schemas, they must be identical + # NOTE: if this fails you can comment this out and use contract run below to find first offending difference + # assert trace_contract.to_pretty_yaml() == inferred_contract_str + + # use trace contract to load data again + contract_trace_pipeline = dlt.pipeline( + pipeline_name="test_trace_schema_traces_contract", destination=dummy(completed_prob=1.0) + ) + info = contract_trace_pipeline.run( + [trace_exception, trace], + table_name="trace", + schema=trace_contract, + schema_contract="freeze", + ) + + # assert inferred_trace_contract.version_hash == trace_contract.version_hash + + # print(trace_pipeline.schemas["trace"].to_pretty_yaml()) + # print(pipeline._schema_storage.storage.storage_path) + + +# def test_trace_schema_contract() -> None: + + def test_save_load_trace() -> None: os.environ["COMPLETED_PROB"] = "1.0" info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy") @@ -255,7 +404,7 @@ def test_save_load_trace() -> None: assert resolved.is_secret_hint is False assert resolved.value == "1.0" assert resolved.config_type_name == "DummyClientConfiguration" - assert_trace_printable(trace) + assert_trace_serializable(trace) # check row counts assert pipeline.last_trace.last_normalize_info.row_counts == { "_dlt_pipeline_state": 1, @@ -296,7 +445,7 @@ def data(): assert run_step.step == "run" assert run_step.step_exception is not None assert step.step_exception == run_step.step_exception - assert_trace_printable(trace) + assert_trace_serializable(trace) assert pipeline.last_trace.last_normalize_info is None @@ -306,7 +455,7 @@ def test_save_load_empty_trace() -> None: pipeline = dlt.pipeline() pipeline.run([], table_name="data", destination="dummy") trace = pipeline.last_trace - assert_trace_printable(trace) + assert_trace_serializable(trace) assert len(trace.steps) == 4 pipeline.activate() @@ -529,7 +678,7 @@ def _mock_sentry_before_send(event: DictStrAny, _unused_hint: Any = None) -> Dic return event -def assert_trace_printable(trace: PipelineTrace) -> None: +def assert_trace_serializable(trace: PipelineTrace) -> None: str(trace) trace.asstr(0) trace.asstr(1) diff --git a/tests/pipeline/test_platform_connection.py b/tests/pipeline/test_platform_connection.py index fa5b143ff5..aa46019382 100644 --- a/tests/pipeline/test_platform_connection.py +++ b/tests/pipeline/test_platform_connection.py @@ -65,7 +65,8 @@ def data(): # basic check of trace result assert trace_result, "no trace" assert trace_result["pipeline_name"] == "platform_test_pipeline" - assert len(trace_result["steps"]) == 4 + # just extract, normalize and load steps. run step is not serialized to trace (it was just a copy of load) + assert len(trace_result["steps"]) == 3 assert trace_result["execution_context"]["library"]["name"] == "dlt" # basic check of state result diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index dfdb9c8e40..d3d87f0e0b 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -98,6 +98,9 @@ def users_materialize_table_schema(): def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None: """Asserts that expected number of packages was loaded and there are no failed jobs""" + # make sure we can serialize + info.asstr(verbosity=2) + info.asdict() assert len(info.loads_ids) == expected_load_packages # all packages loaded assert all(p.completed_at is not None for p in info.load_packages) is True diff --git a/tests/utils.py b/tests/utils.py index 976a623c0b..1b81881470 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -189,8 +189,9 @@ def wipe_pipeline(preserve_environ) -> Iterator[None]: yield if container[PipelineContext].is_active(): # take existing pipeline - p = dlt.pipeline() - p._wipe_working_folder() + # NOTE: no more needed. test storage is wiped fully when test starts + # p = dlt.pipeline() + # p._wipe_working_folder() # deactivate context container[PipelineContext].deactivate() From 2788235572de105ff01aaf5c1ebcbe4ea40b249b Mon Sep 17 00:00:00 2001 From: Akela Drissner-Schmid <32450038+akelad@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:32:22 +0200 Subject: [PATCH 26/34] Update snowflake.md --- docs/website/docs/dlt-ecosystem/destinations/snowflake.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 181d024a2f..d08578c5a2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -136,7 +136,12 @@ If you set the [`replace` strategy](../../general-usage/full-loading.md) to `sta recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. ## Data loading -The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are immediately removed (if not specified otherwise). +The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are kept by default, unless specified otherwise via the `keep_staged_files` parameter: + +```toml +[destination.snowflake] +keep_staged_files = false +``` ## Supported file formats * [insert-values](../file-formats/insert-format.md) is used by default From 935dc09efd067549fbcb87b906ccb560d945bd26 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 27 Aug 2024 00:20:06 +0200 Subject: [PATCH 27/34] Feat/1711 create with not exists dlt tables (#1740) * uses normalized column names when linking tables in relational * destination cap if create table if not exits supported * generates IF NOT EXISTS for dlt tables * adds logging for terminal and retry exception in run_managed of load job * passes schema update to be collected in trace in filesystem * fixes job log exception message --- dlt/common/destination/capabilities.py | 1 + dlt/common/destination/reference.py | 4 ++++ dlt/common/normalizers/json/relational.py | 12 +++++------- dlt/destinations/impl/athena/athena.py | 2 +- .../impl/filesystem/filesystem.py | 5 ++++- dlt/destinations/impl/mssql/factory.py | 1 + dlt/destinations/impl/synapse/factory.py | 4 ++++ dlt/destinations/job_client_impl.py | 19 ++++++++++++++----- .../parent_child_relationship.py | 9 ++++----- .../test_parent_child_relationship.py | 10 ++++------ tests/load/mssql/test_mssql_table_builder.py | 12 ++++++++++-- .../postgres/test_postgres_table_builder.py | 11 ++++++++++- tests/pipeline/test_pipeline_trace.py | 2 +- 13 files changed, 63 insertions(+), 29 deletions(-) diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index be71cb50e9..52e7d74833 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -76,6 +76,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): # use naming convention in the schema naming_convention: TNamingConventionReferenceArg = None alter_add_multi_column: bool = True + supports_create_table_if_not_exists: bool = True supports_truncate_command: bool = True schema_supports_numeric_precision: bool = True timestamp_precision: int = 6 diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index b6c7041592..744cbbd1f5 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -383,9 +383,13 @@ def run_managed( except (DestinationTerminalException, TerminalValueError) as e: self._state = "failed" self._exception = e + logger.exception(f"Terminal exception in job {self.job_id()} in file {self._file_path}") except (DestinationTransientException, Exception) as e: self._state = "retry" self._exception = e + logger.exception( + f"Transient exception in job {self.job_id()} in file {self._file_path}" + ) finally: self._finished_at = pendulum.now() # sanity check diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 8e296445eb..1dbcec4bff 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -184,11 +184,10 @@ def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) -> # and all child tables must be lists return digest128(f"{parent_row_id}_{child_table}_{list_idx}", DLT_ID_LENGTH_BYTES) - @staticmethod - def _link_row(row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: + def _link_row(self, row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: assert parent_row_id - row["_dlt_parent_id"] = parent_row_id - row["_dlt_list_idx"] = list_idx + row[self.c_dlt_parent_id] = parent_row_id + row[self.c_dlt_list_idx] = list_idx return row @@ -227,7 +226,7 @@ def _add_row_id( if row_id_type == "row_hash": row_id = DataItemNormalizer._get_child_row_hash(parent_row_id, table, pos) # link to parent table - DataItemNormalizer._link_row(flattened_row, parent_row_id, pos) + self._link_row(flattened_row, parent_row_id, pos) flattened_row[self.c_dlt_id] = row_id return row_id @@ -260,7 +259,6 @@ def _normalize_list( parent_row_id: Optional[str] = None, _r_lvl: int = 0, ) -> TNormalizedRowIterator: - v: DictStrAny = None table = self.schema.naming.shorten_fragments(*parent_path, *ident_path) for idx, v in enumerate(seq): @@ -285,7 +283,7 @@ def _normalize_list( child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx) wrap_v = wrap_in_dict(v) wrap_v[self.c_dlt_id] = child_row_hash - e = DataItemNormalizer._link_row(wrap_v, parent_row_id, idx) + e = self._link_row(wrap_v, parent_row_id, idx) DataItemNormalizer._extend_row(extend, e) yield (table, self.schema.naming.shorten_fragments(*parent_path)), e diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 1429b28240..0c90d171a3 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -452,7 +452,7 @@ def _get_table_update_sql( partition_clause = self._iceberg_partition_clause( cast(Optional[Dict[str, str]], table.get(PARTITION_HINT)) ) - sql.append(f"""CREATE TABLE {qualified_table_name} + sql.append(f"""{self._make_create_table(qualified_table_name, table)} ({columns}) {partition_clause} LOCATION '{location.rstrip('/')}' diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 2e09871ba9..5445fd2ae9 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -303,6 +303,7 @@ def update_stored_schema( only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, ) -> TSchemaTables: + applied_update = super().update_stored_schema(only_tables, expected_update) # create destination dirs for all tables table_names = only_tables or self.schema.tables.keys() dirs_to_create = self.get_table_dirs(table_names) @@ -316,7 +317,9 @@ def update_stored_schema( if not self.config.as_staging: self._store_current_schema() - return expected_update + # we assume that expected_update == applied_update so table schemas in dest were not + # externally changed + return applied_update def get_table_dir(self, table_name: str, remote: bool = False) -> str: # dlt tables do not respect layout (for now) diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py index 85c94c21b7..f1a8bb136a 100644 --- a/dlt/destinations/impl/mssql/factory.py +++ b/dlt/destinations/impl/mssql/factory.py @@ -37,6 +37,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.max_text_data_type_length = 2**30 - 1 caps.is_max_text_data_type_length_in_bytes = False caps.supports_ddl_transactions = True + caps.supports_create_table_if_not_exists = False # IF NOT EXISTS not supported caps.max_rows_per_insert = 1000 caps.timestamp_precision = 7 caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index bb117e48d2..d5a0281bec 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -63,6 +63,10 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_transactions = True caps.supports_ddl_transactions = False + caps.supports_create_table_if_not_exists = ( + False # IF NOT EXISTS on CREATE TABLE not supported + ) + # Synapse throws "Some part of your SQL statement is nested too deeply. Rewrite the query or break it up into smaller queries." # if number of records exceeds a certain number. Which exact number that is seems not deterministic: # in tests, I've seen a query with 12230 records run succesfully on one run, but fail on a subsequent run, while the query remained exactly the same. diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 92132dd751..1d6403a2c8 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -522,22 +522,31 @@ def _make_add_column_sql( """Make one or more ADD COLUMN sql clauses to be joined in ALTER TABLE statement(s)""" return [f"ADD COLUMN {self._get_column_def_sql(c, table_format)}" for c in new_columns] + def _make_create_table(self, qualified_name: str, table: TTableSchema) -> str: + not_exists_clause = " " + if ( + table["name"] in self.schema.dlt_table_names() + and self.capabilities.supports_create_table_if_not_exists + ): + not_exists_clause = " IF NOT EXISTS " + return f"CREATE TABLE{not_exists_clause}{qualified_name}" + def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> List[str]: # build sql - canonical_name = self.sql_client.make_qualified_table_name(table_name) + qualified_name = self.sql_client.make_qualified_table_name(table_name) table = self.prepare_load_table(table_name) table_format = table.get("table_format") sql_result: List[str] = [] if not generate_alter: # build CREATE - sql = f"CREATE TABLE {canonical_name} (\n" + sql = self._make_create_table(qualified_name, table) + " (\n" sql += ",\n".join([self._get_column_def_sql(c, table_format) for c in new_columns]) sql += ")" sql_result.append(sql) else: - sql_base = f"ALTER TABLE {canonical_name}\n" + sql_base = f"ALTER TABLE {qualified_name}\n" add_column_statements = self._make_add_column_sql(new_columns, table_format) if self.capabilities.alter_add_multi_column: column_sql = ",\n" @@ -561,13 +570,13 @@ def _get_table_update_sql( if hint == "not_null": logger.warning( f"Column(s) {hint_columns} with NOT NULL are being added to existing" - f" table {canonical_name}. If there's data in the table the operation" + f" table {qualified_name}. If there's data in the table the operation" " will fail." ) else: logger.warning( f"Column(s) {hint_columns} with hint {hint} are being added to existing" - f" table {canonical_name}. Several hint types may not be added to" + f" table {qualified_name}. Several hint types may not be added to" " existing tables." ) return sql_result diff --git a/docs/examples/parent_child_relationship/parent_child_relationship.py b/docs/examples/parent_child_relationship/parent_child_relationship.py index 39c9f577cc..6de00ffb28 100644 --- a/docs/examples/parent_child_relationship/parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/parent_child_relationship.py @@ -22,6 +22,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -44,6 +45,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -51,6 +53,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + if __name__ == "__main__": # Create and configure the dlt pipeline pipeline = dlt.pipeline( @@ -60,10 +63,6 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py index f671040823..95d1bade97 100644 --- a/docs/examples/parent_child_relationship/test_parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/test_parent_child_relationship.py @@ -1,4 +1,3 @@ - import pytest from tests.utils import skipifgithubfork @@ -29,6 +28,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -51,6 +51,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -58,6 +59,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + @skipifgithubfork @pytest.mark.forked def test_parent_child_relationship(): @@ -69,10 +71,6 @@ def test_parent_child_relationship(): ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index d6cf3ec3e8..3f3896de6c 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -55,8 +55,8 @@ def test_alter_table(client: MsSqlJobClient) -> None: # existing table has no columns sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)[0] sqlfluff.parse(sql, dialect="tsql") - canonical_name = client.sql_client.make_qualified_table_name("event_test_table") - assert sql.count(f"ALTER TABLE {canonical_name}\nADD") == 1 + qualified_name = client.sql_client.make_qualified_table_name("event_test_table") + assert sql.count(f"ALTER TABLE {qualified_name}\nADD") == 1 assert "event_test_table" in sql assert '"col1" bigint NOT NULL' in sql assert '"col2" float NOT NULL' in sql @@ -75,3 +75,11 @@ def test_alter_table(client: MsSqlJobClient) -> None: assert '"col6_precision" decimal(6,2) NOT NULL' in sql assert '"col7_precision" varbinary(19)' in sql assert '"col11_precision" time(3) NOT NULL' in sql + + +def test_create_dlt_table(client: MsSqlJobClient) -> None: + # non existing table + sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="tsql") + qualified_name = client.sql_client.make_qualified_table_name("_dlt_version") + assert f"CREATE TABLE {qualified_name}" in sql diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 86bd67db9a..28fd4eec9d 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -57,7 +57,8 @@ def test_create_table(client: PostgresClient) -> None: # non existing table sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0] sqlfluff.parse(sql, dialect="postgres") - assert "event_test_table" in sql + qualified_name = client.sql_client.make_qualified_table_name("event_test_table") + assert f"CREATE TABLE {qualified_name}" in sql assert '"col1" bigint NOT NULL' in sql assert '"col2" double precision NOT NULL' in sql assert '"col3" boolean NOT NULL' in sql @@ -173,3 +174,11 @@ def test_create_table_case_sensitive(cs_client: PostgresClient) -> None: # every line starts with "Col" for line in sql.split("\n")[1:]: assert line.startswith('"Col') + + +def test_create_dlt_table(client: PostgresClient) -> None: + # non existing table + sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="postgres") + qualified_name = client.sql_client.make_qualified_table_name("_dlt_version") + assert f"CREATE TABLE IF NOT EXISTS {qualified_name}" in sql diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 69c0f01b8b..4e52d2aa29 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -551,7 +551,7 @@ def test_trace_telemetry() -> None: for item in SENTRY_SENT_ITEMS: # print(item) print(item["logentry"]["message"]) - assert len(SENTRY_SENT_ITEMS) == 2 + assert len(SENTRY_SENT_ITEMS) == 4 # trace with exception @dlt.resource From 08e5e7afca0f328da107d6e8eda7ca3c01366d33 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:59:43 +0400 Subject: [PATCH 28/34] Enable schema evolution for `merge` write disposition with `delta` table format (#1742) * black format * increase minimum deltalake version dependency * enable schema evolution for delta table merge * extract delta table merge logic into separate function * remove big decimal exclusion due to upstream bugfix * evolve delta table schema in empty source case * refactor DeltaLoadFilesystemJob * uses right table path format in delta lake load job * allows to pass schema name when getting delta tables and computing table counts * cleansup usage of remote paths and uris in filesystem load jobs * removes tempfile from file_storage --------- Co-authored-by: Marcin Rudolf --- dlt/common/libs/deltalake.py | 77 ++++++-- dlt/common/storages/file_storage.py | 17 +- dlt/destinations/fs_client.py | 3 + .../impl/filesystem/filesystem.py | 166 +++++++++--------- poetry.lock | 162 ++++++++--------- pyproject.toml | 2 +- tests/libs/test_deltalake.py | 14 +- .../load/pipeline/test_filesystem_pipeline.py | 133 ++++++++++++-- tests/pipeline/utils.py | 17 +- 9 files changed, 358 insertions(+), 233 deletions(-) diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index d98795d07c..d4cb46c600 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -5,13 +5,15 @@ from dlt.common import logger from dlt.common.libs.pyarrow import pyarrow as pa from dlt.common.libs.pyarrow import cast_arrow_schema_types -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableSchema +from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: + import deltalake from deltalake import write_deltalake, DeltaTable from deltalake.writer import try_get_deltatable except ModuleNotFoundError: @@ -74,7 +76,7 @@ def write_delta_table( partition_by: Optional[Union[List[str], str]] = None, storage_options: Optional[Dict[str, str]] = None, ) -> None: - """Writes in-memory Arrow table to on-disk Delta table. + """Writes in-memory Arrow data to on-disk Delta table. Thin wrapper around `deltalake.write_deltalake`. """ @@ -93,31 +95,73 @@ def write_delta_table( ) -def get_delta_tables(pipeline: Pipeline, *tables: str) -> Dict[str, DeltaTable]: - """Returns Delta tables in `pipeline.default_schema` as `deltalake.DeltaTable` objects. +def merge_delta_table( + table: DeltaTable, + data: Union[pa.Table, pa.RecordBatchReader], + schema: TTableSchema, +) -> None: + """Merges in-memory Arrow data into on-disk Delta table.""" + + strategy = schema["x-merge-strategy"] # type: ignore[typeddict-item] + if strategy == "upsert": + # `DeltaTable.merge` does not support automatic schema evolution + # https://github.com/delta-io/delta-rs/issues/2282 + _evolve_delta_table_schema(table, data.schema) + + if "parent" in schema: + unique_column = get_first_column_name_with_prop(schema, "unique") + predicate = f"target.{unique_column} = source.{unique_column}" + else: + primary_keys = get_columns_names_with_prop(schema, "primary_key") + predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys]) + + qry = ( + table.merge( + source=ensure_delta_compatible_arrow_data(data), + predicate=predicate, + source_alias="source", + target_alias="target", + ) + .when_matched_update_all() + .when_not_matched_insert_all() + ) + + qry.execute() + else: + ValueError(f'Merge strategy "{strategy}" not supported.') + + +def get_delta_tables( + pipeline: Pipeline, *tables: str, schema_name: str = None +) -> Dict[str, DeltaTable]: + """Returns Delta tables in `pipeline.default_schema (default)` as `deltalake.DeltaTable` objects. Returned object is a dictionary with table names as keys and `DeltaTable` objects as values. Optionally filters dictionary by table names specified as `*tables*`. - Raises ValueError if table name specified as `*tables` is not found. + Raises ValueError if table name specified as `*tables` is not found. You may try to switch to other + schemas via `schema_name` argument. """ from dlt.common.schema.utils import get_table_format - with pipeline.destination_client() as client: + with pipeline.destination_client(schema_name=schema_name) as client: assert isinstance( client, FilesystemClient ), "The `get_delta_tables` function requires a `filesystem` destination." schema_delta_tables = [ t["name"] - for t in pipeline.default_schema.tables.values() - if get_table_format(pipeline.default_schema.tables, t["name"]) == "delta" + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "delta" ] if len(tables) > 0: invalid_tables = set(tables) - set(schema_delta_tables) if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" raise ValueError( - "Schema does not contain Delta tables with these names: " - f"{', '.join(invalid_tables)}." + f"Schema {client.schema.name} does not contain Delta tables with these names: " + f"{', '.join(invalid_tables)}.{available_schemas}" ) schema_delta_tables = [t for t in schema_delta_tables if t in tables] table_dirs = client.get_table_dirs(schema_delta_tables, remote=True) @@ -145,3 +189,16 @@ def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str + ". dlt will use the values in `deltalake_storage_options`." ) return {**creds, **extra_options} + + +def _evolve_delta_table_schema(delta_table: DeltaTable, arrow_schema: pa.Schema) -> None: + """Evolves `delta_table` schema if different from `arrow_schema`. + + Adds column(s) to `delta_table` present in `arrow_schema` but not in `delta_table`. + """ + new_fields = [ + deltalake.Field.from_pyarrow(field) + for field in ensure_delta_compatible_arrow_schema(arrow_schema) + if field not in delta_table.to_pyarrow_dataset().schema + ] + delta_table.alter.add_columns(new_fields) diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py index 7d14b8f7f7..f26cc060a3 100644 --- a/dlt/common/storages/file_storage.py +++ b/dlt/common/storages/file_storage.py @@ -3,7 +3,6 @@ import re import stat import errno -import tempfile import shutil import pathvalidate from typing import IO, Any, Optional, List, cast @@ -29,10 +28,8 @@ def save(self, relative_path: str, data: Any) -> str: @staticmethod def save_atomic(storage_path: str, relative_path: str, data: Any, file_type: str = "t") -> str: mode = "w" + file_type - with tempfile.NamedTemporaryFile( - dir=storage_path, mode=mode, delete=False, encoding=encoding_for_mode(mode) - ) as f: - tmp_path = f.name + tmp_path = os.path.join(storage_path, uniq_id(8)) + with open(tmp_path, mode=mode, encoding=encoding_for_mode(mode)) as f: f.write(data) try: dest_path = os.path.join(storage_path, relative_path) @@ -116,11 +113,11 @@ def open_file(self, relative_path: str, mode: str = "r") -> IO[Any]: return FileStorage.open_zipsafe_ro(self.make_full_path(relative_path), mode) return open(self.make_full_path(relative_path), mode, encoding=encoding_for_mode(mode)) - def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]: - mode = mode + file_type or self.file_type - return tempfile.NamedTemporaryFile( - dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode) - ) + # def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]: + # mode = mode + file_type or self.file_type + # return tempfile.NamedTemporaryFile( + # dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode) + # ) def has_file(self, relative_path: str) -> bool: return os.path.isfile(self.make_full_path(relative_path)) diff --git a/dlt/destinations/fs_client.py b/dlt/destinations/fs_client.py index 3233446594..14e77b6b4e 100644 --- a/dlt/destinations/fs_client.py +++ b/dlt/destinations/fs_client.py @@ -3,9 +3,12 @@ from abc import ABC, abstractmethod from fsspec import AbstractFileSystem +from dlt.common.schema import Schema + class FSClientBase(ABC): fs_client: AbstractFileSystem + schema: Schema @property @abstractmethod diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 5445fd2ae9..05261ccb1b 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -3,7 +3,7 @@ import base64 from types import TracebackType -from typing import ClassVar, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast +from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast from fsspec import AbstractFileSystem from contextlib import contextmanager @@ -13,7 +13,7 @@ from dlt.common.storages.fsspec_filesystem import glob_files from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema -from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop +from dlt.common.schema.utils import get_columns_names_with_prop from dlt.common.storages import FileStorage, fsspec_from_config from dlt.common.storages.load_package import ( LoadJobInfo, @@ -56,36 +56,36 @@ def __init__( self._job_client: FilesystemClient = None def run(self) -> None: - # pick local filesystem pathlib or posix for buckets - self.is_local_filesystem = self._job_client.config.protocol == "file" - self.pathlib = os.path if self.is_local_filesystem else posixpath - - self.destination_file_name = path_utils.create_path( - self._job_client.config.layout, - self._file_name, - self._job_client.schema.name, - self._load_id, - current_datetime=self._job_client.config.current_datetime, - load_package_timestamp=dlt.current.load_package()["state"]["created_at"], - extra_placeholders=self._job_client.config.extra_placeholders, - ) + self.__is_local_filesystem = self._job_client.config.protocol == "file" # We would like to avoid failing for local filesystem where # deeply nested directory will not exist before writing a file. # It `auto_mkdir` is disabled by default in fsspec so we made some # trade offs between different options and decided on this. # remote_path = f"{client.config.protocol}://{posixpath.join(dataset_path, destination_file_name)}" remote_path = self.make_remote_path() - if self.is_local_filesystem: - self._job_client.fs_client.makedirs(self.pathlib.dirname(remote_path), exist_ok=True) + if self.__is_local_filesystem: + # use os.path for local file name + self._job_client.fs_client.makedirs(os.path.dirname(remote_path), exist_ok=True) self._job_client.fs_client.put_file(self._file_path, remote_path) def make_remote_path(self) -> str: """Returns path on the remote filesystem to which copy the file, without scheme. For local filesystem a native path is used""" + destination_file_name = path_utils.create_path( + self._job_client.config.layout, + self._file_name, + self._job_client.schema.name, + self._load_id, + current_datetime=self._job_client.config.current_datetime, + load_package_timestamp=dlt.current.load_package()["state"]["created_at"], + extra_placeholders=self._job_client.config.extra_placeholders, + ) + # pick local filesystem pathlib or posix for buckets + pathlib = os.path if self.__is_local_filesystem else posixpath # path.join does not normalize separators and available # normalization functions are very invasive and may string the trailing separator - return self.pathlib.join( # type: ignore[no-any-return] + return pathlib.join( # type: ignore[no-any-return] self._job_client.dataset_path, - path_utils.normalize_path_sep(self.pathlib, self.destination_file_name), + path_utils.normalize_path_sep(pathlib, destination_file_name), ) def make_remote_uri(self) -> str: @@ -98,89 +98,81 @@ def metrics(self) -> Optional[LoadJobMetrics]: class DeltaLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: - super().__init__( - file_path=file_path, - ) - - def run(self) -> None: - # pick local filesystem pathlib or posix for buckets - # TODO: since we pass _job_client via run_managed and not set_env_vars it is hard - # to write a handler with those two line below only in FilesystemLoadJob - self.is_local_filesystem = self._job_client.config.protocol == "file" - self.pathlib = os.path if self.is_local_filesystem else posixpath - self.destination_file_name = self._job_client.make_remote_uri( - self._job_client.get_table_dir(self.load_table_name) - ) + super().__init__(file_path=file_path) + # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa - from dlt.common.libs.deltalake import ( - DeltaTable, - write_delta_table, - ensure_delta_compatible_arrow_schema, - _deltalake_storage_options, - try_get_deltatable, - ) - # create Arrow dataset from Parquet files - file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) - arrow_ds = pa.dataset.dataset(file_paths) + self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) + self.arrow_ds = pa.dataset.dataset(self.file_paths) - # create Delta table object + def make_remote_path(self) -> str: + # remote path is table dir - delta will create its file structure inside it + return self._job_client.get_table_dir(self.load_table_name) - storage_options = _deltalake_storage_options(self._job_client.config) - dt = try_get_deltatable(self.destination_file_name, storage_options=storage_options) + def run(self) -> None: + logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_uri()}") - # get partition columns - part_cols = get_columns_names_with_prop(self._load_table, "partition") + from dlt.common.libs.deltalake import write_delta_table, merge_delta_table # explicitly check if there is data # (https://github.com/delta-io/delta-rs/issues/2686) - if arrow_ds.head(1).num_rows == 0: - if dt is None: - # create new empty Delta table with schema from Arrow table - DeltaTable.create( - table_uri=self.destination_file_name, - schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema), - mode="overwrite", - partition_by=part_cols, - storage_options=storage_options, - ) + if self.arrow_ds.head(1).num_rows == 0: + self._create_or_evolve_delta_table() return - arrow_rbr = arrow_ds.scanner().to_reader() # RecordBatchReader - - if self._load_table["write_disposition"] == "merge" and dt is not None: - assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies # type: ignore[typeddict-item] - - if self._load_table["x-merge-strategy"] == "upsert": # type: ignore[typeddict-item] - if "parent" in self._load_table: - unique_column = get_first_column_name_with_prop(self._load_table, "unique") - predicate = f"target.{unique_column} = source.{unique_column}" - else: - primary_keys = get_columns_names_with_prop(self._load_table, "primary_key") - predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys]) - - qry = ( - dt.merge( - source=arrow_rbr, - predicate=predicate, - source_alias="source", - target_alias="target", - ) - .when_matched_update_all() - .when_not_matched_insert_all() + with self.arrow_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader + if self._load_table["write_disposition"] == "merge" and self._delta_table is not None: + assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies # type: ignore[typeddict-item] + merge_delta_table( + table=self._delta_table, + data=arrow_rbr, + schema=self._load_table, + ) + else: + write_delta_table( + table_or_uri=( + self.make_remote_uri() if self._delta_table is None else self._delta_table + ), + data=arrow_rbr, + write_disposition=self._load_table["write_disposition"], + partition_by=self._partition_columns, + storage_options=self._storage_options, ) - qry.execute() + @property + def _storage_options(self) -> Dict[str, str]: + from dlt.common.libs.deltalake import _deltalake_storage_options + + return _deltalake_storage_options(self._job_client.config) - else: - write_delta_table( - table_or_uri=self.destination_file_name if dt is None else dt, - data=arrow_rbr, - write_disposition=self._load_table["write_disposition"], - partition_by=part_cols, - storage_options=storage_options, + @property + def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.deltalake import try_get_deltatable + + return try_get_deltatable(self.make_remote_uri(), storage_options=self._storage_options) + + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + + def _create_or_evolve_delta_table(self) -> None: + from dlt.common.libs.deltalake import ( + DeltaTable, + ensure_delta_compatible_arrow_schema, + _evolve_delta_table_schema, + ) + + if self._delta_table is None: + DeltaTable.create( + table_uri=self.make_remote_uri(), + schema=ensure_delta_compatible_arrow_schema(self.arrow_ds.schema), + mode="overwrite", + partition_by=self._partition_columns, + storage_options=self._storage_options, ) + else: + _evolve_delta_table_schema(self._delta_table, self.arrow_ds.schema) class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): diff --git a/poetry.lock b/poetry.lock index d54a73a2ef..230b354b97 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -2102,27 +2102,27 @@ typing-extensions = ">=3.10.0" [[package]] name = "databricks-sql-connector" -version = "3.1.2" +version = "3.3.0" description = "Databricks SQL Connector for Python" optional = true python-versions = "<4.0.0,>=3.8.0" files = [ - {file = "databricks_sql_connector-3.1.2-py3-none-any.whl", hash = "sha256:5292bc25b4d8d58d301079b55086331764f067e24862c9365698b2eeddedb737"}, - {file = "databricks_sql_connector-3.1.2.tar.gz", hash = "sha256:da0df114e0824d49ccfea36c4679c95689fe359191b056ad516446a058307c37"}, + {file = "databricks_sql_connector-3.3.0-py3-none-any.whl", hash = "sha256:55ee5a4a11291bf91a235ac76e41b419ddd66a9a321065a8bfaf119acbb26d6b"}, + {file = "databricks_sql_connector-3.3.0.tar.gz", hash = "sha256:19e82965da4c86574adfe9f788c17b4494d98eb8075ba4fd4306573d2edbf194"}, ] [package.dependencies] lz4 = ">=4.0.2,<5.0.0" numpy = [ - {version = ">=1.16.6", markers = "python_version >= \"3.8\" and python_version < \"3.11\""}, - {version = ">=1.23.4", markers = "python_version >= \"3.11\""}, + {version = ">=1.16.6,<2.0.0", markers = "python_version >= \"3.8\" and python_version < \"3.11\""}, + {version = ">=1.23.4,<2.0.0", markers = "python_version >= \"3.11\""}, ] oauthlib = ">=3.1.0,<4.0.0" openpyxl = ">=3.0.10,<4.0.0" pandas = {version = ">=1.2.5,<2.2.0", markers = "python_version >= \"3.8\""} -pyarrow = ">=14.0.1,<15.0.0" +pyarrow = ">=14.0.1,<17" requests = ">=2.18.1,<3.0.0" -thrift = ">=0.16.0,<0.17.0" +thrift = ">=0.16.0,<0.21.0" urllib3 = ">=1.26" [package.extras] @@ -2377,25 +2377,24 @@ files = [ [[package]] name = "deltalake" -version = "0.17.4" +version = "0.19.1" description = "Native Delta Lake Python binding based on delta-rs with Pandas integration" optional = true python-versions = ">=3.8" files = [ - {file = "deltalake-0.17.4-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3f048bd4cdd3500fbb0d1b34046966ca4b7cefd1e9df71460b881ee8ad7f844a"}, - {file = "deltalake-0.17.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:b539265d8293794872e1dc3b2daad50abe05ab425e961824b3ac1155bb294604"}, - {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55e6be5f5ab8d5d34d2ea58d86e93eec2da5d2476e3c15e9520239457618bca4"}, - {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dde6c2d0a07e9ce47be367d016541d3a499839350852205819353441e1a9c1"}, - {file = "deltalake-0.17.4-cp38-abi3-win_amd64.whl", hash = "sha256:f51f499d50dad88bdc18c5ed7c2319114759f3220f83aa2d32166c19accee4ce"}, - {file = "deltalake-0.17.4.tar.gz", hash = "sha256:c3c10577afc46d4b10ed16246d814a8c40b3663099066681eeba89f908373814"}, + {file = "deltalake-0.19.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ddaaaa9c85a17791c3997cf320ac11dc1725d16cf4b6f0ff1b130853e7b56cd0"}, + {file = "deltalake-0.19.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0184d5a3f0d4f4f1fb992c3bdc8736329b78b6a4faf1a278109ec35d9945c1d"}, + {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec9d117fcf6c198f3d554be2f3a6291ca3838530650db236741ff48d4d47abb4"}, + {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:447ef721319ed15f7b5f6da507efd5fed0e6172e5ae55ac044d5b8fc9b812e47"}, + {file = "deltalake-0.19.1-cp38-abi3-win_amd64.whl", hash = "sha256:b15bc343a9f8f3de80fbedcebd5d9472b539eb0f538a71739c7fcf699089127e"}, + {file = "deltalake-0.19.1.tar.gz", hash = "sha256:5e09fabb221fb81e989c283c16278eaffb6e85706d98364abcda5c0c6ca73598"}, ] [package.dependencies] -pyarrow = ">=8" -pyarrow-hotfix = "*" +pyarrow = ">=16" [package.extras] -devel = ["mypy (>=1.8.0,<1.9.0)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (>=0.3.0,<0.4.0)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"] +devel = ["azure-storage-blob (==12.20.0)", "mypy (==1.10.1)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (==0.5.2)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"] pandas = ["pandas"] pyspark = ["delta-spark", "numpy (==1.22.2)", "pyspark"] @@ -4567,17 +4566,17 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", [[package]] name = "lancedb" -version = "0.9.0" +version = "0.13.0b1" description = "lancedb" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "lancedb-0.9.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:b1ca08797c72c93ae512aa1078f1891756da157d910fbae8e194fac3528fc1ac"}, - {file = "lancedb-0.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:15129791f03c2c04b95f914ced2c1556b43d73a24710207b9af77b6e4008bdeb"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f093d89447a2039b820d2540a0b64df3024e4549b6808ebd26b44fbe0345cc6"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:a8c1f6777e217d2277451038866d280fa5fb38bd161795e51703b043c26dd345"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:78dd5800a1148f89d33b7e98d1c8b1c42dee146f03580abc1ca83cb05273ff7f"}, - {file = "lancedb-0.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:ba5bdc727d3bc131f17414f42372acde5817073feeb553793a3d20003caa1658"}, + {file = "lancedb-0.13.0b1-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:687b9a08be55e6fa9520255b1b06dcd2e6ba6c64c947410821e9a3a52b2f48ec"}, + {file = "lancedb-0.13.0b1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ac00684f7e90ffc1b386298670e2c4ddaea8c0b61b6eb1b51dbd4e74feb87a86"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbe8fc15bfeec89b6b2a4a42b4b919b6d3e138cf8684af35f77f361d73fe90cd"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:231e1f00d724c468922f7951d902622d4ccb21c2db2a148b845beaebee5d35b3"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:fecdd71f137e52193bfb5843610f32fe025a60a1edf5f80530704de879706c6b"}, + {file = "lancedb-0.13.0b1-cp38-abi3-win_amd64.whl", hash = "sha256:7852d9c04a4402407af06bbbf78bf339a169f1df2bf5c70da586ca733ec40a68"}, ] [package.dependencies] @@ -4587,7 +4586,7 @@ deprecation = "*" overrides = ">=0.7" packaging = "*" pydantic = ">=1.10" -pylance = "0.13.0" +pylance = "0.16.1" ratelimiter = ">=1.0,<2.0" requests = ">=2.31.0" retry = ">=0.9.2" @@ -4598,8 +4597,8 @@ azure = ["adlfs (>=2024.2.0)"] clip = ["open-clip", "pillow", "torch"] dev = ["pre-commit", "ruff"] docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] -embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] -tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] +embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "ibm-watsonx-ai (>=1.1.2)", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] +tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19,<=1.3.0)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] [[package]] name = "lazy-object-proxy" @@ -6660,63 +6659,52 @@ files = [ [[package]] name = "pyarrow" -version = "14.0.2" +version = "16.1.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-14.0.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ba9fe808596c5dbd08b3aeffe901e5f81095baaa28e7d5118e01354c64f22807"}, - {file = "pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:22a768987a16bb46220cef490c56c671993fbee8fd0475febac0b3e16b00a10e"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dbba05e98f247f17e64303eb876f4a80fcd32f73c7e9ad975a83834d81f3fda"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a898d134d00b1eca04998e9d286e19653f9d0fcb99587310cd10270907452a6b"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:87e879323f256cb04267bb365add7208f302df942eb943c93a9dfeb8f44840b1"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:76fc257559404ea5f1306ea9a3ff0541bf996ff3f7b9209fc517b5e83811fa8e"}, - {file = "pyarrow-14.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0c4a18e00f3a32398a7f31da47fefcd7a927545b396e1f15d0c85c2f2c778cd"}, - {file = "pyarrow-14.0.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:87482af32e5a0c0cce2d12eb3c039dd1d853bd905b04f3f953f147c7a196915b"}, - {file = "pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:059bd8f12a70519e46cd64e1ba40e97eae55e0cbe1695edd95384653d7626b23"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f16111f9ab27e60b391c5f6d197510e3ad6654e73857b4e394861fc79c37200"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ff1264fe4448e8d02073f5ce45a9f934c0f3db0a04460d0b01ff28befc3696"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd4f4b472ccf4042f1eab77e6c8bce574543f54d2135c7e396f413046397d5a"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:32356bfb58b36059773f49e4e214996888eeea3a08893e7dbde44753799b2a02"}, - {file = "pyarrow-14.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:52809ee69d4dbf2241c0e4366d949ba035cbcf48409bf404f071f624ed313a2b"}, - {file = "pyarrow-14.0.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:c87824a5ac52be210d32906c715f4ed7053d0180c1060ae3ff9b7e560f53f944"}, - {file = "pyarrow-14.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a25eb2421a58e861f6ca91f43339d215476f4fe159eca603c55950c14f378cc5"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c1da70d668af5620b8ba0a23f229030a4cd6c5f24a616a146f30d2386fec422"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cc61593c8e66194c7cdfae594503e91b926a228fba40b5cf25cc593563bcd07"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:78ea56f62fb7c0ae8ecb9afdd7893e3a7dbeb0b04106f5c08dbb23f9c0157591"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:37c233ddbce0c67a76c0985612fef27c0c92aef9413cf5aa56952f359fcb7379"}, - {file = "pyarrow-14.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:e4b123ad0f6add92de898214d404e488167b87b5dd86e9a434126bc2b7a5578d"}, - {file = "pyarrow-14.0.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e354fba8490de258be7687f341bc04aba181fc8aa1f71e4584f9890d9cb2dec2"}, - {file = "pyarrow-14.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:20e003a23a13da963f43e2b432483fdd8c38dc8882cd145f09f21792e1cf22a1"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0de7575e841f1595ac07e5bc631084fd06ca8b03c0f2ecece733d23cd5102a"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e986dc859712acb0bd45601229021f3ffcdfc49044b64c6d071aaf4fa49e98"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f7d029f20ef56673a9730766023459ece397a05001f4e4d13805111d7c2108c0"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:209bac546942b0d8edc8debda248364f7f668e4aad4741bae58e67d40e5fcf75"}, - {file = "pyarrow-14.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:1e6987c5274fb87d66bb36816afb6f65707546b3c45c44c28e3c4133c010a881"}, - {file = "pyarrow-14.0.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a01d0052d2a294a5f56cc1862933014e696aa08cc7b620e8c0cce5a5d362e976"}, - {file = "pyarrow-14.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a51fee3a7db4d37f8cda3ea96f32530620d43b0489d169b285d774da48ca9785"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64df2bf1ef2ef14cee531e2dfe03dd924017650ffaa6f9513d7a1bb291e59c15"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c0fa3bfdb0305ffe09810f9d3e2e50a2787e3a07063001dcd7adae0cee3601a"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c65bf4fd06584f058420238bc47a316e80dda01ec0dfb3044594128a6c2db794"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:63ac901baec9369d6aae1cbe6cca11178fb018a8d45068aaf5bb54f94804a866"}, - {file = "pyarrow-14.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:75ee0efe7a87a687ae303d63037d08a48ef9ea0127064df18267252cfe2e9541"}, - {file = "pyarrow-14.0.2.tar.gz", hash = "sha256:36cef6ba12b499d864d1def3e990f97949e0b79400d08b7cf74504ffbd3eb025"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, + {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, + {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, + {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, + {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, + {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, + {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, ] [package.dependencies] numpy = ">=1.16.6" -[[package]] -name = "pyarrow-hotfix" -version = "0.6" -description = "" -optional = true -python-versions = ">=3.5" -files = [ - {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, - {file = "pyarrow_hotfix-0.6.tar.gz", hash = "sha256:79d3e030f7ff890d408a100ac16d6f00b14d44a502d7897cd9fc3e3a534e9945"}, -] - [[package]] name = "pyasn1" version = "0.5.0" @@ -6993,22 +6981,22 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pylance" -version = "0.13.0" +version = "0.16.1" description = "python wrapper for Lance columnar format" optional = false python-versions = ">=3.9" files = [ - {file = "pylance-0.13.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2f3d6f9eec1f59f45dccb01075ba79868b8d37c8371d6210bcf6418217a0dd8b"}, - {file = "pylance-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f4861ab466c94b0f9a4b4e6de6e1dfa02f40e7242d8db87447bc7bb7d89606ac"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3cb92547e145f5bfb0ea7d6f483953913b9bdd44c45bea84fc95a18da9f5853"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d1ddd7700924bc6b6b0774ea63d2aa23f9210a86cd6d6af0cdfa987df776d50d"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c51d4b6e59cf4dc97c11a35b299f11e80dbdf392e2d8dc498573c26474a3c19e"}, - {file = "pylance-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:4018ba016f1445874960a4ba2ad5c80cb380f3116683282ee8beabd38fa8989d"}, + {file = "pylance-0.16.1-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:7092303ae21bc162edd98e20fc39785fa1ec6b67f04132977ac0fd63110ba16f"}, + {file = "pylance-0.16.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7c2ebdf89928c68f053ab9e369a5477da0a2ba70d47c00075dc10a37039d9e90"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4525c2fd8095830b753a3efb7285f358b016836086683fe977f9f1de8e6866c"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:645f0ab338bc4bd42bf3321bbb4053261979117aefd8477c2192ba624de27778"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3a7464d60aca51e89196a79c638bcbff0bddb77158946e2ea6b5fcbc6cfc63e1"}, + {file = "pylance-0.16.1-cp39-abi3-win_amd64.whl", hash = "sha256:d12c628dfbd49efde15a5512247065341f3efb29989dd08fb5a7023f013471ee"}, ] [package.dependencies] -numpy = ">=1.22" -pyarrow = ">=12,<15.0.1" +numpy = ">=1.22,<2" +pyarrow = ">=12" [package.extras] benchmarks = ["pytest-benchmark"] @@ -9696,4 +9684,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a64fdd2845d27c9abc344809be68cba08f46641aabdc07416c37c802450fe4f3" +content-hash = "2b8d00f91f33a380b2399989dcac0d1d106d0bd2cd8865c5b7e27a19885753b5" diff --git a/pyproject.toml b/pyproject.toml index f33bbbefcf..74161f5ccc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ databricks-sql-connector = {version = ">=2.9.3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'", allow-prereleases = true } -deltalake = { version = ">=0.17.4", optional = true } +deltalake = { version = ">=0.19.0", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] diff --git a/tests/libs/test_deltalake.py b/tests/libs/test_deltalake.py index 3e2d7cc3f6..dc5586eb32 100644 --- a/tests/libs/test_deltalake.py +++ b/tests/libs/test_deltalake.py @@ -95,21 +95,9 @@ def arrow_data( # type: ignore[return] client = cast(FilesystemClient, client) storage_options = _deltalake_storage_options(client.config) - with pytest.raises(Exception): - # bug in `delta-rs` causes error when writing big decimal values - # https://github.com/delta-io/delta-rs/issues/2510 - # if this test fails, the bug has been fixed and we should remove this - # note from the docs: - write_delta_table( - remote_dir + "/corrupt_delta_table", - arrow_table_all_data_types("arrow-table", include_decimal_default_precision=True)[0], - write_disposition="append", - storage_options=storage_options, - ) - arrow_table = arrow_table_all_data_types( "arrow-table", - include_decimal_default_precision=False, + include_decimal_default_precision=True, include_decimal_arrow_max_precision=True, num_rows=2, )[0] diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 4b8707e989..d88eba7c06 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.exceptions import DependencyVersionException +from dlt.common.schema.typing import TWriteDisposition from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders @@ -580,6 +580,103 @@ def two_part(): @pytest.mark.essential +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + table_format_filesystem_configs=True, + table_format="delta", + bucket_subset=(FILE_BUCKET), + ), + ids=lambda x: x.name, +) +@pytest.mark.parametrize( + "write_disposition", + ( + "append", + "replace", + pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), + ), +) +def test_delta_table_schema_evolution( + destination_config: DestinationTestConfiguration, + write_disposition: TWriteDisposition, +) -> None: + """Tests schema evolution (adding new columns) for `delta` table format.""" + from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data + from dlt.common.libs.pyarrow import pyarrow + + @dlt.resource( + write_disposition=write_disposition, + primary_key="pk", + table_format="delta", + ) + def delta_table(data): + yield data + + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) + + # create Arrow table with one column, one row + pk_field = pyarrow.field("pk", pyarrow.int64(), nullable=False) + schema = pyarrow.schema([pk_field]) + arrow_table = pyarrow.Table.from_pydict({"pk": [1]}, schema=schema) + assert arrow_table.shape == (1, 1) + + # initial load + info = pipeline.run(delta_table(arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + assert actual.equals(expected) + + # create Arrow table with many columns, two rows + arrow_table = arrow_table_all_data_types( + "arrow-table", + include_decimal_default_precision=True, + include_decimal_arrow_max_precision=True, + include_not_normalized_name=False, + include_null=False, + num_rows=2, + )[0] + arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) + + # second load — this should evolve the schema (i.e. add the new columns) + info = pipeline.run(delta_table(arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + actual = dt.to_pyarrow_table() + expected = ensure_delta_compatible_arrow_data(arrow_table) + if write_disposition == "append": + # just check shape and schema for `append`, because table comparison is + # more involved than with the other dispositions + assert actual.num_rows == 3 + actual.schema.equals(expected.schema) + else: + assert actual.sort_by("pk").equals(expected.sort_by("pk")) + + # create empty Arrow table with additional column + arrow_table = arrow_table.append_column( + pyarrow.field("another_new_column", pyarrow.string()), + [["foo", "foo"]], + ) + empty_arrow_table = arrow_table.schema.empty_table() + + # load 3 — this should evolve the schema without changing data + info = pipeline.run(delta_table(empty_arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + actual = dt.to_pyarrow_table() + expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema + assert actual.schema.equals(expected_schema) + expected_num_rows = 3 if write_disposition == "append" else 2 + assert actual.num_rows == expected_num_rows + # new column should have NULLs only + assert ( + actual.column("another_new_column").combine_chunks().to_pylist() + == [None] * expected_num_rows + ) + + @pytest.mark.parametrize( "destination_config", destinations_configs( @@ -607,7 +704,7 @@ def delta_table(data): # create empty Arrow table with schema arrow_table = arrow_table_all_data_types( "arrow-table", - include_decimal_default_precision=False, + include_decimal_default_precision=True, include_decimal_arrow_max_precision=True, include_not_normalized_name=False, include_null=False, @@ -643,22 +740,6 @@ def delta_table(data): ensure_delta_compatible_arrow_data(empty_arrow_table).schema ) - # run 3: empty Arrow table with different schema - # this should not alter the Delta table - empty_arrow_table_2 = pa.schema( - [pa.field("foo", pa.int64()), pa.field("bar", pa.string())] - ).empty_table() - - info = pipeline.run(delta_table(empty_arrow_table_2)) - assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 # still 1, no new commit was done - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) # shape did not change - assert dt_arrow_table.schema.equals( # schema did not change - ensure_delta_compatible_arrow_data(empty_arrow_table).schema - ) - # test `dlt.mark.materialize_table_schema()` users_materialize_table_schema.apply_hints(table_format="delta") info = pipeline.run(users_materialize_table_schema()) @@ -810,6 +891,22 @@ def parent_delta(): with pytest.raises(ValueError): get_delta_tables(pipeline, "non_existing_table") + # test unknown schema + with pytest.raises(FileNotFoundError): + get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + + # load to a new schema and under new name + aux_schema = dlt.Schema("aux_2") + # NOTE: you cannot have a file with name + info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + # also state in seprate package + assert_load_info(info, expected_load_packages=2) + delta_tables = get_delta_tables(pipeline, schema_name="aux_2") + assert "aux_delta__child" in delta_tables.keys() + get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + with pytest.raises(ValueError): + get_delta_tables(pipeline, "aux_delta") + @pytest.mark.parametrize( "destination_config", diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index d3d87f0e0b..dfb5f3f82d 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -177,24 +177,27 @@ def _load_file(client: FSClientBase, filepath) -> List[Dict[str, Any]]: # -def _load_tables_to_dicts_fs(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: +def _load_tables_to_dicts_fs( + p: dlt.Pipeline, *table_names: str, schema_name: str = None +) -> Dict[str, List[Dict[str, Any]]]: """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" - client = p._fs_client() + client = p._fs_client(schema_name=schema_name) + assert isinstance(client, FilesystemClient) + result: Dict[str, Any] = {} delta_table_names = [ table_name for table_name in table_names - if get_table_format(p.default_schema.tables, table_name) == "delta" + if get_table_format(client.schema.tables, table_name) == "delta" ] if len(delta_table_names) > 0: from dlt.common.libs.deltalake import get_delta_tables - delta_tables = get_delta_tables(p, *table_names) + delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) for table_name in table_names: - if table_name in p.default_schema.data_table_names() and table_name in delta_table_names: - assert isinstance(client, FilesystemClient) + if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() else: @@ -244,7 +247,7 @@ def _sort_list_of_dicts(list_: List[Dict[str, Any]], sortkey: str) -> List[Dict[ return sorted(list_, key=lambda d: d[sortkey]) if _is_filesystem(p): - result = _load_tables_to_dicts_fs(p, *table_names) + result = _load_tables_to_dicts_fs(p, *table_names, schema_name=schema_name) else: result = _load_tables_to_dicts_sql(p, *table_names, schema_name=schema_name) From e337cca079ab21742339e097eb381635eafc5de5 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 27 Aug 2024 18:32:07 +0200 Subject: [PATCH 29/34] provides detail exception messages when cursor stored value cannot be coerced to data in incremental (#1748) --- .../impl/filesystem/filesystem.py | 1 + dlt/extract/incremental/exceptions.py | 26 ++++++++ dlt/extract/incremental/transform.py | 63 ++++++++++++++++--- tests/extract/test_incremental.py | 21 ++++++- 4 files changed, 101 insertions(+), 10 deletions(-) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 05261ccb1b..62263a10b9 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -89,6 +89,7 @@ def make_remote_path(self) -> str: ) def make_remote_uri(self) -> str: + """Returns path on a remote filesystem as a full uri including scheme.""" return self._job_client.make_remote_uri(self.make_remote_path()) def metrics(self) -> Optional[LoadJobMetrics]: diff --git a/dlt/extract/incremental/exceptions.py b/dlt/extract/incremental/exceptions.py index e318a028dc..a5f94c2974 100644 --- a/dlt/extract/incremental/exceptions.py +++ b/dlt/extract/incremental/exceptions.py @@ -1,3 +1,5 @@ +from typing import Any + from dlt.extract.exceptions import PipeException from dlt.common.typing import TDataItem @@ -13,6 +15,30 @@ def __init__(self, pipe_name: str, json_path: str, item: TDataItem, msg: str = N super().__init__(pipe_name, msg) +class IncrementalCursorInvalidCoercion(PipeException): + def __init__( + self, + pipe_name: str, + cursor_path: str, + cursor_value: TDataItem, + cursor_value_type: str, + item: TDataItem, + item_type: Any, + details: str, + ) -> None: + self.cursor_path = cursor_path + self.cursor_value = cursor_value + self.cursor_value_type = cursor_value_type + self.item = item + msg = ( + f"Could not coerce {cursor_value_type} with value {cursor_value} and type" + f" {type(cursor_value)} to actual data item {item} at path {cursor_path} with type" + f" {item_type}: {details}. You need to use different data type for" + f" {cursor_value_type} or cast your data ie. by using `add_map` on this resource." + ) + super().__init__(pipe_name, msg) + + class IncrementalPrimaryKeyMissing(PipeException): def __init__(self, pipe_name: str, primary_key_column: str, item: TDataItem) -> None: self.primary_key_column = primary_key_column diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 947e21f7b8..0ac9fdf520 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -8,6 +8,7 @@ from dlt.common.typing import TDataItem from dlt.common.jsonpath import find_values, JSONPathFields, compile_path from dlt.extract.incremental.exceptions import ( + IncrementalCursorInvalidCoercion, IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, ) @@ -158,14 +159,36 @@ def __call__( # Check whether end_value has been reached # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value - if self.end_value is not None and ( - last_value_func((row_value, self.end_value)) != self.end_value - or last_value_func((row_value,)) == self.end_value - ): - return None, False, True - + if self.end_value is not None: + try: + if ( + last_value_func((row_value, self.end_value)) != self.end_value + or last_value_func((row_value,)) == self.end_value + ): + return None, False, True + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + self.cursor_path, + self.end_value, + "end_value", + row_value, + type(row_value).__name__, + str(ex), + ) from ex check_values = (row_value,) + ((last_value,) if last_value is not None else ()) - new_value = last_value_func(check_values) + try: + new_value = last_value_func(check_values) + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + self.cursor_path, + last_value, + "start_value/initial_value", + row_value, + type(row_value).__name__, + str(ex), + ) from ex # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: # use func to compute row_value into last_value compatible @@ -294,14 +317,36 @@ def __call__( # If end_value is provided, filter to include table rows that are "less" than end_value if self.end_value is not None: - end_value_scalar = to_arrow_scalar(self.end_value, cursor_data_type) + try: + end_value_scalar = to_arrow_scalar(self.end_value, cursor_data_type) + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + cursor_path, + self.end_value, + "end_value", + "", + cursor_data_type, + str(ex), + ) from ex tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar)) # Is max row value higher than end value? # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() if self.start_value is not None: - start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type) + try: + start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type) + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + cursor_path, + self.start_value, + "start_value/initial_value", + "", + cursor_data_type, + str(ex), + ) from ex # Remove rows lower or equal than the last start value keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index f4082a7d86..c401552fb2 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -30,6 +30,7 @@ from dlt.sources.helpers.transform import take_first from dlt.extract.incremental import IncrementalResourceWrapper, Incremental from dlt.extract.incremental.exceptions import ( + IncrementalCursorInvalidCoercion, IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, ) @@ -1303,7 +1304,7 @@ def some_data( ) # will cause invalid comparison if item_type == "object": - with pytest.raises(InvalidStepFunctionArguments): + with pytest.raises(IncrementalCursorInvalidCoercion): list(resource) else: data = data_item_to_list(item_type, list(resource)) @@ -2065,3 +2066,21 @@ def test_source(): incremental_steps = test_source_incremental().table_name._pipe._steps assert isinstance(incremental_steps[-2], ValidateItem) assert isinstance(incremental_steps[-1], IncrementalResourceWrapper) + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +def test_cursor_date_coercion(item_type: TestDataItemFormat) -> None: + today = datetime.today().date() + + @dlt.resource() + def updated_is_int(updated_at=dlt.sources.incremental("updated_at", initial_value=today)): + data = [{"updated_at": d} for d in [1, 2, 3]] + yield data_to_item_format(item_type, data) + + pip_1_name = "test_pydantic_columns_validator_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pip_1_name, destination="duckdb") + + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(updated_is_int()) + assert isinstance(pip_ex.value.__cause__, IncrementalCursorInvalidCoercion) + assert pip_ex.value.__cause__.cursor_path == "updated_at" From 98ca505fd06b8146a4355c6355174abe8b45ef66 Mon Sep 17 00:00:00 2001 From: VioletM Date: Wed, 28 Aug 2024 06:28:50 -0400 Subject: [PATCH 30/34] Expose staging tables truncation to config (#1717) * Expose staging tables truncation to config * Fix comments, add tests * Fix tests * Move implementation from mixing, add tests * Fix docs grammar --- dlt/common/destination/reference.py | 8 ++- dlt/destinations/impl/athena/athena.py | 2 +- dlt/destinations/impl/bigquery/bigquery.py | 3 + .../impl/clickhouse/clickhouse.py | 3 + .../impl/databricks/databricks.py | 3 + dlt/destinations/impl/dremio/dremio.py | 3 + dlt/destinations/impl/dummy/configuration.py | 2 + dlt/destinations/impl/dummy/dummy.py | 3 + dlt/destinations/impl/redshift/redshift.py | 3 + dlt/destinations/impl/snowflake/snowflake.py | 3 + dlt/destinations/impl/synapse/synapse.py | 3 + dlt/load/utils.py | 7 +- docs/website/docs/dlt-ecosystem/staging.md | 72 ++++++++++++------- tests/load/pipeline/test_stage_loading.py | 57 ++++++++++++++- tests/load/test_dummy_client.py | 17 +++++ 15 files changed, 152 insertions(+), 37 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 744cbbd1f5..0944b03bea 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -269,6 +269,8 @@ class DestinationClientDwhWithStagingConfiguration(DestinationClientDwhConfigura staging_config: Optional[DestinationClientStagingConfiguration] = None """configuration of the staging, if present, injected at runtime""" + truncate_tables_on_staging_destination_before_load: bool = True + """If dlt should truncate the tables on staging destination before loading data.""" TLoadJobState = Literal["ready", "running", "failed", "retry", "completed"] @@ -578,7 +580,7 @@ def with_staging_dataset(self) -> ContextManager["JobClientBase"]: return self # type: ignore -class SupportsStagingDestination: +class SupportsStagingDestination(ABC): """Adds capability to support a staging destination for the load""" def should_load_data_to_staging_dataset_on_staging_destination( @@ -586,9 +588,9 @@ def should_load_data_to_staging_dataset_on_staging_destination( ) -> bool: return False + @abstractmethod def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: - # the default is to truncate the tables on the staging destination... - return True + pass # TODO: type Destination properly diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 0c90d171a3..b28309b930 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -531,7 +531,7 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable if table["write_disposition"] == "replace" and not self._is_iceberg_table( self.prepare_load_table(table["name"]) ): - return True + return self.config.truncate_tables_on_staging_destination_before_load return False def should_load_data_to_staging_dataset_on_staging_destination( diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 8291415434..11326cf3ed 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -503,6 +503,9 @@ def _should_autodetect_schema(self, table_name: str) -> bool: self.schema._schema_tables, table_name, AUTODETECT_SCHEMA_HINT, allow_none=True ) or (self.config.autodetect_schema and table_name not in self.schema.dlt_table_names()) + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load + def _streaming_load( items: List[Dict[Any, Any]], table: Dict[str, Any], job_client: BigQueryClient diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index 5f17a5a18c..282fbaf338 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -372,3 +372,6 @@ def _from_db_type( self, ch_t: str, precision: Optional[int], scale: Optional[int] ) -> TColumnType: return self.type_mapper.from_db_type(ch_t, precision, scale) + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 2f23e88ea0..38412b2608 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -325,3 +325,6 @@ def _get_storage_table_query_columns(self) -> List[str]: "full_data_type" ) return fields + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py index 68a3fedc31..149d106dcd 100644 --- a/dlt/destinations/impl/dremio/dremio.py +++ b/dlt/destinations/impl/dremio/dremio.py @@ -210,3 +210,6 @@ def _make_add_column_sql( self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None ) -> List[str]: return ["ADD COLUMNS (" + ", ".join(self._get_column_def_sql(c) for c in new_columns) + ")"] + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py index 023b88e51a..a066479294 100644 --- a/dlt/destinations/impl/dummy/configuration.py +++ b/dlt/destinations/impl/dummy/configuration.py @@ -34,6 +34,8 @@ class DummyClientConfiguration(DestinationClientConfiguration): """raise terminal exception in job init""" fail_transiently_in_init: bool = False """raise transient exception in job init""" + truncate_tables_on_staging_destination_before_load: bool = True + """truncate tables on staging destination""" # new jobs workflows create_followup_jobs: bool = False diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 49b55ec65d..feb09369dc 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -202,6 +202,9 @@ def complete_load(self, load_id: str) -> None: def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: return super().should_load_data_to_staging_dataset(table) + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load + @contextmanager def with_staging_dataset(self) -> Iterator[JobClientBase]: try: diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index 93827c8163..0e201dc4e0 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -274,3 +274,6 @@ def _from_db_type( self, pq_t: str, precision: Optional[int], scale: Optional[int] ) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index 8b4eabc961..6688b5bc17 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -325,3 +325,6 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non return ( f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" ) + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index e43e2a6dfa..750a4895f0 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -173,6 +173,9 @@ def create_load_job( ) return job + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load + class SynapseCopyFileLoadJob(CopyRemoteFileLoadJob): def __init__( diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 741c01f249..e3a2ebcd79 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -179,9 +179,10 @@ def _init_dataset_and_update_schema( applied_update = job_client.update_stored_schema( only_tables=update_tables, expected_update=expected_update ) - logger.info( - f"Client for {job_client.config.destination_type} will truncate tables {staging_text}" - ) + if truncate_tables: + logger.info( + f"Client for {job_client.config.destination_type} will truncate tables {staging_text}" + ) job_client.initialize_storage(truncate_tables=truncate_tables) return applied_update diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index 05e31a574b..789189b7dd 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -1,36 +1,33 @@ --- title: Staging -description: Configure an s3 or gcs bucket for staging before copying into the destination +description: Configure an S3 or GCS bucket for staging before copying into the destination keywords: [staging, destination] --- # Staging -The goal of staging is to bring the data closer to the database engine so the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two -staging areas: +The goal of staging is to bring the data closer to the database engine so that the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two staging areas: 1. A **staging dataset** used by the [merge and replace loads](../general-usage/incremental-loading.md#merge-incremental_loading) to deduplicate and merge data with the destination. -2. A **staging storage** which is typically a s3/gcp bucket where [loader files](file-formats/) are copied before they are loaded by the destination. +2. A **staging storage** which is typically an S3/GCP bucket where [loader files](file-formats/) are copied before they are loaded by the destination. ## Staging dataset -`dlt` creates a staging dataset when write disposition of any of the loaded resources requires it. It creates and migrates required tables exactly like for the -main dataset. Data in staging tables is truncated when load step begins and only for tables that will participate in it. -Such staging dataset has the same name as the dataset passed to `dlt.pipeline` but with `_staging` suffix in the name. Alternatively, you can provide your own staging dataset pattern or use a fixed name, identical for all the -configured datasets. +`dlt` creates a staging dataset when the write disposition of any of the loaded resources requires it. It creates and migrates required tables exactly like for the main dataset. Data in staging tables is truncated when the load step begins and only for tables that will participate in it. +Such a staging dataset has the same name as the dataset passed to `dlt.pipeline` but with a `_staging` suffix in the name. Alternatively, you can provide your own staging dataset pattern or use a fixed name, identical for all the configured datasets. ```toml [destination.postgres] staging_dataset_name_layout="staging_%s" ``` -Entry above switches the pattern to `staging_` prefix and for example for dataset with name **github_data** `dlt` will create **staging_github_data**. +The entry above switches the pattern to `staging_` prefix and for example, for a dataset with the name **github_data**, `dlt` will create **staging_github_data**. -To configure static staging dataset name, you can do the following (we use destination factory) +To configure a static staging dataset name, you can do the following (we use the destination factory) ```py import dlt dest_ = dlt.destinations.postgres(staging_dataset_name_layout="_dlt_staging") ``` -All pipelines using `dest_` as destination will use **staging_dataset** to store staging tables. Make sure that your pipelines are not overwriting each other's tables. +All pipelines using `dest_` as the destination will use the **staging_dataset** to store staging tables. Make sure that your pipelines are not overwriting each other's tables. -### Cleanup up staging dataset automatically -`dlt` does not truncate tables in staging dataset at the end of the load. Data that is left after contains all the extracted data and may be useful for debugging. +### Cleanup staging dataset automatically +`dlt` does not truncate tables in the staging dataset at the end of the load. Data that is left after contains all the extracted data and may be useful for debugging. If you prefer to truncate it, put the following line in `config.toml`: ```toml @@ -39,19 +36,23 @@ truncate_staging_dataset=true ``` ## Staging storage -`dlt` allows to chain destinations where the first one (`staging`) is responsible for uploading the files from local filesystem to the remote storage. It then generates followup jobs for the second destination that (typically) copy the files from remote storage into destination. +`dlt` allows chaining destinations where the first one (`staging`) is responsible for uploading the files from the local filesystem to the remote storage. It then generates follow-up jobs for the second destination that (typically) copy the files from remote storage into the destination. -Currently, only one destination the [filesystem](destinations/filesystem.md) can be used as a staging. Following destinations can copy remote files: -1. [Redshift.](destinations/redshift.md#staging-support) -2. [Bigquery.](destinations/bigquery.md#staging-support) -3. [Snowflake.](destinations/snowflake.md#staging-support) +Currently, only one destination, the [filesystem](destinations/filesystem.md), can be used as staging. The following destinations can copy remote files: + +1. [Azure Synapse](destinations/synapse#staging-support) +1. [Athena](destinations/athena#staging-support) +1. [Bigquery](destinations/bigquery.md#staging-support) +1. [Dremio](destinations/dremio#staging-support) +1. [Redshift](destinations/redshift.md#staging-support) +1. [Snowflake](destinations/snowflake.md#staging-support) ### How to use -In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below we'll use `filesystem` staging with `parquet` files to load into `Redshift` destination. +In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below we'll use `filesystem` staging with `parquet` files to load into the `Redshift` destination. -1. **Set up the s3 bucket and filesystem staging.** +1. **Set up the S3 bucket and filesystem staging.** - Please follow our guide in [filesystem destination documentation](destinations/filesystem.md). Test the staging as standalone destination to make sure that files go where you want them. In your `secrets.toml` you should now have a working `filesystem` configuration: + Please follow our guide in the [filesystem destination documentation](destinations/filesystem.md). Test the staging as a standalone destination to make sure that files go where you want them. In your `secrets.toml`, you should now have a working `filesystem` configuration: ```toml [destination.filesystem] bucket_url = "s3://[your_bucket_name]" # replace with your bucket name, @@ -63,15 +64,15 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel 2. **Set up the Redshift destination.** - Please follow our guide in [redshift destination documentation](destinations/redshift.md). In your `secrets.toml` you added: + Please follow our guide in the [redshift destination documentation](destinations/redshift.md). In your `secrets.toml`, you added: ```toml # keep it at the top of your toml file! before any section starts destination.redshift.credentials="redshift://loader:@localhost/dlt_data?connect_timeout=15" ``` -3. **Authorize Redshift cluster to access the staging bucket.** +3. **Authorize the Redshift cluster to access the staging bucket.** - By default `dlt` will forward the credentials configured for `filesystem` to the `Redshift` COPY command. If you are fine with this, move to the next step. + By default, `dlt` will forward the credentials configured for `filesystem` to the `Redshift` COPY command. If you are fine with this, move to the next step. 4. **Chain staging to destination and request `parquet` file format.** @@ -79,7 +80,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel ```py # Create a dlt pipeline that will load # chess player data to the redshift destination - # via staging on s3 + # via staging on S3 pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='redshift', @@ -87,7 +88,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel dataset_name='player_data' ) ``` - `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify `parquet` file format (just to demonstrate how to do it): + `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify the `parquet` file format (just to demonstrate how to do it): ```py info = pipeline.run(chess(), loader_file_format="parquet") ``` @@ -96,4 +97,21 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel Run the pipeline script as usual. -> 💡 Please note that `dlt` does not delete loaded files from the staging storage after the load is complete. +:::tip +Please note that `dlt` does not delete loaded files from the staging storage after the load is complete, but it truncates previously loaded files. +::: + +### How to prevent staging files truncation + +Before `dlt` loads data to the staging storage, it truncates previously loaded files. To prevent it and keep the whole history +of loaded files, you can use the following parameter: + +```toml +[destination.redshift] +truncate_table_before_load_on_staging_destination=false +``` + +:::caution +The [Athena](destinations/athena#staging-support) destination only truncates not iceberg tables with `replace` merge_disposition. +Therefore, the parameter `truncate_table_before_load_on_staging_destination` only controls the truncation of corresponding files for these tables. +::: diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index a760c86526..f216fa3c05 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -1,12 +1,12 @@ import pytest -from typing import Dict, Any, List +from typing import List import dlt, os -from dlt.common import json, sleep -from copy import deepcopy +from dlt.common import json from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.utils import uniq_id from dlt.common.schema.typing import TDataType +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from tests.load.pipeline.test_merge_disposition import github from tests.pipeline.utils import load_table_counts, assert_load_info @@ -40,6 +40,13 @@ def load_modified_issues(): yield from issues +@dlt.resource(table_name="events", write_disposition="append", primary_key="timestamp") +def event_many_load_2(): + with open("tests/normalize/cases/event.event.many_load_2.json", "r", encoding="utf-8") as f: + events = json.load(f) + yield from events + + @pytest.mark.parametrize( "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name ) @@ -183,6 +190,50 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: assert replace_counts == initial_counts +@pytest.mark.parametrize( + "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name +) +def test_truncate_staging_dataset(destination_config: DestinationTestConfiguration) -> None: + """This test checks if tables truncation on staging destination done according to the configuration. + + Test loads data to the destination three times: + * with truncation + * without truncation (after this 2 staging files should be left) + * with truncation (after this 1 staging file should be left) + """ + pipeline = destination_config.setup_pipeline( + pipeline_name="test_stage_loading", dataset_name="test_staging_load" + uniq_id() + ) + resource = event_many_load_2() + table_name: str = resource.table_name # type: ignore[assignment] + + # load the data, files stay on the stage after the load + info = pipeline.run(resource) + assert_load_info(info) + + # load the data without truncating of the staging, should see two files on staging + pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = False + info = pipeline.run(resource) + assert_load_info(info) + # check there are two staging files + _, staging_client = pipeline._get_destination_clients(pipeline.default_schema) + with staging_client: + assert len(staging_client.list_table_files(table_name)) == 2 # type: ignore[attr-defined] + + # load the data with truncating, so only new file is on the staging + pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = True + info = pipeline.run(resource) + assert_load_info(info) + # check that table exists in the destination + with pipeline.sql_client() as sql_client: + qual_name = sql_client.make_qualified_table_name + assert len(sql_client.execute_sql(f"SELECT * from {qual_name(table_name)}")) > 4 + # check there is only one staging file + _, staging_client = pipeline._get_destination_clients(pipeline.default_schema) + with staging_client: + assert len(staging_client.list_table_files(table_name)) == 1 # type: ignore[attr-defined] + + @pytest.mark.parametrize( "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name ) diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 9f0bca6ac5..59b7acac15 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -548,6 +548,23 @@ def test_completed_loop_with_delete_completed() -> None: assert_complete_job(load, should_delete_completed=True) +@pytest.mark.parametrize("to_truncate", [True, False]) +def test_truncate_table_before_load_on_stanging(to_truncate) -> None: + load = setup_loader( + client_config=DummyClientConfiguration( + truncate_tables_on_staging_destination_before_load=to_truncate + ) + ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) + destination_client = load.get_destination_client(schema) + assert ( + destination_client.should_truncate_table_before_load_on_staging_destination( # type: ignore + schema.tables["_dlt_version"] + ) + == to_truncate + ) + + def test_retry_on_new_loop() -> None: # test job that retries sitting in new jobs load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0)) From 4e1c6077c7ed4bbaf127e34a2cbc7d87fe48d924 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Wed, 28 Aug 2024 13:17:11 +0200 Subject: [PATCH 31/34] enables external location and named credential in databricks (#1755) * allows to configure external location and named credential for databricks * fixes #1703 * normalizes 'value' when wrapping simple objects in relational, fixes #1754 * simplifies fsspec globbing and allows various url formats that are preserved when reconstituting full url, allows abfss databricks format * adds info on partially loaded packages to docs * renames remote_uri to remote_url in traces * fixes delta for abfss * adds nested tables dlt columns collision test --- .github/workflows/test_destinations.yml | 1 + .../configuration/specs/azure_credentials.py | 2 + dlt/common/libs/deltalake.py | 3 +- dlt/common/metrics.py | 2 +- dlt/common/normalizers/json/__init__.py | 4 +- dlt/common/normalizers/json/relational.py | 4 +- dlt/common/storages/configuration.py | 119 +++++++++++++----- dlt/common/storages/fsspec_filesystem.py | 58 +++++---- dlt/destinations/impl/athena/athena.py | 1 - dlt/destinations/impl/bigquery/bigquery.py | 2 +- .../impl/databricks/configuration.py | 4 + .../impl/databricks/databricks.py | 108 ++++++++++------ dlt/destinations/impl/databricks/factory.py | 6 + dlt/destinations/impl/dummy/dummy.py | 4 +- .../impl/filesystem/filesystem.py | 32 ++--- .../dlt-ecosystem/destinations/databricks.md | 33 ++++- .../dlt-ecosystem/destinations/snowflake.md | 2 +- .../docs/running-in-production/running.md | 16 ++- tests/.dlt/config.toml | 3 +- tests/common/cases/normalizers/sql_upper.py | 2 - .../common/storages/test_local_filesystem.py | 10 +- .../test_destination_name_and_config.py | 4 +- .../test_databricks_configuration.py | 50 +++++++- .../load/filesystem/test_filesystem_common.py | 54 +++++--- .../load/pipeline/test_databricks_pipeline.py | 85 +++++++++++++ .../load/pipeline/test_filesystem_pipeline.py | 18 +-- tests/load/pipeline/test_stage_loading.py | 10 +- tests/load/test_dummy_client.py | 10 +- tests/load/utils.py | 12 +- .../cases/contracts/trace.schema.yaml | 2 +- tests/pipeline/test_pipeline.py | 14 +++ tests/pipeline/test_pipeline_trace.py | 2 +- 32 files changed, 510 insertions(+), 167 deletions(-) create mode 100644 tests/load/pipeline/test_databricks_pipeline.py diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index a034ac7eb0..7fae69ff9e 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -29,6 +29,7 @@ env: # Test redshift and filesystem with all buckets # postgres runs again here so we can test on mac/windows ACTIVE_DESTINATIONS: "[\"redshift\", \"postgres\", \"duckdb\", \"filesystem\", \"dummy\"]" + # note that all buckets are enabled for testing jobs: get_docs_changes: diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 7fa34fa00f..6794b581ce 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -32,6 +32,8 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: creds = self.to_adlfs_credentials() if creds["sas_token"] is None: creds.pop("sas_token") + if creds["account_key"] is None: + creds.pop("account_key") return creds def create_sas_token(self) -> None: diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index d4cb46c600..38b23ea27a 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -176,7 +176,8 @@ def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str """Returns dict that can be passed as `storage_options` in `deltalake` library.""" creds = {} extra_options = {} - if config.protocol in ("az", "gs", "s3"): + # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery + if hasattr(config.credentials, "to_object_store_rs_credentials"): creds = config.credentials.to_object_store_rs_credentials() if config.deltalake_storage_options is not None: extra_options = config.deltalake_storage_options diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index 5cccee4045..d6acf19d0d 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -64,7 +64,7 @@ class LoadJobMetrics(NamedTuple): started_at: datetime.datetime finished_at: datetime.datetime state: Optional[str] - remote_uri: Optional[str] + remote_url: Optional[str] class LoadMetrics(StepMetrics): diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index a13bab15f4..725f6a8355 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -54,9 +54,9 @@ class SupportsDataItemNormalizer(Protocol): """A class with a name DataItemNormalizer deriving from normalizers.json.DataItemNormalizer""" -def wrap_in_dict(item: Any) -> DictStrAny: +def wrap_in_dict(label: str, item: Any) -> DictStrAny: """Wraps `item` that is not a dictionary into dictionary that can be json normalized""" - return {"value": item} + return {label: item} __all__ = [ diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 1dbcec4bff..33184640f0 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -281,7 +281,7 @@ def _normalize_list( else: # list of simple types child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx) - wrap_v = wrap_in_dict(v) + wrap_v = wrap_in_dict(self.c_value, v) wrap_v[self.c_dlt_id] = child_row_hash e = self._link_row(wrap_v, parent_row_id, idx) DataItemNormalizer._extend_row(extend, e) @@ -387,7 +387,7 @@ def normalize_data_item( ) -> TNormalizedRowIterator: # wrap items that are not dictionaries in dictionary, otherwise they cannot be processed by the JSON normalizer if not isinstance(item, dict): - item = wrap_in_dict(item) + item = wrap_in_dict(self.c_value, item) # we will extend event with all the fields necessary to load it as root row row = cast(DictStrAny, item) # identify load id if loaded data must be processed after loading incrementally diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index b2bdb3a7b6..04780528c4 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -1,7 +1,7 @@ import os import pathlib from typing import Any, Literal, Optional, Type, get_args, ClassVar, Dict, Union -from urllib.parse import urlparse, unquote +from urllib.parse import urlparse, unquote, urlunparse from dlt.common.configuration import configspec, resolve_type from dlt.common.configuration.exceptions import ConfigurationValueError @@ -52,6 +52,53 @@ class LoadStorageConfiguration(BaseConfiguration): ] +def _make_az_url(scheme: str, fs_path: str, bucket_url: str) -> str: + parsed_bucket_url = urlparse(bucket_url) + if parsed_bucket_url.username: + # az://@.dfs.core.windows.net/ + # fs_path always starts with container + split_path = fs_path.split("/", maxsplit=1) + if len(split_path) == 1: + split_path.append("") + container, path = split_path + netloc = f"{container}@{parsed_bucket_url.hostname}" + return urlunparse(parsed_bucket_url._replace(path=path, scheme=scheme, netloc=netloc)) + return f"{scheme}://{fs_path}" + + +def _make_file_url(scheme: str, fs_path: str, bucket_url: str) -> str: + """Creates a normalized file:// url from a local path + + netloc is never set. UNC paths are represented as file://host/path + """ + p_ = pathlib.Path(fs_path) + p_ = p_.expanduser().resolve() + return p_.as_uri() + + +MAKE_URI_DISPATCH = {"az": _make_az_url, "file": _make_file_url} + +MAKE_URI_DISPATCH["adl"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["abfs"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["azure"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["abfss"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["local"] = MAKE_URI_DISPATCH["file"] + + +def make_fsspec_url(scheme: str, fs_path: str, bucket_url: str) -> str: + """Creates url from `fs_path` and `scheme` using bucket_url as an `url` template + + Args: + scheme (str): scheme of the resulting url + fs_path (str): kind of absolute path that fsspec uses to locate resources for particular filesystem. + bucket_url (str): an url template. the structure of url will be preserved if possible + """ + _maker = MAKE_URI_DISPATCH.get(scheme) + if _maker: + return _maker(scheme, fs_path, bucket_url) + return f"{scheme}://{fs_path}" + + @configspec class FilesystemConfiguration(BaseConfiguration): """A configuration defining filesystem location and access credentials. @@ -59,7 +106,7 @@ class FilesystemConfiguration(BaseConfiguration): When configuration is resolved, `bucket_url` is used to extract a protocol and request corresponding credentials class. * s3 * gs, gcs - * az, abfs, adl + * az, abfs, adl, abfss, azure * file, memory * gdrive """ @@ -72,6 +119,8 @@ class FilesystemConfiguration(BaseConfiguration): "az": AnyAzureCredentials, "abfs": AnyAzureCredentials, "adl": AnyAzureCredentials, + "abfss": AnyAzureCredentials, + "azure": AnyAzureCredentials, } bucket_url: str = None @@ -93,17 +142,21 @@ def protocol(self) -> str: else: return urlparse(self.bucket_url).scheme + @property + def is_local_filesystem(self) -> bool: + return self.protocol == "file" + def on_resolved(self) -> None: - uri = urlparse(self.bucket_url) - if not uri.path and not uri.netloc: + url = urlparse(self.bucket_url) + if not url.path and not url.netloc: raise ConfigurationValueError( "File path and netloc are missing. Field bucket_url of" - " FilesystemClientConfiguration must contain valid uri with a path or host:password" + " FilesystemClientConfiguration must contain valid url with a path or host:password" " component." ) # this is just a path in a local file system if self.is_local_path(self.bucket_url): - self.bucket_url = self.make_file_uri(self.bucket_url) + self.bucket_url = self.make_file_url(self.bucket_url) @resolve_type("credentials") def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: @@ -122,44 +175,50 @@ def fingerprint(self) -> str: if self.is_local_path(self.bucket_url): return digest128("") - uri = urlparse(self.bucket_url) - return digest128(self.bucket_url.replace(uri.path, "")) + url = urlparse(self.bucket_url) + return digest128(self.bucket_url.replace(url.path, "")) + + def make_url(self, fs_path: str) -> str: + """Makes a full url (with scheme) form fs_path which is kind-of absolute path used by fsspec to identify resources. + This method will use `bucket_url` to infer the original form of the url. + """ + return make_fsspec_url(self.protocol, fs_path, self.bucket_url) def __str__(self) -> str: """Return displayable destination location""" - uri = urlparse(self.bucket_url) + url = urlparse(self.bucket_url) # do not show passwords - if uri.password: - new_netloc = f"{uri.username}:****@{uri.hostname}" - if uri.port: - new_netloc += f":{uri.port}" - return uri._replace(netloc=new_netloc).geturl() + if url.password: + new_netloc = f"{url.username}:****@{url.hostname}" + if url.port: + new_netloc += f":{url.port}" + return url._replace(netloc=new_netloc).geturl() return self.bucket_url @staticmethod - def is_local_path(uri: str) -> bool: - """Checks if `uri` is a local path, without a schema""" - uri_parsed = urlparse(uri) + def is_local_path(url: str) -> bool: + """Checks if `url` is a local path, without a schema""" + url_parsed = urlparse(url) # this prevents windows absolute paths to be recognized as schemas - return not uri_parsed.scheme or os.path.isabs(uri) + return not url_parsed.scheme or os.path.isabs(url) @staticmethod - def make_local_path(file_uri: str) -> str: + def make_local_path(file_url: str) -> str: """Gets a valid local filesystem path from file:// scheme. Supports POSIX/Windows/UNC paths Returns: str: local filesystem path """ - uri = urlparse(file_uri) - if uri.scheme != "file": - raise ValueError(f"Must be file scheme but is {uri.scheme}") - if not uri.path and not uri.netloc: + url = urlparse(file_url) + if url.scheme != "file": + raise ValueError(f"Must be file scheme but is {url.scheme}") + if not url.path and not url.netloc: raise ConfigurationValueError("File path and netloc are missing.") - local_path = unquote(uri.path) - if uri.netloc: + local_path = unquote(url.path) + if url.netloc: # or UNC file://localhost/path - local_path = "//" + unquote(uri.netloc) + local_path + local_path = "//" + unquote(url.netloc) + local_path else: # if we are on windows, strip the POSIX root from path which is always absolute if os.path.sep != local_path[0]: @@ -172,11 +231,9 @@ def make_local_path(file_uri: str) -> str: return str(pathlib.Path(local_path)) @staticmethod - def make_file_uri(local_path: str) -> str: - """Creates a normalized file:// uri from a local path + def make_file_url(local_path: str) -> str: + """Creates a normalized file:// url from a local path netloc is never set. UNC paths are represented as file://host/path """ - p_ = pathlib.Path(local_path) - p_ = p_.expanduser().resolve() - return p_.as_uri() + return make_fsspec_url("file", local_path, None) diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index be9ae2bbb1..7da5ebabef 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -21,7 +21,7 @@ ) from urllib.parse import urlparse -from fsspec import AbstractFileSystem, register_implementation +from fsspec import AbstractFileSystem, register_implementation, get_filesystem_class from fsspec.core import url_to_fs from dlt import version @@ -32,7 +32,11 @@ AzureCredentials, ) from dlt.common.exceptions import MissingDependencyException -from dlt.common.storages.configuration import FileSystemCredentials, FilesystemConfiguration +from dlt.common.storages.configuration import ( + FileSystemCredentials, + FilesystemConfiguration, + make_fsspec_url, +) from dlt.common.time import ensure_pendulum_datetime from dlt.common.typing import DictStrAny @@ -65,18 +69,20 @@ class FileItem(TypedDict, total=False): MTIME_DISPATCH["gs"] = MTIME_DISPATCH["gcs"] MTIME_DISPATCH["s3a"] = MTIME_DISPATCH["s3"] MTIME_DISPATCH["abfs"] = MTIME_DISPATCH["az"] +MTIME_DISPATCH["abfss"] = MTIME_DISPATCH["az"] # Map of protocol to a filesystem type CREDENTIALS_DISPATCH: Dict[str, Callable[[FilesystemConfiguration], DictStrAny]] = { "s3": lambda config: cast(AwsCredentials, config.credentials).to_s3fs_credentials(), - "adl": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), "az": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), - "gcs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(), "gs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(), "gdrive": lambda config: {"credentials": cast(GcpCredentials, config.credentials)}, - "abfs": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), - "azure": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), } +CREDENTIALS_DISPATCH["adl"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["abfs"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["azure"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["abfss"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["gcs"] = CREDENTIALS_DISPATCH["gs"] def fsspec_filesystem( @@ -90,7 +96,7 @@ def fsspec_filesystem( Please supply credentials instance corresponding to the protocol. The `protocol` is just the code name of the filesystem i.e.: * s3 - * az, abfs + * az, abfs, abfss, adl, azure * gcs, gs also see filesystem_from_config @@ -136,7 +142,7 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys Authenticates following filesystems: * s3 - * az, abfs + * az, abfs, abfss, adl, azure * gcs, gs All other filesystems are not authenticated @@ -146,8 +152,14 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys fs_kwargs = prepare_fsspec_args(config) try: + # first get the class to check the protocol + fs_cls = get_filesystem_class(config.protocol) + if fs_cls.protocol == "abfs": + # if storage account is present in bucket_url and in credentials, az fsspec will fail + if urlparse(config.bucket_url).username: + fs_kwargs.pop("account_name") return url_to_fs(config.bucket_url, **fs_kwargs) # type: ignore - except ModuleNotFoundError as e: + except ImportError as e: raise MissingDependencyException( "filesystem", [f"{version.DLT_PKG_NAME}[{config.protocol}]"] ) from e @@ -291,10 +303,8 @@ def glob_files( """ is_local_fs = "file" in fs_client.protocol if is_local_fs and FilesystemConfiguration.is_local_path(bucket_url): - bucket_url = FilesystemConfiguration.make_file_uri(bucket_url) - bucket_url_parsed = urlparse(bucket_url) - else: - bucket_url_parsed = urlparse(bucket_url) + bucket_url = FilesystemConfiguration.make_file_url(bucket_url) + bucket_url_parsed = urlparse(bucket_url) if is_local_fs: root_dir = FilesystemConfiguration.make_local_path(bucket_url) @@ -302,7 +312,8 @@ def glob_files( files = glob.glob(str(pathlib.Path(root_dir).joinpath(file_glob)), recursive=True) glob_result = {file: fs_client.info(file) for file in files} else: - root_dir = bucket_url_parsed._replace(scheme="", query="").geturl().lstrip("/") + # convert to fs_path + root_dir = fs_client._strip_protocol(bucket_url) filter_url = posixpath.join(root_dir, file_glob) glob_result = fs_client.glob(filter_url, detail=True) if isinstance(glob_result, list): @@ -314,20 +325,23 @@ def glob_files( for file, md in glob_result.items(): if md["type"] != "file": continue + scheme = bucket_url_parsed.scheme + # relative paths are always POSIX if is_local_fs: - rel_path = pathlib.Path(file).relative_to(root_dir).as_posix() - file_url = FilesystemConfiguration.make_file_uri(file) + # use OS pathlib for local paths + loc_path = pathlib.Path(file) + file_name = loc_path.name + rel_path = loc_path.relative_to(root_dir).as_posix() + file_url = FilesystemConfiguration.make_file_url(file) else: - rel_path = posixpath.relpath(file.lstrip("/"), root_dir) - file_url = bucket_url_parsed._replace( - path=posixpath.join(bucket_url_parsed.path, rel_path) - ).geturl() + file_name = posixpath.basename(file) + rel_path = posixpath.relpath(file, root_dir) + file_url = make_fsspec_url(scheme, file, bucket_url) - scheme = bucket_url_parsed.scheme mime_type, encoding = guess_mime_type(rel_path) yield FileItem( - file_name=posixpath.basename(rel_path), + file_name=file_name, relative_path=rel_path, file_url=file_url, mime_type=mime_type, diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index b28309b930..b3b2fbcf0f 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -34,7 +34,6 @@ from dlt.common import logger from dlt.common.exceptions import TerminalValueError -from dlt.common.storages.fsspec_filesystem import fsspec_from_config from dlt.common.utils import uniq_id, without_none from dlt.common.schema import TColumnSchema, Schema, TTableSchema from dlt.common.schema.typing import ( diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 11326cf3ed..1dd4c727be 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -432,7 +432,7 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load # append to table for merge loads (append to stage) and regular appends. table_name = table["name"] - # determine whether we load from local or uri + # determine whether we load from local or url bucket_path = None ext: str = os.path.splitext(file_path)[1][1:] if ReferenceFollowupJobRequest.is_reference_job(file_path): diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 3bd2d12a5a..789dbedae9 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -43,6 +43,10 @@ def to_connector_params(self) -> Dict[str, Any]: class DatabricksClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_type: Final[str] = dataclasses.field(default="databricks", init=False, repr=False, compare=False) # type: ignore[misc] credentials: DatabricksCredentials = None + staging_credentials_name: Optional[str] = None + "If set, credentials with given name will be used in copy command" + is_staging_external_location: bool = False + """If true, the temporary credentials are not propagated to the COPY command""" def __str__(self) -> str: """Return displayable destination location""" diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 38412b2608..614e6e97c5 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -1,4 +1,4 @@ -from typing import ClassVar, Dict, Optional, Sequence, Tuple, List, Any, Iterable, Type, cast +from typing import Optional, Sequence, List, cast from urllib.parse import urlparse, urlunparse from dlt import config @@ -6,20 +6,17 @@ from dlt.common.destination.reference import ( HasFollowupJobs, FollowupJobRequest, - TLoadJobState, RunnableLoadJob, - CredentialsConfiguration, SupportsStagingDestination, LoadJob, ) from dlt.common.configuration.specs import ( AwsCredentialsWithoutDefaults, - AzureCredentials, AzureCredentialsWithoutDefaults, ) from dlt.common.exceptions import TerminalValueError from dlt.common.storages.file_storage import FileStorage -from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns +from dlt.common.schema import TColumnSchema, Schema from dlt.common.schema.typing import TTableSchema, TColumnType, TSchemaTables, TTableFormat from dlt.common.schema.utils import table_schema_has_type from dlt.common.storages import FilesystemConfiguration, fsspec_from_config @@ -35,6 +32,9 @@ from dlt.destinations.type_mapping import TypeMapper +AZURE_BLOB_STORAGE_PROTOCOLS = ["az", "abfss", "abfs"] + + class DatabricksTypeMapper(TypeMapper): sct_to_unbound_dbt = { "complex": "STRING", # Databricks supports complex types like ARRAY @@ -137,41 +137,51 @@ def run(self) -> None: if bucket_path: bucket_url = urlparse(bucket_path) bucket_scheme = bucket_url.scheme - # referencing an staged files via a bucket URL requires explicit AWS credentials - if bucket_scheme == "s3" and isinstance( - staging_credentials, AwsCredentialsWithoutDefaults - ): - s3_creds = staging_credentials.to_session_credentials() - credentials_clause = f"""WITH(CREDENTIAL( - AWS_ACCESS_KEY='{s3_creds["aws_access_key_id"]}', - AWS_SECRET_KEY='{s3_creds["aws_secret_access_key"]}', - - AWS_SESSION_TOKEN='{s3_creds["aws_session_token"]}' - )) - """ - from_clause = f"FROM '{bucket_path}'" - elif bucket_scheme in ["az", "abfs"] and isinstance( - staging_credentials, AzureCredentialsWithoutDefaults - ): - # Explicit azure credentials are needed to load from bucket without a named stage - credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))""" - # Converts an az:/// to abfss://@.dfs.core.windows.net/ - # as required by snowflake - _path = bucket_url.path - bucket_path = urlunparse( - bucket_url._replace( - scheme="abfss", - netloc=f"{bucket_url.netloc}@{staging_credentials.azure_storage_account_name}.dfs.core.windows.net", - path=_path, - ) - ) - from_clause = f"FROM '{bucket_path}'" - else: + + if bucket_scheme not in AZURE_BLOB_STORAGE_PROTOCOLS + ["s3"]: raise LoadJobTerminalException( self._file_path, f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and" " azure buckets are supported", ) + + if self._job_client.config.is_staging_external_location: + # just skip the credentials clause for external location + # https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location + pass + elif self._job_client.config.staging_credentials_name: + # add named credentials + credentials_clause = ( + f"WITH(CREDENTIAL {self._job_client.config.staging_credentials_name} )" + ) + else: + # referencing an staged files via a bucket URL requires explicit AWS credentials + if bucket_scheme == "s3": + assert isinstance(staging_credentials, AwsCredentialsWithoutDefaults) + s3_creds = staging_credentials.to_session_credentials() + credentials_clause = f"""WITH(CREDENTIAL( + AWS_ACCESS_KEY='{s3_creds["aws_access_key_id"]}', + AWS_SECRET_KEY='{s3_creds["aws_secret_access_key"]}', + + AWS_SESSION_TOKEN='{s3_creds["aws_session_token"]}' + )) + """ + elif bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS: + assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + # Explicit azure credentials are needed to load from bucket without a named stage + credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))""" + bucket_path = self.ensure_databricks_abfss_url( + bucket_path, staging_credentials.azure_storage_account_name + ) + + if bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS: + assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + bucket_path = self.ensure_databricks_abfss_url( + bucket_path, staging_credentials.azure_storage_account_name + ) + + # always add FROM clause + from_clause = f"FROM '{bucket_path}'" else: raise LoadJobTerminalException( self._file_path, @@ -231,6 +241,34 @@ def run(self) -> None: """ self._sql_client.execute_sql(statement) + @staticmethod + def ensure_databricks_abfss_url( + bucket_path: str, azure_storage_account_name: str = None + ) -> str: + bucket_url = urlparse(bucket_path) + # Converts an az:/// to abfss://@.dfs.core.windows.net/ + if bucket_url.username: + # has the right form, ensure abfss schema + return urlunparse(bucket_url._replace(scheme="abfss")) + + if not azure_storage_account_name: + raise TerminalValueError( + f"Could not convert azure blob storage url {bucket_path} into form required by" + " Databricks" + " (abfss://@.dfs.core.windows.net/)" + " because storage account name is not known. Please use Databricks abfss://" + " canonical url as bucket_url in staging credentials" + ) + # as required by databricks + _path = bucket_url.path + return urlunparse( + bucket_url._replace( + scheme="abfss", + netloc=f"{bucket_url.netloc}@{azure_storage_account_name}.dfs.core.windows.net", + path=_path, + ) + ) + class DatabricksMergeJob(SqlMergeFollowupJob): @classmethod diff --git a/dlt/destinations/impl/databricks/factory.py b/dlt/destinations/impl/databricks/factory.py index 409d3bc4be..6108b69da9 100644 --- a/dlt/destinations/impl/databricks/factory.py +++ b/dlt/destinations/impl/databricks/factory.py @@ -54,6 +54,8 @@ def client_class(self) -> t.Type["DatabricksClient"]: def __init__( self, credentials: t.Union[DatabricksCredentials, t.Dict[str, t.Any], str] = None, + is_staging_external_location: t.Optional[bool] = False, + staging_credentials_name: t.Optional[str] = None, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -65,10 +67,14 @@ def __init__( Args: credentials: Credentials to connect to the databricks database. Can be an instance of `DatabricksCredentials` or a connection string in the format `databricks://user:password@host:port/database` + is_staging_external_location: If true, the temporary credentials are not propagated to the COPY command + staging_credentials_name: If set, credentials with given name will be used in copy command **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, + is_staging_external_location=is_staging_external_location, + staging_credentials_name=staging_credentials_name, destination_name=destination_name, environment=environment, **kwargs, diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index feb09369dc..fc87faaf5a 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -90,9 +90,9 @@ def run(self) -> None: def metrics(self) -> Optional[LoadJobMetrics]: m = super().metrics() - # add remote uri if there's followup job + # add remote url if there's followup job if self.config.create_followup_jobs: - m = m._replace(remote_uri=self._file_name) + m = m._replace(remote_url=self._file_name) return m diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 62263a10b9..ac5ffb9ef3 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -56,7 +56,7 @@ def __init__( self._job_client: FilesystemClient = None def run(self) -> None: - self.__is_local_filesystem = self._job_client.config.protocol == "file" + self.__is_local_filesystem = self._job_client.config.is_local_filesystem # We would like to avoid failing for local filesystem where # deeply nested directory will not exist before writing a file. # It `auto_mkdir` is disabled by default in fsspec so we made some @@ -88,13 +88,13 @@ def make_remote_path(self) -> str: path_utils.normalize_path_sep(pathlib, destination_file_name), ) - def make_remote_uri(self) -> str: - """Returns path on a remote filesystem as a full uri including scheme.""" - return self._job_client.make_remote_uri(self.make_remote_path()) + def make_remote_url(self) -> str: + """Returns path on a remote filesystem as a full url including scheme.""" + return self._job_client.make_remote_url(self.make_remote_path()) def metrics(self) -> Optional[LoadJobMetrics]: m = super().metrics() - return m._replace(remote_uri=self.make_remote_uri()) + return m._replace(remote_url=self.make_remote_url()) class DeltaLoadFilesystemJob(FilesystemLoadJob): @@ -112,7 +112,7 @@ def make_remote_path(self) -> str: return self._job_client.get_table_dir(self.load_table_name) def run(self) -> None: - logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_uri()}") + logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()}") from dlt.common.libs.deltalake import write_delta_table, merge_delta_table @@ -133,7 +133,7 @@ def run(self) -> None: else: write_delta_table( table_or_uri=( - self.make_remote_uri() if self._delta_table is None else self._delta_table + self.make_remote_url() if self._delta_table is None else self._delta_table ), data=arrow_rbr, write_disposition=self._load_table["write_disposition"], @@ -151,7 +151,7 @@ def _storage_options(self) -> Dict[str, str]: def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.deltalake import try_get_deltatable - return try_get_deltatable(self.make_remote_uri(), storage_options=self._storage_options) + return try_get_deltatable(self.make_remote_url(), storage_options=self._storage_options) @property def _partition_columns(self) -> List[str]: @@ -166,7 +166,7 @@ def _create_or_evolve_delta_table(self) -> None: if self._delta_table is None: DeltaTable.create( - table_uri=self.make_remote_uri(), + table_uri=self.make_remote_url(), schema=ensure_delta_compatible_arrow_schema(self.arrow_ds.schema), mode="overwrite", partition_by=self._partition_columns, @@ -185,7 +185,7 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRe elif final_state == "completed": ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), - remote_paths=[self._job_client.make_remote_uri(self.make_remote_path())], + remote_paths=[self._job_client.make_remote_url(self.make_remote_path())], ) jobs.append(ref_job) return jobs @@ -208,7 +208,7 @@ def __init__( ) -> None: super().__init__(schema, config, capabilities) self.fs_client, fs_path = fsspec_from_config(config) - self.is_local_filesystem = config.protocol == "file" + self.is_local_filesystem = config.is_local_filesystem self.bucket_path = ( config.make_local_path(config.bucket_url) if self.is_local_filesystem else fs_path ) @@ -319,7 +319,7 @@ def get_table_dir(self, table_name: str, remote: bool = False) -> str: table_prefix = self.get_table_prefix(table_name) table_dir: str = self.pathlib.dirname(table_prefix) if remote: - table_dir = self.make_remote_uri(table_dir) + table_dir = self.make_remote_url(table_dir) return table_dir def get_table_prefix(self, table_name: str) -> str: @@ -353,7 +353,7 @@ def list_files_with_prefixes(self, table_dir: str, prefixes: List[str]) -> List[ # we fallback to our own glob implementation that is tested to return consistent results for # filesystems we support. we were not able to use `find` or `walk` because they were selecting # files wrongly (on azure walk on path1/path2/ would also select files from path1/path2_v2/ but returning wrong dirs) - for details in glob_files(self.fs_client, self.make_remote_uri(table_dir), "**"): + for details in glob_files(self.fs_client, self.make_remote_url(table_dir), "**"): file = details["file_name"] filepath = self.pathlib.join(table_dir, details["relative_path"]) # skip INIT files @@ -388,12 +388,12 @@ def create_load_job( cls = FilesystemLoadJobWithFollowup if self.config.as_staging else FilesystemLoadJob return cls(file_path) - def make_remote_uri(self, remote_path: str) -> str: + def make_remote_url(self, remote_path: str) -> str: """Returns uri to the remote filesystem to which copy the file""" if self.is_local_filesystem: - return self.config.make_file_uri(remote_path) + return self.config.make_file_url(remote_path) else: - return f"{self.config.protocol}://{remote_path}" + return self.config.make_url(remote_path) def __enter__(self) -> "FilesystemClient": return self diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index 6cd5767dcb..ddb82c95b2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -117,6 +117,8 @@ access_token = "MY_ACCESS_TOKEN" catalog = "my_catalog" ``` +See [staging support](#staging-support) for authentication options when `dlt` copies files from buckets. + ## Write disposition All write dispositions are supported @@ -166,6 +168,11 @@ pipeline = dlt.pipeline( Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) for details on connecting your Azure Blob Storage container with the bucket_url and credentials. +Databricks requires that you use ABFS urls in following format: +**abfss://container_name@storage_account_name.dfs.core.windows.net/path** + +`dlt` is able to adapt the other representation (ie **az://container-name/path**') still we recommend that you use the correct form. + Example to set up Databricks with Azure as a staging destination: ```py @@ -175,10 +182,34 @@ Example to set up Databricks with Azure as a staging destination: pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='databricks', - staging=dlt.destinations.filesystem('az://your-container-name'), # add this to activate the staging location + staging=dlt.destinations.filesystem('abfss://dlt-ci-data@dltdata.dfs.core.windows.net'), # add this to activate the staging location dataset_name='player_data' ) + ``` + +### Use external locations and stored credentials +`dlt` forwards bucket credentials to `COPY INTO` SQL command by default. You may prefer to use [external locations or stored credentials instead](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location) that are stored on the Databricks side. + +If you set up external location for your staging path, you can tell `dlt` to use it: +```toml +[destination.databricks] +is_staging_external_location=true +``` + +If you set up Databricks credential named ie. **credential_x**, you can tell `dlt` to use it: +```toml +[destination.databricks] +staging_credentials_name="credential_x" +``` + +Both options are available from code: +```py +import dlt + +bricks = dlt.destinations.databricks(staging_credentials_name="credential_x") +``` + ### dbt support This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-databricks](https://github.com/databricks/dbt-databricks) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index d08578c5a2..57e6db311d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -176,7 +176,7 @@ Note that we ignore missing columns `ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE` and Snowflake supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. -### Table and column identifiers +## Table and column identifiers Snowflake supports both case sensitive and case insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate case sensitive identifiers that must be quoted in SQL statements. diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md index 3b5762612c..cc089a1393 100644 --- a/docs/website/docs/running-in-production/running.md +++ b/docs/website/docs/running-in-production/running.md @@ -271,7 +271,7 @@ load_info.raise_on_failed_jobs() ``` You may also abort the load package with `LoadClientJobFailed` (terminal exception) on a first -failed job. Such package is immediately moved to completed but its load id is not added to the +failed job. Such package is will be completed but its load id is not added to the `_dlt_loads` table. All the jobs that were running in parallel are completed before raising. The dlt state, if present, will not be visible to `dlt`. Here's example `config.toml` to enable this option: @@ -282,6 +282,20 @@ load.workers=1 load.raise_on_failed_jobs=true ``` +:::caution +Note that certain write dispositions will irreversibly modify your data +1. `replace` write disposition with the default `truncate-and-insert` [strategy](../general-usage/full-loading.md) will truncate tables before loading. +2. `merge` write disposition will merge staging dataset tables into the destination dataset. This will happen only when all data for this table (and nested tables) got loaded. + +Here's what you can do to deal with partially loaded packages: +1. Retry the load step in case of transient errors +2. Use replace strategy with staging dataset so replace happens only when data for the table (and all nested tables) was fully loaded and is atomic operation (if possible) +3. Use only "append" write disposition. When your load package fails you are able to use `_dlt_load_id` to remove all unprocessed data. +4. Use "staging append" (`merge` disposition without primary key and merge key defined). + +::: + + ### What `run` does inside Before adding retry to pipeline steps, note how `run` method actually works: diff --git a/tests/.dlt/config.toml b/tests/.dlt/config.toml index ba86edf417..292175569b 100644 --- a/tests/.dlt/config.toml +++ b/tests/.dlt/config.toml @@ -6,7 +6,8 @@ bucket_url_gs="gs://ci-test-bucket" bucket_url_s3="s3://dlt-ci-test-bucket" bucket_url_file="_storage" bucket_url_az="az://dlt-ci-test-bucket" +bucket_url_abfss="abfss://dlt-ci-test-bucket@dltdata.dfs.core.windows.net" bucket_url_r2="s3://dlt-ci-test-bucket" # use "/" as root path bucket_url_gdrive="gdrive://15eC3e5MNew2XAIefWNlG8VlEa0ISnnaG" -memory="memory://m" \ No newline at end of file +memory="memory:///m" \ No newline at end of file diff --git a/tests/common/cases/normalizers/sql_upper.py b/tests/common/cases/normalizers/sql_upper.py index f2175f06ad..eb88775f95 100644 --- a/tests/common/cases/normalizers/sql_upper.py +++ b/tests/common/cases/normalizers/sql_upper.py @@ -1,5 +1,3 @@ -from typing import Any, Sequence - from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention diff --git a/tests/common/storages/test_local_filesystem.py b/tests/common/storages/test_local_filesystem.py index 14e3cc23d4..1bfe6c0b5b 100644 --- a/tests/common/storages/test_local_filesystem.py +++ b/tests/common/storages/test_local_filesystem.py @@ -45,7 +45,7 @@ ) def test_local_path_win_configuration(bucket_url: str, file_url: str) -> None: assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -66,7 +66,7 @@ def test_local_path_win_configuration(bucket_url: str, file_url: str) -> None: def test_local_user_win_path_configuration(bucket_url: str) -> None: file_url = "file:///" + pathlib.Path(bucket_url).expanduser().as_posix().lstrip("/") assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -99,7 +99,7 @@ def test_file_win_configuration() -> None: ) def test_file_posix_configuration(bucket_url: str, file_url: str) -> None: assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -117,7 +117,7 @@ def test_file_posix_configuration(bucket_url: str, file_url: str) -> None: def test_local_user_posix_path_configuration(bucket_url: str) -> None: file_url = "file:///" + pathlib.Path(bucket_url).expanduser().as_posix().lstrip("/") assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -166,7 +166,7 @@ def test_file_filesystem_configuration( assert FilesystemConfiguration.make_local_path(bucket_url) == str( pathlib.Path(local_path).resolve() ) - assert FilesystemConfiguration.make_file_uri(local_path) == norm_bucket_url + assert FilesystemConfiguration.make_file_url(local_path) == norm_bucket_url if local_path == "": with pytest.raises(ConfigurationValueError): diff --git a/tests/destinations/test_destination_name_and_config.py b/tests/destinations/test_destination_name_and_config.py index 11de706722..1e432a7803 100644 --- a/tests/destinations/test_destination_name_and_config.py +++ b/tests/destinations/test_destination_name_and_config.py @@ -60,7 +60,7 @@ def test_set_name_and_environment() -> None: def test_preserve_destination_instance() -> None: dummy1 = dummy(destination_name="dummy1", environment="dev/null/1") filesystem1 = filesystem( - FilesystemConfiguration.make_file_uri(TEST_STORAGE_ROOT), + FilesystemConfiguration.make_file_url(TEST_STORAGE_ROOT), destination_name="local_fs", environment="devel", ) @@ -210,7 +210,7 @@ def test_destination_config_in_name(environment: DictStrStr) -> None: with pytest.raises(ConfigFieldMissingException): p.destination_client() - environment["DESTINATION__FILESYSTEM-PROD__BUCKET_URL"] = FilesystemConfiguration.make_file_uri( + environment["DESTINATION__FILESYSTEM-PROD__BUCKET_URL"] = FilesystemConfiguration.make_file_url( "_storage" ) assert p._fs_client().dataset_path.endswith(p.dataset_name) diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index f6a06180c9..bb989a887c 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -3,9 +3,12 @@ pytest.importorskip("databricks") +from dlt.common.exceptions import TerminalValueError +from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob +from dlt.common.configuration import resolve_configuration +from dlt.destinations import databricks from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration -from dlt.common.configuration import resolve_configuration # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -34,3 +37,48 @@ def test_databricks_credentials_to_connector_params(): assert params["extra_a"] == "a" assert params["extra_b"] == "b" assert params["_socket_timeout"] == credentials.socket_timeout + + +def test_databricks_configuration() -> None: + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.is_staging_external_location is False + assert config.staging_credentials_name is None + + os.environ["IS_STAGING_EXTERNAL_LOCATION"] = "true" + os.environ["STAGING_CREDENTIALS_NAME"] = "credential" + config = bricks.configuration(None, accept_partial=True) + assert config.is_staging_external_location is True + assert config.staging_credentials_name == "credential" + + # explicit params + bricks = databricks(is_staging_external_location=None, staging_credentials_name="credential2") + config = bricks.configuration(None, accept_partial=True) + assert config.staging_credentials_name == "credential2" + assert config.is_staging_external_location is None + + +def test_databricks_abfss_converter() -> None: + with pytest.raises(TerminalValueError): + DatabricksLoadJob.ensure_databricks_abfss_url("az://dlt-ci-test-bucket") + + abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url( + "az://dlt-ci-test-bucket", "my_account" + ) + assert abfss_url == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net" + + abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url( + "az://dlt-ci-test-bucket/path/to/file.parquet", "my_account" + ) + assert ( + abfss_url + == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" + ) + + abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url( + "az://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" + ) + assert ( + abfss_url + == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" + ) diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 3cad7dda2c..29ca1a2b57 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -3,8 +3,8 @@ from typing import Tuple, Union, Dict from urllib.parse import urlparse - -from fsspec import AbstractFileSystem +from fsspec import AbstractFileSystem, get_filesystem_class, register_implementation +from fsspec.core import filesystem as fs_filesystem import pytest from tenacity import retry, stop_after_attempt, wait_fixed @@ -15,6 +15,7 @@ from dlt.common.configuration.inject import with_config from dlt.common.configuration.specs import AnyAzureCredentials from dlt.common.storages import fsspec_from_config, FilesystemConfiguration +from dlt.common.storages.configuration import make_fsspec_url from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files from dlt.common.utils import custom_environ, uniq_id from dlt.destinations import filesystem @@ -22,11 +23,12 @@ FilesystemDestinationClientConfiguration, ) from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders + +from tests.common.configuration.utils import environment from tests.common.storages.utils import TEST_SAMPLE_FILES, assert_sample_files -from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET +from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET, WITH_GDRIVE_BUCKETS from tests.utils import autouse_test_storage -from .utils import self_signed_cert -from tests.common.configuration.utils import environment +from tests.load.filesystem.utils import self_signed_cert # mark all tests as essential, do not remove @@ -53,6 +55,24 @@ def test_filesystem_configuration() -> None: } +@pytest.mark.parametrize("bucket_url", WITH_GDRIVE_BUCKETS) +def test_remote_url(bucket_url: str) -> None: + # make absolute urls out of paths + scheme = urlparse(bucket_url).scheme + if not scheme: + scheme = "file" + bucket_url = FilesystemConfiguration.make_file_url(bucket_url) + if scheme == "gdrive": + from dlt.common.storages.fsspecs.google_drive import GoogleDriveFileSystem + + register_implementation("gdrive", GoogleDriveFileSystem, "GoogleDriveFileSystem") + + fs_class = get_filesystem_class(scheme) + fs_path = fs_class._strip_protocol(bucket_url) + # reconstitute url + assert make_fsspec_url(scheme, fs_path, bucket_url) == bucket_url + + def test_filesystem_instance(with_gdrive_buckets_env: str) -> None: @retry(stop=stop_after_attempt(10), wait=wait_fixed(1), reraise=True) def check_file_exists(filedir_: str, file_url_: str): @@ -72,10 +92,8 @@ def check_file_changed(file_url_: str): bucket_url = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] config = get_config() # we do not add protocol to bucket_url (we need relative path) - assert bucket_url.startswith(config.protocol) or config.protocol == "file" + assert bucket_url.startswith(config.protocol) or config.is_local_filesystem filesystem, url = fsspec_from_config(config) - if config.protocol != "file": - assert bucket_url.endswith(url) # do a few file ops now = pendulum.now() filename = f"filesystem_common_{uniq_id()}" @@ -113,7 +131,9 @@ def test_glob_overlapping_path_files(with_gdrive_buckets_env: str) -> None: # "standard_source/sample" overlaps with a real existing "standard_source/samples". walk operation on azure # will return all files from "standard_source/samples" and report the wrong "standard_source/sample" path to the user # here we test we do not have this problem with out glob - bucket_url, _, filesystem = glob_test_setup(bucket_url, "standard_source/sample") + bucket_url, config, filesystem = glob_test_setup(bucket_url, "standard_source/sample") + if config.protocol in ["file"]: + pytest.skip(f"{config.protocol} not supported in this test") # use glob to get data all_file_items = list(glob_files(filesystem, bucket_url)) assert len(all_file_items) == 0 @@ -272,18 +292,18 @@ def glob_test_setup( config = get_config() # enable caches config.read_only = True - if config.protocol in ["file"]: - pytest.skip(f"{config.protocol} not supported in this test") # may contain query string - bucket_url_parsed = urlparse(bucket_url) - bucket_url = bucket_url_parsed._replace( - path=posixpath.join(bucket_url_parsed.path, glob_folder) - ).geturl() - filesystem, _ = fsspec_from_config(config) + filesystem, fs_path = fsspec_from_config(config) + bucket_url = make_fsspec_url(config.protocol, posixpath.join(fs_path, glob_folder), bucket_url) if config.protocol == "memory": - mem_path = os.path.join("m", "standard_source") + mem_path = os.path.join("/m", "standard_source") if not filesystem.isdir(mem_path): filesystem.mkdirs(mem_path) filesystem.upload(TEST_SAMPLE_FILES, mem_path, recursive=True) + if config.protocol == "file": + file_path = os.path.join("_storage", "standard_source") + if not filesystem.isdir(file_path): + filesystem.mkdirs(file_path) + filesystem.upload(TEST_SAMPLE_FILES, file_path, recursive=True) return bucket_url, config, filesystem diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py new file mode 100644 index 0000000000..5f8641f9fa --- /dev/null +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -0,0 +1,85 @@ +import pytest +import os + +from dlt.common.utils import uniq_id +from tests.load.utils import DestinationTestConfiguration, destinations_configs, AZ_BUCKET +from tests.pipeline.utils import assert_load_info + + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, bucket_subset=(AZ_BUCKET), subset=("databricks",) + ), + ids=lambda x: x.name, +) +def test_databricks_external_location(destination_config: DestinationTestConfiguration) -> None: + # do not interfere with state + os.environ["RESTORE_FROM_DESTINATION"] = "False" + dataset_name = "test_databricks_external_location" + uniq_id() + + from dlt.destinations import databricks, filesystem + from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob + + abfss_bucket_url = DatabricksLoadJob.ensure_databricks_abfss_url(AZ_BUCKET, "dltdata") + stage = filesystem(abfss_bucket_url) + + # should load abfss formatted url just fine + bricks = databricks(is_staging_external_location=False) + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", + dataset_name=dataset_name, + destination=bricks, + staging=stage, + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert_load_info(info) + # get metrics + metrics = info.metrics[info.loads_ids[0]][0] + remote_url = list(metrics["job_metrics"].values())[0].remote_url + # abfss form was preserved + assert remote_url.startswith(abfss_bucket_url) + + # should fail on internal config error as external location is not configured + bricks = databricks(is_staging_external_location=True) + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", + dataset_name=dataset_name, + destination=bricks, + staging=stage, + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert info.has_failed_jobs is True + assert ( + "Invalid configuration value detected" + in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message + ) + + # should fail on non existing stored credentials + bricks = databricks(is_staging_external_location=False, staging_credentials_name="CREDENTIAL_X") + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", + dataset_name=dataset_name, + destination=bricks, + staging=stage, + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert info.has_failed_jobs is True + assert ( + "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message + ) + + # should fail on non existing stored credentials + # auto stage with regular az:// used + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", dataset_name=dataset_name, destination=bricks + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert info.has_failed_jobs is True + assert ( + "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message + ) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index d88eba7c06..bc6cbd9848 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -300,16 +300,16 @@ def data_types(): assert len(rows) == 10 assert_all_data_types_row(rows[0], schema=column_schemas) - # make sure remote_uri is in metrics + # make sure remote_url is in metrics metrics = info.metrics[info.loads_ids[0]][0] - # TODO: only final copy job has remote_uri. not the initial (empty) job for particular files - # we could implement an empty job for delta that generates correct remote_uri - remote_uri = list(metrics["job_metrics"].values())[-1].remote_uri - assert remote_uri.endswith("data_types") - bucket_uri = destination_config.bucket_url - if FilesystemConfiguration.is_local_path(bucket_uri): - bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) - assert remote_uri.startswith(bucket_uri) + # TODO: only final copy job has remote_url. not the initial (empty) job for particular files + # we could implement an empty job for delta that generates correct remote_url + remote_url = list(metrics["job_metrics"].values())[-1].remote_url + assert remote_url.endswith("data_types") + bucket_url = destination_config.bucket_url + if FilesystemConfiguration.is_local_path(bucket_url): + bucket_url = FilesystemConfiguration.make_file_url(bucket_url) + assert remote_url.startswith(bucket_url) # another run should append rows to the table info = pipeline.run(data_types()) diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index f216fa3c05..42dee5fc8f 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -57,17 +57,17 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: info = pipeline.run(github(), loader_file_format=destination_config.file_format) assert_load_info(info) - # checks if remote_uri is set correctly on copy jobs + # checks if remote_url is set correctly on copy jobs metrics = info.metrics[info.loads_ids[0]][0] for job_metrics in metrics["job_metrics"].values(): - remote_uri = job_metrics.remote_uri + remote_url = job_metrics.remote_url job_ext = os.path.splitext(job_metrics.job_id)[1] if job_ext not in (".reference", ".sql"): - assert remote_uri.endswith(job_ext) + assert remote_url.endswith(job_ext) bucket_uri = destination_config.bucket_url if FilesystemConfiguration.is_local_path(bucket_uri): - bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) - assert remote_uri.startswith(bucket_uri) + bucket_uri = FilesystemConfiguration.make_file_url(bucket_uri) + assert remote_url.startswith(bucket_uri) package_info = pipeline.get_load_package_info(info.loads_ids[0]) assert package_info.state == "loaded" diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 59b7acac15..72c5772668 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -1012,17 +1012,17 @@ def assert_complete_job( if state == "failed_jobs" else "completed" ) - remote_uri = job_metrics.remote_uri + remote_url = job_metrics.remote_url if load.initial_client_config.create_followup_jobs: # type: ignore - assert remote_uri.endswith(job.file_name()) + assert remote_url.endswith(job.file_name()) elif load.is_staging_destination_job(job.file_name()): # staging destination should contain reference to remote filesystem assert ( - FilesystemConfiguration.make_file_uri(REMOTE_FILESYSTEM) - in remote_uri + FilesystemConfiguration.make_file_url(REMOTE_FILESYSTEM) + in remote_url ) else: - assert remote_uri is None + assert remote_url is None else: assert job_metrics is None diff --git a/tests/load/utils.py b/tests/load/utils.py index 086109de8b..15b1e1575e 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -70,6 +70,7 @@ AWS_BUCKET = dlt.config.get("tests.bucket_url_s3", str) GCS_BUCKET = dlt.config.get("tests.bucket_url_gs", str) AZ_BUCKET = dlt.config.get("tests.bucket_url_az", str) +ABFS_BUCKET = dlt.config.get("tests.bucket_url_abfss", str) GDRIVE_BUCKET = dlt.config.get("tests.bucket_url_gdrive", str) FILE_BUCKET = dlt.config.get("tests.bucket_url_file", str) R2_BUCKET = dlt.config.get("tests.bucket_url_r2", str) @@ -79,6 +80,7 @@ "s3", "gs", "az", + "abfss", "gdrive", "file", "memory", @@ -86,7 +88,15 @@ ] # Filter out buckets not in all filesystem drivers -WITH_GDRIVE_BUCKETS = [GCS_BUCKET, AWS_BUCKET, FILE_BUCKET, MEMORY_BUCKET, AZ_BUCKET, GDRIVE_BUCKET] +WITH_GDRIVE_BUCKETS = [ + GCS_BUCKET, + AWS_BUCKET, + FILE_BUCKET, + MEMORY_BUCKET, + ABFS_BUCKET, + AZ_BUCKET, + GDRIVE_BUCKET, +] WITH_GDRIVE_BUCKETS = [ bucket for bucket in WITH_GDRIVE_BUCKETS diff --git a/tests/pipeline/cases/contracts/trace.schema.yaml b/tests/pipeline/cases/contracts/trace.schema.yaml index 89831977c0..c324818338 100644 --- a/tests/pipeline/cases/contracts/trace.schema.yaml +++ b/tests/pipeline/cases/contracts/trace.schema.yaml @@ -562,7 +562,7 @@ tables: finished_at: data_type: timestamp nullable: true - remote_uri: + remote_url: data_type: text nullable: true parent: trace__steps diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index b6a7feffc1..027a2b4e72 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -2600,6 +2600,20 @@ def ids(_id=dlt.sources.incremental("_id", initial_value=2)): assert pipeline.last_trace.last_normalize_info.row_counts["_ids"] == 2 +def test_dlt_columns_nested_table_collisions() -> None: + # we generate all identifiers in upper case to test for a bug where dlt columns for nested tables were hardcoded to + # small caps. they got normalized to upper case after the first run and then added again as small caps + # generating duplicate columns and raising collision exception as duckdb is ci destination + duck = duckdb(naming_convention="tests.common.cases.normalizers.sql_upper") + pipeline = dlt.pipeline("test_dlt_columns_child_table_collisions", destination=duck) + customers = [ + {"id": 1, "name": "dave", "orders": [1, 2, 3]}, + ] + assert_load_info(pipeline.run(customers, table_name="CUSTOMERS")) + # this one would fail without bugfix + assert_load_info(pipeline.run(customers, table_name="CUSTOMERS")) + + def test_access_pipeline_in_resource() -> None: pipeline = dlt.pipeline("test_access_pipeline_in_resource", destination="duckdb") diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 4e52d2aa29..d2bb035a17 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -315,7 +315,7 @@ def data(): return data() - # create pipeline with staging to get remote_uri in load step job_metrics + # create pipeline with staging to get remote_url in load step job_metrics dummy_dest = dummy(completed_prob=1.0) pipeline = dlt.pipeline( pipeline_name="test_trace_schema", From 63f89542678c7af51089f94365aa6834ccca90e7 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Wed, 28 Aug 2024 13:20:16 +0200 Subject: [PATCH 32/34] bumps dlt version to 0.5.4 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 74161f5ccc..d32285572f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.5.4a0" +version = "0.5.4" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] From b48c7c3e7db9fb4ff321b668b9b22553b7882b31 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Wed, 28 Aug 2024 19:11:56 +0200 Subject: [PATCH 33/34] runs staging tests on athena (#1764) * always truncates staging tables on athena + replace without iceberg * adds athena staging configs to all staging configs * updates athena tests for staging destination --- dlt/common/destination/reference.py | 11 +++++ dlt/destinations/impl/athena/athena.py | 2 +- tests/load/pipeline/test_stage_loading.py | 23 ++++++++++- tests/load/utils.py | 49 ++++++++++++++--------- 4 files changed, 62 insertions(+), 23 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 0944b03bea..e7bba266df 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -586,10 +586,21 @@ class SupportsStagingDestination(ABC): def should_load_data_to_staging_dataset_on_staging_destination( self, table: TTableSchema ) -> bool: + """If set to True, and staging destination is configured, the data will be loaded to staging dataset on staging destination + instead of a regular dataset on staging destination. Currently it is used by Athena Iceberg which uses staging dataset + on staging destination to copy data to iceberg tables stored on regular dataset on staging destination. + The default is to load data to regular dataset on staging destination from where warehouses like Snowflake (that have their + own storage) will copy data. + """ return False @abstractmethod def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + """If set to True, data in `table` will be truncated on staging destination (regular dataset). This is the default behavior which + can be changed with a config flag. + For Athena + Iceberg this setting is always False - Athena uses regular dataset to store Iceberg tables and we avoid touching it. + For Athena we truncate those tables only on "replace" write disposition. + """ pass diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index b3b2fbcf0f..a5a8ae2562 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -530,7 +530,7 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable if table["write_disposition"] == "replace" and not self._is_iceberg_table( self.prepare_load_table(table["name"]) ): - return self.config.truncate_tables_on_staging_destination_before_load + return True return False def should_load_data_to_staging_dataset_on_staging_destination( diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index 42dee5fc8f..3bfa050fd7 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -218,7 +218,18 @@ def test_truncate_staging_dataset(destination_config: DestinationTestConfigurati # check there are two staging files _, staging_client = pipeline._get_destination_clients(pipeline.default_schema) with staging_client: - assert len(staging_client.list_table_files(table_name)) == 2 # type: ignore[attr-defined] + # except Athena + Iceberg which does not store tables in staging dataset + if ( + destination_config.destination == "athena" + and destination_config.table_format == "iceberg" + ): + table_count = 0 + # but keeps them in staging dataset on staging destination - but only the last one + with staging_client.with_staging_dataset(): # type: ignore[attr-defined] + assert len(staging_client.list_table_files(table_name)) == 1 # type: ignore[attr-defined] + else: + table_count = 2 + assert len(staging_client.list_table_files(table_name)) == table_count # type: ignore[attr-defined] # load the data with truncating, so only new file is on the staging pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = True @@ -231,7 +242,15 @@ def test_truncate_staging_dataset(destination_config: DestinationTestConfigurati # check there is only one staging file _, staging_client = pipeline._get_destination_clients(pipeline.default_schema) with staging_client: - assert len(staging_client.list_table_files(table_name)) == 1 # type: ignore[attr-defined] + # except for Athena which does not delete staging destination tables + if destination_config.destination == "athena": + if destination_config.table_format == "iceberg": + table_count = 0 + else: + table_count = 3 + else: + table_count = 1 + assert len(staging_client.list_table_files(table_name)) == table_count # type: ignore[attr-defined] @pytest.mark.parametrize( diff --git a/tests/load/utils.py b/tests/load/utils.py index 15b1e1575e..5427904d52 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -257,6 +257,27 @@ def destinations_configs( # build destination configs destination_configs: List[DestinationTestConfiguration] = [] + # default sql configs that are also default staging configs + default_sql_configs_with_staging = [ + # Athena needs filesystem staging, which will be automatically set; we have to supply a bucket url though. + DestinationTestConfiguration( + destination="athena", + file_format="parquet", + supports_merge=False, + bucket_url=AWS_BUCKET, + ), + DestinationTestConfiguration( + destination="athena", + file_format="parquet", + bucket_url=AWS_BUCKET, + force_iceberg=True, + supports_merge=True, + supports_dbt=False, + table_format="iceberg", + extra_info="iceberg", + ), + ] + # default non staging sql based configs, one per destination if default_sql_configs: destination_configs += [ @@ -268,26 +289,10 @@ def destinations_configs( DestinationTestConfiguration(destination="duckdb", file_format="parquet"), DestinationTestConfiguration(destination="motherduck", file_format="insert_values"), ] - # Athena needs filesystem staging, which will be automatically set; we have to supply a bucket url though. - destination_configs += [ - DestinationTestConfiguration( - destination="athena", - file_format="parquet", - supports_merge=False, - bucket_url=AWS_BUCKET, - ) - ] - destination_configs += [ - DestinationTestConfiguration( - destination="athena", - file_format="parquet", - bucket_url=AWS_BUCKET, - force_iceberg=True, - supports_merge=True, - supports_dbt=False, - extra_info="iceberg", - ) - ] + + # add Athena staging configs + destination_configs += default_sql_configs_with_staging + destination_configs += [ DestinationTestConfiguration( destination="clickhouse", file_format="jsonl", supports_dbt=False @@ -332,6 +337,10 @@ def destinations_configs( DestinationTestConfiguration(destination="qdrant", extra_info="server"), ] + if (default_sql_configs or all_staging_configs) and not default_sql_configs: + # athena default configs not added yet + destination_configs += default_sql_configs_with_staging + if default_staging_configs or all_staging_configs: destination_configs += [ DestinationTestConfiguration( From e9c9ecfa8a644fdb516dd74aabca3bf75bafb154 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Wed, 28 Aug 2024 21:45:16 +0200 Subject: [PATCH 34/34] fixes staging tests for athena --- tests/load/pipeline/test_stage_loading.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index 3bfa050fd7..6c4f6dfec8 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -74,8 +74,14 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: assert len(package_info.jobs["failed_jobs"]) == 0 # we have 4 parquet and 4 reference jobs plus one merge job - num_jobs = 4 + 4 + 1 if destination_config.supports_merge else 4 + 4 - assert len(package_info.jobs["completed_jobs"]) == num_jobs + num_jobs = 4 + 4 + num_sql_jobs = 0 + if destination_config.supports_merge: + num_sql_jobs += 1 + # sql job is used to copy parquet to Athena Iceberg table (_dlt_pipeline_state) + if destination_config.destination == "athena" and destination_config.table_format == "iceberg": + num_sql_jobs += 1 + assert len(package_info.jobs["completed_jobs"]) == num_jobs + num_sql_jobs assert ( len( [ @@ -110,7 +116,7 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: if x.job_file_info.file_format == "sql" ] ) - == 1 + == num_sql_jobs ) initial_counts = load_table_counts(