From 76dff2043d00e2bf2fd217cc267cd834e98caa98 Mon Sep 17 00:00:00 2001 From: Julien Pinchelimouroux Date: Wed, 6 Nov 2024 09:42:06 +0100 Subject: [PATCH] fix: merge api results after their transformations (TCTC-9634) (#1807) * fix: merge api results after their transformations * fix: use in pandas v2 * fix: use pd.concat to concat dataframes --- CHANGELOG.md | 8 +++ tests/http_api/test_http_api.py | 66 +++++++++++++++++++ .../http_api/http_api_connector.py | 16 ++--- .../http_api/pagination_configs.py | 11 +++- 4 files changed, 90 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14280a7aa..92ef6f5b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ ## Unreleased +### Fix + +- HTTP API: API results are now correctly merged even if they need to be filtered or flattened. + +### Added + +- HTTP API: Add `data_filter` offset pagination config field to determine which part of data must be used to compute the data length. + ## [7.1.1] 2024-10-28 ### Fix diff --git a/tests/http_api/test_http_api.py b/tests/http_api/test_http_api.py index 4e09c8b4f..46e3a1b3a 100644 --- a/tests/http_api/test_http_api.py +++ b/tests/http_api/test_http_api.py @@ -158,6 +158,72 @@ def test_get_df_with_offset_pagination( assert len(responses.calls) == 3 +@responses.activate +def test_get_df_with_offset_pagination_and_flatten_option( + connector: HttpAPIConnector, data_source: HttpAPIDataSource, offset_pagination: OffsetLimitPaginationConfig +) -> None: + # first page + responses.add( + responses.GET, + "https://jsonplaceholder.typicode.com/comments?super_offset=0&super_limit=5", + json={ + "totalItems": 12, + "items": { + "product_category": "sofa", + "products": [ + {"name": "p1", "price": 1}, + {"name": "p2", "price": 1}, + {"name": "p3", "price": 1}, + {"name": "p4", "price": 1}, + {"name": "p5", "price": 1}, + ], + }, + }, + ) + + # second page + responses.add( + responses.GET, + "https://jsonplaceholder.typicode.com/comments?super_offset=5&super_limit=5", + json={ + "totalItems": 12, + "items": { + "product_category": "kitchen", + "products": [ + {"name": "p6", "price": 1}, + {"name": "p7", "price": 1}, + {"name": "p8", "price": 1}, + {"name": "p9", "price": 1}, + {"name": "p10", "price": 1}, + ], + }, + }, + ) + + # last page + responses.add( + responses.GET, + "https://jsonplaceholder.typicode.com/comments?super_offset=10&super_limit=5", + json={ + "totalItems": 12, + "items": { + "product_category": "bedroom", + "products": [ + {"name": "p11", "price": 1}, + {"name": "p12", "price": 1}, + ], + }, + }, + ) + offset_pagination.data_filter = ".items.products" + data_source.filter = ".items" + data_source.flatten_column = "products" + data_source.http_pagination_config = offset_pagination + df = connector.get_df(data_source) + assert df.shape == (12, 4) + assert len(responses.calls) == 3 + + @responses.activate def test_get_df_with_page_pagination( connector: HttpAPIConnector, data_source: HttpAPIDataSource, page_pagination: PageBasedPaginationConfig diff --git a/toucan_connectors/http_api/http_api_connector.py b/toucan_connectors/http_api/http_api_connector.py index 216f6c95e..ab57e5b0b 100644 --- a/toucan_connectors/http_api/http_api_connector.py +++ b/toucan_connectors/http_api/http_api_connector.py @@ -167,7 +167,7 @@ def perform_requests(self, data_source: HttpAPIDataSource, session: "Session") - pagination_config = pagination_config.get_next_pagination_config( result=parsed_result, pagination_info=parsed_pagination_info ) - results += parsed_result + results.append(parsed_result) return results def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame": @@ -177,12 +177,11 @@ def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame": session = Session() # Try retrieve dataset try: - results = pd.DataFrame( - self.perform_requests( - data_source=data_source, - session=session, - ) + results = self.perform_requests( + data_source=data_source, + session=session, ) + dfs = [pd.DataFrame(result) for result in results] except HTTPError as exc: if exc.response.status_code == TOO_MANY_REQUESTS: raise HttpAPIConnectorError( @@ -192,10 +191,9 @@ def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame": ) from exc else: raise - if data_source.flatten_column: - return json_to_table(results, columns=[data_source.flatten_column]) - return results + dfs = [json_to_table(df, columns=[data_source.flatten_column]) for df in dfs] + return pd.concat(dfs) def _render_query(self, data_source): query = nosql_apply_parameters_to_query( diff --git a/toucan_connectors/http_api/pagination_configs.py b/toucan_connectors/http_api/pagination_configs.py index 2278700c3..abe23c4de 100644 --- a/toucan_connectors/http_api/pagination_configs.py +++ b/toucan_connectors/http_api/pagination_configs.py @@ -56,6 +56,13 @@ class OffsetLimitPaginationConfig(PaginationConfig): offset: int = Field(0, **UI_HIDDEN) limit_name: str = "limit" limit: int + data_filter: str = Field( + ".", + description=( + "Filter to access the received data. Allows to compare its length to the limit value. " + "It must point to a list of results. " + FilterSchemaDescription + ), + ) def plan_pagination_updates_to_data_source(self, request_params: dict[str, Any] | None) -> dict[str, Any]: offset_limit_params = {self.offset_name: self.offset, self.limit_name: self.limit} @@ -68,7 +75,7 @@ def plan_pagination_updates_to_data_source(self, request_params: dict[str, Any] def get_next_pagination_config( self, result: Any, pagination_info: Any | None ) -> Optional["OffsetLimitPaginationConfig"]: - if len(result) < self.limit: + if len(pagination_info) < self.limit: return None else: return self.model_copy(update={"offset": self.offset + self.limit}) @@ -77,7 +84,7 @@ def get_error_status_whitelist(self) -> list[str] | None: return None def get_pagination_info_filter(self) -> str | None: - return None + return self.data_filter class PageBasedPaginationConfig(PaginationConfig):