fix: merge api results after their transformations (TCTC-9634) (#1807)

* fix: merge api results after their transformations * fix: use in pandas v2 * fix: use pd.concat to concat dataframes
ToucanToco · Nov 6, 2024 · 76dff20 · 76dff20
1 parent 70f9769
commit 76dff20
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 ## Unreleased
 
+### Fix
+
+- HTTP API: API results are now correctly merged even if they need to be filtered or flattened. 
+
+### Added
+
+- HTTP API: Add `data_filter` offset pagination config field to determine which part of data must be used to compute the data length.
+
 ## [7.1.1] 2024-10-28
 
 ### Fix

diff --git a/tests/http_api/test_http_api.py b/tests/http_api/test_http_api.py
@@ -158,6 +158,72 @@ def test_get_df_with_offset_pagination(
     assert len(responses.calls) == 3
 
 
+@responses.activate
+def test_get_df_with_offset_pagination_and_flatten_option(
+    connector: HttpAPIConnector, data_source: HttpAPIDataSource, offset_pagination: OffsetLimitPaginationConfig
+) -> None:
+    # first page
+    responses.add(
+        responses.GET,
+        "https://jsonplaceholder.typicode.com/comments?super_offset=0&super_limit=5",
+        json={
+            "totalItems": 12,
+            "items": {
+                "product_category": "sofa",
+                "products": [
+                    {"name": "p1", "price": 1},
+                    {"name": "p2", "price": 1},
+                    {"name": "p3", "price": 1},
+                    {"name": "p4", "price": 1},
+                    {"name": "p5", "price": 1},
+                ],
+            },
+        },
+    )
+
+    # second page
+    responses.add(
+        responses.GET,
+        "https://jsonplaceholder.typicode.com/comments?super_offset=5&super_limit=5",
+        json={
+            "totalItems": 12,
+            "items": {
+                "product_category": "kitchen",
+                "products": [
+                    {"name": "p6", "price": 1},
+                    {"name": "p7", "price": 1},
+                    {"name": "p8", "price": 1},
+                    {"name": "p9", "price": 1},
+                    {"name": "p10", "price": 1},
+                ],
+            },
+        },
+    )
+
+    # last page
+    responses.add(
+        responses.GET,
+        "https://jsonplaceholder.typicode.com/comments?super_offset=10&super_limit=5",
+        json={
+            "totalItems": 12,
+            "items": {
+                "product_category": "bedroom",
+                "products": [
+                    {"name": "p11", "price": 1},
+                    {"name": "p12", "price": 1},
+                ],
+            },
+        },
+    )
+    offset_pagination.data_filter = ".items.products"
+    data_source.filter = ".items"
+    data_source.flatten_column = "products"
+    data_source.http_pagination_config = offset_pagination
+    df = connector.get_df(data_source)
+    assert df.shape == (12, 4)
+    assert len(responses.calls) == 3
+
+
 @responses.activate
 def test_get_df_with_page_pagination(
     connector: HttpAPIConnector, data_source: HttpAPIDataSource, page_pagination: PageBasedPaginationConfig

diff --git a/toucan_connectors/http_api/http_api_connector.py b/toucan_connectors/http_api/http_api_connector.py
@@ -167,7 +167,7 @@ def perform_requests(self, data_source: HttpAPIDataSource, session: "Session") -
             pagination_config = pagination_config.get_next_pagination_config(
                 result=parsed_result, pagination_info=parsed_pagination_info
             )
-            results += parsed_result
+            results.append(parsed_result)
         return results
 
     def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame":
@@ -177,12 +177,11 @@ def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame":
             session = Session()
         # Try retrieve dataset
         try:
-            results = pd.DataFrame(
-                self.perform_requests(
-                    data_source=data_source,
-                    session=session,
-                )
+            results = self.perform_requests(
+                data_source=data_source,
+                session=session,
             )
+            dfs = [pd.DataFrame(result) for result in results]
         except HTTPError as exc:
             if exc.response.status_code == TOO_MANY_REQUESTS:
                 raise HttpAPIConnectorError(
@@ -192,10 +191,9 @@ def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame":
                 ) from exc
             else:
                 raise
-
         if data_source.flatten_column:
-            return json_to_table(results, columns=[data_source.flatten_column])
-        return results
+            dfs = [json_to_table(df, columns=[data_source.flatten_column]) for df in dfs]
+        return pd.concat(dfs)
 
     def _render_query(self, data_source):
         query = nosql_apply_parameters_to_query(

diff --git a/toucan_connectors/http_api/pagination_configs.py b/toucan_connectors/http_api/pagination_configs.py
@@ -56,6 +56,13 @@ class OffsetLimitPaginationConfig(PaginationConfig):
     offset: int = Field(0, **UI_HIDDEN)
     limit_name: str = "limit"
     limit: int
+    data_filter: str = Field(
+        ".",
+        description=(
+            "Filter to access the received data. Allows to compare its length to the limit value. "
+            "It must point to a list of results. " + FilterSchemaDescription
+        ),
+    )
 
     def plan_pagination_updates_to_data_source(self, request_params: dict[str, Any] | None) -> dict[str, Any]:
         offset_limit_params = {self.offset_name: self.offset, self.limit_name: self.limit}
@@ -68,7 +75,7 @@ def plan_pagination_updates_to_data_source(self, request_params: dict[str, Any]
     def get_next_pagination_config(
         self, result: Any, pagination_info: Any | None
     ) -> Optional["OffsetLimitPaginationConfig"]:
-        if len(result) < self.limit:
+        if len(pagination_info) < self.limit:
             return None
         else:
             return self.model_copy(update={"offset": self.offset + self.limit})
@@ -77,7 +84,7 @@ def get_error_status_whitelist(self) -> list[str] | None:
         return None
 
     def get_pagination_info_filter(self) -> str | None:
-        return None
+        return self.data_filter
 
 
 class PageBasedPaginationConfig(PaginationConfig):