Skip to content

Commit

Permalink
fix: merge api results after their transformations (TCTC-9634) (#1807)
Browse files Browse the repository at this point in the history
* fix: merge api results after their transformations

* fix: use  in pandas v2

* fix: use pd.concat to concat dataframes
  • Loading branch information
julien-pinchelimouroux authored Nov 6, 2024
1 parent 70f9769 commit 76dff20
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 11 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

## Unreleased

### Fix

- HTTP API: API results are now correctly merged even if they need to be filtered or flattened.

### Added

- HTTP API: Add `data_filter` offset pagination config field to determine which part of data must be used to compute the data length.

## [7.1.1] 2024-10-28

### Fix
Expand Down
66 changes: 66 additions & 0 deletions tests/http_api/test_http_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,72 @@ def test_get_df_with_offset_pagination(
assert len(responses.calls) == 3


@responses.activate
def test_get_df_with_offset_pagination_and_flatten_option(
connector: HttpAPIConnector, data_source: HttpAPIDataSource, offset_pagination: OffsetLimitPaginationConfig
) -> None:
# first page
responses.add(
responses.GET,
"https://jsonplaceholder.typicode.com/comments?super_offset=0&super_limit=5",
json={
"totalItems": 12,
"items": {
"product_category": "sofa",
"products": [
{"name": "p1", "price": 1},
{"name": "p2", "price": 1},
{"name": "p3", "price": 1},
{"name": "p4", "price": 1},
{"name": "p5", "price": 1},
],
},
},
)

# second page
responses.add(
responses.GET,
"https://jsonplaceholder.typicode.com/comments?super_offset=5&super_limit=5",
json={
"totalItems": 12,
"items": {
"product_category": "kitchen",
"products": [
{"name": "p6", "price": 1},
{"name": "p7", "price": 1},
{"name": "p8", "price": 1},
{"name": "p9", "price": 1},
{"name": "p10", "price": 1},
],
},
},
)

# last page
responses.add(
responses.GET,
"https://jsonplaceholder.typicode.com/comments?super_offset=10&super_limit=5",
json={
"totalItems": 12,
"items": {
"product_category": "bedroom",
"products": [
{"name": "p11", "price": 1},
{"name": "p12", "price": 1},
],
},
},
)
offset_pagination.data_filter = ".items.products"
data_source.filter = ".items"
data_source.flatten_column = "products"
data_source.http_pagination_config = offset_pagination
df = connector.get_df(data_source)
assert df.shape == (12, 4)
assert len(responses.calls) == 3


@responses.activate
def test_get_df_with_page_pagination(
connector: HttpAPIConnector, data_source: HttpAPIDataSource, page_pagination: PageBasedPaginationConfig
Expand Down
16 changes: 7 additions & 9 deletions toucan_connectors/http_api/http_api_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def perform_requests(self, data_source: HttpAPIDataSource, session: "Session") -
pagination_config = pagination_config.get_next_pagination_config(
result=parsed_result, pagination_info=parsed_pagination_info
)
results += parsed_result
results.append(parsed_result)
return results

def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame":
Expand All @@ -177,12 +177,11 @@ def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame":
session = Session()
# Try retrieve dataset
try:
results = pd.DataFrame(
self.perform_requests(
data_source=data_source,
session=session,
)
results = self.perform_requests(
data_source=data_source,
session=session,
)
dfs = [pd.DataFrame(result) for result in results]
except HTTPError as exc:
if exc.response.status_code == TOO_MANY_REQUESTS:
raise HttpAPIConnectorError(
Expand All @@ -192,10 +191,9 @@ def _retrieve_data(self, data_source: HttpAPIDataSource) -> "pd.DataFrame":
) from exc
else:
raise

if data_source.flatten_column:
return json_to_table(results, columns=[data_source.flatten_column])
return results
dfs = [json_to_table(df, columns=[data_source.flatten_column]) for df in dfs]
return pd.concat(dfs)

def _render_query(self, data_source):
query = nosql_apply_parameters_to_query(
Expand Down
11 changes: 9 additions & 2 deletions toucan_connectors/http_api/pagination_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ class OffsetLimitPaginationConfig(PaginationConfig):
offset: int = Field(0, **UI_HIDDEN)
limit_name: str = "limit"
limit: int
data_filter: str = Field(
".",
description=(
"Filter to access the received data. Allows to compare its length to the limit value. "
"It must point to a list of results. " + FilterSchemaDescription
),
)

def plan_pagination_updates_to_data_source(self, request_params: dict[str, Any] | None) -> dict[str, Any]:
offset_limit_params = {self.offset_name: self.offset, self.limit_name: self.limit}
Expand All @@ -68,7 +75,7 @@ def plan_pagination_updates_to_data_source(self, request_params: dict[str, Any]
def get_next_pagination_config(
self, result: Any, pagination_info: Any | None
) -> Optional["OffsetLimitPaginationConfig"]:
if len(result) < self.limit:
if len(pagination_info) < self.limit:
return None
else:
return self.model_copy(update={"offset": self.offset + self.limit})
Expand All @@ -77,7 +84,7 @@ def get_error_status_whitelist(self) -> list[str] | None:
return None

def get_pagination_info_filter(self) -> str | None:
return None
return self.data_filter


class PageBasedPaginationConfig(PaginationConfig):
Expand Down

0 comments on commit 76dff20

Please sign in to comment.