Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support AnyResponse #161

Merged
merged 25 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
809e179
support HttpOrBrowserResponse
BurnzZ Jan 16, 2024
2f5c69d
use new AnyResponse instead of HttpOrBrowserResponse
BurnzZ Jan 16, 2024
b4a79b0
use zapi response contents to determine which response to build
BurnzZ Jan 16, 2024
5c83d22
add provider tests for AnyResponse
BurnzZ Jan 17, 2024
202fa1b
fix mypy
BurnzZ Jan 17, 2024
5f1d104
add more test cases
BurnzZ Jan 17, 2024
3cb2290
use new weak_ref in scrapy_poet's Injector to handle more cases
BurnzZ Jan 18, 2024
85f355d
BrowserResponse takes precedence in AnyResponse
BurnzZ Jan 19, 2024
297eb57
add more test cases
BurnzZ Jan 19, 2024
1e19ef3
docs for AnyResponse fulfillment
BurnzZ Jan 19, 2024
137d146
AnyResponse would request HttpResponse if BrowserResponse/HttpRespons…
BurnzZ Jan 19, 2024
9322622
improve tests
BurnzZ Jan 22, 2024
ffa345f
small comments and improvements
BurnzZ Jan 24, 2024
7043c55
add test for item return
BurnzZ Jan 26, 2024
3fec1b8
handle empty itemOptions
BurnzZ Jan 31, 2024
b341976
avoid using ParamParser
BurnzZ Jan 31, 2024
60f5c2b
Merge branch 'main' of ssh://github.com/scrapy-plugins/scrapy-zyte-ap…
BurnzZ Jan 31, 2024
2c5e506
use web-poet>=0.16.0
BurnzZ Jan 31, 2024
6c1e043
temporarily use scrapy-poet master branch
BurnzZ Jan 31, 2024
571ee63
Merge branch 'main' of ssh://github.com/scrapy-plugins/scrapy-zyte-ap…
BurnzZ Feb 2, 2024
a51f961
use browserHtml if no extraction source is provided with item types
BurnzZ Feb 5, 2024
9da17b4
remove cache updates since Injector is already doing it
BurnzZ Feb 7, 2024
d389893
Update tests on instance identity
BurnzZ Feb 8, 2024
df32f14
remove duplicate test cases for AnyResponse
BurnzZ Feb 8, 2024
382dced
use scrapy-poet 0.21.0
BurnzZ Feb 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/reference/inputs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ Inputs

- :class:`web_poet.BrowserResponse`

- :class:`web_poet.AnyResponse`

This re-uses either :class:`web_poet.BrowserResponse` *(takes priority)*
or :class:`web_poet.HttpResponse` if they're available.

If neither is available, it would use :class:`web_poet.HttpResponse`
requested from Zyte API. However, if other item inputs (e.g.
:class:`zyte_common_items.Product`) are present, it would request
:class:`web_poet.BrowserResponse` from Zyte API unless an extraction
source is provided.

- :class:`zyte_common_items.Article`

- :class:`zyte_common_items.ArticleList`
Expand Down
103 changes: 86 additions & 17 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
from typing import Any, Callable, Dict, List, Sequence, Set
from weakref import WeakKeyDictionary

from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.utils.defer import maybe_deferred_to_future
from scrapy_poet import AnnotatedResult, PageObjectInputProvider
from web_poet import BrowserHtml, BrowserResponse
from web_poet import (
AnyResponse,
BrowserHtml,
BrowserResponse,
HttpResponse,
HttpResponseHeaders,
)
from zyte_common_items import (
Article,
ArticleList,
Expand Down Expand Up @@ -40,34 +45,43 @@ class ZyteApiProvider(PageObjectInputProvider):
Article,
ArticleList,
ArticleNavigation,
AnyResponse,
JobPosting,
Geolocation,
}

def __init__(self, injector):
super().__init__(injector)
self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None:
if request not in self._cached_instances:
self._cached_instances[request] = {}
self._cached_instances[request].update(mapping)
if request not in self.injector.weak_cache:
self.injector.weak_cache[request] = {}
self.injector.weak_cache[request].update(mapping)

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
"""Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
# TODO what if ``response`` is already from Zyte API and contains something we need
results: List[Any] = []

http_response = None
for cls in list(to_provide):
item = self._cached_instances.get(request, {}).get(cls)
item = self.injector.weak_cache.get(request, {}).get(cls)
if item:
results.append(item)
to_provide.remove(cls)

# BrowserResponse takes precedence over HttpResponse
elif cls == AnyResponse and BrowserResponse not in to_provide:
http_response = self.injector.weak_cache.get(request, {}).get(
HttpResponse
)
if http_response:
any_response = AnyResponse(response=http_response)
results.append(any_response)
self.update_cache(request, {AnyResponse: any_response})
to_provide.remove(cls)

if not to_provide:
return results

Expand All @@ -83,11 +97,10 @@ async def __call__( # noqa: C901
}

zyte_api_meta = crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS")
if html_requested:
zyte_api_meta["browserHtml"] = True

to_provide_stripped: Set[type] = set()
extract_from_seen: Dict[str, str] = {}
item_requested: bool = False

for cls in to_provide:
cls_stripped = strip_annotated(cls)
Expand All @@ -100,6 +113,7 @@ async def __call__( # noqa: C901
kw = item_keywords.get(cls_stripped)
if not kw:
continue
item_requested = True
to_provide_stripped.add(cls_stripped)
zyte_api_meta[kw] = True
if not is_typing_annotated(cls):
Expand All @@ -118,10 +132,32 @@ async def __call__( # noqa: C901
options["extractFrom"] = extract_from.value
break

http_response_needed = (
AnyResponse in to_provide
and BrowserResponse not in to_provide
and BrowserHtml not in to_provide
and not http_response
)

extract_from = None # type: ignore[assignment]
for item_type, kw in item_keywords.items():
options_name = f"{kw}Options"
if item_type not in to_provide_stripped and options_name in zyte_api_meta:
del zyte_api_meta[options_name]
elif zyte_api_meta.get(options_name, {}).get("extractFrom"):
extract_from = zyte_api_meta[options_name]["extractFrom"]

if AnyResponse in to_provide:
if (
item_requested and extract_from != "httpResponseBody"
) or extract_from == "browserHtml":
html_requested = True
elif extract_from == "httpResponseBody" or http_response_needed:
zyte_api_meta["httpResponseBody"] = True
zyte_api_meta["httpResponseHeaders"] = True

if html_requested:
zyte_api_meta["browserHtml"] = True

api_request = Request(
url=request.url,
Expand All @@ -143,14 +179,47 @@ async def __call__( # noqa: C901
if BrowserHtml in to_provide:
results.append(html)
self.update_cache(request, {BrowserHtml: html})

browser_response = None
if BrowserResponse in to_provide:
response = BrowserResponse(
browser_response = BrowserResponse(
url=api_response.url,
status=api_response.status,
html=html,
)
results.append(response)
self.update_cache(request, {BrowserResponse: response})
results.append(browser_response)
self.update_cache(request, {BrowserResponse: browser_response})

if AnyResponse in to_provide:
any_response = None # type: ignore[assignment]

if "browserHtml" in api_response.raw_api_response:
any_response = AnyResponse(
response=browser_response
or BrowserResponse(
url=api_response.url,
status=api_response.status,
html=html,
)
)
elif (
"httpResponseBody" in api_response.raw_api_response
and "httpResponseHeaders" in api_response.raw_api_response
):
any_response = AnyResponse(
response=HttpResponse(
url=api_response.url,
body=api_response.body,
status=api_response.status,
headers=HttpResponseHeaders.from_bytes_dict(
api_response.headers
),
)
)

if any_response:
results.append(any_response)
self.update_cache(request, {AnyResponse: any_response})

for cls in to_provide:
cls_stripped = strip_annotated(cls)
Expand All @@ -163,7 +232,7 @@ async def __call__( # noqa: C901
if not kw:
continue
assert issubclass(cls_stripped, Item)
item = cls_stripped.from_dict(api_response.raw_api_response[kw])
item = cls_stripped.from_dict(api_response.raw_api_response[kw]) # type: ignore[attr-defined]
if is_typing_annotated(cls):
item = AnnotatedResult(item, cls.__metadata__) # type: ignore[attr-defined]
results.append(item)
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def get_version():
# Sync with [testenv:pinned-provider] @ tox.ini
"provider": [
"andi>=0.6.0",
"scrapy-poet>=0.20.1",
"web-poet>=0.15.1",
# "scrapy-poet>=0.19.0",
"scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@master#egg=scrapy-poet",
"web-poet>=0.16.0",
"zyte-common-items>=0.8.0",
]
},
Expand Down
Loading
Loading