Skip to content

Commit

Permalink
use new weak_ref in scrapy_poet's Injector to handle more cases
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Jan 18, 2024
1 parent 5f1d104 commit fa80f6a
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 17 deletions.
22 changes: 12 additions & 10 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import Any, Callable, Dict, List, Sequence, Set
from weakref import WeakKeyDictionary

from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
Expand Down Expand Up @@ -51,30 +50,33 @@ class ZyteApiProvider(PageObjectInputProvider):
JobPosting,
}

def __init__(self, injector):
super().__init__(injector)
self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None:
if request not in self._cached_instances:
self._cached_instances[request] = {}
self._cached_instances[request].update(mapping)
if request not in self.injector.weak_cache:
self.injector.weak_cache[request] = {}
self.injector.weak_cache[request].update(mapping)

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
"""Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
# TODO what if ``response`` is already from Zyte API and contains something we need
results: List[Any] = []

for cls in list(to_provide):
item = self._cached_instances.get(request, {}).get(cls)
item = self.injector.weak_cache.get(request, {}).get(cls)
if item:
results.append(item)
to_provide.remove(cls)
elif cls == AnyResponse:
http_response = self.injector.weak_cache.get(request, {}).get(HttpResponse)
if http_response:
any_response = AnyResponse(response=http_response)
results.append(any_response)
self.update_cache(request, {AnyResponse: any_response})
to_provide.remove(cls)

if not to_provide:
return results

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def get_version():
# Sync with [testenv:pinned-provider] @ tox.ini
"provider": [
"andi>=0.6.0",
"scrapy-poet>=0.19.0",
# "scrapy-poet>=0.19.0",
"scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet",
# "web-poet>=0.15.1",
"web-poet @ git+https://[email protected]/scrapinghub/web-poet@response#egg=web-poet",
"zyte-common-items>=0.8.0",
Expand Down
24 changes: 19 additions & 5 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,10 @@ def parse_(self, response: DummyResponse, page: SomePage):
assert type(item["page"].product) == Product


# The issue here is that HttpResponseProvider runs earlier than ScrapyZyteAPI.
# HttpResponseProvider doesn't know that it should not run since ScrapyZyteAPI
# could provide HttpResponse in anycase.
@pytest.mark.xfail(reason="Not supported yet", raises=AssertionError, strict=True)
@ensureDeferred
async def test_provider_any_response_product_extract_from_http_response_2(mockserver):
@attrs.define
Expand Down Expand Up @@ -688,10 +692,15 @@ def parse_(self, response: DummyResponse, page: SomePage):
params = crawler.engine.downloader.handlers._handlers["http"].params

assert len(params) == 1
assert params[0].keys() == {"url", "HttpResponseBody"}
assert params[0].keys() == {
"url",
"httpResponseBody",
"customHttpRequestHeaders",
"httpResponseHeaders",
}

assert type(item["page"].response) == AnyResponse
assert type(item["page"].response.response) == BrowserResponse
assert type(item["page"].response.response) == HttpResponse
assert type(item["page"].http_response) == HttpResponse


Expand All @@ -715,10 +724,15 @@ def parse_(self, response: DummyResponse, page: SomePage):
params = crawler.engine.downloader.handlers._handlers["http"].params

assert len(params) == 2
assert params[0].keys() == {"url", "HttpResponseBody"}
assert params[1].keys() == {"url", "BrowserHtml"}
assert params[0].keys() == {
"url",
"httpResponseBody",
"customHttpRequestHeaders",
"httpResponseHeaders",
}
assert params[1].keys() == {"url", "browserHtml"}

assert type(item["page"].response) == AnyResponse
assert type(item["page"].response.response) == BrowserResponse
assert type(item["page"].response.response) == HttpResponse
assert type(item["page"].browser_response) == BrowserResponse
assert type(item["page"].http_response) == HttpResponse
3 changes: 2 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ deps =
# scrapy-poet >= 0.4.0 depends on scrapy >= 2.6.0
{[testenv:pinned-scrapy-2x6]deps}
andi==0.6.0
scrapy-poet==0.19.0
#scrapy-poet==0.19.0
scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet
#web-poet==0.15.1
web-poet @ git+https://[email protected]/scrapinghub/web-poet@response#egg=web-poet
zyte-common-items==0.8.0
Expand Down

0 comments on commit fa80f6a

Please sign in to comment.