From 809e179b650490c745387dae0fe4b0229199fb41 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 16 Jan 2024 16:57:28 +0800 Subject: [PATCH 01/23] support HttpOrBrowserResponse --- scrapy_zyte_api/providers.py | 52 +++++++++++++++++++++++++++++++++--- setup.py | 3 ++- tox.ini | 3 ++- 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 7fa8eb5e..0361da5e 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -6,7 +6,13 @@ from scrapy.crawler import Crawler from scrapy.utils.defer import maybe_deferred_to_future from scrapy_poet import AnnotatedResult, PageObjectInputProvider -from web_poet import BrowserHtml, BrowserResponse +from web_poet import ( + BrowserHtml, + BrowserResponse, + HttpOrBrowserResponse, + HttpResponse, + HttpResponseHeaders, +) from zyte_common_items import ( Article, ArticleList, @@ -19,6 +25,7 @@ ) from scrapy_zyte_api._annotations import ExtractFrom +from scrapy_zyte_api._params import _ParamParser from scrapy_zyte_api.responses import ZyteAPITextResponse try: @@ -40,6 +47,7 @@ class ZyteApiProvider(PageObjectInputProvider): Article, ArticleList, ArticleNavigation, + HttpOrBrowserResponse, JobPosting, } @@ -82,8 +90,6 @@ async def __call__( # noqa: C901 } zyte_api_meta = crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS") - if html_requested: - zyte_api_meta["browserHtml"] = True to_provide_stripped: Set[type] = set() extract_from_seen: Dict[str, str] = {} @@ -116,6 +122,20 @@ async def __call__( # noqa: C901 options_name = f"{kw}Options" if item_type not in to_provide_stripped and options_name in zyte_api_meta: del zyte_api_meta[options_name] + elif options_name in zyte_api_meta: + extract_from = zyte_api_meta[options_name]["extractFrom"] + + if HttpOrBrowserResponse in to_provide: + if extract_from == "browserHtml": + html_requested = True + elif extract_from == "httpResponseBody": + param_parser = _ParamParser(crawler) + params = param_parser.parse(request) + del params["url"] + zyte_api_meta.update(params) + + if html_requested: + zyte_api_meta["browserHtml"] = True api_request = Request( url=request.url, @@ -146,6 +166,32 @@ async def __call__( # noqa: C901 results.append(response) self.update_cache(request, {BrowserResponse: response}) + if HttpOrBrowserResponse in to_provide: + if extract_from == "browserHtml": + http_or_browser_response = HttpOrBrowserResponse( + response=BrowserResponse( + url=api_response.url, + status=api_response.status, + html=html, + ) + ) + elif extract_from == "httpResponseBody": + http_or_browser_response = HttpOrBrowserResponse( + response=HttpResponse( + url=api_response.url, + body=api_response.body, + status=api_response.status, + headers=HttpResponseHeaders.from_bytes_dict( + api_response.headers + ), + ) + ) + + results.append(http_or_browser_response) + self.update_cache( + request, {HttpOrBrowserResponse: http_or_browser_response} + ) + for cls in to_provide: cls_stripped = strip_annotated(cls) assert isinstance(cls_stripped, type) diff --git a/setup.py b/setup.py index 597e96fe..a5c15648 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ def get_version(): "provider": [ "andi>=0.6.0", "scrapy-poet>=0.19.0", - "web-poet>=0.15.1", + # "web-poet>=0.15.1", + "web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet", "zyte-common-items>=0.8.0", ] }, diff --git a/tox.ini b/tox.ini index d84752f0..3a4d38fe 100644 --- a/tox.ini +++ b/tox.ini @@ -90,7 +90,8 @@ deps = {[testenv:pinned-scrapy-2x6]deps} andi==0.6.0 scrapy-poet==0.19.0 - web-poet==0.15.1 + #web-poet==0.15.1 + web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet zyte-common-items==0.8.0 [testenv:pinned-extra] From 2f5c69d7cb15bfc1729b3edaf591474d63ad1e89 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 16 Jan 2024 20:26:40 +0800 Subject: [PATCH 02/23] use new AnyResponse instead of HttpOrBrowserResponse --- scrapy_zyte_api/providers.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 0361da5e..5f6ce6a8 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -7,9 +7,9 @@ from scrapy.utils.defer import maybe_deferred_to_future from scrapy_poet import AnnotatedResult, PageObjectInputProvider from web_poet import ( + AnyResponse, BrowserHtml, BrowserResponse, - HttpOrBrowserResponse, HttpResponse, HttpResponseHeaders, ) @@ -47,7 +47,7 @@ class ZyteApiProvider(PageObjectInputProvider): Article, ArticleList, ArticleNavigation, - HttpOrBrowserResponse, + AnyResponse, JobPosting, } @@ -125,7 +125,7 @@ async def __call__( # noqa: C901 elif options_name in zyte_api_meta: extract_from = zyte_api_meta[options_name]["extractFrom"] - if HttpOrBrowserResponse in to_provide: + if AnyResponse in to_provide: if extract_from == "browserHtml": html_requested = True elif extract_from == "httpResponseBody": @@ -166,9 +166,9 @@ async def __call__( # noqa: C901 results.append(response) self.update_cache(request, {BrowserResponse: response}) - if HttpOrBrowserResponse in to_provide: + if AnyResponse in to_provide: if extract_from == "browserHtml": - http_or_browser_response = HttpOrBrowserResponse( + any_response = AnyResponse( response=BrowserResponse( url=api_response.url, status=api_response.status, @@ -176,7 +176,7 @@ async def __call__( # noqa: C901 ) ) elif extract_from == "httpResponseBody": - http_or_browser_response = HttpOrBrowserResponse( + any_response = AnyResponse( response=HttpResponse( url=api_response.url, body=api_response.body, @@ -187,10 +187,8 @@ async def __call__( # noqa: C901 ) ) - results.append(http_or_browser_response) - self.update_cache( - request, {HttpOrBrowserResponse: http_or_browser_response} - ) + results.append(any_response) + self.update_cache(request, {AnyResponse: any_response}) for cls in to_provide: cls_stripped = strip_annotated(cls) From b4a79b0469d3c6b5bf7566a3db962f335cd0389f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 16 Jan 2024 20:44:40 +0800 Subject: [PATCH 03/23] use zapi response contents to determine which response to build --- scrapy_zyte_api/providers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 5f6ce6a8..bfeb67ab 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -167,7 +167,7 @@ async def __call__( # noqa: C901 self.update_cache(request, {BrowserResponse: response}) if AnyResponse in to_provide: - if extract_from == "browserHtml": + if "browserHtml" in api_response.raw_api_response: any_response = AnyResponse( response=BrowserResponse( url=api_response.url, @@ -175,7 +175,10 @@ async def __call__( # noqa: C901 html=html, ) ) - elif extract_from == "httpResponseBody": + elif ( + "httpResponseBody" in api_response.raw_api_response + and "httpResponseHeaders" in api_response.raw_api_response + ): any_response = AnyResponse( response=HttpResponse( url=api_response.url, From 5c83d22cd57a748e15192431220c513dfa49c480 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 17 Jan 2024 20:34:29 +0800 Subject: [PATCH 04/23] add provider tests for AnyResponse --- scrapy_zyte_api/providers.py | 27 +++++--- tests/test_providers.py | 124 ++++++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 13 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index bfeb67ab..19371666 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -118,6 +118,7 @@ async def __call__( # noqa: C901 options["extractFrom"] = extract_from.value break + extract_from = None # type: ignore[assignment] for item_type, kw in item_keywords.items(): options_name = f"{kw}Options" if item_type not in to_provide_stripped and options_name in zyte_api_meta: @@ -130,9 +131,9 @@ async def __call__( # noqa: C901 html_requested = True elif extract_from == "httpResponseBody": param_parser = _ParamParser(crawler) - params = param_parser.parse(request) - del params["url"] - zyte_api_meta.update(params) + http_request_params = param_parser.parse(request) + del http_request_params["url"] + zyte_api_meta.update(http_request_params) if html_requested: zyte_api_meta["browserHtml"] = True @@ -157,19 +158,24 @@ async def __call__( # noqa: C901 if BrowserHtml in to_provide: results.append(html) self.update_cache(request, {BrowserHtml: html}) + + browser_response = None if BrowserResponse in to_provide: - response = BrowserResponse( + browser_response = BrowserResponse( url=api_response.url, status=api_response.status, html=html, ) - results.append(response) - self.update_cache(request, {BrowserResponse: response}) + results.append(browser_response) + self.update_cache(request, {BrowserResponse: browser_response}) if AnyResponse in to_provide: + any_response = None + if "browserHtml" in api_response.raw_api_response: any_response = AnyResponse( - response=BrowserResponse( + response=browser_response + or BrowserResponse( url=api_response.url, status=api_response.status, html=html, @@ -190,8 +196,9 @@ async def __call__( # noqa: C901 ) ) - results.append(any_response) - self.update_cache(request, {AnyResponse: any_response}) + if any_response: + results.append(any_response) + self.update_cache(request, {AnyResponse: any_response}) for cls in to_provide: cls_stripped = strip_annotated(cls) @@ -200,7 +207,7 @@ async def __call__( # noqa: C901 if not kw: continue assert issubclass(cls_stripped, Item) - item = cls_stripped.from_dict(api_response.raw_api_response[kw]) + item = cls_stripped.from_dict(api_response.raw_api_response[kw]) # type: ignore[attr-defined] if is_typing_annotated(cls): item = AnnotatedResult(item, cls.__metadata__) # type: ignore[attr-defined] results.append(item) diff --git a/tests/test_providers.py b/tests/test_providers.py index 75c523e0..89a2abf3 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -4,21 +4,32 @@ pytest.importorskip("scrapy_poet") +import asyncio + import attrs from pytest_twisted import ensureDeferred from scrapy import Request, Spider from scrapy_poet import DummyResponse +from scrapy_poet.injection import Injector from scrapy_poet.utils.testing import HtmlResource, crawl_single_item from scrapy_poet.utils.testing import create_scrapy_settings as _create_scrapy_settings -from twisted.internet import reactor +from twisted.internet import defer, reactor from twisted.web.client import Agent, readBody -from web_poet import BrowserHtml, BrowserResponse, ItemPage, field, handle_urls +from web_poet import ( + AnyResponse, + BrowserHtml, + BrowserResponse, + HttpResponse, + ItemPage, + field, + handle_urls, +) from zyte_common_items import BasePage, Product from scrapy_zyte_api._annotations import ExtractFrom from scrapy_zyte_api.providers import ZyteApiProvider -from . import SETTINGS +from . import SETTINGS, get_crawler from .mockserver import get_ephemeral_port @@ -263,3 +274,110 @@ def parse_(self, response: DummyResponse, page: AnnotatedProductPage): # type: item, _, _ = await crawl_single_item(AnnotatedZyteAPISpider, HtmlResource, settings) assert item is None assert "Multiple different extractFrom specified for product" in caplog.text + + +@defer.inlineCallbacks +def run_provider(server, to_provide, settings_dict=None, request_meta=None): + class AnyResponseSpider(Spider): + name = "any_response" + + request = Request(server.urljoin("/some-page"), meta=request_meta) + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = server.urljoin("/") + if settings_dict: + settings.update(settings_dict) + crawler = get_crawler(settings, AnyResponseSpider) + yield from crawler.engine.open_spider(crawler.spider) + injector = Injector(crawler) + provider = ZyteApiProvider(injector) + + coro = provider(to_provide, request, crawler) + results = yield defer.Deferred.fromFuture(asyncio.ensure_future(coro)) + + return results + + +@defer.inlineCallbacks +def test_provider_any_response(mockserver): + # Use only one instance of the mockserver for faster tests. + def provide(*args, **kwargs): + return run_provider(mockserver, *args, **kwargs) + + results = yield provide(set()) + assert results == [] + + # Having only AnyResponse without any of the other responses to re-use + # does not result in anything. + results = yield provide( + { + AnyResponse, + } + ) + assert results == [] + + # Same case as above, since no response is available. + results = yield provide({AnyResponse, Product}) + assert len(results) == 1 + assert type(results[0]) == Product + + # AnyResponse should re-use BrowserResponse if available. + results = yield provide({AnyResponse, BrowserResponse}) + assert len(results) == 2 + assert type(results[0]) == BrowserResponse + assert type(results[1]) == AnyResponse + assert id(results[0]) == id(results[1].response) + + # AnyResponse should re-use BrowserHtml if available. + results = yield provide({AnyResponse, BrowserHtml}) + assert len(results) == 2 + assert type(results[0]) == BrowserHtml + assert type(results[1]) == AnyResponse + assert results[0] == results[1].response.html # diff instance due to casting + + results = yield provide({AnyResponse, BrowserResponse, BrowserHtml}) + assert len(results) == 3 + assert type(results[0]) == BrowserHtml + assert type(results[1]) == BrowserResponse + assert type(results[2]) == AnyResponse + assert results[0] == results[1].html # diff instance due to casting + assert results[0] == results[2].response.html + + # NOTES: This is hard to test in this setup and would result in being empty. + # This will be tested in a spider-setup instead so that HttpResponseProvider + # can participate. + # results = yield provide({AnyResponse, HttpResponse}) + # assert results == [] + + # For the following cases, extraction source isn't available in `to_provided` + # but are in the `*.extractFrom` parameter. + + settings_dict = { + "ZYTE_API_PROVIDER_PARAMS": {"productOptions": {"extractFrom": "browserHtml"}} + } + + results = yield provide({AnyResponse, Product}, settings_dict) + assert len(results) == 2 + assert type(results[0]) == AnyResponse + assert type(results[0].response) == BrowserResponse + assert type(results[1]) == Product + + results = yield provide({AnyResponse, BrowserHtml, Product}, settings_dict) + assert len(results) == 3 + assert type(results[0]) == BrowserHtml + assert type(results[1]) == AnyResponse + assert type(results[1].response) == BrowserResponse + assert type(results[2]) == Product + assert results[0] == results[1].response.html # diff instance due to casting + + settings_dict = { + "ZYTE_API_PROVIDER_PARAMS": { + "productOptions": {"extractFrom": "httpResponseBody"} + } + } + request_meta = {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": True}} + + results = yield provide({AnyResponse, Product}, settings_dict, request_meta) + assert len(results) == 2 + assert type(results[0]) == AnyResponse + assert type(results[0].response) == HttpResponse + assert type(results[1]) == Product From 202fa1bfda65668f4d36c39bf9ec402e3f61cfb8 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 17 Jan 2024 20:38:26 +0800 Subject: [PATCH 05/23] fix mypy --- tests/test_providers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index 89a2abf3..4d8da175 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -332,15 +332,16 @@ def provide(*args, **kwargs): assert len(results) == 2 assert type(results[0]) == BrowserHtml assert type(results[1]) == AnyResponse - assert results[0] == results[1].response.html # diff instance due to casting + # diff instance due to casting + assert results[0] == results[1].response.html # type: ignore[union-attr] results = yield provide({AnyResponse, BrowserResponse, BrowserHtml}) assert len(results) == 3 assert type(results[0]) == BrowserHtml assert type(results[1]) == BrowserResponse assert type(results[2]) == AnyResponse - assert results[0] == results[1].html # diff instance due to casting - assert results[0] == results[2].response.html + assert results[0] == results[1].html + assert results[0] == results[2].response.html # type: ignore[union-attr] # NOTES: This is hard to test in this setup and would result in being empty. # This will be tested in a spider-setup instead so that HttpResponseProvider From 5f1d1045ee58b2f1221fc1eb75c075ee28e2aee0 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 17 Jan 2024 21:52:14 +0800 Subject: [PATCH 06/23] add more test cases --- tests/test_providers.py | 340 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index 4d8da175..9eb4e1bc 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -27,6 +27,7 @@ from zyte_common_items import BasePage, Product from scrapy_zyte_api._annotations import ExtractFrom +from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler from scrapy_zyte_api.providers import ZyteApiProvider from . import SETTINGS, get_crawler @@ -382,3 +383,342 @@ def provide(*args, **kwargs): assert type(results[0]) == AnyResponse assert type(results[0].response) == HttpResponse assert type(results[1]) == Product + + +class RecordingHandler(ScrapyZyteAPIDownloadHandler): + """Subclasses the original handler in order to record the Zyte API parameters + used for each downloading request, as well as counting the number of Zyte API + requests. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.params = [] + + def _log_request(self, params): + self.params.append(params) + + +def provider_settings(server): + settings = create_scrapy_settings() + settings["ZYTE_API_URL"] = server.urljoin("/") + settings["ZYTE_API_TRANSPARENT_MODE"] = True + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100} + settings["DOWNLOAD_HANDLERS"]["http"] = RecordingHandler + return settings + + +@ensureDeferred +async def test_provider_any_response_only(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url"} + assert item is None + + +@ensureDeferred +async def test_provider_any_response_product(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + product: Product + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url", "product"} + assert item is None + + +@ensureDeferred +async def test_provider_any_response_product_extract_from_browser_html(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + product: Product + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + settings["ZYTE_API_PROVIDER_PARAMS"] = { + "productOptions": {"extractFrom": "browserHtml"} + } + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url", "product", "browserHtml", "productOptions"} + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].product) == Product + + +@ensureDeferred +async def test_provider_any_response_product_extract_from_browser_html_2(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + browser_response: BrowserResponse + product: Product + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + settings["ZYTE_API_PROVIDER_PARAMS"] = { + "productOptions": {"extractFrom": "browserHtml"} + } + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url", "product", "browserHtml", "productOptions"} + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].browser_response) == BrowserResponse + assert type(item["page"].product) == Product + + assert id(item["page"].browser_response) == id(item["page"].response.response) + + +@ensureDeferred +async def test_provider_any_response_product_extract_from_http_response(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + product: Product + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + settings["ZYTE_API_PROVIDER_PARAMS"] = { + "productOptions": {"extractFrom": "httpResponseBody"} + } + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == { + "url", + "product", + "httpResponseBody", + "productOptions", + "httpResponseHeaders", + "customHttpRequestHeaders", + } + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == HttpResponse + assert type(item["page"].product) == Product + + +@ensureDeferred +async def test_provider_any_response_product_extract_from_http_response_2(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + http_response: HttpResponse + product: Product + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + settings["ZYTE_API_PROVIDER_PARAMS"] = { + "productOptions": {"extractFrom": "httpResponseBody"} + } + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == { + "url", + "product", + "httpResponseBody", + "productOptions", + "httpResponseHeaders", + "customHttpRequestHeaders", + } + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == HttpResponse + assert type(item["page"].product) == Product + assert type(item["page"].http_response) == HttpResponse + + +@ensureDeferred +async def test_provider_any_response_browser_html(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + html: BrowserHtml + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url", "browserHtml"} + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].html) == BrowserHtml + + +@ensureDeferred +async def test_provider_any_response_browser_response(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + browser_response: BrowserResponse + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url", "browserHtml"} + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].browser_response) == BrowserResponse + + +@ensureDeferred +async def test_provider_any_response_browser_html_response(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + browser_response: BrowserResponse + html: BrowserHtml + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url", "browserHtml"} + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].browser_response) == BrowserResponse + assert type(item["page"].html) == BrowserHtml + + +@ensureDeferred +async def test_provider_any_response_http_response(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + http_response: HttpResponse + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == {"url", "HttpResponseBody"} + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].http_response) == HttpResponse + + +@ensureDeferred +async def test_provider_any_response_browser_http_response(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + browser_response: BrowserResponse + http_response: HttpResponse + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 2 + assert params[0].keys() == {"url", "HttpResponseBody"} + assert params[1].keys() == {"url", "BrowserHtml"} + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].browser_response) == BrowserResponse + assert type(item["page"].http_response) == HttpResponse From 3cb22901c762aa2f0503707312a2ebabcefc42bc Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 18 Jan 2024 22:18:23 +0800 Subject: [PATCH 07/23] use new weak_ref in scrapy_poet's Injector to handle more cases --- scrapy_zyte_api/providers.py | 26 +++++++++++++++----------- setup.py | 3 ++- tests/test_providers.py | 24 +++++++++++++++++++----- tox.ini | 3 ++- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 19371666..fb04554f 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,5 +1,4 @@ from typing import Any, Callable, Dict, List, Sequence, Set -from weakref import WeakKeyDictionary from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request @@ -51,30 +50,35 @@ class ZyteApiProvider(PageObjectInputProvider): JobPosting, } - def __init__(self, injector): - super().__init__(injector) - self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary() - def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None: - if request not in self._cached_instances: - self._cached_instances[request] = {} - self._cached_instances[request].update(mapping) + if request not in self.injector.weak_cache: + self.injector.weak_cache[request] = {} + self.injector.weak_cache[request].update(mapping) async def __call__( # noqa: C901 self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: """Makes a Zyte API request to provide BrowserResponse and/or item dependencies.""" - # TODO what if ``response`` is already from Zyte API and contains something we need results: List[Any] = [] for cls in list(to_provide): - item = self._cached_instances.get(request, {}).get(cls) + item = self.injector.weak_cache.get(request, {}).get(cls) if item: results.append(item) to_provide.remove(cls) + elif cls == AnyResponse: + http_response = self.injector.weak_cache.get(request, {}).get( + HttpResponse + ) + if http_response: + any_response = AnyResponse(response=http_response) + results.append(any_response) + self.update_cache(request, {AnyResponse: any_response}) + to_provide.remove(cls) + if not to_provide: return results @@ -170,7 +174,7 @@ async def __call__( # noqa: C901 self.update_cache(request, {BrowserResponse: browser_response}) if AnyResponse in to_provide: - any_response = None + any_response = None # type: ignore[assignment] if "browserHtml" in api_response.raw_api_response: any_response = AnyResponse( diff --git a/setup.py b/setup.py index a5c15648..f79c380a 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,8 @@ def get_version(): # Sync with [testenv:pinned-provider] @ tox.ini "provider": [ "andi>=0.6.0", - "scrapy-poet>=0.19.0", + # "scrapy-poet>=0.19.0", + "scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet", # "web-poet>=0.15.1", "web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet", "zyte-common-items>=0.8.0", diff --git a/tests/test_providers.py b/tests/test_providers.py index 9eb4e1bc..b0fd0d6a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -551,6 +551,10 @@ def parse_(self, response: DummyResponse, page: SomePage): assert type(item["page"].product) == Product +# The issue here is that HttpResponseProvider runs earlier than ScrapyZyteAPI. +# HttpResponseProvider doesn't know that it should not run since ScrapyZyteAPI +# could provide HttpResponse in anycase. +@pytest.mark.xfail(reason="Not supported yet", raises=AssertionError, strict=True) @ensureDeferred async def test_provider_any_response_product_extract_from_http_response_2(mockserver): @attrs.define @@ -688,10 +692,15 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "HttpResponseBody"} + assert params[0].keys() == { + "url", + "httpResponseBody", + "customHttpRequestHeaders", + "httpResponseHeaders", + } assert type(item["page"].response) == AnyResponse - assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].response.response) == HttpResponse assert type(item["page"].http_response) == HttpResponse @@ -715,10 +724,15 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 2 - assert params[0].keys() == {"url", "HttpResponseBody"} - assert params[1].keys() == {"url", "BrowserHtml"} + assert params[0].keys() == { + "url", + "httpResponseBody", + "customHttpRequestHeaders", + "httpResponseHeaders", + } + assert params[1].keys() == {"url", "browserHtml"} assert type(item["page"].response) == AnyResponse - assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].response.response) == HttpResponse assert type(item["page"].browser_response) == BrowserResponse assert type(item["page"].http_response) == HttpResponse diff --git a/tox.ini b/tox.ini index 3a4d38fe..d0da523d 100644 --- a/tox.ini +++ b/tox.ini @@ -89,7 +89,8 @@ deps = # scrapy-poet >= 0.4.0 depends on scrapy >= 2.6.0 {[testenv:pinned-scrapy-2x6]deps} andi==0.6.0 - scrapy-poet==0.19.0 + #scrapy-poet==0.19.0 + scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet #web-poet==0.15.1 web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet zyte-common-items==0.8.0 From 85f355d3d91a6c4713746f12b9f8dbd96835405e Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 19 Jan 2024 15:19:34 +0800 Subject: [PATCH 08/23] BrowserResponse takes precedence in AnyResponse --- scrapy_zyte_api/providers.py | 4 +++- tests/test_providers.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index fb04554f..269e1241 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -69,7 +69,9 @@ async def __call__( # noqa: C901 if item: results.append(item) to_provide.remove(cls) - elif cls == AnyResponse: + + # BrowserResponse takes precedence than HttpResponse + elif cls == AnyResponse and BrowserResponse not in to_provide: http_response = self.injector.weak_cache.get(request, {}).get( HttpResponse ) diff --git a/tests/test_providers.py b/tests/test_providers.py index b0fd0d6a..684ffc57 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -733,6 +733,6 @@ def parse_(self, response: DummyResponse, page: SomePage): assert params[1].keys() == {"url", "browserHtml"} assert type(item["page"].response) == AnyResponse - assert type(item["page"].response.response) == HttpResponse + assert type(item["page"].response.response) == BrowserResponse assert type(item["page"].browser_response) == BrowserResponse assert type(item["page"].http_response) == HttpResponse From 297eb570064ce6509811a2cdb802addcac706a72 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 19 Jan 2024 15:47:02 +0800 Subject: [PATCH 09/23] add more test cases --- tests/test_providers.py | 74 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index 684ffc57..f6169614 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -450,7 +450,7 @@ def parse_(self, response: DummyResponse, page: SomePage): assert len(params) == 1 assert params[0].keys() == {"url", "product"} - assert item is None + assert item is None # None represents as an error in crawl_single_item() @ensureDeferred @@ -736,3 +736,75 @@ def parse_(self, response: DummyResponse, page: SomePage): assert type(item["page"].response.response) == BrowserResponse assert type(item["page"].browser_response) == BrowserResponse assert type(item["page"].http_response) == HttpResponse + + +@ensureDeferred +async def test_provider_any_response_http_response_multiple_pages(mockserver): + @attrs.define + class FirstPage(BasePage): + http_response: HttpResponse + + @attrs.define + class SecondPage(BasePage): + http_response: HttpResponse + response: AnyResponse + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page1: FirstPage, page2: SecondPage): + yield {"page1": page1, "page2": page2} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0].keys() == { + "url", + "httpResponseBody", + "customHttpRequestHeaders", + "httpResponseHeaders", + } + assert type(item["page1"].http_response) == HttpResponse + assert type(item["page2"].http_response) == HttpResponse + assert type(item["page2"].response) == AnyResponse + assert type(item["page2"].response.response) == HttpResponse + + +@ensureDeferred +async def test_provider_any_response_http_browser_response_multiple_pages(mockserver): + @attrs.define + class FirstPage(BasePage): + browser_response: BrowserResponse + + @attrs.define + class SecondPage(BasePage): + http_response: HttpResponse + response: AnyResponse + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page1: FirstPage, page2: SecondPage): + yield {"page1": page1, "page2": page2} + + settings = provider_settings(mockserver) + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 2 + assert params[0].keys() == { + "url", + "httpResponseBody", + "customHttpRequestHeaders", + "httpResponseHeaders", + } + assert params[1].keys() == {"url", "browserHtml"} + + assert type(item["page1"].browser_response) == BrowserResponse + assert type(item["page2"].http_response) == HttpResponse + assert type(item["page2"].response) == AnyResponse + assert type(item["page2"].response.response) == BrowserResponse From 1e19ef3a97d05d02d03ae804eb993ff8e4962055 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 19 Jan 2024 17:33:26 +0800 Subject: [PATCH 10/23] docs for AnyResponse fulfillment --- docs/reference/inputs.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/reference/inputs.rst b/docs/reference/inputs.rst index 3b2d15f9..3f89a4f5 100644 --- a/docs/reference/inputs.rst +++ b/docs/reference/inputs.rst @@ -13,6 +13,12 @@ Inputs - :class:`web_poet.BrowserResponse` +- :class:`web_poet.AnyResponse` + + This re-uses either :class:`web_poet.BrowserResponse` *(takes priority)* + or :class:`web_poet.HttpResponse` if they're available. If neither is + available, then it won't cause either one to be obtained from Zyte API. + - :class:`zyte_common_items.Article` - :class:`zyte_common_items.ArticleList` From 137d146b13d78a8c55253955183f0509f0d9fc76 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 19 Jan 2024 20:46:48 +0800 Subject: [PATCH 11/23] AnyResponse would request HttpResponse if BrowserResponse/HttpResponse doesn't exist --- docs/reference/inputs.rst | 3 ++- scrapy_zyte_api/providers.py | 13 +++++++++++- tests/test_providers.py | 39 +++++++++++++++++++++++++++--------- 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/docs/reference/inputs.rst b/docs/reference/inputs.rst index 3f89a4f5..866b352d 100644 --- a/docs/reference/inputs.rst +++ b/docs/reference/inputs.rst @@ -17,7 +17,8 @@ Inputs This re-uses either :class:`web_poet.BrowserResponse` *(takes priority)* or :class:`web_poet.HttpResponse` if they're available. If neither is - available, then it won't cause either one to be obtained from Zyte API. + available, it would use :class:`web_poet.HttpResponse` requested from Zyte + API. - :class:`zyte_common_items.Article` diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 269e1241..7589287a 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -64,6 +64,7 @@ async def __call__( # noqa: C901 """Makes a Zyte API request to provide BrowserResponse and/or item dependencies.""" results: List[Any] = [] + http_response = None for cls in list(to_provide): item = self.injector.weak_cache.get(request, {}).get(cls) if item: @@ -124,6 +125,13 @@ async def __call__( # noqa: C901 options["extractFrom"] = extract_from.value break + http_response_needed = ( + AnyResponse in to_provide + and BrowserResponse not in to_provide + and BrowserHtml not in to_provide + and not http_response + ) + extract_from = None # type: ignore[assignment] for item_type, kw in item_keywords.items(): options_name = f"{kw}Options" @@ -131,12 +139,15 @@ async def __call__( # noqa: C901 del zyte_api_meta[options_name] elif options_name in zyte_api_meta: extract_from = zyte_api_meta[options_name]["extractFrom"] + elif item_type in to_provide_stripped and http_response_needed: + zyte_api_meta[options_name] = {"extractFrom": "httpResponseBody"} if AnyResponse in to_provide: if extract_from == "browserHtml": html_requested = True - elif extract_from == "httpResponseBody": + elif extract_from == "httpResponseBody" or http_response_needed: param_parser = _ParamParser(crawler) + param_parser._transparent_mode = True http_request_params = param_parser.parse(request) del http_request_params["url"] zyte_api_meta.update(http_request_params) diff --git a/tests/test_providers.py b/tests/test_providers.py index f6169614..b074d505 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -307,19 +307,22 @@ def provide(*args, **kwargs): results = yield provide(set()) assert results == [] - # Having only AnyResponse without any of the other responses to re-use - # does not result in anything. + # AnyResponse would use HttpResponse by default, if neither BrowserResponse + # nor HttpResponse is available. results = yield provide( { AnyResponse, } ) - assert results == [] + assert len(results) == 1 + assert type(results[0]) == AnyResponse + assert type(results[0].response) == HttpResponse - # Same case as above, since no response is available. + # Same case as above, HttpResopnse is used by default results = yield provide({AnyResponse, Product}) - assert len(results) == 1 - assert type(results[0]) == Product + assert len(results) == 2 + assert type(results[0]) == AnyResponse + assert type(results[1]) == Product # AnyResponse should re-use BrowserResponse if available. results = yield provide({AnyResponse, BrowserResponse}) @@ -426,8 +429,14 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url"} - assert item is None + assert params[0].keys() == { + "url", + "httpResponseBody", + "httpResponseHeaders", + "customHttpRequestHeaders", + } + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == HttpResponse @ensureDeferred @@ -449,8 +458,18 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "product"} - assert item is None # None represents as an error in crawl_single_item() + assert params[0].keys() == { + "url", + "product", + "productOptions", + "httpResponseBody", + "httpResponseHeaders", + "customHttpRequestHeaders", + } + assert params[0]["productOptions"] == {"extractFrom": "httpResponseBody"} + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == HttpResponse + assert type(item["page"].product) == Product @ensureDeferred From 932262288789a774476ca3d6474caf5d925703dc Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 22 Jan 2024 16:41:23 +0800 Subject: [PATCH 12/23] improve tests --- tests/test_providers.py | 160 +++++++++++++++++++++++----------------- 1 file changed, 92 insertions(+), 68 deletions(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index b074d505..53300631 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -390,8 +390,7 @@ def provide(*args, **kwargs): class RecordingHandler(ScrapyZyteAPIDownloadHandler): """Subclasses the original handler in order to record the Zyte API parameters - used for each downloading request, as well as counting the number of Zyte API - requests. + used for each downloading request. """ def __init__(self, *args, **kwargs): @@ -411,6 +410,24 @@ def provider_settings(server): return settings +CUSTOM_HTTP_REQUEST_HEADERS = [ + { + "name": "Accept", + "value": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + }, + {"name": "Accept-Language", "value": "en"}, +] + +CUSTOM_HTTP_REQUEST_HEADERS_2 = [ + { + "name": "Accept", + "value": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + }, + {"name": "Accept-Language", "value": "en"}, + {"name": "Accept-Encoding", "value": "gzip, deflate, br"}, +] + + @ensureDeferred async def test_provider_any_response_only(mockserver): @attrs.define @@ -429,11 +446,11 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == { - "url", - "httpResponseBody", - "httpResponseHeaders", - "customHttpRequestHeaders", + assert params[0] == { + "url": url, + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == HttpResponse @@ -458,15 +475,14 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == { - "url", - "product", - "productOptions", - "httpResponseBody", - "httpResponseHeaders", - "customHttpRequestHeaders", + assert params[0] == { + "url": url, + "product": True, + "productOptions": {"extractFrom": "httpResponseBody"}, + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } - assert params[0]["productOptions"] == {"extractFrom": "httpResponseBody"} assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == HttpResponse assert type(item["page"].product) == Product @@ -486,15 +502,19 @@ def start_requests(self): def parse_(self, response: DummyResponse, page: SomePage): yield {"page": page} + product_options = {"extractFrom": "browserHtml"} settings = provider_settings(mockserver) - settings["ZYTE_API_PROVIDER_PARAMS"] = { - "productOptions": {"extractFrom": "browserHtml"} - } + settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": product_options} item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "product", "browserHtml", "productOptions"} + assert params[0] == { + "url": url, + "product": True, + "browserHtml": True, + "productOptions": product_options, + } assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == BrowserResponse @@ -516,15 +536,19 @@ def start_requests(self): def parse_(self, response: DummyResponse, page: SomePage): yield {"page": page} + product_options = {"extractFrom": "browserHtml"} settings = provider_settings(mockserver) - settings["ZYTE_API_PROVIDER_PARAMS"] = { - "productOptions": {"extractFrom": "browserHtml"} - } + settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": product_options} item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "product", "browserHtml", "productOptions"} + assert params[0] == { + "url": url, + "product": True, + "browserHtml": True, + "productOptions": product_options, + } assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == BrowserResponse @@ -548,21 +572,20 @@ def start_requests(self): def parse_(self, response: DummyResponse, page: SomePage): yield {"page": page} + product_options = {"extractFrom": "httpResponseBody"} settings = provider_settings(mockserver) - settings["ZYTE_API_PROVIDER_PARAMS"] = { - "productOptions": {"extractFrom": "httpResponseBody"} - } + settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": product_options} item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == { - "url", - "product", - "httpResponseBody", - "productOptions", - "httpResponseHeaders", - "customHttpRequestHeaders", + assert params[0] == { + "url": url, + "product": True, + "httpResponseBody": True, + "productOptions": product_options, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse @@ -589,21 +612,20 @@ def start_requests(self): def parse_(self, response: DummyResponse, page: SomePage): yield {"page": page} + product_options = {"extractFrom": "httpResponseBody"} settings = provider_settings(mockserver) - settings["ZYTE_API_PROVIDER_PARAMS"] = { - "productOptions": {"extractFrom": "httpResponseBody"} - } + settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": product_options} item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == { - "url", - "product", - "httpResponseBody", - "productOptions", - "httpResponseHeaders", - "customHttpRequestHeaders", + assert params[0] == { + "url": url, + "product": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + "productOptions": product_options, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse @@ -631,7 +653,7 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "browserHtml"} + assert params[0] == {"url": url, "browserHtml": True} assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == BrowserResponse @@ -657,7 +679,7 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "browserHtml"} + assert params[0] == {"url": url, "browserHtml": True} assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == BrowserResponse @@ -684,7 +706,7 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "browserHtml"} + assert params[0] == {"url": url, "browserHtml": True} assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == BrowserResponse @@ -711,11 +733,11 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == { - "url", - "httpResponseBody", - "customHttpRequestHeaders", - "httpResponseHeaders", + assert params[0] == { + "url": url, + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, } assert type(item["page"].response) == AnyResponse @@ -743,19 +765,21 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 2 - assert params[0].keys() == { - "url", - "httpResponseBody", - "customHttpRequestHeaders", - "httpResponseHeaders", + assert params[0] == { + "url": url, + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, } - assert params[1].keys() == {"url", "browserHtml"} + assert params[1] == {"url": url, "browserHtml": True} assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == BrowserResponse assert type(item["page"].browser_response) == BrowserResponse assert type(item["page"].http_response) == HttpResponse + assert id(item["page"].browser_response) == id(item["page"].response.response) + @ensureDeferred async def test_provider_any_response_http_response_multiple_pages(mockserver): @@ -780,11 +804,11 @@ def parse_(self, response: DummyResponse, page1: FirstPage, page2: SecondPage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == { - "url", - "httpResponseBody", - "customHttpRequestHeaders", - "httpResponseHeaders", + assert params[0] == { + "url": url, + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, } assert type(item["page1"].http_response) == HttpResponse assert type(item["page2"].http_response) == HttpResponse @@ -815,13 +839,13 @@ def parse_(self, response: DummyResponse, page1: FirstPage, page2: SecondPage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 2 - assert params[0].keys() == { - "url", - "httpResponseBody", - "customHttpRequestHeaders", - "httpResponseHeaders", + assert params[0] == { + "url": url, + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, } - assert params[1].keys() == {"url", "browserHtml"} + assert params[1] == {"url": url, "browserHtml": True} assert type(item["page1"].browser_response) == BrowserResponse assert type(item["page2"].http_response) == HttpResponse From ffa345f530e2a23631a792efe54ff4c474fa5946 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 24 Jan 2024 22:45:03 +0800 Subject: [PATCH 13/23] small comments and improvements --- scrapy_zyte_api/providers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 7589287a..d24eda1c 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -71,7 +71,7 @@ async def __call__( # noqa: C901 results.append(item) to_provide.remove(cls) - # BrowserResponse takes precedence than HttpResponse + # BrowserResponse takes precedence over HttpResponse elif cls == AnyResponse and BrowserResponse not in to_provide: http_response = self.injector.weak_cache.get(request, {}).get( HttpResponse @@ -138,7 +138,7 @@ async def __call__( # noqa: C901 if item_type not in to_provide_stripped and options_name in zyte_api_meta: del zyte_api_meta[options_name] elif options_name in zyte_api_meta: - extract_from = zyte_api_meta[options_name]["extractFrom"] + extract_from = zyte_api_meta[options_name].get("extractFrom") elif item_type in to_provide_stripped and http_response_needed: zyte_api_meta[options_name] = {"extractFrom": "httpResponseBody"} @@ -152,6 +152,7 @@ async def __call__( # noqa: C901 del http_request_params["url"] zyte_api_meta.update(http_request_params) + # TODO: Map out RequestHeaders similar to httpResponseBody if html_requested: zyte_api_meta["browserHtml"] = True From 7043c554f6e38672a2a58902f7e2eb847ee74d79 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 26 Jan 2024 15:41:07 +0800 Subject: [PATCH 14/23] add test for item return --- tests/test_providers.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index 53300631..f7c52925 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -521,6 +521,38 @@ def parse_(self, response: DummyResponse, page: SomePage): assert type(item["page"].product) == Product +@ensureDeferred +async def test_provider_any_response_product_item_extract_from_browser_html(mockserver): + @attrs.define + class SomePage(ItemPage[Product]): + response: AnyResponse + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage, product: Product): + yield {"page": page, "product": product} + + product_options = {"extractFrom": "browserHtml"} + settings = provider_settings(mockserver) + settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": product_options} + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0] == { + "url": url, + "product": True, + "browserHtml": True, + "productOptions": product_options, + } + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == BrowserResponse + assert type(item["product"]) == Product + + @ensureDeferred async def test_provider_any_response_product_extract_from_browser_html_2(mockserver): @attrs.define From 3fec1b8fa5e06e876da14ac5a494970bd8b8bcf3 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 31 Jan 2024 16:58:04 +0800 Subject: [PATCH 15/23] handle empty itemOptions --- scrapy_zyte_api/providers.py | 4 ++-- tests/test_providers.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index d24eda1c..f5e3c70c 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -137,8 +137,8 @@ async def __call__( # noqa: C901 options_name = f"{kw}Options" if item_type not in to_provide_stripped and options_name in zyte_api_meta: del zyte_api_meta[options_name] - elif options_name in zyte_api_meta: - extract_from = zyte_api_meta[options_name].get("extractFrom") + elif zyte_api_meta.get(options_name, {}).get("extractFrom"): + extract_from = zyte_api_meta[options_name]["extractFrom"] elif item_type in to_provide_stripped and http_response_needed: zyte_api_meta[options_name] = {"extractFrom": "httpResponseBody"} diff --git a/tests/test_providers.py b/tests/test_providers.py index f7c52925..1d6ac307 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -625,6 +625,41 @@ def parse_(self, response: DummyResponse, page: SomePage): assert type(item["page"].product) == Product +@ensureDeferred +async def test_provider_any_response_product_options_empty(mockserver): + @attrs.define + class SomePage(BasePage): + response: AnyResponse + product: Product + + class ZyteAPISpider(Spider): + def start_requests(self): + yield Request(self.url, callback=self.parse_) + + def parse_(self, response: DummyResponse, page: SomePage): + yield {"page": page} + + product_options = {} + settings = provider_settings(mockserver) + settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": product_options} + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) + params = crawler.engine.downloader.handlers._handlers["http"].params + + assert len(params) == 1 + assert params[0] == { + "url": url, + "product": True, + "httpResponseBody": True, + "productOptions": {"extractFrom": "httpResponseBody"}, + "httpResponseHeaders": True, + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, + } + + assert type(item["page"].response) == AnyResponse + assert type(item["page"].response.response) == HttpResponse + assert type(item["page"].product) == Product + + # The issue here is that HttpResponseProvider runs earlier than ScrapyZyteAPI. # HttpResponseProvider doesn't know that it should not run since ScrapyZyteAPI # could provide HttpResponse in anycase. From b3419765e0429d7bebe971e0bdff73ca99932996 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 31 Jan 2024 18:25:31 +0800 Subject: [PATCH 16/23] avoid using ParamParser --- scrapy_zyte_api/providers.py | 9 ++------- tests/test_providers.py | 28 +++++++++------------------- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index f5e3c70c..307c3fc5 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -24,7 +24,6 @@ ) from scrapy_zyte_api._annotations import ExtractFrom -from scrapy_zyte_api._params import _ParamParser from scrapy_zyte_api.responses import ZyteAPITextResponse try: @@ -146,13 +145,9 @@ async def __call__( # noqa: C901 if extract_from == "browserHtml": html_requested = True elif extract_from == "httpResponseBody" or http_response_needed: - param_parser = _ParamParser(crawler) - param_parser._transparent_mode = True - http_request_params = param_parser.parse(request) - del http_request_params["url"] - zyte_api_meta.update(http_request_params) + zyte_api_meta["httpResponseBody"] = True + zyte_api_meta["httpResponseHeaders"] = True - # TODO: Map out RequestHeaders similar to httpResponseBody if html_requested: zyte_api_meta["browserHtml"] = True diff --git a/tests/test_providers.py b/tests/test_providers.py index 1d6ac307..52e4c0b6 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -416,14 +416,6 @@ def provider_settings(server): "value": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", }, {"name": "Accept-Language", "value": "en"}, -] - -CUSTOM_HTTP_REQUEST_HEADERS_2 = [ - { - "name": "Accept", - "value": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - }, - {"name": "Accept-Language", "value": "en"}, {"name": "Accept-Encoding", "value": "gzip, deflate, br"}, ] @@ -450,7 +442,6 @@ def parse_(self, response: DummyResponse, page: SomePage): "url": url, "httpResponseBody": True, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == HttpResponse @@ -481,7 +472,6 @@ def parse_(self, response: DummyResponse, page: SomePage): "productOptions": {"extractFrom": "httpResponseBody"}, "httpResponseBody": True, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse assert type(item["page"].response.response) == HttpResponse @@ -617,7 +607,6 @@ def parse_(self, response: DummyResponse, page: SomePage): "httpResponseBody": True, "productOptions": product_options, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse @@ -639,9 +628,8 @@ def start_requests(self): def parse_(self, response: DummyResponse, page: SomePage): yield {"page": page} - product_options = {} settings = provider_settings(mockserver) - settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": product_options} + settings["ZYTE_API_PROVIDER_PARAMS"] = {"productOptions": {}} item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) params = crawler.engine.downloader.handlers._handlers["http"].params @@ -652,7 +640,6 @@ def parse_(self, response: DummyResponse, page: SomePage): "httpResponseBody": True, "productOptions": {"extractFrom": "httpResponseBody"}, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse @@ -692,7 +679,6 @@ def parse_(self, response: DummyResponse, page: SomePage): "httpResponseBody": True, "httpResponseHeaders": True, "productOptions": product_options, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse @@ -804,7 +790,8 @@ def parse_(self, response: DummyResponse, page: SomePage): "url": url, "httpResponseBody": True, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, + # This is actually set by HttpResponseProvider + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page"].response) == AnyResponse @@ -836,7 +823,8 @@ def parse_(self, response: DummyResponse, page: SomePage): "url": url, "httpResponseBody": True, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, + # This is actually set by HttpResponseProvider + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert params[1] == {"url": url, "browserHtml": True} @@ -875,7 +863,8 @@ def parse_(self, response: DummyResponse, page1: FirstPage, page2: SecondPage): "url": url, "httpResponseBody": True, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, + # This is actually set by HttpResponseProvider + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert type(item["page1"].http_response) == HttpResponse assert type(item["page2"].http_response) == HttpResponse @@ -910,7 +899,8 @@ def parse_(self, response: DummyResponse, page1: FirstPage, page2: SecondPage): "url": url, "httpResponseBody": True, "httpResponseHeaders": True, - "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS_2, + # This is actually set by HttpResponseProvider + "customHttpRequestHeaders": CUSTOM_HTTP_REQUEST_HEADERS, } assert params[1] == {"url": url, "browserHtml": True} From 2c5e506d0497bec05c78fa9d07bcf2e0d4d84c36 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 31 Jan 2024 18:32:18 +0800 Subject: [PATCH 17/23] use web-poet>=0.16.0 --- setup.py | 3 +-- tox.ini | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index f79c380a..6ed70dcd 100644 --- a/setup.py +++ b/setup.py @@ -33,8 +33,7 @@ def get_version(): "andi>=0.6.0", # "scrapy-poet>=0.19.0", "scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet", - # "web-poet>=0.15.1", - "web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet", + "web-poet>=0.16.0", "zyte-common-items>=0.8.0", ] }, diff --git a/tox.ini b/tox.ini index d0da523d..4a30290d 100644 --- a/tox.ini +++ b/tox.ini @@ -91,8 +91,7 @@ deps = andi==0.6.0 #scrapy-poet==0.19.0 scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet - #web-poet==0.15.1 - web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet + web-poet==0.16.0 zyte-common-items==0.8.0 [testenv:pinned-extra] From 6c1e043966a56b2917ccabdb4ced19884cd1daac Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 31 Jan 2024 18:46:43 +0800 Subject: [PATCH 18/23] temporarily use scrapy-poet master branch --- setup.py | 2 +- tests/test_providers.py | 2 +- tox.ini | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 6ed70dcd..206ba523 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def get_version(): "provider": [ "andi>=0.6.0", # "scrapy-poet>=0.19.0", - "scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet", + "scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@master#egg=scrapy-poet", "web-poet>=0.16.0", "zyte-common-items>=0.8.0", ] diff --git a/tests/test_providers.py b/tests/test_providers.py index 52e4c0b6..a98c1338 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -35,7 +35,7 @@ def create_scrapy_settings(): - settings = _create_scrapy_settings(None) + settings = _create_scrapy_settings() for setting, value in SETTINGS.items(): if setting.endswith("_MIDDLEWARES") and settings[setting]: settings[setting].update(value) diff --git a/tox.ini b/tox.ini index 4a30290d..4428bc3d 100644 --- a/tox.ini +++ b/tox.ini @@ -90,7 +90,7 @@ deps = {[testenv:pinned-scrapy-2x6]deps} andi==0.6.0 #scrapy-poet==0.19.0 - scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet + scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@master#egg=scrapy-poet web-poet==0.16.0 zyte-common-items==0.8.0 From a51f96173e82fcead7e30578cb1ee78ecc67cfdf Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 5 Feb 2024 19:35:09 +0800 Subject: [PATCH 19/23] use browserHtml if no extraction source is provided with item types --- docs/reference/inputs.rst | 10 +++++++--- scrapy_zyte_api/providers.py | 8 +++++--- tests/test_providers.py | 14 +++++--------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/reference/inputs.rst b/docs/reference/inputs.rst index 866b352d..75d8f9d4 100644 --- a/docs/reference/inputs.rst +++ b/docs/reference/inputs.rst @@ -16,9 +16,13 @@ Inputs - :class:`web_poet.AnyResponse` This re-uses either :class:`web_poet.BrowserResponse` *(takes priority)* - or :class:`web_poet.HttpResponse` if they're available. If neither is - available, it would use :class:`web_poet.HttpResponse` requested from Zyte - API. + or :class:`web_poet.HttpResponse` if they're available. + + If neither is available, it would use :class:`web_poet.HttpResponse` + requested from Zyte API. However, if other item inputs (e.g. + :class:`zyte_common_items.Product`) are present, it would request + :class:`web_poet.BrowserResponse` from Zyte API unless an extraction + source is provided. - :class:`zyte_common_items.Article` diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 51eaac89..7bbfd959 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -100,6 +100,7 @@ async def __call__( # noqa: C901 to_provide_stripped: Set[type] = set() extract_from_seen: Dict[str, str] = {} + item_requested: bool = False for cls in to_provide: cls_stripped = strip_annotated(cls) @@ -112,6 +113,7 @@ async def __call__( # noqa: C901 kw = item_keywords.get(cls_stripped) if not kw: continue + item_requested = True to_provide_stripped.add(cls_stripped) zyte_api_meta[kw] = True if not is_typing_annotated(cls): @@ -144,11 +146,11 @@ async def __call__( # noqa: C901 del zyte_api_meta[options_name] elif zyte_api_meta.get(options_name, {}).get("extractFrom"): extract_from = zyte_api_meta[options_name]["extractFrom"] - elif item_type in to_provide_stripped and http_response_needed: - zyte_api_meta[options_name] = {"extractFrom": "httpResponseBody"} if AnyResponse in to_provide: - if extract_from == "browserHtml": + if ( + item_requested and extract_from != "httpResponseBody" + ) or extract_from == "browserHtml": html_requested = True elif extract_from == "httpResponseBody" or http_response_needed: zyte_api_meta["httpResponseBody"] = True diff --git a/tests/test_providers.py b/tests/test_providers.py index b7813beb..d5a2f3dc 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -367,7 +367,7 @@ def provide(*args, **kwargs): assert type(results[0]) == AnyResponse assert type(results[0].response) == HttpResponse - # Same case as above, HttpResopnse is used by default + # Same case as above, HttpResponse is used by default results = yield provide({AnyResponse, Product}) assert len(results) == 2 assert type(results[0]) == AnyResponse @@ -518,12 +518,10 @@ def parse_(self, response: DummyResponse, page: SomePage): assert params[0] == { "url": url, "product": True, - "productOptions": {"extractFrom": "httpResponseBody"}, - "httpResponseBody": True, - "httpResponseHeaders": True, + "browserHtml": True, } assert type(item["page"].response) == AnyResponse - assert type(item["page"].response.response) == HttpResponse + assert type(item["page"].response.response) == BrowserResponse assert type(item["page"].product) == Product @@ -686,13 +684,11 @@ def parse_(self, response: DummyResponse, page: SomePage): assert params[0] == { "url": url, "product": True, - "httpResponseBody": True, - "productOptions": {"extractFrom": "httpResponseBody"}, - "httpResponseHeaders": True, + "browserHtml": True, } assert type(item["page"].response) == AnyResponse - assert type(item["page"].response.response) == HttpResponse + assert type(item["page"].response.response) == BrowserResponse assert type(item["page"].product) == Product From 9da17b4b93a2b2dab00b4821622e20e5cb4082a5 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 7 Feb 2024 19:40:04 +0800 Subject: [PATCH 20/23] remove cache updates since Injector is already doing it --- scrapy_zyte_api/providers.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 7bbfd959..a69e36ff 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -53,11 +53,6 @@ class ZyteApiProvider(PageObjectInputProvider): def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) - def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None: - if request not in self.injector.weak_cache: - self.injector.weak_cache[request] = {} - self.injector.weak_cache[request].update(mapping) - async def __call__( # noqa: C901 self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: @@ -79,7 +74,6 @@ async def __call__( # noqa: C901 if http_response: any_response = AnyResponse(response=http_response) results.append(any_response) - self.update_cache(request, {AnyResponse: any_response}) to_provide.remove(cls) if not to_provide: @@ -178,7 +172,6 @@ async def __call__( # noqa: C901 html = None if BrowserHtml in to_provide: results.append(html) - self.update_cache(request, {BrowserHtml: html}) browser_response = None if BrowserResponse in to_provide: @@ -188,7 +181,6 @@ async def __call__( # noqa: C901 html=html, ) results.append(browser_response) - self.update_cache(request, {BrowserResponse: browser_response}) if AnyResponse in to_provide: any_response = None # type: ignore[assignment] @@ -219,7 +211,6 @@ async def __call__( # noqa: C901 if any_response: results.append(any_response) - self.update_cache(request, {AnyResponse: any_response}) for cls in to_provide: cls_stripped = strip_annotated(cls) @@ -236,5 +227,4 @@ async def __call__( # noqa: C901 if is_typing_annotated(cls): item = AnnotatedResult(item, cls.__metadata__) # type: ignore[attr-defined] results.append(item) - self.update_cache(request, {cls: item}) return results From d3898933517ab192630ccde4c7e4d236c9d4d45f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 8 Feb 2024 14:20:56 +0800 Subject: [PATCH 21/23] Update tests on instance identity Co-authored-by: Mikhail Korobov --- tests/test_providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index d5a2f3dc..d8567c05 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -378,7 +378,7 @@ def provide(*args, **kwargs): assert len(results) == 2 assert type(results[0]) == BrowserResponse assert type(results[1]) == AnyResponse - assert id(results[0]) == id(results[1].response) + assert results[1].response is results[0] # AnyResponse should re-use BrowserHtml if available. results = yield provide({AnyResponse, BrowserHtml}) From df32f14598238a0166e8ba488c45d1857e497e55 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 8 Feb 2024 15:21:17 +0800 Subject: [PATCH 22/23] remove duplicate test cases for AnyResponse --- tests/test_providers.py | 118 +--------------------------------------- 1 file changed, 2 insertions(+), 116 deletions(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index d8567c05..72652f07 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -4,16 +4,13 @@ pytest.importorskip("scrapy_poet") -import asyncio - import attrs from pytest_twisted import ensureDeferred from scrapy import Request, Spider from scrapy_poet import DummyResponse -from scrapy_poet.injection import Injector from scrapy_poet.utils.testing import HtmlResource, crawl_single_item from scrapy_poet.utils.testing import create_scrapy_settings as _create_scrapy_settings -from twisted.internet import defer, reactor +from twisted.internet import reactor from twisted.web.client import Agent, readBody from web_poet import ( AnyResponse, @@ -30,7 +27,7 @@ from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler from scrapy_zyte_api.providers import ZyteApiProvider -from . import SETTINGS, get_crawler +from . import SETTINGS from .mockserver import get_ephemeral_port @@ -326,117 +323,6 @@ def parse_(self, response: DummyResponse, page: GeoProductPage): # type: ignore assert "Geolocation dependencies must be annotated" in caplog.text -@defer.inlineCallbacks -def run_provider(server, to_provide, settings_dict=None, request_meta=None): - class AnyResponseSpider(Spider): - name = "any_response" - - request = Request(server.urljoin("/some-page"), meta=request_meta) - settings = create_scrapy_settings() - settings["ZYTE_API_URL"] = server.urljoin("/") - if settings_dict: - settings.update(settings_dict) - crawler = get_crawler(settings, AnyResponseSpider) - yield from crawler.engine.open_spider(crawler.spider) - injector = Injector(crawler) - provider = ZyteApiProvider(injector) - - coro = provider(to_provide, request, crawler) - results = yield defer.Deferred.fromFuture(asyncio.ensure_future(coro)) - - return results - - -@defer.inlineCallbacks -def test_provider_any_response(mockserver): - # Use only one instance of the mockserver for faster tests. - def provide(*args, **kwargs): - return run_provider(mockserver, *args, **kwargs) - - results = yield provide(set()) - assert results == [] - - # AnyResponse would use HttpResponse by default, if neither BrowserResponse - # nor HttpResponse is available. - results = yield provide( - { - AnyResponse, - } - ) - assert len(results) == 1 - assert type(results[0]) == AnyResponse - assert type(results[0].response) == HttpResponse - - # Same case as above, HttpResponse is used by default - results = yield provide({AnyResponse, Product}) - assert len(results) == 2 - assert type(results[0]) == AnyResponse - assert type(results[1]) == Product - - # AnyResponse should re-use BrowserResponse if available. - results = yield provide({AnyResponse, BrowserResponse}) - assert len(results) == 2 - assert type(results[0]) == BrowserResponse - assert type(results[1]) == AnyResponse - assert results[1].response is results[0] - - # AnyResponse should re-use BrowserHtml if available. - results = yield provide({AnyResponse, BrowserHtml}) - assert len(results) == 2 - assert type(results[0]) == BrowserHtml - assert type(results[1]) == AnyResponse - # diff instance due to casting - assert results[0] == results[1].response.html # type: ignore[union-attr] - - results = yield provide({AnyResponse, BrowserResponse, BrowserHtml}) - assert len(results) == 3 - assert type(results[0]) == BrowserHtml - assert type(results[1]) == BrowserResponse - assert type(results[2]) == AnyResponse - assert results[0] == results[1].html - assert results[0] == results[2].response.html # type: ignore[union-attr] - - # NOTES: This is hard to test in this setup and would result in being empty. - # This will be tested in a spider-setup instead so that HttpResponseProvider - # can participate. - # results = yield provide({AnyResponse, HttpResponse}) - # assert results == [] - - # For the following cases, extraction source isn't available in `to_provided` - # but are in the `*.extractFrom` parameter. - - settings_dict = { - "ZYTE_API_PROVIDER_PARAMS": {"productOptions": {"extractFrom": "browserHtml"}} - } - - results = yield provide({AnyResponse, Product}, settings_dict) - assert len(results) == 2 - assert type(results[0]) == AnyResponse - assert type(results[0].response) == BrowserResponse - assert type(results[1]) == Product - - results = yield provide({AnyResponse, BrowserHtml, Product}, settings_dict) - assert len(results) == 3 - assert type(results[0]) == BrowserHtml - assert type(results[1]) == AnyResponse - assert type(results[1].response) == BrowserResponse - assert type(results[2]) == Product - assert results[0] == results[1].response.html # diff instance due to casting - - settings_dict = { - "ZYTE_API_PROVIDER_PARAMS": { - "productOptions": {"extractFrom": "httpResponseBody"} - } - } - request_meta = {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": True}} - - results = yield provide({AnyResponse, Product}, settings_dict, request_meta) - assert len(results) == 2 - assert type(results[0]) == AnyResponse - assert type(results[0].response) == HttpResponse - assert type(results[1]) == Product - - class RecordingHandler(ScrapyZyteAPIDownloadHandler): """Subclasses the original handler in order to record the Zyte API parameters used for each downloading request. From 382dced3e11eb67341f58415609c3875e6df1dd7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 8 Feb 2024 16:49:01 +0800 Subject: [PATCH 23/23] use scrapy-poet 0.21.0 --- setup.py | 3 +-- tox.ini | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 206ba523..2e5bb5a4 100644 --- a/setup.py +++ b/setup.py @@ -31,8 +31,7 @@ def get_version(): # Sync with [testenv:pinned-provider] @ tox.ini "provider": [ "andi>=0.6.0", - # "scrapy-poet>=0.19.0", - "scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@master#egg=scrapy-poet", + "scrapy-poet>=0.21.0", "web-poet>=0.16.0", "zyte-common-items>=0.8.0", ] diff --git a/tox.ini b/tox.ini index 4428bc3d..e00d603f 100644 --- a/tox.ini +++ b/tox.ini @@ -89,8 +89,7 @@ deps = # scrapy-poet >= 0.4.0 depends on scrapy >= 2.6.0 {[testenv:pinned-scrapy-2x6]deps} andi==0.6.0 - #scrapy-poet==0.19.0 - scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@master#egg=scrapy-poet + scrapy-poet==0.21.0 web-poet==0.16.0 zyte-common-items==0.8.0