From f01c9f448452d76d5f84ea423702cbb3d330b2f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 12:28:43 +0200 Subject: [PATCH 1/9] Add provider for RequestUrl --- scrapy_poet/middleware.py | 35 ++++++++++++++++------------- scrapy_poet/page_input_providers.py | 13 ++++++++++- tests/test_injection.py | 4 ++-- tests/test_middleware.py | 27 +++++++++++++++++++++- 4 files changed, 60 insertions(+), 19 deletions(-) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 9b9978c8..0efd1faa 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -14,7 +14,7 @@ from .api import DummyResponse from .overrides import OverridesRegistry -from .page_input_providers import HttpResponseProvider +from .page_input_providers import HttpResponseProvider, RequestUrlProvider from .injection import Injector @@ -22,7 +22,8 @@ DEFAULT_PROVIDERS = { - HttpResponseProvider: 500 + HttpResponseProvider: 500, + RequestUrlProvider: 600, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") @@ -54,6 +55,21 @@ def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> Injectio def spider_closed(self, spider: Spider) -> None: self.injector.close() + @inlineCallbacks + def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = None): + # Find out the dependencies + final_kwargs = yield from self.injector.build_callback_dependencies( + request, + response=response, + ) + # Fill the callback arguments with the created instances + for arg, value in final_kwargs.items(): + # Precedence of user callback arguments + if arg not in request.cb_kwargs: + request.cb_kwargs[arg] = value + # TODO: check if all arguments are fulfilled somehow? + + @inlineCallbacks def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` @@ -70,7 +86,7 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes """ if self.injector.is_scrapy_response_required(request): return None - + yield from self._inject_cb_kwargs(request) logger.debug(f"Using DummyResponse instead of downloading {request}") return DummyResponse(url=request.url, request=request) @@ -89,16 +105,5 @@ def process_response(self, request: Request, response: Response, and an injectable attribute, the user-defined ``cb_kwargs`` takes precedence. """ - # Find out the dependencies - final_kwargs = yield from self.injector.build_callback_dependencies( - request, - response - ) - # Fill the callback arguments with the created instances - for arg, value in final_kwargs.items(): - # Precedence of user callback arguments - if arg not in request.cb_kwargs: - request.cb_kwargs[arg] = value - # TODO: check if all arguments are fulfilled somehow? - + yield from self._inject_cb_kwargs(request, response) return response diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 0a258d00..cce323f8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -19,7 +19,7 @@ from scrapy.utils.request import request_fingerprint from scrapy_poet.injection_errors import MalformedProvidedClassesError -from web_poet import HttpResponse, HttpResponseHeaders +from web_poet import HttpResponse, HttpResponseHeaders, RequestUrl class PageObjectInputProvider: @@ -197,3 +197,14 @@ def deserialize(self, data: Any) -> Sequence[Any]: ) for response_data in data ] + + +class RequestUrlProvider(PageObjectInputProvider): + """This class provides ``web_poet.page_inputs.RequestUrl`` instances.""" + + provided_classes = {RequestUrl} + name = "request_url" + + def __call__(self, to_provide: Set[Callable], request: Request): + """Builds a ``RequestUrl`` instance using a Scrapy ``Request``""" + return [RequestUrl(url=request.url)] diff --git a/tests/test_injection.py b/tests/test_injection.py index 99393e52..6871b1b6 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -266,11 +266,11 @@ def callback(response: DummyResponse, class Html(Injectable): url = "http://example.com" - html = """Price: 22€""" + text = """Price: 22€""" @property def selector(self): - return parsel.Selector(self.html) + return parsel.Selector(self.text) class EurDollarRate(Injectable): diff --git a/tests/test_middleware.py b/tests/test_middleware.py index ac4ea36a..cc39cc9d 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -25,7 +25,7 @@ PageObjectInputProvider ) from web_poet import default_registry -from web_poet.page_inputs import HttpResponse +from web_poet.page_inputs import HttpResponse, RequestUrl from scrapy_poet import DummyResponse from tests.utils import (HtmlResource, crawl_items, @@ -310,6 +310,19 @@ def parse(self, response: DummyResponse): } +class RequestUrlSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, *, url: RequestUrl): + return { + 'response': response, + 'url': url, + } + + @inlineCallbacks def test_skip_downloads(settings): item, url, crawler = yield crawl_single_item( @@ -327,6 +340,18 @@ def test_skip_downloads(settings): assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 +@inlineCallbacks +def test_skip_download_request_url(settings): + item, url, crawler = yield crawl_single_item( + RequestUrlSpider, ProductHtml, settings) + assert isinstance(item['response'], Response) is True + assert isinstance(item['response'], DummyResponse) is True + assert isinstance(item['url'], RequestUrl) + assert str(item['url']) == url + assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 + + @mock.patch("scrapy_poet.injection.SqlitedictCache", spec=SqlitedictCache) def test_cache_closed_on_spider_close(mock_sqlitedictcache, settings): def get_middleware(settings): From 69ed8cc7170e14ddbd9288b73dafbdc2802200b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 12:54:42 +0200 Subject: [PATCH 2/9] Fix mypy and docs --- docs/conf.py | 2 +- scrapy_poet/middleware.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2e205d04..213b2ea9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 0efd1faa..c435d3fa 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -3,7 +3,7 @@ are executed. """ import logging -from typing import Optional, Type, TypeVar +from typing import Generator, Optional, Type, TypeVar from scrapy import Spider, signals from scrapy.crawler import Crawler @@ -56,7 +56,7 @@ def spider_closed(self, spider: Spider) -> None: self.injector.close() @inlineCallbacks - def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = None): + def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = None) -> Generator[None, None, None]: # Find out the dependencies final_kwargs = yield from self.injector.build_callback_dependencies( request, @@ -70,7 +70,7 @@ def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = Non # TODO: check if all arguments are fulfilled somehow? @inlineCallbacks - def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: + def process_request(self, request: Request, spider: Spider) -> Generator[None, None, Optional[DummyResponse]]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` is going to be used by the callback or a Page Input. @@ -92,7 +92,7 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes @inlineCallbacks def process_response(self, request: Request, response: Response, - spider: Spider) -> Response: + spider: Spider) -> Generator[None, None, Response]: """This method fills ``request.cb_kwargs`` with instances for the required Page Objects found in the callback signature. From 80f8de1bcab793863b18ac7a13aa6f9982ed4fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 15:22:30 +0200 Subject: [PATCH 3/9] Revert uneeded, potentially-breaking changes to process_request --- scrapy_poet/middleware.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index c435d3fa..4de7cd05 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -55,22 +55,7 @@ def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> Injectio def spider_closed(self, spider: Spider) -> None: self.injector.close() - @inlineCallbacks - def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = None) -> Generator[None, None, None]: - # Find out the dependencies - final_kwargs = yield from self.injector.build_callback_dependencies( - request, - response=response, - ) - # Fill the callback arguments with the created instances - for arg, value in final_kwargs.items(): - # Precedence of user callback arguments - if arg not in request.cb_kwargs: - request.cb_kwargs[arg] = value - # TODO: check if all arguments are fulfilled somehow? - - @inlineCallbacks - def process_request(self, request: Request, spider: Spider) -> Generator[None, None, Optional[DummyResponse]]: + def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` is going to be used by the callback or a Page Input. @@ -86,7 +71,7 @@ def process_request(self, request: Request, spider: Spider) -> Generator[None, N """ if self.injector.is_scrapy_response_required(request): return None - yield from self._inject_cb_kwargs(request) + logger.debug(f"Using DummyResponse instead of downloading {request}") return DummyResponse(url=request.url, request=request) @@ -105,5 +90,16 @@ def process_response(self, request: Request, response: Response, and an injectable attribute, the user-defined ``cb_kwargs`` takes precedence. """ - yield from self._inject_cb_kwargs(request, response) + # Find out the dependencies + final_kwargs = yield from self.injector.build_callback_dependencies( + request, + response=response, + ) + # Fill the callback arguments with the created instances + for arg, value in final_kwargs.items(): + # Precedence of user callback arguments + if arg not in request.cb_kwargs: + request.cb_kwargs[arg] = value + # TODO: check if all arguments are fulfilled somehow? + return response From 8080117901aeef4cf736b9187409fd8da22fbd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 15:23:33 +0200 Subject: [PATCH 4/9] Revert unneeded change --- scrapy_poet/middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 4de7cd05..e7af18e3 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -93,7 +93,7 @@ def process_response(self, request: Request, response: Response, # Find out the dependencies final_kwargs = yield from self.injector.build_callback_dependencies( request, - response=response, + response, ) # Fill the callback arguments with the created instances for arg, value in final_kwargs.items(): From 38517845c275523e4dd457e1b4e67d9f4ffbfe8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 15:36:58 +0200 Subject: [PATCH 5/9] Test RequestUrl page object --- tests/test_middleware.py | 54 ++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index cc39cc9d..3d2f5669 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -310,19 +310,6 @@ def parse(self, response: DummyResponse): } -class RequestUrlSpider(scrapy.Spider): - url = None - - def start_requests(self): - yield Request(url=self.url, callback=self.parse) - - def parse(self, response: DummyResponse, *, url: RequestUrl): - return { - 'response': response, - 'url': url, - } - - @inlineCallbacks def test_skip_downloads(settings): item, url, crawler = yield crawl_single_item( @@ -340,6 +327,19 @@ def test_skip_downloads(settings): assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 +class RequestUrlSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, url: RequestUrl): + return { + 'response': response, + 'url': url, + } + + @inlineCallbacks def test_skip_download_request_url(settings): item, url, crawler = yield crawl_single_item( @@ -352,6 +352,34 @@ def test_skip_download_request_url(settings): assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 +@attr.s(auto_attribs=True) +class RequestUrlPage(ItemPage): + url: RequestUrl + + def to_item(self): + return {'url': self.url} + + +class RequestUrlPageSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: RequestUrlPage): + return page.to_item() + + +@inlineCallbacks +def test_skip_download_request_url_page(settings): + item, url, crawler = yield crawl_single_item( + RequestUrlPageSpider, ProductHtml, settings) + assert tuple(item.keys()) == ('url',) + assert str(item['url']) == url + assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 + + @mock.patch("scrapy_poet.injection.SqlitedictCache", spec=SqlitedictCache) def test_cache_closed_on_spider_close(mock_sqlitedictcache, settings): def get_middleware(settings): From afb9be2bafc3269056ba3fe0dba2c57a22cf27d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 15:43:18 +0200 Subject: [PATCH 6/9] Implement a downloader/request_count/skipped stat --- scrapy_poet/middleware.py | 1 + tests/test_middleware.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index e7af18e3..1131862e 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -73,6 +73,7 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes return None logger.debug(f"Using DummyResponse instead of downloading {request}") + self.crawler.stats.inc_value("downloader/request_count/skipped") return DummyResponse(url=request.url, request=request) @inlineCallbacks diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 3d2f5669..28bb622c 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -317,6 +317,7 @@ def test_skip_downloads(settings): assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is False assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1 + assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 0 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 item, url, crawler = yield crawl_single_item( @@ -324,6 +325,7 @@ def test_skip_downloads(settings): assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is True assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 @@ -349,6 +351,7 @@ def test_skip_download_request_url(settings): assert isinstance(item['url'], RequestUrl) assert str(item['url']) == url assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 @@ -377,6 +380,7 @@ def test_skip_download_request_url_page(settings): assert tuple(item.keys()) == ('url',) assert str(item['url']) == url assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 From 17befe590f342a101d098bf0e8d82727ba5d064d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 16 Jun 2022 08:54:47 +0200 Subject: [PATCH 7/9] =?UTF-8?q?stat:=20downloader/request=5Fcount/skipped?= =?UTF-8?q?=20=E2=86=92=20scrapy=5Fpoet/dummy=5Fresponse=5Fcount?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_poet/middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 1131862e..55273256 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -73,7 +73,7 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes return None logger.debug(f"Using DummyResponse instead of downloading {request}") - self.crawler.stats.inc_value("downloader/request_count/skipped") + self.crawler.stats.inc_value("scrapy_poet/dummy_response_count") return DummyResponse(url=request.url, request=request) @inlineCallbacks From 62e6d00470e0a0f523c539143c93b2a84b53ac76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 16 Jun 2022 08:56:00 +0200 Subject: [PATCH 8/9] =?UTF-8?q?stat:=20downloader/request=5Fcount/skipped?= =?UTF-8?q?=20=E2=86=92=20scrapy=5Fpoet/dummy=5Fresponse=5Fcount=20(tests)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_middleware.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 28bb622c..12f6ea1f 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -317,7 +317,7 @@ def test_skip_downloads(settings): assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is False assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1 - assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 0 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 0 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 item, url, crawler = yield crawl_single_item( @@ -325,7 +325,7 @@ def test_skip_downloads(settings): assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is True assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 - assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 @@ -351,7 +351,7 @@ def test_skip_download_request_url(settings): assert isinstance(item['url'], RequestUrl) assert str(item['url']) == url assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 - assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 @@ -380,7 +380,7 @@ def test_skip_download_request_url_page(settings): assert tuple(item.keys()) == ('url',) assert str(item['url']) == url assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 - assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 From 05f1bfd9b905dd9501dcf08c37f56663c2d8a4ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 17 Jun 2022 11:24:24 +0200 Subject: [PATCH 9/9] Address mypy feedback --- scrapy_poet/middleware.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index ac03704f..0543ee3a 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -8,7 +8,7 @@ from scrapy import Spider, signals from scrapy.crawler import Crawler from scrapy.http import Request, Response -from twisted.internet.defer import inlineCallbacks +from twisted.internet.defer import Deferred, inlineCallbacks from scrapy.utils.misc import create_instance, load_object @@ -85,7 +85,7 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes @inlineCallbacks def process_response(self, request: Request, response: Response, - spider: Spider) -> Generator[None, None, Response]: + spider: Spider) -> Generator[Deferred[object], object, Response]: """This method fills ``request.cb_kwargs`` with instances for the required Page Objects found in the callback signature.