diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index fde14af2..0543ee3a 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -3,12 +3,12 @@ are executed. """ import logging -from typing import Optional, Type, TypeVar +from typing import Generator, Optional, Type, TypeVar from scrapy import Spider, signals from scrapy.crawler import Crawler from scrapy.http import Request, Response -from twisted.internet.defer import inlineCallbacks +from twisted.internet.defer import Deferred, inlineCallbacks from scrapy.utils.misc import create_instance, load_object @@ -17,6 +17,7 @@ HttpClientProvider, HttpResponseProvider, PageParamsProvider, + RequestUrlProvider, ) from .overrides import OverridesRegistry from .injection import Injector @@ -29,6 +30,7 @@ HttpResponseProvider: 500, HttpClientProvider: 600, PageParamsProvider: 700, + RequestUrlProvider: 800, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") @@ -78,11 +80,12 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes return None logger.debug(f"Using DummyResponse instead of downloading {request}") + self.crawler.stats.inc_value("scrapy_poet/dummy_response_count") return DummyResponse(url=request.url, request=request) @inlineCallbacks def process_response(self, request: Request, response: Response, - spider: Spider) -> Response: + spider: Spider) -> Generator[Deferred[object], object, Response]: """This method fills ``request.cb_kwargs`` with instances for the required Page Objects found in the callback signature. @@ -98,7 +101,7 @@ def process_response(self, request: Request, response: Response, # Find out the dependencies final_kwargs = yield from self.injector.build_callback_dependencies( request, - response + response, ) # Fill the callback arguments with the created instances for arg, value in final_kwargs.items(): diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index f47bec2a..8a3e2ca0 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -21,7 +21,7 @@ from scrapy_poet.utils import scrapy_response_to_http_response from scrapy_poet.injection_errors import MalformedProvidedClassesError from scrapy_poet.downloader import create_scrapy_downloader -from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams +from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams, RequestUrl class PageObjectInputProvider: @@ -223,3 +223,14 @@ def __call__(self, to_provide: Set[Callable], request: Request): ``scrapy.http.Response`` instance. """ return [PageParams(request.meta.get("page_params", {}))] + + +class RequestUrlProvider(PageObjectInputProvider): + """This class provides ``web_poet.page_inputs.RequestUrl`` instances.""" + + provided_classes = {RequestUrl} + name = "request_url" + + def __call__(self, to_provide: Set[Callable], request: Request): + """Builds a ``RequestUrl`` instance using a Scrapy ``Request``""" + return [RequestUrl(url=request.url)] diff --git a/tests/test_middleware.py b/tests/test_middleware.py index ac4ea36a..12f6ea1f 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -25,7 +25,7 @@ PageObjectInputProvider ) from web_poet import default_registry -from web_poet.page_inputs import HttpResponse +from web_poet.page_inputs import HttpResponse, RequestUrl from scrapy_poet import DummyResponse from tests.utils import (HtmlResource, crawl_items, @@ -317,6 +317,7 @@ def test_skip_downloads(settings): assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is False assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 0 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 item, url, crawler = yield crawl_single_item( @@ -324,6 +325,62 @@ def test_skip_downloads(settings): assert isinstance(item['response'], Response) is True assert isinstance(item['response'], DummyResponse) is True assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 + assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 + + +class RequestUrlSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, url: RequestUrl): + return { + 'response': response, + 'url': url, + } + + +@inlineCallbacks +def test_skip_download_request_url(settings): + item, url, crawler = yield crawl_single_item( + RequestUrlSpider, ProductHtml, settings) + assert isinstance(item['response'], Response) is True + assert isinstance(item['response'], DummyResponse) is True + assert isinstance(item['url'], RequestUrl) + assert str(item['url']) == url + assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 + assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 + + +@attr.s(auto_attribs=True) +class RequestUrlPage(ItemPage): + url: RequestUrl + + def to_item(self): + return {'url': self.url} + + +class RequestUrlPageSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: RequestUrlPage): + return page.to_item() + + +@inlineCallbacks +def test_skip_download_request_url_page(settings): + item, url, crawler = yield crawl_single_item( + RequestUrlPageSpider, ProductHtml, settings) + assert tuple(item.keys()) == ('url',) + assert str(item['url']) == url + assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1