Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add provider for RequestUrl #76

Merged
merged 10 commits into from
Jun 17, 2022
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
language = 'en'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
Expand Down
41 changes: 23 additions & 18 deletions scrapy_poet/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
are executed.
"""
import logging
from typing import Optional, Type, TypeVar
from typing import Generator, Optional, Type, TypeVar

from scrapy import Spider, signals
from scrapy.crawler import Crawler
Expand All @@ -14,15 +14,16 @@

from .api import DummyResponse
from .overrides import OverridesRegistry
from .page_input_providers import HttpResponseProvider
from .page_input_providers import HttpResponseProvider, RequestUrlProvider
from .injection import Injector


logger = logging.getLogger(__name__)


DEFAULT_PROVIDERS = {
HttpResponseProvider: 500
HttpResponseProvider: 500,
RequestUrlProvider: 600,
}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
Expand Down Expand Up @@ -54,7 +55,22 @@ def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> Injectio
def spider_closed(self, spider: Spider) -> None:
self.injector.close()

def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]:
@inlineCallbacks
def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = None) -> Generator[None, None, None]:
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response=response,
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
# Precedence of user callback arguments
if arg not in request.cb_kwargs:
request.cb_kwargs[arg] = value
# TODO: check if all arguments are fulfilled somehow?

@inlineCallbacks
def process_request(self, request: Request, spider: Spider) -> Generator[None, None, Optional[DummyResponse]]:
"""This method checks if the request is really needed and if its
download could be skipped by trying to infer if a ``Response``
is going to be used by the callback or a Page Input.
Expand All @@ -70,13 +86,13 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes
"""
if self.injector.is_scrapy_response_required(request):
return None

yield from self._inject_cb_kwargs(request)
logger.debug(f"Using DummyResponse instead of downloading {request}")
return DummyResponse(url=request.url, request=request)

@inlineCallbacks
def process_response(self, request: Request, response: Response,
spider: Spider) -> Response:
spider: Spider) -> Generator[None, None, Response]:
"""This method fills ``request.cb_kwargs`` with instances for
the required Page Objects found in the callback signature.

Expand All @@ -89,16 +105,5 @@ def process_response(self, request: Request, response: Response,
and an injectable attribute,
the user-defined ``cb_kwargs`` takes precedence.
"""
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
# Precedence of user callback arguments
if arg not in request.cb_kwargs:
request.cb_kwargs[arg] = value
# TODO: check if all arguments are fulfilled somehow?

yield from self._inject_cb_kwargs(request, response)
return response
13 changes: 12 additions & 1 deletion scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from scrapy.utils.request import request_fingerprint

from scrapy_poet.injection_errors import MalformedProvidedClassesError
from web_poet import HttpResponse, HttpResponseHeaders
from web_poet import HttpResponse, HttpResponseHeaders, RequestUrl


class PageObjectInputProvider:
Expand Down Expand Up @@ -197,3 +197,14 @@ def deserialize(self, data: Any) -> Sequence[Any]:
)
for response_data in data
]


class RequestUrlProvider(PageObjectInputProvider):
"""This class provides ``web_poet.page_inputs.RequestUrl`` instances."""

provided_classes = {RequestUrl}
name = "request_url"

def __call__(self, to_provide: Set[Callable], request: Request):
"""Builds a ``RequestUrl`` instance using a Scrapy ``Request``"""
return [RequestUrl(url=request.url)]
4 changes: 2 additions & 2 deletions tests/test_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,11 @@ def callback(response: DummyResponse,

class Html(Injectable):
url = "http://example.com"
html = """<html><body>Price: <span class="price">22</span>€</body></html>"""
text = """<html><body>Price: <span class="price">22</span>€</body></html>"""

@property
def selector(self):
return parsel.Selector(self.html)
return parsel.Selector(self.text)


class EurDollarRate(Injectable):
Expand Down
27 changes: 26 additions & 1 deletion tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
PageObjectInputProvider
)
from web_poet import default_registry
from web_poet.page_inputs import HttpResponse
from web_poet.page_inputs import HttpResponse, RequestUrl
from scrapy_poet import DummyResponse
from tests.utils import (HtmlResource,
crawl_items,
Expand Down Expand Up @@ -310,6 +310,19 @@ def parse(self, response: DummyResponse):
}


class RequestUrlSpider(scrapy.Spider):
url = None

def start_requests(self):
yield Request(url=self.url, callback=self.parse)

def parse(self, response: DummyResponse, *, url: RequestUrl):
return {
'response': response,
'url': url,
}


@inlineCallbacks
def test_skip_downloads(settings):
item, url, crawler = yield crawl_single_item(
Expand All @@ -327,6 +340,18 @@ def test_skip_downloads(settings):
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@inlineCallbacks
def test_skip_download_request_url(settings):
item, url, crawler = yield crawl_single_item(
RequestUrlSpider, ProductHtml, settings)
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is True
assert isinstance(item['url'], RequestUrl)
assert str(item['url']) == url
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@mock.patch("scrapy_poet.injection.SqlitedictCache", spec=SqlitedictCache)
def test_cache_closed_on_spider_close(mock_sqlitedictcache, settings):
def get_middleware(settings):
Expand Down