Skip to content

Commit

Permalink
Fix request fingerprinting involving items (#185)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Jan 24, 2024
1 parent 3b2a0f7 commit 13280a0
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 0 deletions.
3 changes: 3 additions & 0 deletions scrapy_poet/_request_fingerprinter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import Callable, Dict, List, Optional, get_args, get_origin
from weakref import WeakKeyDictionary

from andi import CustomBuilder
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
Expand All @@ -34,6 +35,8 @@
logger = getLogger(__name__)

def _serialize_dep(cls):
if isinstance(cls, CustomBuilder):
cls = cls.result_class_or_fn
try:
from typing import Annotated
except ImportError:
Expand Down
2 changes: 2 additions & 0 deletions scrapy_poet/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from twisted.web.resource import Resource
from twisted.web.server import NOT_DONE_YET

from scrapy_poet import ScrapyPoetRequestFingerprinter
from scrapy_poet.utils.mockserver import MockServer


Expand Down Expand Up @@ -231,6 +232,7 @@ def create_scrapy_settings(request):
InjectedDependenciesCollectorMiddleware: 542,
"scrapy_poet.InjectionMiddleware": 543,
},
REQUEST_FINGERPRINTER_CLASS=ScrapyPoetRequestFingerprinter,
)
return Settings(s)

Expand Down
32 changes: 32 additions & 0 deletions tests/test_request_fingerprinter.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,3 +512,35 @@ async def parse_page(self, response, page: WebPage):
fingerprinter.fingerprint(request)
fingerprinter.fingerprint(request)
mock.assert_called_once_with(request)


def test_item(settings):
"""Test that fingerprinting works even for items."""
from scrapy import Request, Spider
from web_poet import ItemPage, RulesRegistry

registry = RulesRegistry()

class MyItem:
pass

@registry.handle_urls("example.com")
class MyPage(ItemPage[MyItem]):
pass

class TestSpider(Spider):
name = "test"

def __init__(self, *args, **kwargs):
self.request = Request("https://example.com", callback=self.parse_page)

async def parse_page(self, response, a: MyItem):
pass

settings["SCRAPY_POET_RULES"] = registry.get_rules()
crawler = get_crawler(spider_cls=TestSpider, settings=settings)
fingerprinter = crawler.request_fingerprinter

fingerprint = fingerprinter.fingerprint(crawler.spider.request)
assert fingerprint
assert isinstance(fingerprint, bytes)

0 comments on commit 13280a0

Please sign in to comment.