From 04b1d54058723222b3f419536044aac6b90dec30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 23 Jan 2024 17:29:35 +0100 Subject: [PATCH 1/4] Handle CustomBuilder during request fingerprinting --- scrapy_poet/_request_fingerprinter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index f6593a16..7073754f 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -13,6 +13,7 @@ from typing import Callable, Dict, List, Optional, get_args, get_origin from weakref import WeakKeyDictionary + from andi import CustomBuilder from scrapy import Request from scrapy.crawler import Crawler from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS @@ -34,6 +35,8 @@ logger = getLogger(__name__) def _serialize_dep(cls): + if isinstance(cls, CustomBuilder): + cls = cls.result_class_or_fn try: from typing import Annotated except ImportError: From 02607b77e02db6ba5b503da4de21a01b1b701ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Jan 2024 12:28:11 +0100 Subject: [PATCH 2/4] Add a test for fingerprinting with item deps --- tests/test_request_fingerprinter.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index b329590d..70ade7e1 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -512,3 +512,32 @@ async def parse_page(self, response, page: WebPage): fingerprinter.fingerprint(request) fingerprinter.fingerprint(request) mock.assert_called_once_with(request) + + +def test_item(): + """Test that fingerprinting works even for items.""" + from scrapy import Request, Spider + from web_poet import ItemPage, handle_urls + + class MyItem: + pass + + @handle_urls("example.com") + class MyPage(ItemPage[MyItem]): + pass + + class TestSpider(Spider): + name = "test" + + def __init__(self, *args, **kwargs): + self.request = Request("https://example.com", callback=self.parse_page) + + async def parse_page(self, response, a: MyItem): + pass + + crawler = get_crawler(spider_cls=TestSpider) + fingerprinter = crawler.request_fingerprinter + + fingerprint = fingerprinter.fingerprint(crawler.spider.request) + assert fingerprint + assert isinstance(fingerprint, bytes) From 774d8d8e0997928a3f572722641c8b52c56eff00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Jan 2024 12:46:23 +0100 Subject: [PATCH 3/4] Do not leak to the default registry from fingerprinting tests --- tests/test_request_fingerprinter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 70ade7e1..31556676 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -517,12 +517,14 @@ async def parse_page(self, response, page: WebPage): def test_item(): """Test that fingerprinting works even for items.""" from scrapy import Request, Spider - from web_poet import ItemPage, handle_urls + from web_poet import ItemPage, RulesRegistry + + registry = RulesRegistry() class MyItem: pass - @handle_urls("example.com") + @registry.handle_urls("example.com") class MyPage(ItemPage[MyItem]): pass @@ -535,7 +537,10 @@ def __init__(self, *args, **kwargs): async def parse_page(self, response, a: MyItem): pass - crawler = get_crawler(spider_cls=TestSpider) + settings = { + "SCRAPY_POET_RULES": registry.get_rules(), + } + crawler = get_crawler(spider_cls=TestSpider, settings=settings) fingerprinter = crawler.request_fingerprinter fingerprint = fingerprinter.fingerprint(crawler.spider.request) From f5300a921d145d4604466ad72d37e4baa5d5b5c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Jan 2024 14:44:20 +0100 Subject: [PATCH 4/4] Fix the test --- scrapy_poet/utils/testing.py | 2 ++ tests/test_request_fingerprinter.py | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapy_poet/utils/testing.py b/scrapy_poet/utils/testing.py index b334c8cb..6d7ad28c 100644 --- a/scrapy_poet/utils/testing.py +++ b/scrapy_poet/utils/testing.py @@ -15,6 +15,7 @@ from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET +from scrapy_poet import ScrapyPoetRequestFingerprinter from scrapy_poet.utils.mockserver import MockServer @@ -231,6 +232,7 @@ def create_scrapy_settings(request): InjectedDependenciesCollectorMiddleware: 542, "scrapy_poet.InjectionMiddleware": 543, }, + REQUEST_FINGERPRINTER_CLASS=ScrapyPoetRequestFingerprinter, ) return Settings(s) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 31556676..a860225b 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -514,7 +514,7 @@ async def parse_page(self, response, page: WebPage): mock.assert_called_once_with(request) -def test_item(): +def test_item(settings): """Test that fingerprinting works even for items.""" from scrapy import Request, Spider from web_poet import ItemPage, RulesRegistry @@ -537,9 +537,7 @@ def __init__(self, *args, **kwargs): async def parse_page(self, response, a: MyItem): pass - settings = { - "SCRAPY_POET_RULES": registry.get_rules(), - } + settings["SCRAPY_POET_RULES"] = registry.get_rules() crawler = get_crawler(spider_cls=TestSpider, settings=settings) fingerprinter = crawler.request_fingerprinter