From 6191c63b14b50ea41c493c7ce8b65082e11aee75 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 1 Dec 2023 18:31:58 +0400 Subject: [PATCH 01/10] Switch from ItemProvider to custom builders. --- scrapy_poet/downloadermiddlewares.py | 2 - scrapy_poet/injection.py | 46 ++++++++-- scrapy_poet/page_input_providers.py | 122 ++------------------------- tests/test_injection.py | 71 +++++++++++++++- tests/test_providers.py | 30 ++----- tests/test_web_poet_rules.py | 105 ++++++++--------------- 6 files changed, 157 insertions(+), 219 deletions(-) diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py index d7340f1c..118a7bf5 100644 --- a/scrapy_poet/downloadermiddlewares.py +++ b/scrapy_poet/downloadermiddlewares.py @@ -18,7 +18,6 @@ from .page_input_providers import ( HttpClientProvider, HttpResponseProvider, - ItemProvider, PageParamsProvider, RequestUrlProvider, ResponseUrlProvider, @@ -34,7 +33,6 @@ PageParamsProvider: 700, RequestUrlProvider: 800, ResponseUrlProvider: 900, - ItemProvider: 2000, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 92de6a12..dac19111 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -3,7 +3,7 @@ import os import pprint import warnings -from typing import Any, Callable, Dict, List, Mapping, Optional, Set, cast +from typing import Any, Callable, Dict, List, Mapping, Optional, Set, Type, cast import andi from andi.typeutils import issubclass_safe @@ -13,12 +13,12 @@ from scrapy.settings import Settings from scrapy.statscollectors import MemoryStatsCollector, StatsCollector from scrapy.utils.conf import build_component_list -from scrapy.utils.defer import maybeDeferred_coro +from scrapy.utils.defer import deferred_from_coro, maybeDeferred_coro from scrapy.utils.misc import load_object from twisted.internet.defer import inlineCallbacks from web_poet import RulesRegistry from web_poet.page_inputs.http import request_fingerprint -from web_poet.pages import is_injectable +from web_poet.pages import ItemPage, is_injectable from web_poet.serialization.api import deserialize_leaf, load_class, serialize from web_poet.utils import get_fq_class_name @@ -148,8 +148,37 @@ def build_plan(self, request: Request) -> andi.Plan: # Callable[[Callable], Optional[Callable]] but the registry # returns the typing for ``dict.get()`` method. overrides=self.registry.overrides_for(request.url).get, # type: ignore[arg-type] + custom_builder_fn=self._get_item_builder(request), ) + def _get_item_builder( + self, request: Request + ) -> Callable[[Callable], Optional[Callable]]: + """Return a function suitable for passing as ``custom_builder_fn`` to ``andi.plan``. + + The returned function can map an item to a factory for that item based + on the registry. + """ + factory_cache: Dict[Callable, Optional[Callable]] = {} + + def mapping_fn(item_cls: Callable) -> Optional[Callable]: + if item_cls in factory_cache: + return factory_cache.get(item_cls) + page_object_cls: Optional[Type[ItemPage]] = self.registry.page_cls_for_item( + request.url, cast(type, item_cls) + ) + if not page_object_cls: + factory_cache[item_cls] = None + return None + + async def item_factory(page: page_object_cls) -> item_cls: # type: ignore[valid-type] + return await page.to_item() # type: ignore[attr-defined] + + factory_cache[item_cls] = item_factory + return item_factory + + return mapping_fn + @inlineCallbacks def build_instances( self, @@ -170,8 +199,15 @@ def build_instances( # following the andi plan. for cls, kwargs_spec in plan.dependencies: if cls not in instances.keys(): - instances[cls] = cls(**kwargs_spec.kwargs(instances)) - cls_fqn = get_fq_class_name(cast(type, cls)) + result_cls: type = cast(type, cls) + if isinstance(cls, andi.CustomBuilder): + result_cls = cls.result_class_or_fn + instances[result_cls] = yield deferred_from_coro( + cls.factory(**kwargs_spec.kwargs(instances)) + ) + else: + instances[result_cls] = cls(**kwargs_spec.kwargs(instances)) + cls_fqn = get_fq_class_name(result_cls) self.crawler.stats.inc_value(f"poet/injector/{cls_fqn}") return instances diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 672909f0..3198f327 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -8,18 +8,12 @@ different providers in order to acquire data from multiple external sources, for example, from scrapy-playwright or from an API for automatic extraction. """ -import asyncio -from dataclasses import make_dataclass -from inspect import isclass -from typing import Any, Callable, ClassVar, Dict, List, Optional, Set, Type, Union +from typing import Any, Callable, ClassVar, Dict, List, Set, Union from warnings import warn -from weakref import WeakKeyDictionary -import andi from scrapy import Request from scrapy.crawler import Crawler from scrapy.http import Response -from scrapy.utils.defer import maybe_deferred_to_future from web_poet import ( HttpClient, HttpResponse, @@ -30,13 +24,9 @@ Stats, ) from web_poet.page_inputs.stats import StatCollector, StatNum -from web_poet.pages import is_injectable from scrapy_poet.downloader import create_scrapy_downloader -from scrapy_poet.injection_errors import ( - MalformedProvidedClassesError, - ProviderDependencyDeadlockError, -) +from scrapy_poet.injection_errors import MalformedProvidedClassesError class PageObjectInputProvider: @@ -229,59 +219,13 @@ def __call__(self, to_provide: Set[Callable], response: Response): class ItemProvider(PageObjectInputProvider): + provided_classes = set() name = "item" - template_deadlock_msg = ( - "Deadlock detected! A loop has been detected to " - "trying to resolve this plan: {plan}" - ) - - allow_prev_instances: bool = True - def __init__(self, injector): super().__init__(injector) - self.registry = self.injector.registry - - # The key that's used here is the ``scrapy.Request`` instance to ensure - # that the cached instances under it are properly garbage collected - # after processing such request. - self._cached_instances = WeakKeyDictionary() - - # This is only used when the reactor is ``AsyncioSelectorReactor`` since - # the ``asyncio.Future`` that it uses doesn't trigger a RecursionError - # unlike Twisted's Deferred. So we use this as a soft-proxy to recursion - # depth to check how many calls to ``self.injector.build_instances`` are - # made. - # Similar to ``_cached_instances`` above, the key is ``scrapy.Request``. - self._build_instances_call_counter = WeakKeyDictionary() - - def provided_classes(self, cls): - """If the item is in any of the ``to_return`` in the rules, then it can - be provided by using the corresponding page object in ``use``. - """ - return isclass(cls) and self.registry.search(to_return=cls) - - def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: - if request not in self._cached_instances: - self._cached_instances[request] = {} - self._cached_instances[request].update(mapping) - - def get_from_cache(self, request: Request, cls: Callable) -> Optional[Any]: - return self._cached_instances.get(request, {}).get(cls) - - def check_if_deadlock(self, request: Request) -> bool: - """Should only be used when ``AsyncioSelectorReactor`` is the reactor.""" - if request not in self._build_instances_call_counter: - self._build_instances_call_counter[request] = 0 - self._build_instances_call_counter[request] += 1 - - # If there are more than 100 calls to ``injector.build_instances()`` - # for a given request, it might be a deadlock. This limit is large - # enough since the dependency tree for item dependencies needing page - # objects and/or items wouldn't reach this far. - if self._build_instances_call_counter[request] > 100: - return True - return False + msg = "The ItemProvider now does nothing and you should disable it." + warn(msg, DeprecationWarning, stacklevel=2) async def __call__( self, @@ -290,61 +234,7 @@ async def __call__( response: Response, prev_instances: Dict, ) -> List[Any]: - results = [] - for cls in to_provide: - if item := self.get_from_cache(request, cls): - results.append(item) - continue - - page_object_cls = self.registry.page_cls_for_item(request.url, cls) - if not page_object_cls: - warn( - f"Can't find appropriate page object for {cls} item for " - f"url: '{request.url}'. Check the ApplyRules you're using." - ) - continue - - # https://github.com/scrapinghub/andi/issues/23#issuecomment-1331682180 - fake_call_signature = make_dataclass( - "FakeCallSignature", [("page_object", page_object_cls)] - ) - plan = andi.plan( - fake_call_signature, - is_injectable=is_injectable, - externally_provided=self.injector.is_class_provided_by_any_provider, - ) - - try: - deferred_or_future = maybe_deferred_to_future( - self.injector.build_instances( - request, response, plan, prev_instances - ) - ) - # RecursionError NOT raised when ``AsyncioSelectorReactor`` is used. - # Could be related: https://github.com/python/cpython/issues/93837 - - # Need to check before awaiting on the ``asyncio.Future`` - # before it gets stuck on a potential deadlock. - if asyncio.isfuture(deferred_or_future): - if self.check_if_deadlock(request): - raise ProviderDependencyDeadlockError( - self.template_deadlock_msg.format(plan=plan) - ) - - po_instances = await deferred_or_future - except RecursionError: - raise ProviderDependencyDeadlockError( - self.template_deadlock_msg.format(plan=plan) - ) - - page_object = po_instances[page_object_cls] - item = await page_object.to_item() - - self.update_cache(request, po_instances) - self.update_cache(request, {type(item): item}) - - results.append(item) - return results + return [] class ScrapyPoetStatCollector(StatCollector): diff --git a/tests/test_injection.py b/tests/test_injection.py index 9a0f0eba..21f7b65c 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -8,7 +8,7 @@ from scrapy.http import Response from url_matcher import Patterns from url_matcher.util import get_domain -from web_poet import Injectable, ItemPage, RulesRegistry +from web_poet import Injectable, ItemPage, RulesRegistry, field from web_poet.mixins import ResponseShortcutsMixin from web_poet.rules import ApplyRule @@ -267,6 +267,70 @@ def callback( "d": ClsNoProviderRequired, } + @inlineCallbacks + def test_build_callback_dependencies_minimize_provider_calls(self): + """Test that build_callback_dependencies does not call any given + provider more times than it needs when one provided class is requested + directly while another is a page object dependency requested through + an item.""" + + class ExpensiveDependency1: + pass + + class ExpensiveDependency2: + pass + + class ExpensiveProvider(PageObjectInputProvider): + provided_classes = {ExpensiveDependency1, ExpensiveDependency2} + + def __call__(self, to_provide): + if to_provide != self.provided_classes: + raise RuntimeError( + "The expensive dependency provider has been called " + "with a subset of the classes that it provides and " + "that are required for the callback in this test." + ) + return [cls() for cls in to_provide] + + @attr.define + class MyItem(Injectable): + exp: ExpensiveDependency2 + i: int + + @attr.define + class MyPage(ItemPage[MyItem]): + expensive: ExpensiveDependency2 + + @field + def i(self): + return 42 + + @field + def exp(self): + return self.expensive + + def callback( + expensive: ExpensiveDependency1, + item: MyItem, + ): + pass + + providers = { + ExpensiveProvider: 2, + } + injector = get_injector_for_testing(providers) + injector.registry.add_rule(ApplyRule("", use=MyPage, to_return=MyItem)) + response = get_response_for_testing(callback) + + # This would raise RuntimeError if expectations are not met. + kwargs = yield from injector.build_callback_dependencies( + response.request, response + ) + + # Make sure the test does not simply pass because some dependencies were + # not injected at all. + assert set(kwargs.keys()) == {"expensive", "item"} + class Html(Injectable): url = "http://example.com" @@ -343,7 +407,7 @@ class TestInjectorStats: ), ( {"item": TestItem}, - set(), # there must be no stats as ItemProvider is not enabled + set(), ), ), ) @@ -369,8 +433,7 @@ def callback_factory(): def test_po_provided_via_item(self, injector): rules = [ApplyRule(Patterns(include=()), use=TestItemPage, to_return=TestItem)] registry = RulesRegistry(rules=rules) - providers = {"scrapy_poet.page_input_providers.ItemProvider": 10} - injector = get_injector_for_testing(providers, registry=registry) + injector = get_injector_for_testing({}, registry=registry) def callback(response: DummyResponse, item: TestItem): pass diff --git a/tests/test_providers.py b/tests/test_providers.py index 892d9583..4610b5a1 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -2,6 +2,7 @@ from unittest import mock import attr +import pytest import scrapy from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Request, Spider @@ -228,32 +229,13 @@ def test_page_params_provider(settings): assert results[0] == expected_data -def test_item_provider_cache(settings): - """Note that the bulk of the tests for the ``ItemProvider`` alongside the - ``Injector`` is tested in ``tests/test_web_poet_rules.py``. - - We'll only test its caching behavior here if its properly garbage collected. - """ - +def test_item_provider_deprecated(settings): crawler = get_crawler(Spider, settings) injector = Injector(crawler) - provider = ItemProvider(injector) - - assert len(provider._cached_instances) == 0 - - def inside(): - request = Request("https://example.com") - provider.update_cache(request, {Name: Name("test")}) - assert len(provider._cached_instances) == 1 - - cached_instance = provider.get_from_cache(request, Name) - assert isinstance(cached_instance, Name) - - # The cache should be empty after the ``inside`` scope has finished which - # means that the corresponding ``request`` and the contents under it are - # garbage collected. - inside() - assert len(provider._cached_instances) == 0 + msg = "The ItemProvider now does nothing and you should disable it." + with pytest.warns(DeprecationWarning, match=msg): + provider = ItemProvider(injector) + assert len(provider.provided_classes) == 0 def test_stats_provider(settings): diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index afab22d2..d8086056 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -73,9 +73,6 @@ class PageObjectCounterMixin: For example, a PO could have its ``.to_item()`` method called multiple times to produce the same item. This is extremely wasteful should there be any additional requests used to produce the item. - - ``ItemProvider`` should cache up such instances and prevent them from being - built again. """ instances: Dict[Type, Any] = defaultdict(list) @@ -432,7 +429,7 @@ async def name(self) -> str: ) @inlineCallbacks def test_item_using_asyncio() -> None: - """This ensures that ``ItemProvider`` works properly for page objects using + """This ensures that the injector works properly for page objects using the ``asyncio`` functionalities. """ item, deps = yield crawl_item_and_deps(DelayedProduct) @@ -457,25 +454,6 @@ def name(self) -> str: return "wrong url" -@inlineCallbacks -def test_basic_item_with_page_object_but_different_url() -> None: - """If an item has been requested and a page object can produce it, but the - URL pattern is different, the item won't be produced at all. - - For these cases, a warning should be issued since the user might have written - some incorrect URL Pattern for the ``ApplyRule``. - """ - msg = ( - "Can't find appropriate page object for item for url: " - ) - with warnings.catch_warnings(record=True) as caught_warnings: - item, deps = yield crawl_item_and_deps(ItemWithPageObjectButForDifferentUrl) - assert any(True for w in caught_warnings if msg in str(w.message)) - assert item is None - assert not deps - - @attrs.define class ProductFromParent: name: str @@ -804,10 +782,7 @@ def name(self) -> str: return "replaced product name" -@pytest.mark.xfail( - reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " - "ItemProvide has received a different type of item class from the page object." -) +@pytest.mark.xfail(reason="Need to check the current failure cause") @inlineCallbacks def test_item_to_return_in_handle_urls() -> None: """Even if ``@handle_urls`` could derive the value for the ``to_return`` @@ -863,10 +838,7 @@ def name(self) -> str: return "subclass replaced product name" -@pytest.mark.xfail( - reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " - "ItemProvide has received a different type of item class from the page object." -) +@pytest.mark.xfail(reason="Need to check the current failure cause") @inlineCallbacks def test_item_to_return_in_handle_urls_subclass() -> None: """Same case as with the ``test_item_to_return_in_handle_urls()`` case above @@ -910,10 +882,7 @@ def name(self) -> str: return "standalone product name" -@pytest.mark.xfail( - reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " - "ItemProvide has received a different type of item class from the page object." -) +@pytest.mark.xfail(reason="Need to check the current failure cause") @inlineCallbacks def test_item_to_return_standalone() -> None: """Same case as with ``test_item_to_return_in_handle_urls()`` above but the @@ -1372,7 +1341,7 @@ class EggItem: @handle_urls(URL) @attrs.define -class ChickenDeadlockPage(ItemPage[ChickenItem]): +class ChickenCyclePage(ItemPage[ChickenItem]): other_injected_item: EggItem @field @@ -1386,7 +1355,7 @@ def other(self) -> str: @handle_urls(URL) @attrs.define -class EggDeadlockPage(ItemPage[EggItem]): +class EggCyclePage(ItemPage[EggItem]): other_injected_item: ChickenItem @field @@ -1399,30 +1368,30 @@ def other(self) -> str: @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_a(caplog) -> None: - """Items with page objects which depend on each other resulting in a deadlock +def test_page_object_with_item_dependency_cycle_a(caplog) -> None: + """Items with page objects which depend on each other resulting in a plan cycle should have a corresponding error raised. """ yield crawl_item_and_deps(ChickenItem) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_b(caplog) -> None: +def test_page_object_with_item_dependency_cycle_b(caplog) -> None: yield crawl_item_and_deps(EggItem) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_c(caplog) -> None: - yield crawl_item_and_deps(ChickenDeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_c(caplog) -> None: + yield crawl_item_and_deps(ChickenCyclePage) + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_d(caplog) -> None: - yield crawl_item_and_deps(EggDeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_d(caplog) -> None: + yield crawl_item_and_deps(EggCyclePage) + assert "Cyclic dependency found" in caplog.text @attrs.define @@ -1439,7 +1408,7 @@ class Egg2Item: @handle_urls(URL) @attrs.define -class Chicken2DeadlockPage(ItemPage[Chicken2Item]): +class Chicken2CyclePage(ItemPage[Chicken2Item]): other_injected_item: Egg2Item @field @@ -1453,8 +1422,8 @@ def other(self) -> str: @handle_urls(URL) @attrs.define -class Egg2DeadlockPage(ItemPage[Egg2Item]): - other_injected_page: Chicken2DeadlockPage +class Egg2CyclePage(ItemPage[Egg2Item]): + other_injected_page: Chicken2CyclePage @field def name(self) -> str: @@ -1467,30 +1436,30 @@ async def other(self) -> str: @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_a(caplog) -> None: - """Same with ``test_page_object_with_item_dependency_deadlock()`` but one +def test_page_object_with_item_dependency_cycle_2_a(caplog) -> None: + """Same with ``test_page_object_with_item_dependency_cycle()`` but one of the page objects requires a page object instead of an item. """ yield crawl_item_and_deps(Chicken2Item) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_b(caplog) -> None: +def test_page_object_with_item_dependency_cycle_2_b(caplog) -> None: yield crawl_item_and_deps(Egg2Item) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_c(caplog) -> None: - yield crawl_item_and_deps(Chicken2DeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_2_c(caplog) -> None: + yield crawl_item_and_deps(Chicken2CyclePage) + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_d(caplog) -> None: - yield crawl_item_and_deps(Egg2DeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_2_d(caplog) -> None: + yield crawl_item_and_deps(Egg2CyclePage) + assert "Cyclic dependency found" in caplog.text @attrs.define @@ -1543,7 +1512,7 @@ def test_page_object_returning_item_which_is_also_a_dep_but_no_provider_item( but there's no provider for the original item """ yield crawl_item_and_deps(Mobius) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "AssertionError" in caplog.text @inlineCallbacks @@ -1554,7 +1523,7 @@ def test_page_object_returning_item_which_is_also_a_dep_but_no_provider_po( but tests the PO instead of the item. """ yield crawl_item_and_deps(MobiusPage) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "AssertionError" in caplog.text @attrs.define @@ -1704,10 +1673,10 @@ def test_created_apply_rules() -> None: ), ApplyRule(URL, use=ProductDeepDependencyPage, to_return=MainProductC), ApplyRule(URL, use=ProductDuplicateDeepDependencyPage, to_return=MainProductD), - ApplyRule(URL, use=ChickenDeadlockPage, to_return=ChickenItem), - ApplyRule(URL, use=EggDeadlockPage, to_return=EggItem), - ApplyRule(URL, use=Chicken2DeadlockPage, to_return=Chicken2Item), - ApplyRule(URL, use=Egg2DeadlockPage, to_return=Egg2Item), + ApplyRule(URL, use=ChickenCyclePage, to_return=ChickenItem), + ApplyRule(URL, use=EggCyclePage, to_return=EggItem), + ApplyRule(URL, use=Chicken2CyclePage, to_return=Chicken2Item), + ApplyRule(URL, use=Egg2CyclePage, to_return=Egg2Item), ApplyRule(URL, use=MobiusPage, to_return=Mobius), ApplyRule(URL, use=KangarooPage, to_return=Kangaroo), ApplyRule(Patterns([URL], priority=600), use=JoeyPage, to_return=Kangaroo), From fceec040a255b4ef871b70033b96eb1ed865a0f1 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 13 Dec 2023 17:42:19 +0400 Subject: [PATCH 02/10] Fix the expected exceptions after the andi change. --- tests/test_web_poet_rules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index d8086056..1ccfe3e2 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -1512,7 +1512,7 @@ def test_page_object_returning_item_which_is_also_a_dep_but_no_provider_item( but there's no provider for the original item """ yield crawl_item_and_deps(Mobius) - assert "AssertionError" in caplog.text + assert "NonProvidableError" in caplog.text @inlineCallbacks @@ -1523,7 +1523,7 @@ def test_page_object_returning_item_which_is_also_a_dep_but_no_provider_po( but tests the PO instead of the item. """ yield crawl_item_and_deps(MobiusPage) - assert "AssertionError" in caplog.text + assert "NonProvidableError" in caplog.text @attrs.define From 67c32c0f47d16081de0bfeab0f5c801ea39ab468 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 13 Dec 2023 18:00:52 +0400 Subject: [PATCH 03/10] Restore test_basic_item_with_page_object_but_different_url(). --- tests/test_web_poet_rules.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index 1ccfe3e2..271d242a 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -147,6 +147,19 @@ def assert_warning_tokens(caught_warnings, expected_warning_tokens): assert all(results) +@inlineCallbacks +def assert_no_item(page: type) -> None: + # Starting Scrapy 2.7, there's better support for async callbacks. This + # means that errors aren't suppressed. + if is_min_scrapy_version("2.7.0"): + expected_msg = r"parse\(\) missing 1 required keyword-only argument: 'item'" + with pytest.raises(TypeError, match=expected_msg): + yield crawl_item_and_deps(page) + else: + item = yield crawl_item_and_deps(page) + assert item == (None, [{}]) + + @handle_urls(URL) class UrlMatchPage(ItemPage): async def to_item(self) -> dict: @@ -398,16 +411,7 @@ def test_basic_item_but_no_page_object() -> None: assigned to it in any of the given ``ApplyRule``, it would result to an error in the spider callback since """ - - # Starting Scrapy 2.7, there's better support for async callbacks. This - # means that errors aren't suppressed. - if is_min_scrapy_version("2.7.0"): - expected_msg = r"parse\(\) missing 1 required keyword-only argument: 'item'" - with pytest.raises(TypeError, match=expected_msg): - yield crawl_item_and_deps(ItemButNoPageObject) - else: - item = yield crawl_item_and_deps(ItemButNoPageObject) - assert item == (None, [{}]) + yield assert_no_item(ItemButNoPageObject) @attrs.define @@ -454,6 +458,14 @@ def name(self) -> str: return "wrong url" +@inlineCallbacks +def test_basic_item_with_page_object_but_different_url() -> None: + """If an item has been requested and a page object can produce it, but the + URL pattern is different, the item won't be produced at all. + """ + yield assert_no_item(ItemWithPageObjectButForDifferentUrl) + + @attrs.define class ProductFromParent: name: str From 7052ec86c5e0cdbf780c31e854c5ec0add7d0f0f Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 13 Dec 2023 18:18:57 +0400 Subject: [PATCH 04/10] Clarify test_stats(). --- tests/test_injection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_injection.py b/tests/test_injection.py index 7b161dcc..71b5b9e8 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -603,7 +603,7 @@ class TestInjectorStats: ), ( {"item": TestItem}, - set(), + set(), # there must be no stats as TestItem is not in the registry ), ), ) @@ -626,7 +626,7 @@ def callback_factory(): assert set(poet_stats) == expected @inlineCallbacks - def test_po_provided_via_item(self, injector): + def test_po_provided_via_item(self): rules = [ApplyRule(Patterns(include=()), use=TestItemPage, to_return=TestItem)] registry = RulesRegistry(rules=rules) injector = get_injector_for_testing({}, registry=registry) From 8723d08b970c7fd09f33df6c595ea9722f0f9527 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 13 Dec 2023 19:31:41 +0400 Subject: [PATCH 05/10] Fix and remove xfail from tests broken by ItemProvider. --- tests/test_web_poet_rules.py | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index b160fc71..0708fe6f 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -790,7 +790,6 @@ def name(self) -> str: return "replaced product name" -@pytest.mark.xfail(reason="Need to check the current failure cause") @inlineCallbacks def test_item_to_return_in_handle_urls() -> None: """Even if ``@handle_urls`` could derive the value for the ``to_return`` @@ -803,14 +802,8 @@ class vs the class that is actually returned. Using the ``to_return`` """ item, deps = yield crawl_item_and_deps(ReplacedProduct) assert item == Product(name="replaced product name") - assert_deps(deps, {"page": ReplacedProductPage}) - + assert_deps(deps, {"item": Product}) -@inlineCallbacks -def test_item_to_return_in_handle_urls_other() -> None: - """Remaining tests for ``test_item_to_return_in_handle_urls()`` which are - not expected to be xfail. - """ # Requesting the underlying item class from the PO should still work. item, deps = yield crawl_item_and_deps(Product) assert item == Product(name="product name") @@ -846,7 +839,6 @@ def name(self) -> str: return "subclass replaced product name" -@pytest.mark.xfail(reason="Need to check the current failure cause") @inlineCallbacks def test_item_to_return_in_handle_urls_subclass() -> None: """Same case as with the ``test_item_to_return_in_handle_urls()`` case above @@ -854,14 +846,8 @@ def test_item_to_return_in_handle_urls_subclass() -> None: """ item, deps = yield crawl_item_and_deps(SubclassReplacedProduct) assert item == ParentReplacedProduct(name="subclass replaced product name") - assert_deps(deps, {"page": SubclassReplacedProductPage}) - + assert_deps(deps, {"item": ParentReplacedProduct}) -@inlineCallbacks -def test_item_to_return_in_handle_urls_subclass_others() -> None: - """Remaining tests for ``test_item_to_return_in_handle_urls_subclass()`` - which are not expected to be xfail. - """ # Requesting the underlying item class from the parent PO should still work. item, deps = yield crawl_item_and_deps(ParentReplacedProduct) assert item == ParentReplacedProduct(name="parent replaced product name") @@ -890,7 +876,6 @@ def name(self) -> str: return "standalone product name" -@pytest.mark.xfail(reason="Need to check the current failure cause") @inlineCallbacks def test_item_to_return_standalone() -> None: """Same case as with ``test_item_to_return_in_handle_urls()`` above but the @@ -898,14 +883,7 @@ def test_item_to_return_standalone() -> None: """ item, deps = yield crawl_item_and_deps(StandaloneProduct) assert item == {"name": "standalone product name"} - assert_deps(deps, {"page": StandaloneProductPage}) - - -@inlineCallbacks -def test_item_to_return_standalone_others() -> None: - """Remaining tests for ``test_item_to_return_standalone()`` - which are not expected to be xfail. - """ + assert_deps(deps, {"item": dict}) # calling the actual page object should still work item, deps = yield crawl_item_and_deps(StandaloneProductPage) From e04992609d9818fee5ac6f7670a008d688ba51e2 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 25 Dec 2023 14:05:42 +0400 Subject: [PATCH 06/10] Update the test_page_object_returning_item_which_is_also_a_dep_2() expectation. --- tests/test_web_poet_rules.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index 0708fe6f..cefd7441 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -1558,11 +1558,14 @@ def test_page_object_returning_item_which_is_also_a_dep_2() -> None: assert item == Kangaroo(name="(modified by Joey) data from KangarooProvider") assert_deps(deps, {"item": Kangaroo}) - # calling the actual page objects should still work + # both page objects are called item, deps = yield crawl_item_and_deps(KangarooPage, override_settings=settings) - assert item == Kangaroo(name="(modified) data from KangarooProvider") + assert item == Kangaroo( + name="(modified) (modified by Joey) data from KangarooProvider" + ) assert_deps(deps, {"page": KangarooPage}) + # calling the actual page object should still work item, deps = yield crawl_item_and_deps(JoeyPage, override_settings=settings) assert item == Kangaroo(name="(modified by Joey) data from KangarooProvider") assert_deps(deps, {"page": JoeyPage}) From 7c8c7f9d8b16d867a80076116981fa3f1e7876be Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 25 Dec 2023 14:09:34 +0400 Subject: [PATCH 07/10] Remove prev_instances code. --- scrapy_poet/commands.py | 5 ++--- scrapy_poet/injection.py | 20 ++------------------ scrapy_poet/page_input_providers.py | 9 +-------- 3 files changed, 5 insertions(+), 29 deletions(-) diff --git a/scrapy_poet/commands.py b/scrapy_poet/commands.py index 033f7718..e64ee544 100644 --- a/scrapy_poet/commands.py +++ b/scrapy_poet/commands.py @@ -1,7 +1,7 @@ import datetime import logging from pathlib import Path -from typing import Dict, Optional, Type +from typing import Optional, Type import andi import scrapy @@ -38,10 +38,9 @@ def build_instances_from_providers( request: Request, response: Response, plan: andi.Plan, - prev_instances: Optional[Dict] = None, ): instances = yield super().build_instances_from_providers( - request, response, plan, prev_instances + request, response, plan ) if request.meta.get("savefixture", False): saved_dependencies.extend(instances.values()) diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index b92c5ce3..d8288be8 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -184,7 +184,6 @@ def build_instances( request: Request, response: Response, plan: andi.Plan, - prev_instances: Optional[Dict] = None, ): """Build the instances dict from a plan including external dependencies.""" # First we build the external dependencies using the providers @@ -192,7 +191,6 @@ def build_instances( request, response, plan, - prev_instances, ) # All the remaining dependencies are internal so they can be built just # following the andi plan. @@ -217,10 +215,9 @@ def build_instances_from_providers( request: Request, response: Response, plan: andi.Plan, - prev_instances: Optional[Dict] = None, ): """Build dependencies handled by registered providers""" - instances: Dict[Callable, Any] = prev_instances or {} + instances: Dict[Callable, Any] = {} scrapy_provided_dependencies = self.available_dependencies_for_providers( request, response ) @@ -230,22 +227,11 @@ def build_instances_from_providers( provided_classes = { cls for cls in dependencies_set if provider.is_provided(cls) } - - # ignore already provided types if provider doesn't need to use them - if not provider.allow_prev_instances: - provided_classes -= instances.keys() + provided_classes -= instances.keys() # ignore already provided types if not provided_classes: continue - # If dependency instances were already made by previously invoked - # providers, don't try to build them again since it may result in - # incorrect values (e.g. PO modifying an item > 2 times). - required_deps = set(plan.dependencies[-1][1].values()) - built_deps = set(instances.keys()) - if required_deps and required_deps == built_deps: - continue - objs, fingerprint = [], None cache_hit = False if self.cache: @@ -281,8 +267,6 @@ def build_instances_from_providers( externally_provided=scrapy_provided_dependencies, full_final_kwargs=False, ).final_kwargs(scrapy_provided_dependencies) - if provider.allow_prev_instances: - kwargs.update({"prev_instances": instances}) try: # Invoke the provider to get the data objs = yield maybeDeferred_coro( diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 9d0e39bb..d5d70fd0 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -8,7 +8,7 @@ different providers in order to acquire data from multiple external sources, for example, from scrapy-playwright or from an API for automatic extraction. """ -from typing import Any, Callable, ClassVar, Dict, FrozenSet, List, Set, Union +from typing import Any, Callable, ClassVar, FrozenSet, List, Set, Union from warnings import warn from scrapy import Request @@ -98,12 +98,6 @@ def __call__(self, to_provide, response: Response): provided_classes: Union[Set[Callable], Callable[[Callable], bool]] name: ClassVar[str] = "" # It must be a unique name. Used by the cache mechanism - # If set to True, the Injector will not skip the Provider when the dependency has - # been built. Instead, the Injector will pass the previously built instances (by - # the other providers) to the Provider. The Provider can then choose to modify - # these previous instances before returning them to the Injector. - allow_prev_instances: bool = False - def is_provided(self, type_: Callable) -> bool: """ Return ``True`` if the given type is provided by this provider based @@ -257,7 +251,6 @@ async def __call__( to_provide: Set[Callable], request: Request, response: Response, - prev_instances: Dict, ) -> List[Any]: return [] From 2ea32a0d04ccfda4cafe1d0490f33031704437f7 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 25 Dec 2023 17:23:31 +0400 Subject: [PATCH 08/10] Address feedback. --- scrapy_poet/injection.py | 7 ++----- tests/test_injection.py | 10 +++++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index d8288be8..9642b0ea 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -1,3 +1,4 @@ +import functools import inspect import logging import os @@ -158,22 +159,18 @@ def _get_item_builder( The returned function can map an item to a factory for that item based on the registry. """ - factory_cache: Dict[Callable, Optional[Callable]] = {} + @functools.cache # to minimize the registry queries def mapping_fn(item_cls: Callable) -> Optional[Callable]: - if item_cls in factory_cache: - return factory_cache.get(item_cls) page_object_cls: Optional[Type[ItemPage]] = self.registry.page_cls_for_item( request.url, cast(type, item_cls) ) if not page_object_cls: - factory_cache[item_cls] = None return None async def item_factory(page: page_object_cls) -> item_cls: # type: ignore[valid-type] return await page.to_item() # type: ignore[attr-defined] - factory_cache[item_cls] = item_factory return item_factory return mapping_fn diff --git a/tests/test_injection.py b/tests/test_injection.py index 71b5b9e8..65690173 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -479,12 +479,16 @@ class ExpensiveDependency2: class ExpensiveProvider(PageObjectInputProvider): provided_classes = {ExpensiveDependency1, ExpensiveDependency2} + def __init__(self, injector): + super().__init__(injector) + self.call_count = 0 + def __call__(self, to_provide): - if to_provide != self.provided_classes: + self.call_count += 1 + if self.call_count > 1: raise RuntimeError( "The expensive dependency provider has been called " - "with a subset of the classes that it provides and " - "that are required for the callback in this test." + "more than once." ) return [cls() for cls in to_provide] From 51ecf0701bb853e5de1cf0fb72d6f37c31c89231 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 26 Dec 2023 16:54:42 +0400 Subject: [PATCH 09/10] Bump the andi version. --- setup.py | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e2726246..0db70547 100755 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ package_data={"scrapy_poet": ["VERSION"]}, python_requires=">=3.8", install_requires=[ - "andi >= 0.5.0", + "andi >= 0.6.0", "attrs >= 21.3.0", "parsel >= 1.5.0", "scrapy >= 2.6.0", diff --git a/tox.ini b/tox.ini index b3830667..9b499072 100644 --- a/tox.ini +++ b/tox.ini @@ -17,7 +17,7 @@ commands = [pinned] deps = {[testenv]deps} - andi==0.5.0 + andi==0.6.0 attrs==21.3.0 parsel==1.5.0 sqlitedict==1.5.0 From de6105c57b59ca59f78b4d50518ce3fa5edfd8ae Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 26 Dec 2023 17:12:24 +0400 Subject: [PATCH 10/10] Restore Python 3.8 support. --- scrapy_poet/injection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 9642b0ea..d662ed0d 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -160,7 +160,7 @@ def _get_item_builder( on the registry. """ - @functools.cache # to minimize the registry queries + @functools.lru_cache(maxsize=None) # to minimize the registry queries def mapping_fn(item_cls: Callable) -> Optional[Callable]: page_object_cls: Optional[Type[ItemPage]] = self.registry.page_cls_for_item( request.url, cast(type, item_cls)