diff --git a/.flake8 b/.flake8 index 939f4afa..22d193ba 100644 --- a/.flake8 +++ b/.flake8 @@ -46,6 +46,7 @@ per-file-ignores = web_poet/testing/pytest.py:D102 tests/po_lib_to_return/__init__.py:D102 tests/test_testing.py:D102 + tests/test_fields.py:D102 # the suggestion makes the code worse tests/test_serialization.py:B028 diff --git a/tests/test_fields.py b/tests/test_fields.py index f84a2851..e202212c 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -1,6 +1,7 @@ import asyncio import random -from typing import Optional +from collections import defaultdict +from typing import DefaultDict, Optional import attrs import pytest @@ -23,9 +24,13 @@ HttpResponse, Injectable, ItemPage, + Returns, + SelectFields, + WebPage, field, item_from_fields, item_from_fields_sync, + item_from_select_fields, ) from web_poet.fields import FieldInfo, get_fields_dict @@ -439,7 +444,6 @@ def field_foo_cached(self): @pytest.mark.asyncio async def test_field_with_handle_urls() -> None: - page = ProductPage() assert page.name == "name" assert page.price == 12.99 @@ -630,3 +634,626 @@ def x(self) -> int: assert info["x"] == FieldInfo(name="x", meta=None, out=None, disabled=True) assert info["y"] == FieldInfo(name="y", meta=None, out=None, disabled=False) assert info["z"] == FieldInfo(name="z", meta=None, out=None, disabled=True) + + +@attrs.define +class BigItem: + x: int + y: Optional[int] = None + z: Optional[int] = None + + +@attrs.define +class SmallItem: + y: Optional[int] = None + + +@attrs.define +class BigPage(WebPage[BigItem]): + call_counter: DefaultDict = attrs.field(factory=lambda: defaultdict(int)) + + @field + def x(self): + self.call_counter["x"] += 1 + return 1 + + @field(disabled=False) + def y(self): + self.call_counter["y"] += 1 + return 2 + + @field(disabled=True) + def z(self): + self.call_counter["z"] += 1 + return 3 + + +@pytest.mark.asyncio +async def test_select_fields() -> None: + # Required fields from the item cls which are not included raise an TypeError + expected_type_error_msg = ( + r"__init__\(\) missing 1 required positional argument: 'x'" + ) + response = HttpResponse("https://example.com", b"") + + # When SelectFields isn't set + page = BigPage(response) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 1, "y": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 2, "y": 2} + + # If no field selection directive is given but SelectFields is set, it would + # use the default fields that are not disabled. + page = BigPage(response, select_fields=SelectFields(None)) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 1, "y": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 2, "y": 2} + + # Same case as above but given an empty dict + page = BigPage(response, select_fields=SelectFields({})) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 1, "y": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 2, "y": 2} + + # Select all fields + page = BigPage(response, select_fields=SelectFields({"*": True})) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "y": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 2, "y": 2, "z": 2} + + # Don't select all fields; but in this case a TypeError is raised since + # required fields aren't supplied to the item + page = BigPage(response, select_fields=SelectFields({"*": False})) + assert page.fields_to_ignore == ["x", "y", "z"] + with pytest.raises(TypeError, match=expected_type_error_msg): + await page.to_item() + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {} + + # Exclude all but one (which is the required field in the item) + page = BigPage(response, select_fields=SelectFields({"*": False, "x": True})) + assert page.fields_to_ignore == ["y", "z"] + assert await page.to_item() == BigItem(x=1, y=None, z=None) + assert page.call_counter == {"x": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=None, z=None) + assert page.call_counter == {"x": 2} + + # Include all fields but one + page = BigPage(response, select_fields=SelectFields({"*": True, "y": False})) + assert page.fields_to_ignore == ["y"] + assert await page.to_item() == BigItem(x=1, y=None, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=None, z=3) + assert page.call_counter == {"x": 2, "z": 2} + + # overlapping directives on the same field should be okay + page = BigPage( + response, + select_fields=SelectFields({"*": True, "x": True, "y": True, "z": True}), + ) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "y": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 2, "y": 2, "z": 2} + + # Excluding a required field throws an error + page = BigPage(response, select_fields=SelectFields({"x": False})) + assert page.fields_to_ignore == ["x", "z"] + with pytest.raises(TypeError, match=expected_type_error_msg): + await page.to_item() + assert page.call_counter == {"y": 1} + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {"y": 2} + + # Boolean-like values are not supported. + expected_non_boolean_value_error_msg = ( + "SelectField only allows boolean values as keys. " + "Got: {'x': 0, 'y': 0, 'z': 1}" + ) + page = BigPage( + response, + select_fields=SelectFields({"x": 0, "y": 0, "z": 1}), # type: ignore[dict-item] + ) + with pytest.raises(ValueError, match=expected_non_boolean_value_error_msg): + page.fields_to_ignore + with pytest.raises(ValueError, match=expected_non_boolean_value_error_msg): + await page.to_item() + with pytest.raises(ValueError, match=expected_non_boolean_value_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {} + + # If an invalid SelectFields value was passed to `select_fields` parameter + expected_invalid_instance_value_error_msg = ( + r"The select_fields.fields parameter is expecting a Mapping. " + r'Got SelectFields\(fields="not the instance it\'s expecting"\).' + ) + page = BigPage( + response, + select_fields="not the instance it's expecting", # type: ignore[arg-type] + ) + with pytest.raises(ValueError, match=expected_invalid_instance_value_error_msg): + page.fields_to_ignore + with pytest.raises(ValueError, match=expected_invalid_instance_value_error_msg): + await page.to_item() + with pytest.raises(ValueError, match=expected_invalid_instance_value_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {} + + # If the item class doesn't have a field, it would error out. + fields = {"x": True, "not_existing": True} + expected_value_error_msg = ( + r"The fields {'not_existing'} is not available in which has SelectFields\(fields={'x': True, " + r"'not_existing': True}\)." + ) + page = BigPage(response, select_fields=SelectFields(fields)) + with pytest.raises(ValueError, match=expected_value_error_msg): + page.fields_to_ignore + with pytest.raises(ValueError, match=expected_value_error_msg): + await page.to_item() + with pytest.raises(ValueError, match=expected_value_error_msg): + await item_from_select_fields(page) + + +@attrs.define +class SmallPage(BigPage, Returns[SmallItem]): + pass + + +@attrs.define +class SmallPageSkip(BigPage, Returns[SmallItem], skip_nonitem_fields=True): + pass + + +@pytest.mark.asyncio +async def test_select_fields_small_item() -> None: + expected_type_error_msg = r"__init__\(\) got an unexpected keyword argument 'x'" + response = HttpResponse("https://example.com", b"") + + # Giving excess fields to a small item results in errors + page = SmallPage(response) + assert page.fields_to_ignore == ["z"] + with pytest.raises(TypeError, match=expected_type_error_msg): + await page.to_item() + assert page.call_counter == {"x": 1, "y": 1} + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {"x": 2, "y": 2} + + # The error should go away after unselecting the excess field + page = SmallPage(response, select_fields=SelectFields({"x": False})) + assert page.fields_to_ignore == ["x", "z"] + assert await page.to_item() == SmallItem(y=2) + assert page.call_counter == {"y": 1} + assert await item_from_select_fields(page) == SmallItem(y=2) + assert page.call_counter == {"y": 2} + + # If the page object uses skip_nonitem_fields=True, it should work without + # any problems + page2 = SmallPageSkip(response) + assert page2.fields_to_ignore == ["z"] + assert await page2.to_item() == SmallItem(y=2) + assert page2.call_counter == {"y": 1} + assert await item_from_select_fields(page2) == SmallItem(y=2) + assert page2.call_counter == {"y": 2} + + # Declaring "x" as a field to ignore works the same but it's added to the + # ``.fields_to_ignore`` + page2 = SmallPageSkip(response, select_fields=SelectFields({"x": False})) + assert page2.fields_to_ignore == ["x", "z"] + assert await page2.to_item() == SmallItem(y=2) + assert page2.call_counter == {"y": 1} + assert await item_from_select_fields(page2) == SmallItem(y=2) + assert page2.call_counter == {"y": 2} + + +@attrs.define +class BigToItemOnlyPage(WebPage[BigItem]): + async def to_item(self) -> BigItem: + return BigItem(x=1, y=2) + + +@pytest.mark.asyncio +async def test_select_fields_but_to_item_only() -> None: + """Same with ``test_select_fields()`` but the page object overrides the + ``.to_item()`` method and doesn't use the ``@field`` decorators at all. + + For the different scenarios in this test, these are consistent: + - ``.fields_to_ignore`` returns an empty list. + - ``.to_item()`` is unaffected by the passed ``SelectFields`` since it + doesn't take it into account as it simply returns the item instance. + """ + # Required fields from the item cls which are not included raise an TypeError + expected_type_error_msg = ( + r"__init__\(\) missing 1 required positional argument: 'x'" + ) + response = HttpResponse("https://example.com", b"") + + # When SelectFields isn't set, it should simply extract the non-disabled + # fields. + page = BigToItemOnlyPage(response) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + + # If no field selection directive is given but SelectFields is set, it would + # use the default fields that are not disabled. + page = BigToItemOnlyPage(response, select_fields=SelectFields(None)) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + + # Same case as above but given an empty dict + page = BigToItemOnlyPage(response, select_fields=SelectFields({})) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + + # Select all fields + page = BigToItemOnlyPage(response, select_fields=SelectFields({"*": True})) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + + # Don't select all fields; but in this case a TypeError is raised since + # required fields aren't supplied to the item + page = BigToItemOnlyPage(response, select_fields=SelectFields({"*": False})) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + + # Exclude all but one (which is the required field in the item) + page = BigToItemOnlyPage( + response, select_fields=SelectFields({"*": False, "x": True}) + ) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == BigItem(x=1, y=None, z=None) + + # Include all fields but one + page = BigToItemOnlyPage( + response, select_fields=SelectFields({"*": True, "y": False}) + ) + assert page.fields_to_ignore == ["y"] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == BigItem(x=1, y=None, z=None) + + # overlapping directives on the same field should be okay + page = BigToItemOnlyPage( + response, + select_fields=SelectFields({"*": True, "x": True, "y": True, "z": True}), + ) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + + # Excluding a required field throws an error + page = BigToItemOnlyPage(response, select_fields=SelectFields({"x": False})) + assert page.fields_to_ignore == ["x"] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + + # Boolean-like values are not supported. + expected_non_boolean_value_error_msg = ( + "SelectField only allows boolean values as keys. " + "Got: {'x': 0, 'y': 0, 'z': 1}" + ) + page = BigToItemOnlyPage( + response, + select_fields=SelectFields({"x": 0, "y": 0, "z": 1}), # type: ignore[dict-item] + ) + with pytest.raises(ValueError, match=expected_non_boolean_value_error_msg): + page.fields_to_ignore + assert await page.to_item() == BigItem(x=1, y=2, z=None) + with pytest.raises(ValueError, match=expected_non_boolean_value_error_msg): + await item_from_select_fields(page) + + # If an invalid SelectFields value was passed to `select_fields` parameter + expected_invalid_instance_value_error_msg = ( + r"The select_fields.fields parameter is expecting a Mapping. " + r'Got SelectFields\(fields="not the instance it\'s expecting"\).' + ) + page = BigToItemOnlyPage( + response, + select_fields="not the instance it's expecting", # type: ignore[arg-type] + ) + with pytest.raises(ValueError, match=expected_invalid_instance_value_error_msg): + page.fields_to_ignore + assert await page.to_item() == BigItem(x=1, y=2, z=None) + with pytest.raises(ValueError, match=expected_invalid_instance_value_error_msg): + await item_from_select_fields(page) + + # If the item class doesn't have a field, it would error out. + fields = {"x": True, "not_existing": True} + expected_value_error_msg = ( + r"The fields {'not_existing'} is not available in which has SelectFields\(fields={'x': True, " + r"'not_existing': True}\)." + ) + page = BigToItemOnlyPage(response, select_fields=SelectFields(fields)) + with pytest.raises(ValueError, match=expected_value_error_msg): + page.fields_to_ignore + assert await page.to_item() == BigItem(x=1, y=2, z=None) + with pytest.raises(ValueError, match=expected_value_error_msg): + await item_from_select_fields(page) + + +@attrs.define +class SmallToItemOnlyPage(BigToItemOnlyPage, Returns[SmallItem]): + pass + + +@attrs.define +class SmallToItemOnlyPageSkip( + BigToItemOnlyPage, Returns[SmallItem], skip_nonitem_fields=True +): + pass + + +@pytest.mark.asyncio +async def test_select_fields_but_to_item_only_small_item() -> None: + expected_type_error_msg = r"__init__\(\) got an unexpected keyword argument 'x'" + response = HttpResponse("https://example.com", b"") + + # Giving excess fields to a small item results in errors; except + # ``.to_item()`` since it doesn't call ``item_from_fields()`` or + # ``super().to_item()`` + page = SmallToItemOnlyPage(response) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + + # The error should go away after unselecting the excess field + page = SmallToItemOnlyPage( + response, select_fields=SelectFields({"x": False, "z": False}) + ) + assert page.fields_to_ignore == ["x", "z"] + assert await page.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page) == SmallItem(y=2) + + # If the page object uses skip_nonitem_fields=True, it should work without + # any problems + page2 = SmallToItemOnlyPageSkip(response) + assert page2.fields_to_ignore == [] + assert await page2.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page2) == SmallItem(y=2) + + # Declaring "x" as a field to ignore works the same but it's added to the + # ``.fields_to_ignore`` + page2 = SmallToItemOnlyPageSkip(response, select_fields=SelectFields({"x": False})) + assert page2.fields_to_ignore == ["x"] + assert await page2.to_item() == BigItem(x=1, y=2, z=None) + assert await item_from_select_fields(page2) == SmallItem(y=2) + + +@attrs.define +class BigUnreliablePage(WebPage[BigItem]): + call_counter: DefaultDict = attrs.field(factory=lambda: defaultdict(int)) + + @field + def x(self): + self.call_counter["x"] += 1 + return 1 + + @field(disabled=True) + def z(self): + self.call_counter["z"] += 1 + return 3 + + async def to_item(self) -> BigItem: + return BigItem(x=self.x, y=2, z=self.z) + + +@pytest.mark.asyncio +async def test_select_fields_but_unreliable() -> None: + """This is essentially a combination of ``test_select_fields()`` and + ``test_select_fields_but_to_item_only()`` where the ``.to_item()`` method + is overridden as well as ``@field`` decorators are partially used. + + For this test, the ``.to_item()`` method is incorrectly made wherein it's + not properly checking the ``.fields_to_ignore`` to determine fields to + avoid. + """ + # Required fields from the item cls which are not included raise an TypeError + expected_type_error_msg = ( + r"__init__\(\) missing 1 required positional argument: 'x'" + ) + response = HttpResponse("https://example.com", b"") + + # When SelectFields isn't set + page = BigUnreliablePage(response) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 2, "z": 2} + + # If no field selection directive is given but SelectFields is set, it would + # use the default fields that are not disabled. + page = BigUnreliablePage(response, select_fields=SelectFields(None)) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 2, "z": 2} + + # Same case as above but given an empty dict + page = BigUnreliablePage(response, select_fields=SelectFields({})) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 2, "z": 2} + + # Select all fields + page = BigUnreliablePage(response, select_fields=SelectFields({"*": True})) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 2, "z": 2} + + # Don't select all fields; but in this case a TypeError is raised since + # required fields aren't supplied to the item + page = BigUnreliablePage(response, select_fields=SelectFields({"*": False})) + assert page.fields_to_ignore == ["x", "z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {"x": 2, "z": 2} + + # Exclude all but one (which is the required field in the item) + page = BigUnreliablePage( + response, select_fields=SelectFields({"*": False, "x": True}) + ) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=None, z=None) + assert page.call_counter == {"x": 2, "z": 2} + + # Include all fields but one + page = BigUnreliablePage( + response, select_fields=SelectFields({"*": True, "z": False}) + ) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=None) + assert page.call_counter == {"x": 2, "z": 2} + + # overlapping directives on the same field should be okay + page = BigUnreliablePage( + response, + select_fields=SelectFields({"*": True, "x": True, "y": True, "z": True}), + ) + assert page.fields_to_ignore == [] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 2, "z": 2} + + # Excluding a required field throws an error + page = BigUnreliablePage(response, select_fields=SelectFields({"x": False})) + assert page.fields_to_ignore == ["x", "z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {"x": 2, "z": 2} + + # Boolean-like values are not supported. + expected_non_boolean_value_error_msg = ( + "SelectField only allows boolean values as keys. " + "Got: {'x': 0, 'y': 0, 'z': 1}" + ) + page = BigUnreliablePage( + response, + select_fields=SelectFields({"x": 0, "y": 0, "z": 1}), # type: ignore[dict-item] + ) + with pytest.raises(ValueError, match=expected_non_boolean_value_error_msg): + page.fields_to_ignore + assert await page.to_item() == BigItem(x=1, y=2, z=3) + with pytest.raises(ValueError, match=expected_non_boolean_value_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {"x": 1, "z": 1} + + # If an invalid SelectFields value was passed to `select_fields` parameter + expected_invalid_instance_value_error_msg = ( + r"The select_fields.fields parameter is expecting a Mapping. " + r'Got SelectFields\(fields="not the instance it\'s expecting"\).' + ) + page = BigUnreliablePage( + response, + select_fields="not the instance it's expecting", # type: ignore[arg-type] + ) + with pytest.raises(ValueError, match=expected_invalid_instance_value_error_msg): + page.fields_to_ignore + assert await page.to_item() == BigItem(x=1, y=2, z=3) + with pytest.raises(ValueError, match=expected_invalid_instance_value_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {"x": 1, "z": 1} + + # If the item class doesn't have a field, it would error out. + fields = {"x": True, "not_existing": True} + expected_value_error_msg = ( + r"The fields {'not_existing'} is not available in which has SelectFields\(fields={'x': True, " + r"'not_existing': True}\)." + ) + page = BigUnreliablePage(response, select_fields=SelectFields(fields)) + with pytest.raises(ValueError, match=expected_value_error_msg): + page.fields_to_ignore + assert await page.to_item() == BigItem(x=1, y=2, z=3) + with pytest.raises(ValueError, match=expected_value_error_msg): + await item_from_select_fields(page) + + +@attrs.define +class SmallUnreliablePage(BigUnreliablePage, Returns[SmallItem]): + pass + + +@attrs.define +class SmallUnreliablePageSkip( + BigUnreliablePage, Returns[SmallItem], skip_nonitem_fields=True +): + pass + + +@pytest.mark.asyncio +async def test_select_fields_but_unreliable_small_item() -> None: + expected_type_error_msg = r"__init__\(\) got an unexpected keyword argument 'x'" + response = HttpResponse("https://example.com", b"") + + # Giving excess fields to a small item results in errors; except + # ``.to_item()`` since it's not calling ``item_from_fields()`` nor + # ``super().to_item()`` + page = SmallUnreliablePage(response) + assert page.fields_to_ignore == ["z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + with pytest.raises(TypeError, match=expected_type_error_msg): + await item_from_select_fields(page) + assert page.call_counter == {"x": 2, "z": 2} + + # The error should go away after unselecting the excess field + page = SmallUnreliablePage(response, select_fields=SelectFields({"x": False})) + assert page.fields_to_ignore == ["x", "z"] + assert await page.to_item() == BigItem(x=1, y=2, z=3) + assert page.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page) == SmallItem(y=2) + assert page.call_counter == {"x": 2, "z": 2} + + # If the page object uses skip_nonitem_fields=True, it should work without + # any problems + page2 = SmallUnreliablePageSkip(response) + assert page2.fields_to_ignore == ["z"] + assert await page2.to_item() == BigItem(x=1, y=2, z=3) + assert page2.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page2) == SmallItem(y=2) + assert page2.call_counter == {"x": 2, "z": 2} + + # Declaring "x" as a field to ignore works the same but it's added to the + # ``.fields_to_ignore`` + page2 = SmallUnreliablePageSkip(response, select_fields=SelectFields({"x": False})) + assert page2.fields_to_ignore == ["x", "z"] + assert await page2.to_item() == BigItem(x=1, y=2, z=3) + assert page2.call_counter == {"x": 1, "z": 1} + assert await item_from_select_fields(page2) == SmallItem(y=2) + assert page2.call_counter == {"x": 2, "z": 2} diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 3c8b0112..3e434588 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,4 +1,10 @@ -from .fields import field, item_from_fields, item_from_fields_sync +from .fields import ( + SelectFields, + field, + item_from_fields, + item_from_fields_sync, + item_from_select_fields, +) from .page_inputs import ( BrowserHtml, HttpClient, diff --git a/web_poet/fields.py b/web_poet/fields.py index ca92d4ac..0869d8e3 100644 --- a/web_poet/fields.py +++ b/web_poet/fields.py @@ -2,14 +2,17 @@ ``web_poet.fields`` is a module with helpers for putting extraction logic into separate Page Object methods / properties. """ +from __future__ import annotations + import inspect from contextlib import suppress from functools import update_wrapper, wraps -from typing import Callable, Dict, List, Optional, Type, TypeVar +from typing import Any, Callable, Dict, List, Mapping, Optional, Type, TypeVar import attrs from itemadapter import ItemAdapter +import web_poet from web_poet.utils import cached_method, ensure_awaitable _FIELDS_INFO_ATTRIBUTE_READ = "_web_poet_fields_info" @@ -190,12 +193,17 @@ async def item_from_fields( to ``item_cls.__init__``, possibly causing exceptions if ``item_cls.__init__`` doesn't support them. """ - item_dict = item_from_fields_sync(obj, item_cls=dict, skip_nonitem_fields=False) - field_names = list(item_dict.keys()) + adapter = ItemAdapter( + item_from_fields_sync( + obj, item_cls=item_cls, skip_nonitem_fields=skip_nonitem_fields + ) + ) + fields_to_ignore = getattr(obj, "fields_to_ignore", []) + field_names = [f for f in adapter if f not in fields_to_ignore] if skip_nonitem_fields: field_names = _without_unsupported_field_names(item_cls, field_names) return item_cls( - **{name: await ensure_awaitable(item_dict[name]) for name in field_names} + **{name: await ensure_awaitable(adapter[name]) for name in field_names} ) @@ -203,7 +211,12 @@ def item_from_fields_sync( obj, item_cls: Type[T] = dict, *, skip_nonitem_fields: bool = False # type: ignore[assignment] ) -> T: """Synchronous version of :func:`item_from_fields`.""" - field_names = list(get_fields_dict(obj)) + fields_to_ignore = getattr(obj, "fields_to_ignore", []) + field_names = [ + f + for f in get_fields_dict(obj, include_disabled=True) + if f not in fields_to_ignore + ] if skip_nonitem_fields: field_names = _without_unsupported_field_names(item_cls, field_names) return item_cls(**{name: getattr(obj, name) for name in field_names}) @@ -216,3 +229,95 @@ def _without_unsupported_field_names( if item_field_names is None: # item_cls doesn't define field names upfront return field_names[:] return list(set(field_names) & set(item_field_names)) + + +@attrs.define +class SelectFields: + """This is used as a dependency in :class:`~.ItemPage` to control which + fields to populate its returned item class. + + You can also use this to enable some fields that were disabled by default + via the ``@field(disabled=True)`` decorator. + + Some usage examples: + + * ``SelectFields({"name": True})`` — select one field + * ``SelectFields({"name": False})`` — unselect one field + * ``SelectFields({"*": True})`` — select all fields + * ``SelectFields({"*": True, "name": False})`` — select all fields but one + * ``SelectFields({"*": False, "name": True})`` — unselect all fields but one + + """ + + #: Fields that the page object would use to populate the + #: :meth:`~.Returns.item_cls` it returns. It's a mapping of field names to + #: boolean values where ``True`` would indicate it being included when using + #: :meth:`~.ItemPage.to_item()` and :func:`~.item_from_select_fields`. + fields: Mapping[str, bool] = attrs.field(converter=lambda x: x or {}) + + +def _validate_select_fields(page: web_poet.ItemPage) -> None: + fields = page.select_fields.fields + + if fields is None or len(fields) == 0: + return None + elif not isinstance(fields, Mapping): + raise ValueError( + f"The select_fields.fields parameter is expecting a Mapping. " + f"Got {page.select_fields}." + ) + + page_obj_fields = get_fields_dict(page, include_disabled=True) + + unknown_fields = set(fields) - set(page_obj_fields.keys()).union({"*"}) + fields_in_item = inspect.signature(page.item_cls).parameters.keys() + unselected_fields = {k for k, v in fields.items() if v is False} + + # Only raise an error if a field is selected but it's not present in the + # item class. It doesn't raise an error even if the page object doesn't + # have the field, it simply ignores it. + fields_not_in_item = unknown_fields - fields_in_item - unselected_fields + if fields_not_in_item: + raise ValueError( + f"The fields {fields_not_in_item} is not available in {page.item_cls} " + f"which has {page.select_fields}." + ) + + if any([not isinstance(v, bool) for v in page.select_fields.fields.values()]): + raise ValueError( + f"SelectField only allows boolean values as keys. " + f"Got: {page.select_fields.fields}" + ) + + +async def item_from_select_fields(page: web_poet.ItemPage) -> Any: + """Returns an item produced by the given page object instance. + + This ensures that the fields specified inside the :class:`~.SelectFields` + instance are taken into account alongside any fields that are disabled by + default (i.e. ``@field(disabled=True)``. This is done by calling the + :meth:`~.ItemPage.to_item` method and simply dropping any field that should + not be included. + """ + + _validate_select_fields(page) + + item = await ensure_awaitable(page.to_item()) + fields = page.select_fields.fields or {} + fields_to_ignore = page.fields_to_ignore + + kwargs = {} + for k, v in ItemAdapter(item).items(): + if k in fields_to_ignore or ( + fields.get("*") is False and fields.get(k) is not True + ): + continue + kwargs[k] = v + + if page._get_skip_nonitem_fields(): + field_names = _without_unsupported_field_names( + page.item_cls, list(kwargs.keys()) + ) + kwargs = {k: v for k, v in kwargs.items() if k in field_names} + + return page.item_cls(**kwargs) diff --git a/web_poet/pages.py b/web_poet/pages.py index fed2fae4..823d97d8 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -1,10 +1,16 @@ import abc -import typing +from typing import Any, Generic, List, Type, TypeVar -import attr +import attrs from web_poet._typing import get_item_cls -from web_poet.fields import FieldsMixin, item_from_fields +from web_poet.fields import ( + FieldsMixin, + SelectFields, + _validate_select_fields, + get_fields_dict, + item_from_fields, +) from web_poet.mixins import ResponseShortcutsMixin from web_poet.page_inputs import HttpResponse from web_poet.utils import _create_deprecated_class @@ -31,21 +37,21 @@ class Injectable(abc.ABC, FieldsMixin): Injectable.register(type(None)) -def is_injectable(cls: typing.Any) -> bool: +def is_injectable(cls: Any) -> bool: """Return True if ``cls`` is a class which inherits from :class:`~.Injectable`.""" return isinstance(cls, type) and issubclass(cls, Injectable) -ItemT = typing.TypeVar("ItemT") +ItemT = TypeVar("ItemT") -class Returns(typing.Generic[ItemT]): +class Returns(Generic[ItemT]): """Inherit from this generic mixin to change the item class used by :class:`~.ItemPage`""" @property - def item_cls(self) -> typing.Type[ItemT]: + def item_cls(self) -> Type[ItemT]: """Item class""" return get_item_cls(self.__class__, default=dict) @@ -53,11 +59,17 @@ def item_cls(self) -> typing.Type[ItemT]: _NOT_SET = object() +@attrs.define class ItemPage(Injectable, Returns[ItemT]): """Base Page Object, with a default :meth:`to_item` implementation which supports web-poet fields. """ + select_fields: SelectFields = attrs.field( + converter=lambda x: SelectFields(x) if not isinstance(x, SelectFields) else x, + kw_only=True, + default=None, + ) _skip_nonitem_fields = _NOT_SET def _get_skip_nonitem_fields(self) -> bool: @@ -80,8 +92,43 @@ async def to_item(self) -> ItemT: skip_nonitem_fields=self._get_skip_nonitem_fields(), ) - -@attr.s(auto_attribs=True) + @property + def fields_to_ignore(self) -> List[str]: + """Returns a list of field names which should **NOT** populate the + designated :meth:`~.Returns.item_cls`. + + This is takes into account the fields inside the :class:`~.SelectFields` + instance as well as fields that are marked as disabled by default (i.e. + ``@field(disabled=True)``). + """ + _validate_select_fields(self) + + fields = self.select_fields.fields + page_obj_fields = get_fields_dict(self, include_disabled=True) + + fields_to_ignore = [] + for name, field_info in page_obj_fields.items(): + if fields.get("*") is True and fields.get(name) is not False: + continue + if ( + field_info.disabled is True + or fields.get(name) is False + or (fields.get("*") is False and fields.get(name) is not True) + ): + fields_to_ignore.append(name) + + for name in fields: + if ( + fields.get(name) is False + and name != "*" + and name not in fields_to_ignore + ): + fields_to_ignore.append(name) + + return fields_to_ignore + + +@attrs.define class WebPage(ItemPage[ItemT], ResponseShortcutsMixin): """Base Page Object which requires :class:`~.HttpResponse` and provides XPath / CSS shortcuts.