diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index bd7b8516..54562d9e 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -54,8 +54,9 @@ a generic HTTP Request: :class:`~.HttpRequest`. Here's an example: ).encode("utf-8"), ) - print(request.url) # https://www.api.example.com/product-pagination/ - print(request.method) # POST + print(request.url) # https://www.api.example.com/product-pagination/ + print(type(request.url)) # + print(request.method) # POST print(type(request.headers) # print(request.headers) # @@ -67,7 +68,8 @@ a generic HTTP Request: :class:`~.HttpRequest`. Here's an example: There are a few things to take note here: - * ``url`` and ``method`` are simply **strings**. + * ``method`` is simply a **string**. + * ``url`` is represented by the :class:`~.RequestUrl` class. * ``headers`` is represented by the :class:`~.HttpRequestHeaders` class which resembles a ``dict``-like interface. It supports case-insensitive header-key lookups as well as multi-key storage. @@ -90,8 +92,9 @@ it's perfectly fine to define them as: request = web_poet.HttpRequest("https://api.example.com/product-info?id=123") - print(request.url) # https://api.example.com/product-info?id=123 - print(request.method) # GET + print(request.url) # https://api.example.com/product-info?id=123 + print(type(request.url)) # + print(request.method) # GET print(type(request.headers) # print(request.headers) # @@ -141,8 +144,8 @@ Let's check out an example to see its internals: headers={"Content-Type": "application/json;charset=UTF-8"} ) - print(response.url) # https://www.api.example.com/product-pagination/ - print(type(response.url)) # + print(response.url) # https://www.api.example.com/product-pagination/ + print(type(response.url)) # print(response.body) # b'{"data": "value \xf0\x9f\x91\x8d"}' print(type(response.body)) # @@ -174,7 +177,8 @@ methods. Here are the key take aways from the example above: - * The ``url`` and ``status`` are simply **string** and **int** respectively. + * ``status`` is simply an **int**. + * ``url`` is represented by the :class:`~.ResponseUrl` class. * ``headers`` is represented by the :class:`~.HttpResponseHeaders` class. It's similar to :class:`~.HttpRequestHeaders` where it inherits from :external:py:class:`multidict.CIMultiDict`, granting it case-insensitive diff --git a/docs/intro/from-ground-up.rst b/docs/intro/from-ground-up.rst index d5781650..83e8133e 100644 --- a/docs/intro/from-ground-up.rst +++ b/docs/intro/from-ground-up.rst @@ -503,7 +503,7 @@ For example, a very basic Page Object could look like this: def to_item(self) -> dict: return { - 'url': self.response.url, + 'url': str(self.response.url), 'title': self.response.css("h1::text").get() } diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index 65934a10..a87a9867 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -3,8 +3,9 @@ import aiohttp.web_response import pytest import requests - import parsel + +from web_poet import RequestUrl, ResponseUrl from web_poet.page_inputs import ( HttpRequest, HttpResponse, @@ -72,7 +73,7 @@ def test_http_defaults(cls, body_cls): http_body = body_cls(b"content") obj = cls("url", body=http_body) - assert obj.url == "url" + assert str(obj.url) == "url" assert obj.body == b"content" assert not obj.headers assert obj.headers.get("user-agent") is None @@ -164,7 +165,8 @@ def test_http_headers_init_dict(cls, headers_cls): def test_http_request_init_minimal(): req = HttpRequest("url") - assert req.url == "url" + assert str(req.url) == "url" + assert isinstance(req.url, RequestUrl) assert req.method == "GET" assert isinstance(req.method, str) assert not req.headers @@ -189,12 +191,20 @@ def test_http_request_init_full(): http_body = HttpRequestBody(b"body") req_2 = HttpRequest("url", method="POST", headers=http_headers, body=http_body) - assert req_1.url == req_2.url + assert str(req_1.url) == str(req_2.url) assert req_1.method == req_2.method assert req_1.headers == req_2.headers assert req_1.body == req_2.body +def test_http_request_init_with_response_url(): + resp = HttpResponse("url", b"") + assert isinstance(resp.url, ResponseUrl) + req = HttpRequest(resp.url) + assert isinstance(req.url, RequestUrl) + assert str(req.url) == str(resp.url) + + def test_http_response_headers_from_bytes_dict(): raw_headers = { b"Content-Length": [b"316"], diff --git a/tests/test_requests.py b/tests/test_requests.py index 9e6fef57..2d92ff7f 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -17,7 +17,7 @@ def async_mock(): """Workaround since python 3.7 doesn't ship with asyncmock.""" async def async_test(req): - return HttpResponse(req.url, body=b"") + return HttpResponse(str(req.url), body=b"") mock.MagicMock.__await__ = lambda x: async_test(x).__await__() @@ -37,7 +37,7 @@ async def test_perform_request_from_httpclient(async_mock): response = await client.get(url) # The async downloader implementation should return the HttpResponse - assert response.url == url + assert str(response.url) == str(url) assert isinstance(response, HttpResponse) @@ -47,15 +47,15 @@ async def test_http_client_single_requests(async_mock): with mock.patch("web_poet.page_inputs.client.HttpRequest") as mock_request: response = await client.request("url") - response.url == "url" + str(response.url) == "url" response = await client.get("url-get", headers={"X-Headers": "123"}) - response.url == "url-get" + str(response.url) == "url-get" response = await client.post( "url-post", headers={"X-Headers": "123"}, body=b"body value" ) - response.url == "url-post" + str(response.url) == "url-post" assert mock_request.call_args_list == [ mock.call( @@ -162,7 +162,7 @@ async def test_http_client_execute(async_mock): response = await client.execute(request) assert isinstance(response, HttpResponse) - assert response.url == "url-1" + assert str(response.url) == "url-1" @pytest.mark.asyncio diff --git a/tests/test_url.py b/tests/test_url.py new file mode 100644 index 00000000..fba26a9c --- /dev/null +++ b/tests/test_url.py @@ -0,0 +1,51 @@ +import pytest + +from web_poet._base import _Url +from web_poet import RequestUrl, ResponseUrl + + +def test_url_base_class(): + url_str = "http://example.com" + url = _Url(url_str) + assert str(url) == url_str + assert repr(url) == "_Url('http://example.com')" + + +def test_url_init_validation(): + with pytest.raises(TypeError): + _Url(123) + + +def test_url_subclasses(): + url_str = "http://example.com" + + class MyUrl(_Url): + pass + + class MyUrl2(_Url): + pass + + url = MyUrl(url_str) + assert str(url) == url_str + assert url._url == url_str + assert repr(url) == "MyUrl('http://example.com')" + + url2 = MyUrl2(url) + assert str(url2) == str(url) + + +@pytest.mark.parametrize('url_cls', [_Url, RequestUrl, ResponseUrl]) +def test_str_equality(url_cls): + url_str = "http://example.com#foo" + url = url_cls(url_str) + assert url != url_str + assert str(url) == url_str + + +def test_url_classes_eq(): + url_str = "http://example.com#foo" + request_url = RequestUrl(url_str) + response_url = ResponseUrl(url_str) + + assert request_url != response_url + assert str(request_url) == str(response_url) diff --git a/web_poet/__init__.py b/web_poet/__init__.py index a3a2259e..34a26b92 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -10,6 +10,8 @@ HttpRequestBody, HttpResponseBody, PageParams, + RequestUrl, + ResponseUrl, ) from .overrides import PageObjectRegistry, consume_modules, OverrideRule diff --git a/web_poet/_base.py b/web_poet/_base.py index 00ca0940..53382896 100644 --- a/web_poet/_base.py +++ b/web_poet/_base.py @@ -4,7 +4,7 @@ """ -from typing import Type, TypeVar, List, Dict +from typing import Type, TypeVar, List, Dict, Union from multidict import CIMultiDict @@ -32,3 +32,19 @@ def from_name_value_pairs(cls: Type[T_headers], arg: List[Dict]) -> T_headers: <_HttpHeaders('Content-Encoding': 'gzip', 'content-length': '648')> """ return cls([(pair["name"], pair["value"]) for pair in arg]) + + +class _Url: + """ Base URL class. + """ + def __init__(self, url: Union[str, '_Url']): + if not isinstance(url, (str, _Url)): + raise TypeError(f"`url` must be a str or an instance of _Url, " + f"got {url.__class__} instance instead") + self._url = str(url) + + def __str__(self) -> str: + return self._url + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self._url!r})" diff --git a/web_poet/mixins.py b/web_poet/mixins.py index faf6c0f6..53f4b8dc 100644 --- a/web_poet/mixins.py +++ b/web_poet/mixins.py @@ -50,8 +50,8 @@ class ResponseShortcutsMixin(SelectableMixin): @property def url(self): - """Shortcut to HTML Response's URL.""" - return self.response.url + """Shortcut to HTML Response's URL, as a string.""" + return str(self.response.url) @property def html(self): diff --git a/web_poet/page_inputs/__init__.py b/web_poet/page_inputs/__init__.py index f0f0803b..db08f9ce 100644 --- a/web_poet/page_inputs/__init__.py +++ b/web_poet/page_inputs/__init__.py @@ -7,5 +7,7 @@ HttpResponseHeaders, HttpRequestBody, HttpResponseBody, + RequestUrl, + ResponseUrl ) from .browser import BrowserHtml diff --git a/web_poet/page_inputs/client.py b/web_poet/page_inputs/client.py index 6e12b6e6..63ef4ac0 100644 --- a/web_poet/page_inputs/client.py +++ b/web_poet/page_inputs/client.py @@ -22,6 +22,7 @@ ) from web_poet.exceptions import RequestBackendError, HttpResponseError from web_poet.utils import as_list +from web_poet._base import _Url logger = logging.getLogger(__name__) @@ -77,7 +78,7 @@ def _handle_status( async def request( self, - url: str, + url: Union[str, _Url], *, method: str = "GET", headers: Optional[_Headers] = None, @@ -115,7 +116,7 @@ async def request( async def get( self, - url: str, + url: Union[str, _Url], *, headers: Optional[_Headers] = None, allow_status: List[_Status] = None, @@ -132,7 +133,7 @@ async def get( async def post( self, - url: str, + url: Union[str, _Url], *, headers: Optional[_Headers] = None, body: Optional[_Body] = None, diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index 68c3071b..2c613369 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -9,7 +9,7 @@ http_content_type_encoding ) -from web_poet._base import _HttpHeaders +from web_poet._base import _HttpHeaders, _Url from web_poet.utils import memoizemethod_noargs from web_poet.mixins import SelectableMixin @@ -18,6 +18,16 @@ _AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]] +class ResponseUrl(_Url): + """ URL of the response """ + pass + + +class RequestUrl(_Url): + """ URL of the request """ + pass + + class HttpRequestBody(bytes): """A container for holding the raw HTTP request body in bytes format.""" @@ -152,7 +162,7 @@ class HttpRequest: **web-poet** like :class:`~.HttpClient`. """ - url: str = attrs.field() + url: RequestUrl = attrs.field(converter=RequestUrl) method: str = attrs.field(default="GET", kw_only=True) headers: HttpRequestHeaders = attrs.field( factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True @@ -185,7 +195,7 @@ class HttpResponse(SelectableMixin): is auto-detected from headers and body content. """ - url: str = attrs.field() + url: ResponseUrl = attrs.field(converter=ResponseUrl) body: HttpResponseBody = attrs.field(converter=HttpResponseBody) status: Optional[int] = attrs.field(default=None, kw_only=True) headers: HttpResponseHeaders = attrs.field(factory=HttpResponseHeaders,