diff --git a/CHANGES/1150.feature.rst b/CHANGES/1150.feature.rst new file mode 100644 index 00000000..fbcb1f32 --- /dev/null +++ b/CHANGES/1150.feature.rst @@ -0,0 +1 @@ +Added :attr:`~yarl.URL.path_safe` to be able to fetch the path without ``%2F`` and ``%25`` decoded -- by :user:`bdraco`. diff --git a/docs/api.rst b/docs/api.rst index b9e950b5..b552e4f4 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -270,6 +270,23 @@ There are two kinds of properties: *decoded* and *encoded* (with >>> URL('http://example.com').path '/' + .. warning:: + + In many situations it is important to distinguish between path separators + (a literal ``/``) and other forward slashes (a literal ``%2F``). Use + :attr:`URL.path_safe` for these cases. + +.. attribute:: URL.path_safe + + Similar to :attr:`URL.path` except it doesn't decode ``%2F`` or ``%25``. + This allows to distinguish between path separators (``/``) and encoded + slashes (``%2F``). + + Note that ``%25`` is also not decoded to avoid issues with double unquoting + of values. e.g. You can unquote the value with + ``URL.path_safe.replace("%2F", "/").replace("%25", %")`` to get the same + result as :meth:`URL.path`. If the ``%25`` was unquoted, it would be + impossible to tell the difference between ``%2F`` and ``%252F``. .. attribute:: URL.path_qs diff --git a/tests/test_url.py b/tests/test_url.py index 45d70902..1dbdd80a 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,5 +1,5 @@ from enum import Enum -from urllib.parse import SplitResult +from urllib.parse import SplitResult, quote, unquote import pytest @@ -352,6 +352,42 @@ def test_path_with_2F(): assert url.path == "/foo/bar/baz" +def test_path_safe_with_2F(): + """Path safe should not decode %2F, otherwise it may look like a path separator.""" + + url = URL("http://example.com/foo/bar%2fbaz") + assert url.path_safe == "/foo/bar%2Fbaz" + + +def test_path_safe_with_25(): + """Path safe should not decode %25, otherwise it is prone to double unquoting.""" + + url = URL("http://example.com/foo/bar%252Fbaz") + assert url.path_safe == "/foo/bar%252Fbaz" + unquoted = url.path_safe.replace("%2F", "/").replace("%25", "%") + assert unquoted == "/foo/bar%2Fbaz" + + +@pytest.mark.parametrize( + "original_path", + [ + "m+@bar/baz", + "m%2B@bar/baz", + "m%252B@bar/baz", + "m%2F@bar/baz", + ], +) +def test_path_safe_only_round_trips(original_path: str) -> None: + """Path safe can round trip with documented decode method.""" + encoded_once = quote(original_path, safe="") + encoded_twice = quote(encoded_once, safe="") + + url = URL(f"http://example.com/{encoded_twice}") + unquoted = url.path_safe.replace("%2F", "/").replace("%25", "%") + assert unquoted == f"/{encoded_once}" + assert unquote(unquoted) == f"/{original_path}" + + def test_raw_path_for_empty_url(): url = URL() assert "" == url.raw_path diff --git a/yarl/_url.py b/yarl/_url.py index f506eac6..26da688a 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -226,6 +226,7 @@ class URL: _UNQUOTER = _Unquoter() _PATH_UNQUOTER = _Unquoter(unsafe="+") + _PATH_SAFE_UNQUOTER = _Unquoter(ignore="/%", unsafe="+") _QS_UNQUOTER = _Unquoter(qs=True) _val: SplitResult @@ -710,6 +711,17 @@ def path(self) -> str: """ return self._PATH_UNQUOTER(self.raw_path) + @cached_property + def path_safe(self) -> str: + """Decoded path of URL. + + / for absolute URLs without path part. + + / (%2F) and % (%25) are not decoded + + """ + return self._PATH_SAFE_UNQUOTER(self.raw_path) + @cached_property def _parsed_query(self) -> List[Tuple[str, str]]: """Parse query part of URL."""