From 741d95f428514448e4dc25e1c0dec724e47c6fb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 8 Aug 2023 11:51:36 +0200 Subject: [PATCH] Only prefix absolute URLs with / (#34) --- src/protego.py | 14 +++++--------- tests/test_protego.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/protego.py b/src/protego.py index 3ee76a6..3b2c0fe 100644 --- a/src/protego.py +++ b/src/protego.py @@ -49,13 +49,6 @@ def _is_valid_directive_field(field): ) -def _enforce_path(pattern): - if pattern.startswith("/"): - return pattern - - return "/" + pattern - - class _URLPattern(object): """Internal class which represents a URL pattern.""" @@ -179,6 +172,9 @@ def _quote_path(self, path): return path or "/" def _quote_pattern(self, pattern): + if pattern.startswith("https://") or pattern.startswith("http://"): + pattern = "/" + pattern + # Corner case for query only (e.g. '/abc?') and param only (e.g. '/abc;') URLs. # Save the last character otherwise, urlparse will kill it. last_char = "" @@ -444,11 +440,11 @@ def _parse_robotstxt(self, content): elif field in _ALLOW_DIRECTIVE: for rule_set in current_rule_sets: - rule_set.allow(_enforce_path(value)) + rule_set.allow(value) elif field in _DISALLOW_DIRECTIVE: for rule_set in current_rule_sets: - rule_set.disallow(_enforce_path(value)) + rule_set.disallow(value) elif field in _SITEMAP_DIRECTIVE: self._sitemap_list.append(value) diff --git a/tests/test_protego.py b/tests/test_protego.py index df12384..3571d11 100644 --- a/tests/test_protego.py +++ b/tests/test_protego.py @@ -1,7 +1,8 @@ -# encoding=utf-8 from datetime import time from unittest import TestCase +import pytest + from protego import Protego, _RuleSet @@ -1139,3 +1140,16 @@ def test_parse_time_period(self): start_time, end_time = rs._parse_time_period("0500 0600", separator=" ") self.assertEqual(start_time, time(5, 0)) self.assertEqual(end_time, time(6, 0)) + + +@pytest.mark.parametrize( + "allow,disallow,url,allowed", + [ + ("*/p", "/", "http://example.com/page", True), + ("/page", "*/*.htm", "https://example.com/page.htm", False), + ], +) +def test_leading_asterisk(allow, disallow, url, allowed): + content = f"User-Agent: *\n" f"allow: {allow}\n" f"disallow: {disallow}\n" + rp = Protego.parse(content) + assert rp.can_fetch(url, "*") == allowed