-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
61 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,19 +6,22 @@ | |
|
||
class MockResponseURLNotFound: | ||
"""Mocks a ``requests.Response`` object for an unavailable URL.""" | ||
|
||
status_code = 404 | ||
reason = "Not Found" | ||
|
||
@staticmethod | ||
def get(): | ||
return None | ||
|
||
|
||
class MockResponseAIScrapersForbidden: | ||
"""Mocks a ``requests.Response`` for a website with a ``robots.txt``-file that forbids AI scraping. | ||
(This real example was found on golem.de)""" | ||
|
||
status_code = 200 | ||
reason = "OK" | ||
text = ( | ||
""" | ||
text = """ | ||
User-agent: Twitterbot | ||
Disallow: /mail.php | ||
Disallow: /search.php | ||
|
@@ -56,11 +59,12 @@ class MockResponseAIScrapersForbidden: | |
# golem.de may, in its discretion, permit certain automated access to certain golem.de pages. | ||
# If you would like to apply for permission to crawl golem.de, collect or use data, please email [email protected] | ||
""" | ||
) | ||
|
||
@staticmethod | ||
def get(*args, **kwargs): | ||
return MockResponseAIScrapersForbidden() | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"test_input,expected", | ||
[ | ||
|
@@ -74,73 +78,110 @@ def get(*args, **kwargs): | |
def test_fetch_robots_txt_from_wlo(test_input: str, expected: str | None): | ||
assert isinstance(fetch_robots_txt(test_input), str) | ||
|
||
|
||
def test_fetch_robots_txt(monkeypatch): | ||
"""Mocks a ``requests.Response`` for a robots.txt file that forbids AI scrapers.""" | ||
|
||
def mock_get(*args, **kwargs): | ||
return MockResponseAIScrapersForbidden() | ||
|
||
monkeypatch.setattr(requests, "get", mock_get) | ||
result = fetch_robots_txt("https://www.golem.de/robots.txt") | ||
assert isinstance(result, str) | ||
|
||
|
||
def test_fetch_robots_txt_from_an_unreachable_website_with_warning(monkeypatch): | ||
"""Mocks a ``requests.Response`` for a website that's unreachable.""" | ||
|
||
# see: https://docs.pytest.org/en/stable/how-to/monkeypatch.html#monkeypatching-returned-objects-building-mock-classes | ||
def mock_get(*args, **kwargs): | ||
return MockResponseURLNotFound() | ||
|
||
monkeypatch.setattr(requests, "get", mock_get) | ||
result = fetch_robots_txt("https://fake.url") | ||
assert result is None | ||
|
||
|
||
def test_if_ai_usage_is_allowed_on_malformed_url(monkeypatch): | ||
"""Mocks a ``requests.Response`` for a malformed URL.""" | ||
|
||
def mock_get(*args, **kwargs): | ||
return MockResponseURLNotFound() | ||
|
||
monkeypatch.setattr(requests, "get", mock_get) | ||
with pytest.raises(ValueError): | ||
# if the provided URL is malformed, we expect the function to raise a ValueError | ||
is_ai_usage_allowed("https://malformed-url/robots.txt") | ||
|
||
|
||
def test_if_ai_usage_is_allowed_on_website_without_robots_txt(monkeypatch): | ||
"""Mocks a ``requests.Response`` for a (available) website that has no ``robots.txt``""" | ||
|
||
def mock_get(*args, **kwargs): | ||
return MockResponseURLNotFound() | ||
|
||
monkeypatch.setattr(requests, "get", mock_get) | ||
ai_usage_allowed = is_ai_usage_allowed( | ||
url="https://www.this-domain-does-not-exist.dev/robots.txt", | ||
) | ||
assert ai_usage_allowed is True | ||
|
||
|
||
def test_if_ai_usage_is_allowed_with_robots_txt_that_forbids_ai_scraping(monkeypatch): | ||
"""Mocks a robots.txt file that explicitly forbids several AI scrapers from crawling the website.""" | ||
|
||
def mock_get(*args, **kwargs): | ||
return MockResponseAIScrapersForbidden() | ||
|
||
monkeypatch.setattr(requests, "get", mock_get) | ||
ai_usage_allowed: bool = is_ai_usage_allowed( | ||
url="https://www.golem.de/robots.txt", | ||
) | ||
assert ai_usage_allowed is False | ||
|
||
|
||
# to run these tests, just comment out the ``pytest.mark.skip`` decorator | ||
@pytest.mark.skip(reason="These tests cause HTTP requests and should only be run on-demand within your IDE. " | ||
"They are flaky by nature and could break without notice!") | ||
@pytest.mark.skip( | ||
reason="These tests fire HTTP requests and should only be run on-demand within your IDE for debugging purposes. " | ||
"They are flaky by nature and could break without notice, therefore they are skipped in the CI/CD pipelines!" | ||
) | ||
@pytest.mark.parametrize( | ||
"test_input,expected", | ||
[ | ||
pytest.param("https://www.zum.de/robots.txt", True, | ||
id="ZUM.de does not forbid AI scrapers. Last checked on: 2024-12-05"), | ||
pytest.param("https://www.dilertube.de/robots.txt", True, | ||
id="DiLerTube does not forbid AI scrapers. Last checked on: 2024-12-05"), | ||
pytest.param("https://www.lehrer-online.de/robots.txt", True, | ||
id="Lehrer-Online does not forbid AI scrapers. Last checked on: 2024-12-05"), | ||
pytest.param("https://www.scienceinschool.org/robots.txt", True, | ||
id="Science in School does not forbid AI scrapers. Last checked on: 2024-12-05"), | ||
pytest.param("https://www.leifiphysik.de/robots.txt", False, | ||
id="Leifi-Physik forbids (a lot) of AI scrapers. Last checked on: 2024-12-05"), | ||
pytest.param("https://www.golem.de/robots.txt", False, | ||
id="Golem.de forbids several AI scrapers. Last checked on: 2024-12-05"), | ||
pytest.param("https://taz.de/robots.txt", False, | ||
id="taz.de forbids several AI scrapers (GPTBot, Bytespider). Last checked on: 2024-12-05"), | ||
] | ||
pytest.param( | ||
"https://www.zum.de/robots.txt", True, id="ZUM.de does not forbid AI scrapers. Last checked on: 2024-12-05" | ||
), | ||
pytest.param( | ||
"https://www.dilertube.de/robots.txt", | ||
True, | ||
id="DiLerTube does not forbid AI scrapers. Last checked on: 2024-12-05", | ||
), | ||
pytest.param( | ||
"https://www.lehrer-online.de/robots.txt", | ||
True, | ||
id="Lehrer-Online does not forbid AI scrapers. Last checked on: 2024-12-05", | ||
), | ||
pytest.param( | ||
"https://www.scienceinschool.org/robots.txt", | ||
True, | ||
id="Science in School does not forbid AI scrapers. Last checked on: 2024-12-05", | ||
), | ||
pytest.param( | ||
"https://www.leifiphysik.de/robots.txt", | ||
False, | ||
id="Leifi-Physik forbids (a lot) of AI scrapers. Last checked on: 2024-12-05", | ||
), | ||
pytest.param( | ||
"https://www.golem.de/robots.txt", | ||
False, | ||
id="Golem.de forbids several AI scrapers. Last checked on: 2024-12-05", | ||
), | ||
pytest.param( | ||
"https://taz.de/robots.txt", | ||
False, | ||
id="taz.de forbids several AI scrapers (GPTBot, Bytespider). Last checked on: 2024-12-05", | ||
), | ||
], | ||
) | ||
def test_if_ai_usage_is_allowed_with_live_examples(test_input: str, expected: bool): | ||
"""This test is flaky by nature as it uses third-party ``robots.txt``-files, which might change without notice, | ||
|