Skip to content

Commit

Permalink
style: code formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
Criamos committed Dec 5, 2024
1 parent aa43f0a commit b60e2de
Showing 1 changed file with 61 additions and 20 deletions.
81 changes: 61 additions & 20 deletions tests/test_robots_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,22 @@

class MockResponseURLNotFound:
"""Mocks a ``requests.Response`` object for an unavailable URL."""

status_code = 404
reason = "Not Found"

@staticmethod
def get():
return None


class MockResponseAIScrapersForbidden:
"""Mocks a ``requests.Response`` for a website with a ``robots.txt``-file that forbids AI scraping.
(This real example was found on golem.de)"""

status_code = 200
reason = "OK"
text = (
"""
text = """
User-agent: Twitterbot
Disallow: /mail.php
Disallow: /search.php
Expand Down Expand Up @@ -56,11 +59,12 @@ class MockResponseAIScrapersForbidden:
# golem.de may, in its discretion, permit certain automated access to certain golem.de pages.
# If you would like to apply for permission to crawl golem.de, collect or use data, please email [email protected]
"""
)

@staticmethod
def get(*args, **kwargs):
return MockResponseAIScrapersForbidden()


@pytest.mark.parametrize(
"test_input,expected",
[
Expand All @@ -74,73 +78,110 @@ def get(*args, **kwargs):
def test_fetch_robots_txt_from_wlo(test_input: str, expected: str | None):
assert isinstance(fetch_robots_txt(test_input), str)


def test_fetch_robots_txt(monkeypatch):
"""Mocks a ``requests.Response`` for a robots.txt file that forbids AI scrapers."""

def mock_get(*args, **kwargs):
return MockResponseAIScrapersForbidden()

monkeypatch.setattr(requests, "get", mock_get)
result = fetch_robots_txt("https://www.golem.de/robots.txt")
assert isinstance(result, str)


def test_fetch_robots_txt_from_an_unreachable_website_with_warning(monkeypatch):
"""Mocks a ``requests.Response`` for a website that's unreachable."""

# see: https://docs.pytest.org/en/stable/how-to/monkeypatch.html#monkeypatching-returned-objects-building-mock-classes
def mock_get(*args, **kwargs):
return MockResponseURLNotFound()

monkeypatch.setattr(requests, "get", mock_get)
result = fetch_robots_txt("https://fake.url")
assert result is None


def test_if_ai_usage_is_allowed_on_malformed_url(monkeypatch):
"""Mocks a ``requests.Response`` for a malformed URL."""

def mock_get(*args, **kwargs):
return MockResponseURLNotFound()

monkeypatch.setattr(requests, "get", mock_get)
with pytest.raises(ValueError):
# if the provided URL is malformed, we expect the function to raise a ValueError
is_ai_usage_allowed("https://malformed-url/robots.txt")


def test_if_ai_usage_is_allowed_on_website_without_robots_txt(monkeypatch):
"""Mocks a ``requests.Response`` for a (available) website that has no ``robots.txt``"""

def mock_get(*args, **kwargs):
return MockResponseURLNotFound()

monkeypatch.setattr(requests, "get", mock_get)
ai_usage_allowed = is_ai_usage_allowed(
url="https://www.this-domain-does-not-exist.dev/robots.txt",
)
assert ai_usage_allowed is True


def test_if_ai_usage_is_allowed_with_robots_txt_that_forbids_ai_scraping(monkeypatch):
"""Mocks a robots.txt file that explicitly forbids several AI scrapers from crawling the website."""

def mock_get(*args, **kwargs):
return MockResponseAIScrapersForbidden()

monkeypatch.setattr(requests, "get", mock_get)
ai_usage_allowed: bool = is_ai_usage_allowed(
url="https://www.golem.de/robots.txt",
)
assert ai_usage_allowed is False


# to run these tests, just comment out the ``pytest.mark.skip`` decorator
@pytest.mark.skip(reason="These tests cause HTTP requests and should only be run on-demand within your IDE. "
"They are flaky by nature and could break without notice!")
@pytest.mark.skip(
reason="These tests fire HTTP requests and should only be run on-demand within your IDE for debugging purposes. "
"They are flaky by nature and could break without notice, therefore they are skipped in the CI/CD pipelines!"
)
@pytest.mark.parametrize(
"test_input,expected",
[
pytest.param("https://www.zum.de/robots.txt", True,
id="ZUM.de does not forbid AI scrapers. Last checked on: 2024-12-05"),
pytest.param("https://www.dilertube.de/robots.txt", True,
id="DiLerTube does not forbid AI scrapers. Last checked on: 2024-12-05"),
pytest.param("https://www.lehrer-online.de/robots.txt", True,
id="Lehrer-Online does not forbid AI scrapers. Last checked on: 2024-12-05"),
pytest.param("https://www.scienceinschool.org/robots.txt", True,
id="Science in School does not forbid AI scrapers. Last checked on: 2024-12-05"),
pytest.param("https://www.leifiphysik.de/robots.txt", False,
id="Leifi-Physik forbids (a lot) of AI scrapers. Last checked on: 2024-12-05"),
pytest.param("https://www.golem.de/robots.txt", False,
id="Golem.de forbids several AI scrapers. Last checked on: 2024-12-05"),
pytest.param("https://taz.de/robots.txt", False,
id="taz.de forbids several AI scrapers (GPTBot, Bytespider). Last checked on: 2024-12-05"),
]
pytest.param(
"https://www.zum.de/robots.txt", True, id="ZUM.de does not forbid AI scrapers. Last checked on: 2024-12-05"
),
pytest.param(
"https://www.dilertube.de/robots.txt",
True,
id="DiLerTube does not forbid AI scrapers. Last checked on: 2024-12-05",
),
pytest.param(
"https://www.lehrer-online.de/robots.txt",
True,
id="Lehrer-Online does not forbid AI scrapers. Last checked on: 2024-12-05",
),
pytest.param(
"https://www.scienceinschool.org/robots.txt",
True,
id="Science in School does not forbid AI scrapers. Last checked on: 2024-12-05",
),
pytest.param(
"https://www.leifiphysik.de/robots.txt",
False,
id="Leifi-Physik forbids (a lot) of AI scrapers. Last checked on: 2024-12-05",
),
pytest.param(
"https://www.golem.de/robots.txt",
False,
id="Golem.de forbids several AI scrapers. Last checked on: 2024-12-05",
),
pytest.param(
"https://taz.de/robots.txt",
False,
id="taz.de forbids several AI scrapers (GPTBot, Bytespider). Last checked on: 2024-12-05",
),
],
)
def test_if_ai_usage_is_allowed_with_live_examples(test_input: str, expected: bool):
"""This test is flaky by nature as it uses third-party ``robots.txt``-files, which might change without notice,
Expand Down

0 comments on commit b60e2de

Please sign in to comment.