From c8219ec8aa90265659d801be66cd8ad5c56d5f4f Mon Sep 17 00:00:00 2001 From: Taylor Madore Date: Mon, 22 Jan 2024 07:38:48 -0500 Subject: [PATCH 1/4] always format request datetime with microseconds datetime.isoformat will truncate microseconds if they are zero, which requires handling with multiple format strings when using strptime Signed-off-by: Taylor Madore --- cachito/web/api_v1.py | 2 +- cachito/web/models.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cachito/web/api_v1.py b/cachito/web/api_v1.py index 5343c50e6..9f550b2b5 100644 --- a/cachito/web/api_v1.py +++ b/cachito/web/api_v1.py @@ -978,7 +978,7 @@ def get_request_metrics(): "id": state.request_id, "final_state": RequestStateMapping(state.state).name, "final_state_reason": state.state_reason, - "finished": state.updated.isoformat(), + "finished": state.updated.isoformat(timespec="microseconds"), "duration": state.duration, "time_in_queue": state.time_in_queue, } diff --git a/cachito/web/models.py b/cachito/web/models.py index 4d1dfa18f..00699e691 100644 --- a/cachito/web/models.py +++ b/cachito/web/models.py @@ -406,12 +406,16 @@ def to_json(self, verbose=True): if self.user: user = self.user.username + created = None + if self.created is not None: + created = self.created.isoformat(timespec="microseconds") + env_vars_json = OrderedDict() for env_var in self.environment_variables: env_vars_json[env_var.name] = env_var.value rv = { "id": self.id, - "created": None if self.created is None else self.created.isoformat(), + "created": created, "repo": self.repo, "ref": self.ref, "pkg_managers": pkg_managers, @@ -428,7 +432,7 @@ def _state_to_json(state): return { "state": RequestStateMapping(state.state).name, "state_reason": state.state_reason, - "updated": state.updated.isoformat(), + "updated": state.updated.isoformat(timespec="microseconds"), } def _error_to_json(error): @@ -776,7 +780,7 @@ def to_json(self): "origin": self.origin, "error_type": self.error_type, "message": self.message, - "occurred": self.occurred.isoformat(), + "occurred": self.occurred.isoformat(timespec="microseconds"), } @classmethod From 7f26ec4bf8cab37db6e82d874dce3ef88c35e482 Mon Sep 17 00:00:00 2001 From: Taylor Madore Date: Tue, 23 Jan 2024 07:32:39 -0500 Subject: [PATCH 2/4] add minimum and default archive age to config cachito_archives_default_age_days will be the default age in days after which source archives are purged cachito_archives_minimum_age_days will be the minimum age in days before which source archives should not be purged STONEBLD-1990 Signed-off-by: Taylor Madore --- cachito/workers/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cachito/workers/config.py b/cachito/workers/config.py index 3f0dc7d2c..3f6a5ab6a 100644 --- a/cachito/workers/config.py +++ b/cachito/workers/config.py @@ -25,6 +25,8 @@ class Config(object): broker_transport_options = {"max_retries": 10} # Refer to README.md for information on all the Cachito configuration options cachito_api_timeout = 60 + cachito_archives_default_age_days = 730 + cachito_archives_minimum_age_days = 365 cachito_auth_type: Optional[str] = None cachito_default_environment_variables = { "gomod": {"GOSUMDB": {"value": "off", "kind": "literal"}}, From 2ed37fa703548af4c0076e753510d391a398af56 Mon Sep 17 00:00:00 2001 From: Taylor Madore Date: Thu, 11 Jan 2024 15:08:44 -0500 Subject: [PATCH 3/4] add typer and ratelimit to requirements typer will be used for a script to prune stale cachito source archives and ratelimit will be used to prevent that script from potentially overwhelming the API with requests for what is a maintenance activity STONEBLD-1990 Signed-off-by: Taylor Madore --- requirements.in | 2 ++ requirements.txt | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/requirements.in b/requirements.in index cba257701..67ded7c8d 100644 --- a/requirements.in +++ b/requirements.in @@ -14,6 +14,7 @@ packaging pyarn pydantic<2 pyyaml +ratelimit requests_kerberos>=0.13.0 requests semver @@ -24,3 +25,4 @@ opentelemetry-instrumentation-requests opentelemetry-instrumentation-sqlalchemy opentelemetry-exporter-jaeger opentelemetry-exporter-otlp-proto-http +typer diff --git a/requirements.txt b/requirements.txt index e2dba0a69..db56a8285 100644 --- a/requirements.txt +++ b/requirements.txt @@ -286,6 +286,7 @@ click==8.1.7 \ # click-plugins # click-repl # commoncode + # typer click-didyoumean==0.3.0 \ --hash=sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667 \ --hash=sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035 @@ -829,6 +830,9 @@ pyyaml==6.0.1 \ # via # -r requirements.in # saneyaml +ratelimit==2.2.1 \ + --hash=sha256:af8a9b64b821529aca09ebaf6d8d279100d766f19e90b5059ac6a718ca6dee42 + # via -r requirements.in requests==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 @@ -870,12 +874,17 @@ text-unidecode==1.3 \ thrift==0.16.0 \ --hash=sha256:2b5b6488fcded21f9d312aa23c9ff6a0195d0f6ae26ddbd5ad9e3e25dfc14408 # via opentelemetry-exporter-jaeger-thrift +typer==0.9.0 \ + --hash=sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2 \ + --hash=sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee + # via -r requirements.in typing-extensions==4.9.0 \ --hash=sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783 \ --hash=sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd # via # opentelemetry-sdk # pydantic + # typer tzdata==2023.3 \ --hash=sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a \ --hash=sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda From b683ca230f4c54116c2b4923f316424062ef750e Mon Sep 17 00:00:00 2001 From: Taylor Madore Date: Wed, 17 Jan 2024 14:31:57 -0500 Subject: [PATCH 4/4] add script for pruning stale source archives The script will iterate over stored source archives and attempt to locate the most recent request for them via the API. Archives determined to be stale will be deleted. STONEBLD-1990 Signed-off-by: Taylor Madore --- cachito/workers/prune_archives.py | 233 +++++++++++++ setup.py | 1 + tests/test_workers/test_prune_archives.py | 403 ++++++++++++++++++++++ 3 files changed, 637 insertions(+) create mode 100644 cachito/workers/prune_archives.py create mode 100644 tests/test_workers/test_prune_archives.py diff --git a/cachito/workers/prune_archives.py b/cachito/workers/prune_archives.py new file mode 100644 index 000000000..3d0eef69f --- /dev/null +++ b/cachito/workers/prune_archives.py @@ -0,0 +1,233 @@ +import logging +import re +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from itertools import islice +from pathlib import Path +from typing import Annotated, Any, Generator, NamedTuple, Optional + +import requests +import typer +from ratelimit import limits, sleep_and_retry + +from cachito.errors import NetworkError +from cachito.workers.config import get_worker_config +from cachito.workers.requests import get_requests_session + +app = typer.Typer() +config = get_worker_config() +log = logging.getLogger(__name__) +session = get_requests_session() + +ARCHIVE_DIR = Path(config.cachito_sources_dir) +ARCHIVE_PATTERN = re.compile(r"^[a-f0-9]{40}(-with-submodules)?\.tar\.gz$") +DEFAULT_AGE_DATETIME = datetime.now(timezone.utc) - timedelta( + days=config.cachito_archives_default_age_days +) +MINIMUM_AGE_DATETIME = datetime.now(timezone.utc) - timedelta( + days=config.cachito_archives_minimum_age_days +) +LOG_FORMAT = "%(asctime)s %(levelname)s %(message)s" + + +@dataclass(frozen=True) +class _ParsedArchive: + """A source archive parsed from the filesystem.""" + + path: Path + repo_name: str + ref: str + + @classmethod + def from_path(cls, path: Path) -> "_ParsedArchive": + repo_name = path.parent.relative_to(ARCHIVE_DIR).as_posix() + ref = path.name[:40] + return cls(path, repo_name, ref) + + +class _ResolvedArchive(NamedTuple): + """A source archive matched to the most recent request for it.""" + + path: Path + created: datetime + latest_request_id: int + + +@app.callback() +def configure_logging(verbose: bool = False): + """Configure logging for the app.""" + log_level = logging.DEBUG if verbose else logging.INFO + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter(LOG_FORMAT)) + log.setLevel(log_level) + log.addHandler(handler) + + +def _get_latest_request(archive: _ParsedArchive) -> Optional[dict[str, Any]]: + """ + Find the latest request matching the _ParsedArchive via the API. + + Return None if no matching request is found. + """ + url = f"{config.cachito_api_url.rstrip('/')}/requests/latest" + params = { + "repo_name": archive.repo_name, + "ref": archive.ref, + } + + try: + response = session.get(url, params=params, timeout=config.cachito_api_timeout) + response.raise_for_status() + except requests.HTTPError: + if response.status_code == 404: + return None + log.error( + "The request to %s failed with the status code %d and the following text: %s", + url, + response.status_code, + response.text, + ) + raise NetworkError("Failed to query the cachito API") + except requests.RequestException: + msg = f"The connection failed when querying {url}" + log.exception(msg) + raise NetworkError(msg) + + return response.json() + + +def _get_parsed_source_archives(archive_dir: Path) -> Generator[_ParsedArchive, None, None]: + """Return a _ParsedArchive for each source archive in ARCHIVE_DIR.""" + + def is_valid_archive_filename(filename: str) -> bool: + """Archive filename should match -<(optional)with-submodules>.tar.gz.""" + return re.match(ARCHIVE_PATTERN, filename) is not None + + for path in archive_dir.rglob("*.tar.gz"): + if path.is_file() and is_valid_archive_filename(path.name): + yield _ParsedArchive.from_path(path) + else: + log.debug("%s does not appear to be a source archive.", path) + + +def _resolve_source_archive(parsed_archive: _ParsedArchive) -> Optional[_ResolvedArchive]: + """Return a _ResolvedArchive if a matching request is found via the API.""" + latest_request = _get_latest_request(parsed_archive) + if latest_request is None: + log.debug("Archive %s could not be resolved via the API.", parsed_archive.path) + return None + + return _ResolvedArchive( + parsed_archive.path, + datetime.strptime(latest_request["created"], "%Y-%m-%dT%H:%M:%S.%f").replace( + tzinfo=timezone.utc + ), + latest_request["id"], + ) + + +def _get_stale_archives( + older_than: datetime, api_calls_per_second: int +) -> Generator[_ResolvedArchive, None, None]: + """ + Return a Generator of _ResolvedArchives that are all stale. + + The API requests are ratelimited to prevent potentially overwhelming the API + with a background maintenance task. + """ + + @sleep_and_retry + @limits(calls=api_calls_per_second, period=1) + def resolve_source_archive_ratelimited(archive: _ParsedArchive) -> Optional[_ResolvedArchive]: + return _resolve_source_archive(archive) + + for parsed_archive in _get_parsed_source_archives(ARCHIVE_DIR): + resolved_archive = resolve_source_archive_ratelimited(parsed_archive) + if resolved_archive and resolved_archive.created < older_than: + yield resolved_archive + + +def _process_stale_archives( + older_than: datetime, + api_calls_per_second: int, + delete: bool = False, + limit: Optional[int] = None, +) -> None: + """List stale source archives up to the limit, optionally deleting them.""" + for archive in islice(_get_stale_archives(older_than, api_calls_per_second), limit): + log.info( + f"Archive {archive.path} is stale. The most recent request_id=" + f"{archive.latest_request_id} at {archive.created}" + ) + if delete: + log.info(f"Deleting {archive.path}") + archive.path.unlink() + + +def _validate_older_than(older_than: Optional[datetime]) -> datetime: + """Ensure that the value of the --older-than CLI option is not more recent than the minimum.""" + older_than_utc = ( + DEFAULT_AGE_DATETIME if older_than is None else older_than.astimezone(timezone.utc) + ) + if older_than_utc > MINIMUM_AGE_DATETIME: + raise typer.BadParameter(f"cannot be more recent than {MINIMUM_AGE_DATETIME}") + return older_than_utc + + +@app.command("delete") +def delete_archives( + older_than: Annotated[ + Optional[datetime], + typer.Option( + callback=_validate_older_than, + formats=["%Y-%m-%d"], + help="Deletes archives that are older than the specified date. YYYY-MM-DD", + ), + ] = None, + api_calls_per_second: Annotated[ + int, typer.Option(min=1, max=5, help="The API requests-per-second limit.") + ] = 2, + limit: Annotated[ + Optional[int], typer.Option(min=1, help="The maximum number of stale archives to process.") + ] = None, + execute: Annotated[bool, typer.Option(help="Actual deletion will only occur if True.")] = False, +): + """ + List and delete stale source archives. + + Actual deletion will not occur unless the --execute option is included. + """ + # Needed to keep mypy happy. See the _validate_older_than callback + if older_than is None: + raise typer.BadParameter("--older-than cannot be None") + + _process_stale_archives(older_than, api_calls_per_second, delete=execute, limit=limit) + + +@app.command("list") +def list_archives( + older_than: Annotated[ + Optional[datetime], + typer.Option( + callback=_validate_older_than, + formats=["%Y-%m-%d"], + help="Lists archives that are older than the specified date. YYYY-MM-DD", + ), + ] = None, + api_calls_per_second: Annotated[ + int, typer.Option(min=1, max=5, help="The API requests-per-second limit.") + ] = 2, + limit: Annotated[ + Optional[int], typer.Option(min=1, help="The maximum number of stale archives to process.") + ] = None, +): + """List stale source archives.""" + # Needed to keep mypy happy. See the _validate_older_than callback + if older_than is None: + raise typer.BadParameter("--older-than cannot be None") + + _process_stale_archives(older_than, api_calls_per_second, delete=False, limit=limit) + + +if __name__ == "__main__": + app() diff --git a/setup.py b/setup.py index d29857d2c..55b93d700 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ "console_scripts": [ "cachito=cachito.web.manage:cli", "cachito-cleanup=cachito.workers.cleanup_job:main", + "cachito-prune-archives=cachito.workers.prune_archives:app", "cachito-update-nexus-scripts=cachito.workers.nexus:create_or_update_scripts", ] }, diff --git a/tests/test_workers/test_prune_archives.py b/tests/test_workers/test_prune_archives.py new file mode 100644 index 000000000..bb1495669 --- /dev/null +++ b/tests/test_workers/test_prune_archives.py @@ -0,0 +1,403 @@ +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional +from unittest import mock + +import pytest +import requests +from typer.testing import CliRunner + +from cachito.errors import NetworkError +from cachito.workers.prune_archives import ( + _get_latest_request, + _get_parsed_source_archives, + _get_stale_archives, + _ParsedArchive, + _process_stale_archives, + _resolve_source_archive, + _ResolvedArchive, + app, +) + +runner = CliRunner() + +LATEST_REQUEST_DATA = [ + { + "id": 1, + "created": "2024-01-01T00:00:00.000000", + }, + None, + { + "id": 3, + "created": "2024-01-03T00:00:00.000000", + }, + { + "id": 4, + "created": "2024-01-03T00:00:00.000000", + }, + { + "id": 5, + "created": "2024-01-05T00:00:00.000000", + }, +] + + +@pytest.fixture() +def archive_paths(tmp_path: Path) -> list[Path]: + paths = [ + "my-org/not-an-archive.txt", + "not-a-ref.tar.gz", + "?ccccccccccccccccccccccccccccccccccccccc.tar.gz", + "not-a-ref-with-submodules.tar.gz", + "my-org/bar/cccccccccccccccccccccccccccccccccccccccc.tar.gz", + "my-org/foo/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.tar.gz", + "my-org/foo/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.tar.gz", + "my-org/foo/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-with-submodules.tar.gz", + "nested/my-org/baz/dddddddddddddddddddddddddddddddddddddddd.tar.gz", + ] + + return [tmp_path / path for path in paths] + + +@pytest.fixture() +def archive_dir(tmp_path: Path, archive_paths: list[Path]) -> None: + with mock.patch("cachito.workers.prune_archives.ARCHIVE_DIR", tmp_path): + for path in archive_paths: + path.parent.mkdir(parents=True, exist_ok=True) + path.touch() + + yield tmp_path + + +@pytest.fixture() +def parsed_archives(archive_dir: Path): + paths = [ + "my-org/bar/cccccccccccccccccccccccccccccccccccccccc.tar.gz", + "my-org/foo/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.tar.gz", + "my-org/foo/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.tar.gz", + "my-org/foo/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-with-submodules.tar.gz", + "nested/my-org/baz/dddddddddddddddddddddddddddddddddddddddd.tar.gz", + ] + + return [_ParsedArchive.from_path(archive_dir / path) for path in paths] + + +@pytest.fixture() +def resolved_archives(archive_dir: Path): + return [ + _ResolvedArchive( + Path(archive_dir, "my-org/bar/cccccccccccccccccccccccccccccccccccccccc.tar.gz"), + datetime(2024, 1, 1, tzinfo=timezone.utc), + 1, + ), + None, + _ResolvedArchive( + Path(archive_dir, "my-org/foo/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.tar.gz"), + datetime(2024, 1, 3, tzinfo=timezone.utc), + 3, + ), + _ResolvedArchive( + Path( + archive_dir, + "my-org/foo/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-with-submodules.tar.gz", + ), + datetime(2024, 1, 3, tzinfo=timezone.utc), + 4, + ), + _ResolvedArchive( + Path(archive_dir, "nested/my-org/baz/dddddddddddddddddddddddddddddddddddddddd.tar.gz"), + datetime(2024, 1, 5, tzinfo=timezone.utc), + 5, + ), + ] + + +@pytest.fixture() +def stale_archives(archive_dir: Path): + return [ + _ResolvedArchive( + Path(archive_dir, "my-org/bar/cccccccccccccccccccccccccccccccccccccccc.tar.gz"), + datetime(2024, 1, 1, tzinfo=timezone.utc), + 1, + ), + ] + + +@mock.patch("cachito.workers.prune_archives.session.get") +def test_get_latest_request_not_found_test(mock_get_latest: mock.Mock): + """Tests that get_latest_request returns None when the API responds with a 404.""" + mock_response = mock.MagicMock(status_code=404) + mock_response.raise_for_status.side_effect = [requests.HTTPError()] + mock_get_latest.return_value = mock_response + result = _get_latest_request(mock.Mock()) + assert result is None + + +@mock.patch("cachito.workers.prune_archives.session.get") +def test_get_latest_request_timeout(mock_get_latest: mock.Mock): + """Tests that get_latest_request raises NetworkError on a failed connection.""" + mock_get_latest.side_effect = requests.ConnectionError() + expected = "The connection failed when querying" + with pytest.raises(NetworkError, match=expected): + _get_latest_request(mock.Mock()) + + +@mock.patch("cachito.workers.prune_archives.session.get") +def test_get_latest_request_http_error(mock_get_latest: mock.Mock): + """Tests that get_latest_request raises NetworkError for HTTP errors other than 404.""" + mock_response = mock.MagicMock(status_code=500) + mock_response.raise_for_status.side_effect = [requests.HTTPError()] + mock_get_latest.return_value = mock_response + expected = "Failed to query the cachito API" + with pytest.raises(NetworkError, match=expected): + _get_latest_request(mock.Mock()) + + +@mock.patch("pathlib.Path.rglob") +def test_get_parsed_source_archives( + mock_rglob: mock.Mock, + archive_dir: Path, + archive_paths: list[Path], + parsed_archives: list[_ParsedArchive], +): + """Tests finding source archives on the filesystem and parsing them into _ParsedArchives.""" + mock_rglob.return_value = archive_paths + result = _get_parsed_source_archives(archive_dir) + assert list(result) == parsed_archives + + +@mock.patch("cachito.workers.prune_archives._get_latest_request") +def test_resolve_source_archive(mock_request: mock.Mock): + """Tests resolving a ParsedArchive with request data from the API.""" + path = Path("my-org/my-project/ce60002604554992203f2afe17f23724f674b411.tar.gz") + repo_name = path.parent.as_posix() + ref = path.name[:40] + created = datetime(2024, 1, 1, tzinfo=timezone.utc) + latest_request_id = 1 + + mock_request.return_value = { + "created": datetime.strftime(created, format="%Y-%m-%dT%H:%M:%S.%f"), + "id": latest_request_id, + } + parsed_archive = _ParsedArchive(path, repo_name, ref) + expected_resolved_archive = _ResolvedArchive(path, created, latest_request_id) + + resolved_archive = _resolve_source_archive(parsed_archive) + assert resolved_archive == expected_resolved_archive + + +@mock.patch("cachito.workers.prune_archives._get_latest_request") +def test_resolve_source_archive_not_found(mock_request: mock.Mock): + """Tests when we cannot resolve a ParsedArchive with request data from the API.""" + path = Path("my-org/my-project/ce60002604554992203f2afe17f23724f674b411.tar.gz") + repo_name = path.parent.as_posix() + ref = path.name[:40] + + mock_request.return_value = None + parsed_archive = _ParsedArchive(path, repo_name, ref) + + resolved_archive = _resolve_source_archive(parsed_archive) + assert resolved_archive is None + + +@mock.patch("cachito.workers.prune_archives._resolve_source_archive") +@mock.patch("cachito.workers.prune_archives._get_parsed_source_archives") +def test_get_stale_archives( + mock_get_parsed_archives: mock.Mock, + mock_resolve_archive: mock.Mock, + parsed_archives: list[_ParsedArchive], + resolved_archives: list[_ResolvedArchive], + stale_archives: list[_ResolvedArchive], +): + """Tests that get_stale_archives returns a list of _ResolvedArchives that are stale.""" + older_than = datetime(2024, 1, 3, 0, 0, 0, 0, tzinfo=timezone.utc) + api_calls_per_second = 100 + mock_get_parsed_archives.return_value = parsed_archives + mock_resolve_archive.side_effect = resolved_archives + + result = _get_stale_archives(older_than, api_calls_per_second) + assert list(result) == stale_archives + + +@pytest.mark.parametrize("limit, expected_deletions", [(None, 2), (1, 1)]) +@mock.patch("pathlib.Path.unlink") +@mock.patch("cachito.workers.prune_archives._get_stale_archives") +def test_process_stale_archives_limit( + mock_get_stale: mock.Mock, mock_unlink: mock.Mock, limit: Optional[int], expected_deletions: int +): + """Tests that _process_stale_archives adheres to the `--limit` CLI option.""" + archives = [ + _ResolvedArchive(Path("aaa"), datetime.now(), 1), + _ResolvedArchive(Path("bbb"), datetime.now(), 2), + ] + mock_get_stale.return_value = archives + older_than = datetime(2024, 1, 1) + api_calls_per_second = 100 + + _process_stale_archives(older_than, api_calls_per_second, delete=True, limit=limit) + mock_get_stale.assert_called_once_with(older_than, api_calls_per_second) + assert mock_unlink.call_count == expected_deletions + + +@mock.patch( + "cachito.workers.prune_archives.MINIMUM_AGE_DATETIME", datetime(2024, 1, 3, tzinfo=timezone.utc) +) +@mock.patch("cachito.workers.prune_archives._get_latest_request") +@mock.patch("pathlib.Path.rglob") +def test_process_stale_archives_delete_e2e( + mock_rglob: mock.Mock, + mock_get_latest: mock.Mock, + archive_dir: Path, + archive_paths: list[Path], + stale_archives: list[_ResolvedArchive], +): + """Tests _process_stale_archives e2e and ensures that only stale archives are deleted.""" + mock_rglob.return_value = archive_paths + mock_get_latest.side_effect = LATEST_REQUEST_DATA + _process_stale_archives( + datetime(2024, 1, 3, tzinfo=timezone.utc), api_calls_per_second=100, delete=True, limit=None + ) + + deleted_paths = {archive.path for archive in stale_archives} + all_paths = {archive_dir / path for path in archive_paths} + remaining_paths = all_paths - deleted_paths + + # Ensure that stale paths have been deleted and all others remain + for path in deleted_paths: + assert not path.exists() + for path in remaining_paths: + assert path.exists() + + +@mock.patch( + "cachito.workers.prune_archives.MINIMUM_AGE_DATETIME", datetime(2024, 1, 3, tzinfo=timezone.utc) +) +@mock.patch("cachito.workers.prune_archives._get_latest_request") +@mock.patch("pathlib.Path.rglob") +def test_process_stale_archives_list_only_e2e( + mock_rglob: mock.Mock, + mock_get_latest: mock.Mock, + archive_dir: Path, + archive_paths: list[Path], + stale_archives: list[_ResolvedArchive], +): + """Tests _process_stale_archives e2e and ensures no archives are deleted if delete=False.""" + mock_rglob.return_value = archive_paths + mock_get_latest.side_effect = LATEST_REQUEST_DATA + _process_stale_archives( + datetime(2024, 1, 3, tzinfo=timezone.utc), + api_calls_per_second=100, + delete=False, + limit=None, + ) + + # Ensure that all archive paths still exist + for path in (archive_dir / path for path in archive_paths): + assert path.exists() + + +def test_prune_archives_app_no_command(): + """Tests invoking the app without a command.""" + result = runner.invoke(app, []) + assert result.exit_code == 2 + assert "Error: Missing command." in result.stdout + + +@pytest.mark.parametrize("command", ["list", "delete"]) +def test_prune_archives_app_invalid_date(command: mock.Mock): + """Tests processing archives that are more recent than the minimum.""" + today = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d") + result = runner.invoke(app, [command, "--older-than", today]) + assert result.exit_code == 2 + assert "'--older-than': cannot be more recent than" in result.stdout + + +@mock.patch( + "cachito.workers.prune_archives.MINIMUM_AGE_DATETIME", datetime(2024, 1, 3, tzinfo=timezone.utc) +) +@mock.patch( + "cachito.workers.prune_archives.DEFAULT_AGE_DATETIME", datetime(2024, 1, 1, tzinfo=timezone.utc) +) +@mock.patch("cachito.workers.prune_archives._process_stale_archives") +def test_prune_archives_app_list(mock_process_stale: mock.Mock): + """Tests the `list` CLI command with no options.""" + result = runner.invoke(app, ["list"]) + assert result.exit_code == 0 + mock_process_stale.assert_called_once_with( + datetime(2024, 1, 1, tzinfo=timezone.utc), 2, delete=False, limit=None + ) + + +@mock.patch( + "cachito.workers.prune_archives.MINIMUM_AGE_DATETIME", datetime(2024, 1, 3, tzinfo=timezone.utc) +) +@mock.patch( + "cachito.workers.prune_archives.DEFAULT_AGE_DATETIME", datetime(2024, 1, 1, tzinfo=timezone.utc) +) +@mock.patch("cachito.workers.prune_archives._process_stale_archives") +def test_prune_archives_app_list_with_options(mock_process_stale: mock.Mock): + """Tests the `list` CLI command with all options set to non-defaults.""" + older_than_local = datetime(2024, 1, 2) + older_than_utc = older_than_local.astimezone(timezone.utc) + + result = runner.invoke( + app, + [ + "list", + "--older-than", + older_than_local.strftime("%Y-%m-%d"), + "--api-calls-per-second", + "1", + "--limit", + "4", + ], + ) + + assert result.exit_code == 0 + mock_process_stale.assert_called_once_with(older_than_utc, 1, delete=False, limit=4) + + +@mock.patch( + "cachito.workers.prune_archives.MINIMUM_AGE_DATETIME", datetime(2024, 1, 3, tzinfo=timezone.utc) +) +@mock.patch( + "cachito.workers.prune_archives.DEFAULT_AGE_DATETIME", datetime(2024, 1, 1, tzinfo=timezone.utc) +) +@mock.patch("cachito.workers.prune_archives._process_stale_archives") +def test_prune_archives_app_delete(mock_process_stale: mock.Mock): + """Tests the `delete` CLI command with no options.""" + result = runner.invoke(app, ["delete"]) + assert result.exit_code == 0 + mock_process_stale.assert_called_once_with( + datetime(2024, 1, 1, tzinfo=timezone.utc), 2, delete=False, limit=None + ) + + +@mock.patch( + "cachito.workers.prune_archives.MINIMUM_AGE_DATETIME", datetime(2024, 1, 3, tzinfo=timezone.utc) +) +@mock.patch( + "cachito.workers.prune_archives.DEFAULT_AGE_DATETIME", datetime(2024, 1, 1, tzinfo=timezone.utc) +) +@mock.patch("cachito.workers.prune_archives._process_stale_archives") +def test_prune_archives_app_delete_with_options(mock_process_stale: mock.Mock): + """Tests the `delete` CLI command with all options set to non-defaults.""" + older_than_local = datetime(2024, 1, 2) + older_than_utc = older_than_local.astimezone(timezone.utc) + + result = runner.invoke( + app, + [ + "delete", + "--older-than", + older_than_local.strftime("%Y-%m-%d"), + "--api-calls-per-second", + "1", + "--limit", + "4", + "--execute", + ], + ) + + assert result.exit_code == 0 + mock_process_stale.assert_called_once_with(older_than_utc, 1, delete=True, limit=4)