From ebbe298e299c387943705726e70fb38b8358f71c Mon Sep 17 00:00:00 2001 From: Frank Sachsenheim Date: Fri, 13 Sep 2024 15:36:56 +0200 Subject: [PATCH] integration-tests: Ignore files larger than 10MB --- integration-tests/fetch-corpora.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/integration-tests/fetch-corpora.py b/integration-tests/fetch-corpora.py index a64e958f..2d4c3f30 100755 --- a/integration-tests/fetch-corpora.py +++ b/integration-tests/fetch-corpora.py @@ -54,6 +54,7 @@ class Archive(NamedTuple): CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora" +FILE_SIZE_LIMIT: Final = 10 * 1024 ** 2 ARCHIVE_DESCRIPTIONS: Final = ( Archive( @@ -286,6 +287,9 @@ def _filter(member: tarfile.TarInfo, path: str) -> tarfile.TarInfo | None: if member is None: return None + if member.size > FILE_SIZE_LIMIT: + return None + member_path = member.name root_folder = archive_description.archive_documents_root if member_path.endswith(".xml") and member_path.startswith(root_folder):