From 854f3a2c9ad425023de4ac131d85799f81959afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 13 Jan 2025 09:24:14 +0000 Subject: [PATCH] [DOP-22143] Implement FileSizeFilter --- docs/file/file_filters/file_size_filter.rst | 9 ++ docs/file/file_filters/index.rst | 1 + onetl/file/filter/__init__.py | 1 + onetl/file/filter/file_size.py | 96 +++++++++++++++++++ setup.cfg | 4 +- .../test_filter/test_file_size_filter.py | 47 +++++++++ 6 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 docs/file/file_filters/file_size_filter.rst create mode 100644 onetl/file/filter/file_size.py create mode 100644 tests/tests_unit/test_file/test_filter/test_file_size_filter.py diff --git a/docs/file/file_filters/file_size_filter.rst b/docs/file/file_filters/file_size_filter.rst new file mode 100644 index 000000000..98bdfd55f --- /dev/null +++ b/docs/file/file_filters/file_size_filter.rst @@ -0,0 +1,9 @@ +.. _file-size-filter: + +FileSizeFilter +============== + +.. currentmodule:: onetl.file.filter.file_size + +.. autoclass:: FileSizeFilter + :members: match diff --git a/docs/file/file_filters/index.rst b/docs/file/file_filters/index.rst index a76fedf3c..575a31667 100644 --- a/docs/file/file_filters/index.rst +++ b/docs/file/file_filters/index.rst @@ -10,6 +10,7 @@ File Filters glob regexp exclude_dir + file_size_filter .. toctree:: :maxdepth: 1 diff --git a/onetl/file/filter/__init__.py b/onetl/file/filter/__init__.py index 88e2f8356..6115fff6b 100644 --- a/onetl/file/filter/__init__.py +++ b/onetl/file/filter/__init__.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from onetl.file.filter.exclude_dir import ExcludeDir from onetl.file.filter.file_hwm import FileHWMFilter +from onetl.file.filter.file_size import FileSizeFilter from onetl.file.filter.glob import Glob from onetl.file.filter.match_all_filters import match_all_filters from onetl.file.filter.regexp import Regexp diff --git a/onetl/file/filter/file_size.py b/onetl/file/filter/file_size.py new file mode 100644 index 000000000..2d4b09ad1 --- /dev/null +++ b/onetl/file/filter/file_size.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from onetl.base.path_protocol import PathWithStatsProtocol + +try: + from pydantic.v1 import ByteSize, root_validator +except (ImportError, AttributeError): + from pydantic import root_validator, ByteSize # type: ignore[no-redef, assignment] + +from onetl.base import BaseFileFilter, PathProtocol +from onetl.impl import FrozenModel + + +class FileSizeFilter(BaseFileFilter, FrozenModel): + """Filter files matching a specified size. + + If file size doesn't match boundaries, it will be excluded. + Doesn't affect directories or paths without defined size. + + .. versionadded:: 0.13.0 + + .. note:: + + SI unit prefixes means that ``1Kb`` == ``1 kilobyte`` == ``1000 bytes``. + If you need ``1024 bytes``, use ``1 Kib`` == ``1 kibibyte``. + + Parameters + ---------- + + min : :obj:`int`, optional + + Minimal allowed file size. ``None`` means no limit. Value can be set in a string form. + + max : :obj:`int`, optional + + If file size is greater than this value, it will be excluded. + + Maximum allowed file size. ``None`` means no limit. Value can be set in a string form. + + Examples + -------- + + Specify min and max file sizes: + + .. code:: python + + from onetl.file.filter import FileSizeFilter + + file_size = FileSizeFilter(min="1Kib", max="100Mib") + + Specify only min file size: + + .. code:: python + + from onetl.file.filter import FileSizeFilter + + file_size = FileSizeFilter(min="1Kib") + + Specify only max file size: + + .. code:: python + + from onetl.file.filter import FileSizeFilter + + file_size = FileSizeFilter(max="100Mib") + """ + + min: ByteSize | None = None + max: ByteSize | None = None + + @root_validator + def _validate(cls, values): + min_value = values.get("min") + max_value = values.get("max") + + if min_value is None and max_value is None: + raise ValueError("Either min or max must be specified") + + if min_value and max_value and min_value > max_value: + raise ValueError("Min size cannot be greater than max size") + + return values + + def match(self, path: PathProtocol) -> bool: + if path.is_file() and isinstance(path, PathWithStatsProtocol): + file_size = path.stat().st_size + + if self.min is not None and file_size < self.min: + return False + + if self.max is not None and file_size > self.max: + return False + + return True diff --git a/setup.cfg b/setup.cfg index 3fcecbbaa..3d6d37b61 100644 --- a/setup.cfg +++ b/setup.cfg @@ -281,7 +281,9 @@ ignore = # WPS412 Found `__init__.py` module with logic WPS412, # WPS413 Found bad magic module function: __getattr__ - WPS413 + WPS413, +# WPS338 Found incorrect order of methods in a class + WPS338 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = diff --git a/tests/tests_unit/test_file/test_filter/test_file_size_filter.py b/tests/tests_unit/test_file/test_filter/test_file_size_filter.py new file mode 100644 index 000000000..dab6ce490 --- /dev/null +++ b/tests/tests_unit/test_file/test_filter/test_file_size_filter.py @@ -0,0 +1,47 @@ +import pytest + +from onetl.file.filter import FileSizeFilter +from onetl.impl import RemoteDirectory, RemoteFile, RemotePathStat + + +def test_file_size_filter_invalid(): + with pytest.raises(ValueError, match="Either min or max must be specified"): + FileSizeFilter() + with pytest.raises(ValueError, match="Min size cannot be greater than max size"): + FileSizeFilter(min="10Kb", max="1Kb") + + +@pytest.mark.parametrize( + ["input", "expected"], + [ + ("10", 10), + ("10B", 10), + ("10b", 10), + ("10Kb", 10_000), + ("10Kib", 10 * 1024), + ("10Mb", 10_000_000), + ("10Mib", 10 * 1024 * 1024), + ("10Gb", 10_000_000_000), + ("10Gib", 10 * 1024 * 1024 * 1024), + ], +) +def test_file_size_filter_parse_units(input: str, expected: int): + assert FileSizeFilter(min=input).min == expected + assert FileSizeFilter(max=input).max == expected + + +@pytest.mark.parametrize( + "matched, path", + [ + (False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=1024, st_mtime=50))), + (True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50))), + (True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=15 * 1024, st_mtime=50))), + (True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50))), + (False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=30 * 1024, st_mtime=50))), + (True, RemoteDirectory("some")), + ], +) +def test_file_size_filter_match(matched, path): + file_filter = FileSizeFilter(min="10Kib", max="20Kib") + + assert file_filter.match(path) == matched