Skip to content

Commit

Permalink
[DOP-22143] Implement FileSizeFilter
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfinus committed Jan 13, 2025
1 parent 7542169 commit 854f3a2
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 1 deletion.
9 changes: 9 additions & 0 deletions docs/file/file_filters/file_size_filter.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.. _file-size-filter:

FileSizeFilter
==============

.. currentmodule:: onetl.file.filter.file_size

.. autoclass:: FileSizeFilter
:members: match
1 change: 1 addition & 0 deletions docs/file/file_filters/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ File Filters
glob
regexp
exclude_dir
file_size_filter

.. toctree::
:maxdepth: 1
Expand Down
1 change: 1 addition & 0 deletions onetl/file/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
from onetl.file.filter.exclude_dir import ExcludeDir
from onetl.file.filter.file_hwm import FileHWMFilter
from onetl.file.filter.file_size import FileSizeFilter
from onetl.file.filter.glob import Glob
from onetl.file.filter.match_all_filters import match_all_filters
from onetl.file.filter.regexp import Regexp
96 changes: 96 additions & 0 deletions onetl/file/filter/file_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# SPDX-FileCopyrightText: 2021-2024 MTS PJSC
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

from onetl.base.path_protocol import PathWithStatsProtocol

try:
from pydantic.v1 import ByteSize, root_validator
except (ImportError, AttributeError):
from pydantic import root_validator, ByteSize # type: ignore[no-redef, assignment]

from onetl.base import BaseFileFilter, PathProtocol
from onetl.impl import FrozenModel


class FileSizeFilter(BaseFileFilter, FrozenModel):
"""Filter files matching a specified size.
If file size doesn't match boundaries, it will be excluded.
Doesn't affect directories or paths without defined size.
.. versionadded:: 0.13.0
.. note::
SI unit prefixes means that ``1Kb`` == ``1 kilobyte`` == ``1000 bytes``.
If you need ``1024 bytes``, use ``1 Kib`` == ``1 kibibyte``.
Parameters
----------
min : :obj:`int`, optional
Minimal allowed file size. ``None`` means no limit. Value can be set in a string form.
max : :obj:`int`, optional
If file size is greater than this value, it will be excluded.
Maximum allowed file size. ``None`` means no limit. Value can be set in a string form.
Examples
--------
Specify min and max file sizes:
.. code:: python
from onetl.file.filter import FileSizeFilter
file_size = FileSizeFilter(min="1Kib", max="100Mib")
Specify only min file size:
.. code:: python
from onetl.file.filter import FileSizeFilter
file_size = FileSizeFilter(min="1Kib")
Specify only max file size:
.. code:: python
from onetl.file.filter import FileSizeFilter
file_size = FileSizeFilter(max="100Mib")
"""

min: ByteSize | None = None
max: ByteSize | None = None

@root_validator
def _validate(cls, values):
min_value = values.get("min")
max_value = values.get("max")

if min_value is None and max_value is None:
raise ValueError("Either min or max must be specified")

if min_value and max_value and min_value > max_value:
raise ValueError("Min size cannot be greater than max size")

return values

def match(self, path: PathProtocol) -> bool:
if path.is_file() and isinstance(path, PathWithStatsProtocol):
file_size = path.stat().st_size

if self.min is not None and file_size < self.min:
return False

if self.max is not None and file_size > self.max:
return False

return True
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,9 @@ ignore =
# WPS412 Found `__init__.py` module with logic
WPS412,
# WPS413 Found bad magic module function: __getattr__
WPS413
WPS413,
# WPS338 Found incorrect order of methods in a class
WPS338

# http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores
per-file-ignores =
Expand Down
47 changes: 47 additions & 0 deletions tests/tests_unit/test_file/test_filter/test_file_size_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest

from onetl.file.filter import FileSizeFilter
from onetl.impl import RemoteDirectory, RemoteFile, RemotePathStat


def test_file_size_filter_invalid():
with pytest.raises(ValueError, match="Either min or max must be specified"):
FileSizeFilter()
with pytest.raises(ValueError, match="Min size cannot be greater than max size"):
FileSizeFilter(min="10Kb", max="1Kb")


@pytest.mark.parametrize(
["input", "expected"],
[
("10", 10),
("10B", 10),
("10b", 10),
("10Kb", 10_000),
("10Kib", 10 * 1024),
("10Mb", 10_000_000),
("10Mib", 10 * 1024 * 1024),
("10Gb", 10_000_000_000),
("10Gib", 10 * 1024 * 1024 * 1024),
],
)
def test_file_size_filter_parse_units(input: str, expected: int):
assert FileSizeFilter(min=input).min == expected
assert FileSizeFilter(max=input).max == expected


@pytest.mark.parametrize(
"matched, path",
[
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=1024, st_mtime=50))),
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50))),
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=15 * 1024, st_mtime=50))),
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50))),
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=30 * 1024, st_mtime=50))),
(True, RemoteDirectory("some")),
],
)
def test_file_size_filter_match(matched, path):
file_filter = FileSizeFilter(min="10Kib", max="20Kib")

assert file_filter.match(path) == matched

0 comments on commit 854f3a2

Please sign in to comment.