-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[DOP-22143] Implement FileSizeFilter
- Loading branch information
Showing
6 changed files
with
157 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
.. _file-size-filter: | ||
|
||
FileSizeFilter | ||
============== | ||
|
||
.. currentmodule:: onetl.file.filter.file_size | ||
|
||
.. autoclass:: FileSizeFilter | ||
:members: match |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ File Filters | |
glob | ||
regexp | ||
exclude_dir | ||
file_size_filter | ||
|
||
.. toctree:: | ||
:maxdepth: 1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# SPDX-FileCopyrightText: 2021-2024 MTS PJSC | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from __future__ import annotations | ||
|
||
from onetl.base.path_protocol import PathWithStatsProtocol | ||
|
||
try: | ||
from pydantic.v1 import ByteSize, root_validator | ||
except (ImportError, AttributeError): | ||
from pydantic import root_validator, ByteSize # type: ignore[no-redef, assignment] | ||
|
||
from onetl.base import BaseFileFilter, PathProtocol | ||
from onetl.impl import FrozenModel | ||
|
||
|
||
class FileSizeFilter(BaseFileFilter, FrozenModel): | ||
"""Filter files matching a specified size. | ||
If file size doesn't match boundaries, it will be excluded. | ||
Doesn't affect directories or paths without defined size. | ||
.. versionadded:: 0.13.0 | ||
.. note:: | ||
SI unit prefixes means that ``1Kb`` == ``1 kilobyte`` == ``1000 bytes``. | ||
If you need ``1024 bytes``, use ``1 Kib`` == ``1 kibibyte``. | ||
Parameters | ||
---------- | ||
min : :obj:`int`, optional | ||
Minimal allowed file size. ``None`` means no limit. Value can be set in a string form. | ||
max : :obj:`int`, optional | ||
If file size is greater than this value, it will be excluded. | ||
Maximum allowed file size. ``None`` means no limit. Value can be set in a string form. | ||
Examples | ||
-------- | ||
Specify min and max file sizes: | ||
.. code:: python | ||
from onetl.file.filter import FileSizeFilter | ||
file_size = FileSizeFilter(min="1Kib", max="100Mib") | ||
Specify only min file size: | ||
.. code:: python | ||
from onetl.file.filter import FileSizeFilter | ||
file_size = FileSizeFilter(min="1Kib") | ||
Specify only max file size: | ||
.. code:: python | ||
from onetl.file.filter import FileSizeFilter | ||
file_size = FileSizeFilter(max="100Mib") | ||
""" | ||
|
||
min: ByteSize | None = None | ||
max: ByteSize | None = None | ||
|
||
@root_validator | ||
def _validate(cls, values): | ||
min_value = values.get("min") | ||
max_value = values.get("max") | ||
|
||
if min_value is None and max_value is None: | ||
raise ValueError("Either min or max must be specified") | ||
|
||
if min_value and max_value and min_value > max_value: | ||
raise ValueError("Min size cannot be greater than max size") | ||
|
||
return values | ||
|
||
def match(self, path: PathProtocol) -> bool: | ||
if path.is_file() and isinstance(path, PathWithStatsProtocol): | ||
file_size = path.stat().st_size | ||
|
||
if self.min is not None and file_size < self.min: | ||
return False | ||
|
||
if self.max is not None and file_size > self.max: | ||
return False | ||
|
||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
47 changes: 47 additions & 0 deletions
47
tests/tests_unit/test_file/test_filter/test_file_size_filter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import pytest | ||
|
||
from onetl.file.filter import FileSizeFilter | ||
from onetl.impl import RemoteDirectory, RemoteFile, RemotePathStat | ||
|
||
|
||
def test_file_size_filter_invalid(): | ||
with pytest.raises(ValueError, match="Either min or max must be specified"): | ||
FileSizeFilter() | ||
with pytest.raises(ValueError, match="Min size cannot be greater than max size"): | ||
FileSizeFilter(min="10Kb", max="1Kb") | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["input", "expected"], | ||
[ | ||
("10", 10), | ||
("10B", 10), | ||
("10b", 10), | ||
("10Kb", 10_000), | ||
("10Kib", 10 * 1024), | ||
("10Mb", 10_000_000), | ||
("10Mib", 10 * 1024 * 1024), | ||
("10Gb", 10_000_000_000), | ||
("10Gib", 10 * 1024 * 1024 * 1024), | ||
], | ||
) | ||
def test_file_size_filter_parse_units(input: str, expected: int): | ||
assert FileSizeFilter(min=input).min == expected | ||
assert FileSizeFilter(max=input).max == expected | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"matched, path", | ||
[ | ||
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=1024, st_mtime=50))), | ||
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50))), | ||
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=15 * 1024, st_mtime=50))), | ||
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50))), | ||
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=30 * 1024, st_mtime=50))), | ||
(True, RemoteDirectory("some")), | ||
], | ||
) | ||
def test_file_size_filter_match(matched, path): | ||
file_filter = FileSizeFilter(min="10Kib", max="20Kib") | ||
|
||
assert file_filter.match(path) == matched |