-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[DOP-22144] Introduce TotalFilesSize limit
- Loading branch information
Showing
11 changed files
with
230 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Introduce ``TotalFilesSize(...)`` limit. Now users can set ``FileDownloader`` / ``FileMover`` to stop downloading/moving files after reaching a certain amount of data. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ File Limits | |
:caption: File limits | ||
|
||
max_files_count | ||
total_files_size | ||
|
||
.. toctree:: | ||
:maxdepth: 1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
.. _total-files-size-limit: | ||
|
||
TotalFilesSize | ||
============== | ||
|
||
.. currentmodule:: onetl.file.limit.total_tiles_size | ||
|
||
.. autoclass:: TotalFilesSize | ||
:members: reset, stops_at, is_reached |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# SPDX-FileCopyrightText: 2021-2024 MTS PJSC | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from __future__ import annotations | ||
|
||
import logging | ||
|
||
from onetl.base.path_protocol import PathWithStatsProtocol | ||
|
||
try: | ||
from pydantic.v1 import ByteSize, validator | ||
except (ImportError, AttributeError): | ||
from pydantic import ByteSize, validator # type: ignore[no-redef, assignment] | ||
|
||
from onetl.base import BaseFileLimit, PathProtocol | ||
from onetl.impl import FrozenModel | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
class TotalFilesSize(BaseFileLimit, FrozenModel): | ||
"""Limits the total size of files handled by :ref:`file-downloader` or :ref:`file-mover`. | ||
Sum of downloaded/moved files should be less or equal to specified size. After that all files with non-zero size will be ignored. | ||
This doesn't apply to directories or files with no size information, | ||
.. versionadded:: 0.13.0 | ||
..note:: | ||
SI unit prefixes means that ``1KB`` == ``1 kilobyte`` == ``1000 bytes``. | ||
If you need ``1024 bytes``, use ``1 KiB`` == ``1 kibibyte``. | ||
Parameters | ||
---------- | ||
limit : int or str | ||
Examples | ||
-------- | ||
Create filter which allows to download/move files with total size up to 1GiB, but not higher: | ||
.. code:: python | ||
from onetl.file.limit import MaxFilesCount | ||
limit = TotalFilesSize("1GiB") | ||
""" | ||
|
||
limit: ByteSize | ||
|
||
_handled: int = 0 | ||
|
||
def __init__(self, limit: int | str): | ||
# this is only to allow passing glob as positional argument | ||
super().__init__(limit=limit) # type: ignore | ||
|
||
def __repr__(self): | ||
return f'{self.__class__.__name__}("{self.limit.human_readable()}")' | ||
|
||
@validator("limit") | ||
def _limit_cannot_be_negative(cls, value): | ||
if value <= 0: | ||
raise ValueError("Limit should be positive number") | ||
return value | ||
|
||
def reset(self): | ||
self._handled = 0 | ||
return self | ||
|
||
def stops_at(self, path: PathProtocol) -> bool: | ||
if self.is_reached: | ||
return True | ||
|
||
if not path.is_file(): | ||
# directories count does not matter | ||
return False | ||
|
||
if not isinstance(path, PathWithStatsProtocol): | ||
return False | ||
|
||
self._handled += path.stat().st_size | ||
return self.is_reached | ||
|
||
@property | ||
def is_reached(self) -> bool: | ||
return self._handled >= self.limit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
14 changes: 14 additions & 0 deletions
14
tests/tests_unit/test_file/test_limit/test_max_files_count.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
81 changes: 81 additions & 0 deletions
81
tests/tests_unit/test_file/test_limit/test_total_files_size.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import pytest | ||
|
||
from onetl.file.limit import TotalFilesSize | ||
from onetl.impl import RemoteDirectory, RemoteFile, RemotePathStat | ||
|
||
|
||
def test_total_files_size_invalid(): | ||
with pytest.raises(ValueError, match="Limit should be positive number"): | ||
TotalFilesSize(0) | ||
|
||
with pytest.raises(ValueError, match="Limit should be positive number"): | ||
TotalFilesSize(-1) | ||
|
||
with pytest.raises(ValueError, match="could not parse value and unit from byte string"): | ||
TotalFilesSize("wtf") | ||
|
||
|
||
def test_total_files_size_repr(): | ||
assert repr(TotalFilesSize("10KiB")) == 'TotalFilesSize("10.0KiB")' | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["input", "expected"], | ||
[ | ||
("10", 10), | ||
("10B", 10), | ||
("10KB", 10_000), | ||
("10KiB", 10 * 1024), | ||
("10MB", 10_000_000), | ||
("10MiB", 10 * 1024 * 1024), | ||
("10GB", 10_000_000_000), | ||
("10GiB", 10 * 1024 * 1024 * 1024), | ||
], | ||
) | ||
def test_total_files_size_parse_units(input: str, expected: int): | ||
assert TotalFilesSize(input.replace("B", "b")).limit == expected | ||
assert TotalFilesSize(input).limit == expected | ||
|
||
|
||
def test_total_files_size(): | ||
limit = TotalFilesSize("30KiB") | ||
assert not limit.is_reached | ||
|
||
directory = RemoteDirectory("some") | ||
file1 = RemoteFile(path="file1.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50)) | ||
file2 = RemoteFile(path="file2.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50)) | ||
file3 = RemoteFile(path="nested/file3.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50)) | ||
file4 = RemoteFile(path="nested/file4.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50)) | ||
|
||
assert not limit.stops_at(file1) | ||
assert not limit.is_reached | ||
|
||
assert not limit.stops_at(file2) | ||
assert not limit.is_reached | ||
|
||
# directories are not checked by limit | ||
assert not limit.stops_at(directory) | ||
assert not limit.is_reached | ||
|
||
# limit is reached - all check are True, input does not matter | ||
assert limit.stops_at(file3) | ||
assert limit.is_reached | ||
|
||
assert limit.stops_at(file4) | ||
assert limit.is_reached | ||
|
||
assert limit.stops_at(directory) | ||
assert limit.is_reached | ||
|
||
# reset internal state | ||
limit.reset() | ||
|
||
assert not limit.stops_at(file1) | ||
assert not limit.is_reached | ||
|
||
# limit does not remember each file, so if duplicates are present, they can affect the result | ||
assert not limit.stops_at(file1) | ||
assert not limit.is_reached | ||
|
||
assert limit.stops_at(file1) | ||
assert limit.is_reached |