-
Notifications
You must be signed in to change notification settings - Fork 104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
file: support exporting files as a symlink #819
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
BinaryIO, | ||
Callable, | ||
ClassVar, | ||
Literal, | ||
Optional, | ||
TypeVar, | ||
Union, | ||
|
@@ -2415,19 +2416,30 @@ def setup(self, **kwargs) -> "Self": | |
def export_files( | ||
self, | ||
output: str, | ||
signal="file", | ||
signal: str = "file", | ||
placement: FileExportPlacement = "fullpath", | ||
use_cache: bool = True, | ||
link_type: Literal["copy", "symlink"] = "copy", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another thing I forgot to mention - if files fail to symlink, it will always fall back to |
||
) -> None: | ||
"""Method that exports all files from chain to some folder.""" | ||
"""Export files from a specified signal to a directory. | ||
|
||
Args: | ||
output: Path to the target directory for exporting files. | ||
signal: Name of the signal to export files from. | ||
placement: The method to use for naming exported files. | ||
The possible values are: "filename", "etag", "fullpath", and "checksum". | ||
use_cache: If `True`, cache the files before exporting. | ||
link_type: Method to use for exporting files. | ||
Falls back to `'copy'` if symlinking fails. | ||
""" | ||
if placement == "filename" and ( | ||
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() | ||
!= self._query.count() | ||
): | ||
raise ValueError("Files with the same name found") | ||
|
||
for file in self.collect(signal): | ||
file.export(output, placement, use_cache) # type: ignore[union-attr] | ||
file.export(output, placement, use_cache, link_type=link_type) # type: ignore[union-attr] | ||
|
||
def shuffle(self) -> "Self": | ||
"""Shuffle the rows of the chain deterministically.""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
import errno | ||
import hashlib | ||
import io | ||
import json | ||
|
@@ -236,11 +237,26 @@ | |
with open(destination, mode="wb") as f: | ||
f.write(self.read()) | ||
|
||
def _symlink_to(self, destination: str): | ||
if self.location: | ||
raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported") | ||
|
||
if self._caching_enabled: | ||
self.ensure_cached() | ||
source = self.get_local_path() | ||
assert source, "File was not cached" | ||
elif self.source.startswith("file://"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
source = self.get_path() | ||
else: | ||
raise OSError(errno.EXDEV, "can't link across filesystems") | ||
return os.symlink(source, destination) | ||
|
||
def export( | ||
self, | ||
output: str, | ||
placement: ExportPlacement = "fullpath", | ||
use_cache: bool = True, | ||
link_type: Literal["copy", "symlink"] = "copy", | ||
) -> None: | ||
"""Export file to new location.""" | ||
if use_cache: | ||
|
@@ -249,6 +265,13 @@ | |
dst_dir = os.path.dirname(dst) | ||
os.makedirs(dst_dir, exist_ok=True) | ||
|
||
if link_type == "symlink": | ||
try: | ||
return self._symlink_to(dst) | ||
except OSError as exc: | ||
if exc.errno not in (errno.ENOTSUP, errno.EXDEV, errno.ENOSYS): | ||
raise | ||
Comment on lines
+268
to
+273
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What should we do in case of overwrite? Looks like |
||
|
||
self.save(dst) | ||
|
||
def _set_stream( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import json | ||
from pathlib import Path | ||
from unittest.mock import Mock | ||
|
||
import pytest | ||
|
@@ -379,3 +380,18 @@ def test_get_local_path(tmp_path, catalog): | |
assert file.get_local_path() is None | ||
file.ensure_cached() | ||
assert file.get_local_path() is not None | ||
|
||
|
||
@pytest.mark.parametrize("use_cache", (True, False)) | ||
def test_export_with_symlink(tmp_path, catalog, use_cache): | ||
path = tmp_path / "myfile.txt" | ||
path.write_text("some text") | ||
|
||
file = File(path=path.name, source=tmp_path.as_uri()) | ||
file._set_stream(catalog, use_cache) | ||
|
||
file.export(tmp_path / "dir", link_type="symlink", use_cache=use_cache) | ||
assert (tmp_path / "dir" / "myfile.txt").is_symlink() | ||
|
||
dst = Path(file.get_local_path()) if use_cache else path | ||
assert (tmp_path / "dir" / "myfile.txt").resolve() == dst | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using -- WindowsPath('C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/popen-gw2/default_working_dir0/.datachain/cache/29/89fe8cd7eda0a1e2e28bf837e137f5bdd8d20c506348f67f5671f5a036d529')
++ WindowsPath('//?/C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/popen-gw2/default_working_dir0/.datachain/cache/29/89fe8cd7eda0a1e2e28bf837e137f5bdd8d20c506348f67f5671f5a036d529') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we update / add docs while we do this (or as a followup)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done in 2d9547f