Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --preserve-tree option to dandi download #1467

Merged
merged 1 commit into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions dandi/cli/cmd_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@
default="all",
show_default=True,
)
@click.option(
"--preserve-tree",
is_flag=True,
help=(
"When downloading only part of a Dandiset, also download"
" `dandiset.yaml` and do not strip leading directories from asset"
" paths. Implies `--download all`."
),
)
@click.option(
"--sync", is_flag=True, help="Delete local assets that do not exist on the server"
)
Expand Down Expand Up @@ -138,6 +147,7 @@ def download(
sync: bool,
dandi_instance: str,
path_type: PathType,
preserve_tree: bool,
) -> None:
# We need to import the download module rather than the download function
# so that the tests can properly patch the function with a mock.
Expand Down Expand Up @@ -171,8 +181,9 @@ def download(
format=format,
jobs=jobs[0],
jobs_per_zarr=jobs[1],
get_metadata="dandiset.yaml" in download_types,
get_assets="assets" in download_types,
get_metadata="dandiset.yaml" in download_types or preserve_tree,
get_assets="assets" in download_types or preserve_tree,
preserve_tree=preserve_tree,
sync=sync,
path_type=path_type,
# develop_debug=develop_debug
Expand Down
7 changes: 7 additions & 0 deletions dandi/cli/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_download_defaults(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -41,6 +42,7 @@ def test_download_all_types(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -59,6 +61,7 @@ def test_download_metadata_only(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=False,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -77,6 +80,7 @@ def test_download_assets_only(mocker):
jobs_per_zarr=None,
get_metadata=False,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand Down Expand Up @@ -110,6 +114,7 @@ def test_download_gui_instance_in_dandiset(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -135,6 +140,7 @@ def test_download_api_instance_in_dandiset(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -160,6 +166,7 @@ def test_download_url_instance_match(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand Down
42 changes: 31 additions & 11 deletions dandi/dandiarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,17 @@
yield (client, dandiset, assets)

@abstractmethod
def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
"""
Returns the path (relative to the base download directory) at which the
asset ``asset`` (assumed to have been returned by this object's
`get_assets()` method) should be downloaded
`get_assets()` method) should be downloaded.

If ``preserve_tree`` is `True`, then the download is being performed
with ``--download tree`` option, and the method's return value should
be adjusted accordingly.

:meta private:
"""
Expand Down Expand Up @@ -231,7 +237,9 @@
assert d is not None
yield from d.get_assets(order=order)

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
return asset.path.lstrip("/")

def is_under_download_path(self, path: str) -> bool:
Expand All @@ -242,13 +250,17 @@
class SingleAssetURL(ParsedDandiURL):
"""Superclass for parsed URLs that refer to a single asset"""

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return posixpath.basename(asset.path.lstrip("/"))
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
path = asset.path.lstrip("/")
if preserve_tree:
return path
else:
return posixpath.basename(path)

def is_under_download_path(self, path: str) -> bool:
raise TypeError(
f"{type(self).__name__}.is_under_download_path() should not be called"
)
return False

Check warning on line 263 in dandi/dandiarchive.py

View check run for this annotation

Codecov / codecov/patch

dandi/dandiarchive.py#L263

Added line #L263 was not covered by tests


@dataclass
Expand All @@ -257,8 +269,14 @@

path: str

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return multiasset_target(self.path, asset.path.lstrip("/"))
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
path = asset.path.lstrip("/")
if preserve_tree:
return path
else:
return multiasset_target(self.path, path)

def is_under_download_path(self, path: str) -> bool:
prefix = posixpath.dirname(self.path.strip("/"))
Expand Down Expand Up @@ -487,7 +505,9 @@
if strict and not any_assets:
raise NotFoundError(f"No assets found matching glob {self.path!r}")

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
return asset.path.lstrip("/")

def is_under_download_path(self, path: str) -> bool:
Expand Down
28 changes: 19 additions & 9 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def download(
jobs_per_zarr: int | None = None,
get_metadata: bool = True,
get_assets: bool = True,
preserve_tree: bool = False,
sync: bool = False,
path_type: PathType = PathType.EXACT,
) -> None:
Expand Down Expand Up @@ -141,6 +142,7 @@ def download(
existing=existing,
get_metadata=get_metadata,
get_assets=get_assets,
preserve_tree=preserve_tree,
jobs_per_zarr=jobs_per_zarr,
on_error="yield" if format is DownloadFormat.PYOUT else "raise",
**kw,
Expand Down Expand Up @@ -201,6 +203,7 @@ class Downloader:
existing: DownloadExisting
get_metadata: bool
get_assets: bool
preserve_tree: bool
jobs_per_zarr: int | None
on_error: Literal["raise", "yield"]
#: which will be set .gen to assets. Purpose is to make it possible to get
Expand All @@ -214,19 +217,24 @@ def __post_init__(self, output_dir: str | Path) -> None:
# TODO: if we are ALREADY in a dandiset - we can validate that it is
# the same dandiset and use that dandiset path as the one to download
# under
if isinstance(self.url, DandisetURL):
if isinstance(self.url, DandisetURL) or (
self.preserve_tree and self.url.dandiset_id is not None
):
assert self.url.dandiset_id is not None
self.output_prefix = Path(self.url.dandiset_id)
else:
self.output_prefix = Path()
self.output_path = Path(output_dir, self.output_prefix)

def is_dandiset_yaml(self) -> bool:
return isinstance(self.url, AssetItemURL) and self.url.path == "dandiset.yaml"

def download_generator(self) -> Iterator[dict]:
"""
A generator for downloads of files, folders, or entire dandiset from
DANDI (as identified by URL)

This function is a generator which would yield records on ongoing
This function is a generator which yields records on ongoing
activities. Activities include traversal of the remote resource (DANDI
archive), download of individual assets while yielding records (TODO:
schema) while validating their checksums "on the fly", etc.
Expand All @@ -235,10 +243,8 @@ def download_generator(self) -> Iterator[dict]:
with self.url.navigate(strict=True) as (client, dandiset, assets):
if (
isinstance(self.url, DandisetURL)
or (
isinstance(self.url, AssetItemURL)
and self.url.path == "dandiset.yaml"
)
or self.is_dandiset_yaml()
or self.preserve_tree
) and self.get_metadata:
assert dandiset is not None
for resp in _populate_dandiset_yaml(
Expand All @@ -248,7 +254,7 @@ def download_generator(self) -> Iterator[dict]:
"path": str(self.output_prefix / dandiset_metadata_file),
**resp,
}
if isinstance(self.url, AssetItemURL):
if self.is_dandiset_yaml():
return

# TODO: do analysis of assets for early detection of needed renames
Expand All @@ -262,7 +268,9 @@ def download_generator(self) -> Iterator[dict]:
assets = self.assets_it.feed(assets)
lock = Lock()
for asset in assets:
path = self.url.get_asset_download_path(asset)
path = self.url.get_asset_download_path(
asset, preserve_tree=self.preserve_tree
)
self.asset_download_paths.add(path)
download_path = Path(self.output_path, path)
path = str(self.output_prefix / path)
Expand Down Expand Up @@ -995,7 +1003,9 @@ class DownloadProgress:
@dataclass
class ProgressCombiner:
zarr_size: int
file_qty: int | None = None # set to specific known value whenever full sweep is complete
file_qty: int | None = (
None # set to specific known value whenever full sweep is complete
)
files: dict[str, DownloadProgress] = field(default_factory=dict)
#: Total size of all files that were not skipped and did not error out
#: during download
Expand Down
45 changes: 44 additions & 1 deletion dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,28 @@ def test_download_folder(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
assert (tmp_path / "subdir2" / "coconut.txt").read_text() == "Coconut\n"


def test_download_folder_preserve_tree(
text_dandiset: SampleDandiset, tmp_path: Path
) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/",
tmp_path,
preserve_tree=True,
)
assert list_paths(tmp_path, dirs=True) == [
tmp_path / dandiset_id,
tmp_path / dandiset_id / "dandiset.yaml",
tmp_path / dandiset_id / "subdir2",
tmp_path / dandiset_id / "subdir2" / "banana.txt",
tmp_path / dandiset_id / "subdir2" / "coconut.txt",
]
assert (tmp_path / dandiset_id / "subdir2" / "banana.txt").read_text() == "Banana\n"
assert (
tmp_path / dandiset_id / "subdir2" / "coconut.txt"
).read_text() == "Coconut\n"


def test_download_item(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
Expand All @@ -182,6 +204,26 @@ def test_download_item(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
assert (tmp_path / "coconut.txt").read_text() == "Coconut\n"


def test_download_item_preserve_tree(
text_dandiset: SampleDandiset, tmp_path: Path
) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/coconut.txt",
tmp_path,
preserve_tree=True,
)
assert list_paths(tmp_path, dirs=True) == [
tmp_path / dandiset_id,
tmp_path / dandiset_id / "dandiset.yaml",
tmp_path / dandiset_id / "subdir2",
tmp_path / dandiset_id / "subdir2" / "coconut.txt",
]
assert (
tmp_path / dandiset_id / "subdir2" / "coconut.txt"
).read_text() == "Coconut\n"


def test_download_dandiset_yaml(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
Expand Down Expand Up @@ -330,6 +372,7 @@ def test_download_metadata404(text_dandiset: SampleDandiset, tmp_path: Path) ->
existing=DownloadExisting.ERROR,
get_metadata=True,
get_assets=True,
preserve_tree=False,
jobs_per_zarr=None,
on_error="raise",
).download_generator()
Expand Down Expand Up @@ -985,4 +1028,4 @@ def test_pyouthelper_time_remaining_1339():
# once done, dont print ETA
assert len(done) == 2
else:
assert done[-1] == f"ETA: {10-i} seconds<"
assert done[-1] == f"ETA: {10 - i} seconds<"
6 changes: 6 additions & 0 deletions docs/source/cmdline/download.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ Options

Whether to interpret asset paths in URLs as exact matches or glob patterns

.. option:: --preserve-tree

When downloading only part of a Dandiset, also download
:file:`dandiset.yaml` and do not strip leading directories from asset
paths. Implies ``--download all``.

.. option:: --sync

Delete local assets that do not exist on the server after downloading
Loading