Skip to content

Commit

Permalink
✨ Allow to set cache destinations via cache_key in cloud_to_local (#861)
Browse files Browse the repository at this point in the history
Allow to set cache destinations via cache_key in cloud_to_local
  • Loading branch information
Koncopd authored Sep 20, 2024
1 parent d35f12d commit fa23386
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 16 deletions.
75 changes: 75 additions & 0 deletions docs/hub-prod/test-cloud-sync.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"outputs": [],
"source": [
"from lamindb_setup import init, settings\n",
"from upath import UPath\n",
"import time\n",
"import os"
]
Expand Down Expand Up @@ -101,6 +102,80 @@
")"
]
},
{
"cell_type": "markdown",
"id": "02399123",
"metadata": {},
"source": [
"Test `cloud_to_local_no_update` paths"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51ce7f38",
"metadata": {},
"outputs": [],
"source": [
"test_local_path = UPath(\"./some/local/path\")\n",
"assert settings.storage.cloud_to_local_no_update(test_local_path) == test_local_path\n",
"assert settings.storage.cloud_to_local_no_update(test_local_path.as_posix()) == test_local_path\n",
"assert settings.storage.cloud_to_local_no_update(test_local_path, cache_key=\"some/cache/key\") == test_local_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f264a96e",
"metadata": {},
"outputs": [],
"source": [
"assert settings.storage.cloud_to_local_no_update(dir_sync) == settings.storage.cache_dir / f\"lamindb-ci/{instance_name}/dir_sync\"\n",
"assert settings.storage.cloud_to_local_no_update(dir_sync, cache_key=\"dir_cache/key\") == settings.storage.cache_dir / \"dir_cache/key\""
]
},
{
"cell_type": "markdown",
"id": "0b79f2f7",
"metadata": {},
"source": [
"Test `cloud_to_local` with `cache_key`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c406abb5",
"metadata": {},
"outputs": [],
"source": [
"dir_sync_local = settings.storage.cloud_to_local(dir_sync, cache_key=\"dir_cache/key\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5cf38fa",
"metadata": {},
"outputs": [],
"source": [
"assert dir_sync_local == settings.storage.cache_dir / \"dir_cache/key\"\n",
"assert dir_sync_local.is_dir()\n",
"assert num_files(dir_sync_local) == 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fbf5c69",
"metadata": {},
"outputs": [],
"source": [
"for file in dir_sync_local.iterdir():\n",
" file.unlink()\n",
"dir_sync_local.rmdir()"
]
},
{
"cell_type": "markdown",
"id": "574c3f95",
Expand Down
38 changes: 22 additions & 16 deletions lamindb_setup/core/_settings_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,28 +372,34 @@ def is_on_hub(self) -> bool:
else:
return True

def key_to_filepath(self, filekey: Path | UPath | str) -> UPath:
"""Cloud or local filepath from filekey."""
return self.root / filekey

def cloud_to_local(self, filepath: Path | UPath, **kwargs) -> UPath:
"""Local (cache) filepath from filepath."""
local_filepath = self.cloud_to_local_no_update(filepath) # type: ignore
def cloud_to_local(
self, filepath: UPathStr, cache_key: UPathStr | None = None, **kwargs
) -> UPath:
"""Local (or local cache) filepath from filepath."""
# cache_key is ignored in cloud_to_local_no_update if filepath is local
local_filepath = self.cloud_to_local_no_update(filepath, cache_key)
if isinstance(filepath, UPath) and not isinstance(filepath, LocalPathClasses):
local_filepath.parent.mkdir(parents=True, exist_ok=True)
filepath.synchronize(local_filepath, **kwargs)
return local_filepath

# conversion to Path via cloud_to_local() would trigger download
# of remote file to cache if there already is one
# in pure write operations that update the cloud, we don't want this
# hence, we manually construct the local file path
# using the `.parts` attribute in the following line
def cloud_to_local_no_update(self, filepath: UPath) -> UPath:
def cloud_to_local_no_update(
self, filepath: UPathStr, cache_key: UPathStr | None = None
) -> UPath:
# cache_key is ignored if filepath is local
if isinstance(filepath, UPath) and not isinstance(filepath, LocalPathClasses):
return self.cache_dir.joinpath(filepath._url.netloc, *filepath.parts[1:]) # type: ignore
return filepath
# Path / UPath discards protocol from UPath if present
local_filepath = self.cache_dir / (
filepath if cache_key is None else cache_key
)
else:
local_filepath = filepath
return UPath(local_filepath)

def key_to_filepath(self, filekey: UPathStr) -> UPath:
"""Cloud or local filepath from filekey."""
return self.root / filekey

def local_filepath(self, filekey: Path | UPath | str) -> UPath:
def local_filepath(self, filekey: UPathStr) -> UPath:
"""Local (cache) filepath from filekey: `local(filepath(...))`."""
return self.cloud_to_local(self.key_to_filepath(filekey))

0 comments on commit fa23386

Please sign in to comment.