From f36309fa7efbfbebe633a35d5b83f65dc6fafe11 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Mon, 12 Aug 2024 13:25:05 -0400 Subject: [PATCH] Env vars for force overwrite functions (#437) * Env vars for force overwrite * change history * Defaults for force_overwrite_to_cloud * overwrite to tests * make sure times not equal in tests * update test * more snoozing * sleep cloud * sleepier * update timing * Sleep after delete * try force gc --- HISTORY.md | 1 + cloudpathlib/cloudpath.py | 55 ++++++++++++------- docs/docs/caching.ipynb | 6 ++- tests/test_caching.py | 111 +++++++++++++++++++++++++++++++++++++- 4 files changed, 151 insertions(+), 22 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 046ab12d..9cf6a661 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,6 +8,7 @@ - fix: use native `exists()` method in `GSClient`. (PR [#420](https://github.com/drivendataorg/cloudpathlib/pull/420)) - Enhancement: lazy instantiation of default client (PR [#432](https://github.com/drivendataorg/cloudpathlib/issues/432), Issue [#428](https://github.com/drivendataorg/cloudpathlib/issues/428)) - Adds existence check before downloading in `download_to` (Issue [#430](https://github.com/drivendataorg/cloudpathlib/issues/430), PR [#432](https://github.com/drivendataorg/cloudpathlib/pull/432)) +- Add env vars `CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD` and `CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD`. (Issue [#393](https://github.com/drivendataorg/cloudpathlib/issues/393), PR [#437](https://github.com/drivendataorg/cloudpathlib/pull/437)) ## v0.18.1 (2024-02-26) diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py index 8bad810d..fc90db41 100644 --- a/cloudpathlib/cloudpath.py +++ b/cloudpathlib/cloudpath.py @@ -315,7 +315,7 @@ def __eq__(self, other: Any) -> bool: def __fspath__(self) -> str: if self.is_file(): - self._refresh_cache(force_overwrite_from_cloud=False) + self._refresh_cache() return str(self._local) def __lt__(self, other: Any) -> bool: @@ -549,8 +549,8 @@ def open( encoding: Optional[str] = None, errors: Optional[str] = None, newline: Optional[str] = None, - force_overwrite_from_cloud: bool = False, # extra kwarg not in pathlib - force_overwrite_to_cloud: bool = False, # extra kwarg not in pathlib + force_overwrite_from_cloud: Optional[bool] = None, # extra kwarg not in pathlib + force_overwrite_to_cloud: Optional[bool] = None, # extra kwarg not in pathlib ) -> IO[Any]: # if trying to call open on a directory that exists if self.exists() and not self.is_file(): @@ -931,7 +931,7 @@ def rmtree(self) -> None: def upload_from( self, source: Union[str, os.PathLike], - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ) -> Self: """Upload a file or directory to the cloud path.""" source = Path(source) @@ -956,24 +956,24 @@ def upload_from( def copy( self, destination: Self, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ) -> Self: ... @overload def copy( self, destination: Path, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ) -> Path: ... @overload def copy( self, destination: str, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ) -> Union[Path, "CloudPath"]: ... - def copy(self, destination, force_overwrite_to_cloud=False): + def copy(self, destination, force_overwrite_to_cloud=None): """Copy self to destination folder of file, if self is a file.""" if not self.exists() or not self.is_file(): raise ValueError( @@ -992,6 +992,11 @@ def copy(self, destination, force_overwrite_to_cloud=False): if destination.exists() and destination.is_dir(): destination = destination / self.name + if force_overwrite_to_cloud is None: + force_overwrite_to_cloud = os.environ.get( + "CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD", "False" + ).lower() in ["1", "true"] + if ( not force_overwrite_to_cloud and destination.exists() @@ -1019,7 +1024,7 @@ def copy(self, destination, force_overwrite_to_cloud=False): def copytree( self, destination: Self, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ignore: Optional[Callable[[str, Iterable[str]], Container[str]]] = None, ) -> Self: ... @@ -1027,7 +1032,7 @@ def copytree( def copytree( self, destination: Path, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ignore: Optional[Callable[[str, Iterable[str]], Container[str]]] = None, ) -> Path: ... @@ -1035,11 +1040,11 @@ def copytree( def copytree( self, destination: str, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ignore: Optional[Callable[[str, Iterable[str]], Container[str]]] = None, ) -> Union[Path, "CloudPath"]: ... - def copytree(self, destination, force_overwrite_to_cloud=False, ignore=None): + def copytree(self, destination, force_overwrite_to_cloud=None, ignore=None): """Copy self to a directory, if self is a directory.""" if not self.is_dir(): raise CloudPathNotADirectoryError( @@ -1112,7 +1117,7 @@ def _new_cloudpath(self, path: Union[str, os.PathLike]) -> Self: return self.client.CloudPath(path) - def _refresh_cache(self, force_overwrite_from_cloud: bool = False) -> None: + def _refresh_cache(self, force_overwrite_from_cloud: Optional[bool] = None) -> None: try: stats = self.stat() except NoStatError: @@ -1120,11 +1125,16 @@ def _refresh_cache(self, force_overwrite_from_cloud: bool = False) -> None: # new files that will be uploaded return + if force_overwrite_from_cloud is None: + force_overwrite_from_cloud = os.environ.get( + "CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD", "False" + ).lower() in ["1", "true"] + # if not exist or cloud newer if ( - not self._local.exists() + force_overwrite_from_cloud + or not self._local.exists() or (self._local.stat().st_mtime < stats.st_mtime) - or force_overwrite_from_cloud ): # ensure there is a home for the file self._local.parent.mkdir(parents=True, exist_ok=True) @@ -1138,7 +1148,7 @@ def _refresh_cache(self, force_overwrite_from_cloud: bool = False) -> None: f"Local file ({self._local}) for cloud path ({self}) has been changed by your code, but " f"is being requested for download from cloud. Either (1) push your changes to the cloud, " f"(2) remove the local file, or (3) pass `force_overwrite_from_cloud=True` to " - f"overwrite." + f"overwrite; or set env var CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD=1." ) # if local newer but not dirty, it was updated @@ -1148,12 +1158,12 @@ def _refresh_cache(self, force_overwrite_from_cloud: bool = False) -> None: f"Local file ({self._local}) for cloud path ({self}) is newer on disk, but " f"is being requested for download from cloud. Either (1) push your changes to the cloud, " f"(2) remove the local file, or (3) pass `force_overwrite_from_cloud=True` to " - f"overwrite." + f"overwrite; or set env var CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD=1." ) def _upload_local_to_cloud( self, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ) -> Self: """Uploads cache file at self._local to the cloud""" # We should never try to be syncing entire directories; we should only @@ -1178,11 +1188,16 @@ def _upload_local_to_cloud( def _upload_file_to_cloud( self, local_path: Path, - force_overwrite_to_cloud: bool = False, + force_overwrite_to_cloud: Optional[bool] = None, ) -> Self: """Uploads file at `local_path` to the cloud if there is not a newer file already there. """ + if force_overwrite_to_cloud is None: + force_overwrite_to_cloud = os.environ.get( + "CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD", "False" + ).lower() in ["1", "true"] + if force_overwrite_to_cloud: # If we are overwriting no need to perform any checks, so we can save time self.client._upload_file( @@ -1210,7 +1225,7 @@ def _upload_file_to_cloud( f"Local file ({self._local}) for cloud path ({self}) is newer in the cloud disk, but " f"is being requested to be uploaded to the cloud. Either (1) redownload changes from the cloud or " f"(2) pass `force_overwrite_to_cloud=True` to " - f"overwrite." + f"overwrite; or set env var CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD=1." ) # =========== pydantic integration special methods =============== diff --git a/docs/docs/caching.ipynb b/docs/docs/caching.ipynb index c6d627a9..a1b98951 100644 --- a/docs/docs/caching.ipynb +++ b/docs/docs/caching.ipynb @@ -398,10 +398,14 @@ "\n", "The `CloudPath.open` method supports a `force_overwrite_from_cloud` kwarg to force overwriting your local version.\n", "\n", + "You can make overwriting the cache with the cloud copy the default by setting the environment variable `CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD=1` or `CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD=True`.\n", + "\n", "`OverwriteNewerCloudError`\n", "This exception is raised if we are asked to upload a file, but the one on the cloud is newer than our local version. This likely means that a separate process has updated the cloud version, and we don't want to overwrite and lose that new data in the cloud.\n", "\n", "The `CloudPath.open` method supports a `force_overwrite_to_cloud` kwarg to force overwriting the cloud version.\n", + "\n", + "You can make overwriting the cloud copy with the local one being uploaded by setting the environment variable `CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD=1` or `CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD=True`.\n", "\n" ] }, @@ -773,7 +777,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.1" }, "vscode": { "interpreter": { diff --git a/tests/test_caching.py b/tests/test_caching.py index af1aebcf..9384aac3 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -1,3 +1,4 @@ +import gc import os from time import sleep from pathlib import Path @@ -5,7 +6,11 @@ import pytest from cloudpathlib.enums import FileCacheMode -from cloudpathlib.exceptions import InvalidConfigurationException +from cloudpathlib.exceptions import ( + InvalidConfigurationException, + OverwriteNewerCloudError, + OverwriteNewerLocalError, +) from tests.conftest import CloudProviderTestRig @@ -344,6 +349,107 @@ def test_environment_variable_local_cache_dir(rig: CloudProviderTestRig, tmpdir) os.environ["CLOUDPATHLIB_LOCAL_CACHE_DIR"] = original_env_setting +def test_environment_variables_force_overwrite_from(rig: CloudProviderTestRig, tmpdir): + # environment instantiation + original_env_setting = os.environ.get("CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD", "") + + try: + # explicitly false overwrite + os.environ["CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD"] = "False" + + p = rig.create_cloud_path("dir_0/file0_0.txt") + p._refresh_cache() # dl to cache + p._local.touch() # update mod time + + with pytest.raises(OverwriteNewerLocalError): + p._refresh_cache() + + for val in ["1", "True", "TRUE"]: + os.environ["CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD"] = val + + p = rig.create_cloud_path("dir_0/file0_0.txt") + + orig_mod_time = p.stat().st_mtime + + p._refresh_cache() # dl to cache + p._local.touch() # update mod time + + new_mod_time = p._local.stat().st_mtime + + p._refresh_cache() + assert p._local.stat().st_mtime == orig_mod_time + assert p._local.stat().st_mtime < new_mod_time + + finally: + os.environ["CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD"] = original_env_setting + + +def test_environment_variables_force_overwrite_to(rig: CloudProviderTestRig, tmpdir): + # environment instantiation + original_env_setting = os.environ.get("CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD", "") + + try: + # explicitly false overwrite + os.environ["CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD"] = "False" + + p = rig.create_cloud_path("dir_0/file0_0.txt") + + new_local = Path((tmpdir / "new_content.txt").strpath) + new_local.write_text("hello") + new_also_cloud = rig.create_cloud_path("dir_0/another_cloud_file.txt") + new_also_cloud.write_text("newer") + + # make cloud newer than local or other cloud file + os.utime(new_local, (new_local.stat().st_mtime - 2, new_local.stat().st_mtime - 2)) + + p.write_text("updated") + + with pytest.raises(OverwriteNewerCloudError): + p._upload_file_to_cloud(new_local) + + with pytest.raises(OverwriteNewerCloudError): + # copy short-circuits upload if same client, so we test separately + + # raises if destination is newer + new_also_cloud.write_text("newest") + sleep(0.01) + p.copy(new_also_cloud) + + for val in ["1", "True", "TRUE"]: + os.environ["CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD"] = val + + p = rig.create_cloud_path("dir_0/file0_0.txt") + + new_local.write_text("updated") + + # make cloud newer than local + os.utime(new_local, (new_local.stat().st_mtime - 2, new_local.stat().st_mtime - 2)) + + p.write_text("updated") + + orig_cloud_mod_time = p.stat().st_mtime + + assert p.stat().st_mtime >= new_local.stat().st_mtime + + # would raise if not set + sleep(1.01) # give time so not equal when rounded + p._upload_file_to_cloud(new_local) + assert p.stat().st_mtime > orig_cloud_mod_time # cloud now overwritten + + new_also_cloud = rig.create_cloud_path("dir_0/another_cloud_file.txt") + sleep(1.01) # give time so not equal when rounded + new_also_cloud.write_text("newer") + + new_cloud_mod_time = new_also_cloud.stat().st_mtime + + assert p.stat().st_mtime < new_cloud_mod_time # would raise if not set + p.copy(new_also_cloud) + assert new_also_cloud.stat().st_mtime >= new_cloud_mod_time + + finally: + os.environ["CLOUDPATHLIB_FORCE_OVERWRITE_TO_CLOUD"] = original_env_setting + + def test_manual_cache_clearing(rig: CloudProviderTestRig): # use client that we can delete rather than default client = rig.client_class(**rig.required_client_kwargs) @@ -395,6 +501,9 @@ def test_manual_cache_clearing(rig: CloudProviderTestRig): del cp del client + gc.collect() # force gc before asserting + sleep(0.5) # give time to delete + assert not local_cache_path.exists() assert not client_cache_folder.exists()