From 8207b3dbd0a9eab07e73adfdf91c9006341d5e81 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Mon, 9 Sep 2024 11:20:00 -0400 Subject: [PATCH] use variable for client schemes, allowing override (#467) (#471) * use variable for client schemes, allowing override This change is intended to make the default client implementations more flexible so that their scheme can be customized. This can be useful in scenarios where a subclass wants to implement a custom scheme on e.g. a S3 compatible API [1] but with a custom scheme so that the default S3 access is still also available. [1] https://cloudpathlib.drivendata.org/stable/authentication/#accessing-custom-s3-compatible-object-stores The tests have been updated to include a new s3-like rig which uses the new scheme override functionality. * use single cloud_prefix * tests: switch to lighter-weight custom scheme tests * update HISTORY file for custom scheme support * update custom scheme tests to utilize pytest fixture This isolates the implementation in response to PR feedback: https://github.com/drivendataorg/cloudpathlib/pull/467#discussion_r1742723302 Co-authored-by: Aaron Taylor --- HISTORY.md | 4 +++ cloudpathlib/azure/azblobclient.py | 14 +++++--- cloudpathlib/gs/gsclient.py | 16 ++++++--- cloudpathlib/s3/s3client.py | 18 +++++++--- tests/test_client.py | 53 +++++++++++++++++++++++++++++- 5 files changed, 90 insertions(+), 15 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 1a68ce12..0deb48db 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ # cloudpathlib Changelog +## UNRELEASED + +- Added support for custom schemes in CloudPath and Client subclases. (Issue [#466](https://github.com/drivendataorg/cloudpathlib/issues/466), PR [#467](https://github.com/drivendataorg/cloudpathlib/pull/467)) + ## v0.19.0 (2024-08-29) - Fixed an error that occurred when loading and dumping `CloudPath` objects using pickle multiple times. (Issue [#450](https://github.com/drivendataorg/cloudpathlib/issues/450), PR [#454](https://github.com/drivendataorg/cloudpathlib/pull/454), thanks to [@kujenga](https://github.com/kujenga)) diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 98189378..289eb236 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -276,12 +276,14 @@ def _list_dir( ) -> Iterable[Tuple[AzureBlobPath, bool]]: if not cloud_path.container: for container in self.service_client.list_containers(): - yield self.CloudPath(f"az://{container.name}"), True + yield self.CloudPath(f"{cloud_path.cloud_prefix}{container.name}"), True if not recursive: continue - yield from self._list_dir(self.CloudPath(f"az://{container.name}"), recursive=True) + yield from self._list_dir( + self.CloudPath(f"{cloud_path.cloud_prefix}{container.name}"), recursive=True + ) return container_client = self.service_client.get_container_client(cloud_path.container) @@ -295,7 +297,9 @@ def _list_dir( paths = file_system_client.get_paths(path=cloud_path.blob, recursive=recursive) for path in paths: - yield self.CloudPath(f"az://{cloud_path.container}/{path.name}"), path.is_directory + yield self.CloudPath( + f"{cloud_path.cloud_prefix}{cloud_path.container}/{path.name}" + ), path.is_directory else: if not recursive: @@ -306,7 +310,9 @@ def _list_dir( for blob in blobs: # walk_blobs returns folders with a trailing slash blob_path = blob.name.rstrip("/") - blob_cloud_path = self.CloudPath(f"az://{cloud_path.container}/{blob_path}") + blob_cloud_path = self.CloudPath( + f"{cloud_path.cloud_prefix}{cloud_path.container}/{blob_path}" + ) yield blob_cloud_path, ( isinstance(blob, BlobPrefix) diff --git a/cloudpathlib/gs/gsclient.py b/cloudpathlib/gs/gsclient.py index 56d0e2c5..edd5b88a 100644 --- a/cloudpathlib/gs/gsclient.py +++ b/cloudpathlib/gs/gsclient.py @@ -183,7 +183,8 @@ def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPat ) yield from ( - (self.CloudPath(f"gs://{str(b)}"), True) for b in self.client.list_buckets() + (self.CloudPath(f"{cloud_path.cloud_prefix}{str(b)}"), True) + for b in self.client.list_buckets() ) return @@ -200,11 +201,16 @@ def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPat # if we haven't surfaced this directory already if parent not in yielded_dirs and str(parent) != ".": yield ( - self.CloudPath(f"gs://{cloud_path.bucket}/{prefix}{parent}"), + self.CloudPath( + f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{prefix}{parent}" + ), True, # is a directory ) yielded_dirs.add(parent) - yield (self.CloudPath(f"gs://{cloud_path.bucket}/{o.name}"), False) # is a file + yield ( + self.CloudPath(f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{o.name}"), + False, + ) # is a file else: iterator = bucket.list_blobs(delimiter="/", prefix=prefix) @@ -212,13 +218,13 @@ def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPat # see: https://github.com/googleapis/python-storage/issues/863 for file in iterator: yield ( - self.CloudPath(f"gs://{cloud_path.bucket}/{file.name}"), + self.CloudPath(f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{file.name}"), False, # is a file ) for directory in iterator.prefixes: yield ( - self.CloudPath(f"gs://{cloud_path.bucket}/{directory}"), + self.CloudPath(f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{directory}"), True, # is a directory ) diff --git a/cloudpathlib/s3/s3client.py b/cloudpathlib/s3/s3client.py index ac2ecaa6..db130e82 100644 --- a/cloudpathlib/s3/s3client.py +++ b/cloudpathlib/s3/s3client.py @@ -217,7 +217,7 @@ def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Pat ) yield from ( - (self.CloudPath(f"s3://{b['Name']}"), True) + (self.CloudPath(f"{cloud_path.cloud_prefix}{b['Name']}"), True) for b in self.client.list_buckets().get("Buckets", []) ) return @@ -241,7 +241,9 @@ def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Pat canonical = result_prefix.get("Prefix").rstrip("/") # keep a canonical form if canonical not in yielded_dirs: yield ( - self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"), + self.CloudPath( + f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{canonical}" + ), True, ) yielded_dirs.add(canonical) @@ -254,7 +256,9 @@ def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Pat parent_canonical = prefix + str(parent).rstrip("/") if parent_canonical not in yielded_dirs and str(parent) != ".": yield ( - self.CloudPath(f"s3://{cloud_path.bucket}/{parent_canonical}"), + self.CloudPath( + f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{parent_canonical}" + ), True, ) yielded_dirs.add(parent_canonical) @@ -267,7 +271,9 @@ def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Pat # s3 fake directories have 0 size and end with "/" if result_key.get("Key").endswith("/") and result_key.get("Size") == 0: yield ( - self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"), + self.CloudPath( + f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{canonical}" + ), True, ) yielded_dirs.add(canonical) @@ -275,7 +281,9 @@ def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Pat # yield object as file else: yield ( - self.CloudPath(f"s3://{cloud_path.bucket}/{result_key.get('Key')}"), + self.CloudPath( + f"{cloud_path.cloud_prefix}{cloud_path.bucket}/{result_key.get('Key')}" + ), False, ) diff --git a/tests/test_client.py b/tests/test_client.py index a665a5a6..fd58535b 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,11 +1,16 @@ import mimetypes import os -from pathlib import Path import random import string +from pathlib import Path + +import pytest from cloudpathlib import CloudPath +from cloudpathlib.client import register_client_class +from cloudpathlib.cloudpath import implementation_registry, register_path_class from cloudpathlib.s3.s3client import S3Client +from cloudpathlib.s3.s3path import S3Path def test_default_client_instantiation(rig): @@ -116,3 +121,49 @@ def my_content_type(path): for suffix, content_type in mimes: _test_write_content_type(suffix, content_type, rig) + + +@pytest.fixture +def custom_s3_path(): + # A fixture isolates these classes as they modify the global registry of + # implementations. + @register_path_class("mys3") + class MyS3Path(S3Path): + cloud_prefix: str = "mys3://" + + @register_client_class("mys3") + class MyS3Client(S3Client): + pass + + yield (MyS3Path, MyS3Client) + + # cleanup after use + implementation_registry.pop("mys3") + + +def test_custom_mys3path_instantiation(custom_s3_path): + CustomPath, _ = custom_s3_path + + path = CustomPath("mys3://bucket/dir/file.txt") + assert isinstance(path, CustomPath) + assert path.cloud_prefix == "mys3://" + assert path.bucket == "bucket" + assert path.key == "dir/file.txt" + + +def test_custom_mys3client_instantiation(custom_s3_path): + _, CustomClient = custom_s3_path + + client = CustomClient() + assert isinstance(client, CustomClient) + assert client.CloudPath("mys3://bucket/dir/file.txt").cloud_prefix == "mys3://" + + +def test_custom_mys3client_default_client(custom_s3_path): + _, CustomClient = custom_s3_path + + CustomClient().set_as_default_client() + + path = CloudPath("mys3://bucket/dir/file.txt") + assert isinstance(path.client, CustomClient) + assert path.cloud_prefix == "mys3://"