From ee24ee60021a59aded236c94676285ed662d1e66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Kub=C3=ADk?= Date: Thu, 20 Jun 2024 12:45:01 +0200 Subject: [PATCH 1/4] perf(az): optimize non-recursive _list_dir --- cloudpathlib/azure/azblobclient.py | 68 +++++++++++++++--------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index f161a02d..19d8b558 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -1,7 +1,7 @@ from datetime import datetime, timedelta import mimetypes import os -from pathlib import Path, PurePosixPath +from pathlib import Path from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union @@ -20,6 +20,7 @@ BlobProperties, ContentSettings, generate_blob_sas, + BlobPrefix, ) except ModuleNotFoundError: implementation_registry["azure"].dependencies_loaded = False @@ -181,17 +182,14 @@ def _exists(self, cloud_path: AzureBlobPath) -> bool: def _list_dir( self, cloud_path: AzureBlobPath, recursive: bool = False ) -> Iterable[Tuple[AzureBlobPath, bool]]: - # shortcut if listing all available containers if not cloud_path.container: - if recursive: - raise NotImplementedError( - "Cannot recursively list all containers and contents; you can get all the containers then recursively list each separately." - ) - - yield from ( - (self.CloudPath(f"az://{c.name}"), True) - for c in self.service_client.list_containers() - ) + for container in self.service_client.list_containers(): + yield self.CloudPath(f"az://{container.name}"), True + + if not recursive: + continue + + yield from self._list_dir(self.CloudPath(f"az://{container.name}"), recursive=True) return container_client = self.service_client.get_container_client(cloud_path.container) @@ -200,30 +198,30 @@ def _list_dir( if prefix and not prefix.endswith("/"): prefix += "/" - yielded_dirs = set() - - # NOTE: Not recursive may be slower than necessary since it just filters - # the recursive implementation - for o in container_client.list_blobs(name_starts_with=prefix): - # get directory from this path - for parent in PurePosixPath(o.name[len(prefix) :]).parents: - # if we haven't surfaced this directory already - if parent not in yielded_dirs and str(parent) != ".": - # skip if not recursive and this is beyond our depth - if not recursive and "/" in str(parent): - continue - - yield ( - self.CloudPath(f"az://{cloud_path.container}/{prefix}{parent}"), - True, # is a directory - ) - yielded_dirs.add(parent) - - # skip file if not recursive and this is beyond our depth - if not recursive and "/" in o.name[len(prefix) :]: - continue - - yield (self.CloudPath(f"az://{cloud_path.container}/{o.name}"), False) # is a file + if not recursive: + blobs = container_client.walk_blobs(name_starts_with=prefix) + else: + blobs = container_client.list_blobs(name_starts_with=prefix) + + def is_dir(blob: Union[BlobProperties, BlobPrefix]) -> bool: + # walk_blobs returns a BlobPrefix object for directories + # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobprefix?view=azure-python + if isinstance(blob, BlobPrefix): + return True + + # content_type and content_md5 are never None for files in Azure Blob Storage + # if they are None, then the given path is a directory + # https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers + return ( + blob.content_settings.content_md5 is None + and blob.content_settings.content_type is None + ) + + for blob in blobs: + # walk_blobs returns folders with a trailing slash + blob_path = blob.name.rstrip("/") + blob_cloud_path = self.CloudPath(f"az://{cloud_path.container}/{blob_path}") + yield blob_cloud_path, is_dir(blob) def _move_file( self, src: AzureBlobPath, dst: AzureBlobPath, remove_src: bool = True From 125148089d511b50cf903fa23bb22401cda0cfd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Kub=C3=ADk?= Date: Thu, 20 Jun 2024 12:44:24 +0200 Subject: [PATCH 2/4] fix(az): correctly identify files and directories --- cloudpathlib/azure/azblobclient.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 19d8b558..2e84441e 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -2,8 +2,7 @@ import mimetypes import os from pathlib import Path -from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union - +from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union, cast from ..client import Client, register_client_class from ..cloudpath import implementation_registry @@ -156,7 +155,20 @@ def _is_file_or_dir(self, cloud_path: AzureBlobPath) -> Optional[str]: return "dir" try: - self._get_metadata(cloud_path) + # the result of get_blob_properties is a BlobProperties object (dict mixin) + metadata = cast(BlobProperties, self._get_metadata(cloud_path)) + + # content_type and content_md5 are never None for files in Azure Blob Storage + # if they are None, then the given path is a directory + # https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers + is_folder = ( + metadata.content_settings.content_type is None + and metadata.content_settings.content_md5 is None + ) + + if is_folder: + return "dir" + return "file" except ResourceNotFoundError: prefix = cloud_path.blob From 987493e90f45a24dfbd6872d36bc299d053fc1ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Kub=C3=ADk?= Date: Tue, 2 Jul 2024 17:39:00 +0200 Subject: [PATCH 3/4] test(az_rig): match list_blobs/walk_blobs/metadata API functionality --- tests/mock_clients/mock_azureblob.py | 61 ++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/tests/mock_clients/mock_azureblob.py b/tests/mock_clients/mock_azureblob.py index b07aeb0a..d5b278de 100644 --- a/tests/mock_clients/mock_azureblob.py +++ b/tests/mock_clients/mock_azureblob.py @@ -1,4 +1,4 @@ -from collections import namedtuple +from collections import namedtuple, defaultdict from datetime import datetime from pathlib import Path, PurePosixPath import shutil @@ -25,7 +25,12 @@ def __init__(self, *args, **kwargs): self.tmp_path = Path(self.tmp.name) / "test_case_copy" shutil.copytree(TEST_ASSETS, self.tmp_path / test_dir) - self.metadata_cache = {} + self.metadata_cache = defaultdict( + lambda: { + "content_type": "application/octet-stream", + "content_md5": "some_md5_hash", + } + ) @classmethod def from_connection_string(cls, *args, **kwargs): @@ -77,15 +82,22 @@ def url(self): def get_blob_properties(self): path = self.root / self.key - if path.exists() and path.is_file(): + if path.exists(): + # replicates the behavior of the Azure Blob API + # files always have content_type and content_md5 + # directories never have content_type and content_md5 + # https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers + if path.is_file(): + metadata = self.service_client.metadata_cache[path] + else: + metadata = {"content_type": None, "content_md5": None} + return BlobProperties( **{ "name": self.key, "Last-Modified": datetime.fromtimestamp(path.stat().st_mtime), "ETag": "etag", - "content_type": self.service_client.metadata_cache.get( - self.root / self.key, None - ), + **metadata, } ) else: @@ -114,9 +126,13 @@ def upload_blob(self, data, overwrite, content_settings=None): path.write_bytes(data.read()) if content_settings is not None: - self.service_client.metadata_cache[self.root / self.key] = ( - content_settings.content_type + # content_type and content_md5 are never None for files in Azure Blob Storage + # https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers + content_settings.content_type = ( + content_settings.content_type or "application/octet-stream" ) + content_settings.content_md5 = content_settings.content_md5 or "some_md5_hash" + self.service_client.metadata_cache[path].update(content_settings) class MockStorageStreamDownloader: @@ -148,24 +164,31 @@ def exists(self): def list_blobs(self, name_starts_with=None): return mock_item_paged(self.root, name_starts_with) + def walk_blobs(self, name_starts_with=None): + return mock_item_paged(self.root, name_starts_with, recursive=False) + def delete_blobs(self, *blobs): for blob in blobs: (self.root / blob).unlink() delete_empty_parents_up_to_root(path=self.root / blob, root=self.root) -def mock_item_paged(root, name_starts_with=None): - items = [] - +def mock_item_paged(root, name_starts_with=None, recursive=True): if not name_starts_with: name_starts_with = "" - for f in root.glob("**/*"): - if ( - (not f.name.startswith(".")) - and f.is_file() - and (root / name_starts_with) in [f, *f.parents] - ): - items.append((PurePosixPath(f), f)) + + if recursive: + items = [ + (PurePosixPath(f), f) + for f in root.glob("**/*") + if ( + (not f.name.startswith(".")) + and f.is_file() + and (root / name_starts_with) in [f, *f.parents] + ) + ] + else: + items = [(PurePosixPath(f), f) for f in (root / name_starts_with).iterdir()] for mocked, local in items: # BlobProperties @@ -175,5 +198,7 @@ def mock_item_paged(root, name_starts_with=None): "name": str(mocked.relative_to(PurePosixPath(root))), "Last-Modified": datetime.fromtimestamp(local.stat().st_mtime), "ETag": "etag", + "content_type": "application/octet-stream" if local.is_file() else None, + "content_md5": "some_md5_hash" if not local.is_file() else None, } ) From 2783e38d9db3b0a7f92d63d32c4e22408e73fc1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Kub=C3=ADk?= Date: Tue, 2 Jul 2024 21:32:43 +0200 Subject: [PATCH 4/4] chore: add HISTORY.md entry --- HISTORY.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 046ab12d..d7a412c5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,6 +8,8 @@ - fix: use native `exists()` method in `GSClient`. (PR [#420](https://github.com/drivendataorg/cloudpathlib/pull/420)) - Enhancement: lazy instantiation of default client (PR [#432](https://github.com/drivendataorg/cloudpathlib/issues/432), Issue [#428](https://github.com/drivendataorg/cloudpathlib/issues/428)) - Adds existence check before downloading in `download_to` (Issue [#430](https://github.com/drivendataorg/cloudpathlib/issues/430), PR [#432](https://github.com/drivendataorg/cloudpathlib/pull/432)) +- perf(az): optimize non-recursive _list_dir (Issue [#446](https://github.com/drivendataorg/cloudpathlib/issues/446), PR [#447](https://github.com/drivendataorg/cloudpathlib/pull/447)) +- fix(az): improve detection of path types (Issue [#444](https://github.com/drivendataorg/cloudpathlib/issues/444), PR [#447](https://github.com/drivendataorg/cloudpathlib/pull/447)) ## v0.18.1 (2024-02-26)