-
Notifications
You must be signed in to change notification settings - Fork 59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
perf(az): optimize non-recursive _list_dir
#447
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,8 @@ | ||
from datetime import datetime, timedelta | ||
import mimetypes | ||
import os | ||
from pathlib import Path, PurePosixPath | ||
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union | ||
|
||
from pathlib import Path | ||
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union, cast | ||
|
||
from ..client import Client, register_client_class | ||
from ..cloudpath import implementation_registry | ||
|
@@ -20,6 +19,7 @@ | |
BlobProperties, | ||
ContentSettings, | ||
generate_blob_sas, | ||
BlobPrefix, | ||
) | ||
except ModuleNotFoundError: | ||
implementation_registry["azure"].dependencies_loaded = False | ||
|
@@ -155,7 +155,20 @@ | |
return "dir" | ||
|
||
try: | ||
self._get_metadata(cloud_path) | ||
# the result of get_blob_properties is a BlobProperties object (dict mixin) | ||
metadata = cast(BlobProperties, self._get_metadata(cloud_path)) | ||
|
||
# content_type and content_md5 are never None for files in Azure Blob Storage | ||
# if they are None, then the given path is a directory | ||
# https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers | ||
is_folder = ( | ||
metadata.content_settings.content_type is None | ||
and metadata.content_settings.content_md5 is None | ||
) | ||
|
||
if is_folder: | ||
return "dir" | ||
|
||
return "file" | ||
except ResourceNotFoundError: | ||
prefix = cloud_path.blob | ||
|
@@ -181,17 +194,14 @@ | |
def _list_dir( | ||
self, cloud_path: AzureBlobPath, recursive: bool = False | ||
) -> Iterable[Tuple[AzureBlobPath, bool]]: | ||
# shortcut if listing all available containers | ||
if not cloud_path.container: | ||
if recursive: | ||
raise NotImplementedError( | ||
"Cannot recursively list all containers and contents; you can get all the containers then recursively list each separately." | ||
) | ||
|
||
yield from ( | ||
(self.CloudPath(f"az://{c.name}"), True) | ||
for c in self.service_client.list_containers() | ||
) | ||
for container in self.service_client.list_containers(): | ||
yield self.CloudPath(f"az://{container.name}"), True | ||
|
||
if not recursive: | ||
continue | ||
|
||
yield from self._list_dir(self.CloudPath(f"az://{container.name}"), recursive=True) | ||
return | ||
|
||
container_client = self.service_client.get_container_client(cloud_path.container) | ||
|
@@ -200,30 +210,30 @@ | |
if prefix and not prefix.endswith("/"): | ||
prefix += "/" | ||
|
||
yielded_dirs = set() | ||
|
||
# NOTE: Not recursive may be slower than necessary since it just filters | ||
# the recursive implementation | ||
for o in container_client.list_blobs(name_starts_with=prefix): | ||
# get directory from this path | ||
for parent in PurePosixPath(o.name[len(prefix) :]).parents: | ||
# if we haven't surfaced this directory already | ||
if parent not in yielded_dirs and str(parent) != ".": | ||
# skip if not recursive and this is beyond our depth | ||
if not recursive and "/" in str(parent): | ||
continue | ||
|
||
yield ( | ||
self.CloudPath(f"az://{cloud_path.container}/{prefix}{parent}"), | ||
True, # is a directory | ||
) | ||
yielded_dirs.add(parent) | ||
|
||
# skip file if not recursive and this is beyond our depth | ||
if not recursive and "/" in o.name[len(prefix) :]: | ||
continue | ||
|
||
yield (self.CloudPath(f"az://{cloud_path.container}/{o.name}"), False) # is a file | ||
if not recursive: | ||
blobs = container_client.walk_blobs(name_starts_with=prefix) | ||
M0dEx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
blobs = container_client.list_blobs(name_starts_with=prefix) | ||
|
||
def is_dir(blob: Union[BlobProperties, BlobPrefix]) -> bool: | ||
# walk_blobs returns a BlobPrefix object for directories | ||
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobprefix?view=azure-python | ||
if isinstance(blob, BlobPrefix): | ||
return True | ||
|
||
# content_type and content_md5 are never None for files in Azure Blob Storage | ||
# if they are None, then the given path is a directory | ||
# https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers | ||
return ( | ||
blob.content_settings.content_md5 is None | ||
and blob.content_settings.content_type is None | ||
) | ||
|
||
for blob in blobs: | ||
# walk_blobs returns folders with a trailing slash | ||
blob_path = blob.name.rstrip("/") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it problematic to keep the trailing slashes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did it to be consistent with the current behaviour, but we can simply not remove the trailing slashes. |
||
blob_cloud_path = self.CloudPath(f"az://{cloud_path.container}/{blob_path}") | ||
yield blob_cloud_path, is_dir(blob) | ||
|
||
def _move_file( | ||
self, src: AzureBlobPath, dst: AzureBlobPath, remove_src: bool = True | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you point to a more specific reference? I don't see the link you sent supporting the claim that this is definitive for folders.
It does seem like
x-ms-resource-type
is useful to answer the question directly, but only for certain configurations so could be included as an optimization. It may be the case that all the scenarios where those vars are none are also ones wherex-ms-resource-type
is available.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree the reference is sort of vague. I was mainly going of off these two statements:
Using this, coupled with our observations from real-life usage, we concluded that these parameters are always
None
for folders and neverNone
for blobs, even if they are empty.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does your configuration have the
x-ms-resource-type
header for these folder blobs?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That header does not seem to be returned within
BlobProperties
, nor can I see it in the in-browser Azure storage explorer.