From 7a18af097f8b9fee77f70bbfc5e8d975df02e607 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Wed, 17 Jul 2024 12:03:23 -0700 Subject: [PATCH 01/15] minimal ADLS gen2 support --- CONTRIBUTING.md | 9 ++ HISTORY.md | 1 + cloudpathlib/azure/azblobclient.py | 147 +++++++++++++++++++++-------- pyproject.toml | 2 +- tests/conftest.py | 127 +++++++++++++++---------- 5 files changed, 197 insertions(+), 89 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c309ea2f..19a44945 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,6 +81,15 @@ Finally, you may want to run your tests against live servers to ensure that the make test-live-cloud ``` +#### Azure live backend tests + +For Azure, you can test both against Azure Blob Storage backends and Azure Data Lake Storage Gen2 backends. To run these tests, you need to set connection strings for both of the backends by setting the following environment variables (in your `.env` file for local development). If `AZURE_STORAGE_GEN2_CONNECTION_STRING` is not set, only the blob storage backend will be tested. + +```bash +AZURE_STORAGE_CONNECTION_STRING=your_connection_string +AZURE_STORAGE_GEN2_CONNECTION_STRING=your_connection_string +``` + You can copy `.env.example` to `.env` and fill in the credentials and bucket/container names for the providers you want to test against. **Note that the live tests will create and delete files on the cloud provider.** You can also skip providers you do not have accounts for by commenting them out in the `rig` and `s3_like_rig` variables defined at the end of `tests/conftest.py`. diff --git a/HISTORY.md b/HISTORY.md index 0ec4abf1..603fec73 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,7 @@ - Changed `LocalClient` so that client instances using the default storage access the default local storage directory through the `get_default_storage_dir` rather than having an explicit reference to the path set at instantiation. This means that calling `get_default_storage_dir` will reset the local storage for all clients using the default local storage, whether the client has already been instantiated or is instantiated after resetting. This fixes unintuitive behavior where `reset_local_storage` did not reset local storage when using the default client. (Issue [#414](https://github.com/drivendataorg/cloudpathlib/issues/414)) - Added a new `local_storage_dir` property to `LocalClient`. This will return the current local storage directory used by that client instance. by reference through the `get_default_ rather than with an explicit. +- Add Azure Data Lake Storage Gen2 support (Issue [#161](https://github.com/drivendataorg/cloudpathlib/issues/161), PR [#450](https://github.com/drivendataorg/cloudpathlib/pull/450)), thanks to [@M0dEx](https://github.com/M0dEx) for PR [#447](https://github.com/drivendataorg/cloudpathlib/pull/447) and PR [#449](https://github.com/drivendataorg/cloudpathlib/pull/449) ## v0.18.1 (2024-02-26) diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index f161a02d..5a008572 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -1,7 +1,7 @@ from datetime import datetime, timedelta import mimetypes import os -from pathlib import Path, PurePosixPath +from pathlib import Path from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union @@ -15,12 +15,15 @@ try: from azure.core.exceptions import ResourceNotFoundError from azure.storage.blob import ( + BlobPrefix, BlobSasPermissions, BlobServiceClient, BlobProperties, ContentSettings, generate_blob_sas, ) + + from azure.storage.filedatalake import DataLakeServiceClient, FileProperties except ModuleNotFoundError: implementation_registry["azure"].dependencies_loaded = False @@ -39,6 +42,7 @@ def __init__( credential: Optional[Any] = None, connection_string: Optional[str] = None, blob_service_client: Optional["BlobServiceClient"] = None, + data_lake_client: Optional["DataLakeServiceClient"] = None, file_cache_mode: Optional[Union[str, FileCacheMode]] = None, local_cache_dir: Optional[Union[str, os.PathLike]] = None, content_type_method: Optional[Callable] = mimetypes.guess_type, @@ -76,6 +80,10 @@ def __init__( https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python#copy-your-credentials-from-the-azure-portal). blob_service_client (Optional[BlobServiceClient]): Instantiated [`BlobServiceClient`]( https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python). + data_lake_client (Optional[DataLakeServiceClient]): Instantiated [`DataLakeServiceClient`]( + https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.datalakeserviceclient). + If None and `blob_service_client` is passed, we will create based on that. + Otherwise, will create based on passed credential, account_url, connection_string, or AZURE_STORAGE_CONNECTION_STRING env var file_cache_mode (Optional[Union[str, FileCacheMode]]): How often to clear the file cache; see [the caching docs](https://cloudpathlib.drivendata.org/stable/caching/) for more information about the options in cloudpathlib.eums.FileCacheMode. @@ -94,27 +102,73 @@ def __init__( if connection_string is None: connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None) + self.blob_service_client = None + self.data_lake_client = None + if blob_service_client is not None: self.service_client = blob_service_client + + # create from blob service client if not passed + if data_lake_client is None: + self.data_lake_client = DataLakeServiceClient( + account_url=f"https://{self.service_client.account_name}.dfs.core.windows.net", + credential=blob_service_client.credential, + ) + + if data_lake_client is not None: + self.data_lake_client = data_lake_client + elif connection_string is not None: self.service_client = BlobServiceClient.from_connection_string( conn_str=connection_string, credential=credential ) + self.data_lake_client = DataLakeServiceClient.from_connection_string( + conn_str=connection_string, credential=credential + ) elif account_url is not None: self.service_client = BlobServiceClient(account_url=account_url, credential=credential) + self.data_lake_client = DataLakeServiceClient( + account_url=account_url, credential=credential + ) else: raise MissingCredentialsError( "AzureBlobClient does not support anonymous instantiation. " "Credentials are required; see docs for options." ) - def _get_metadata(self, cloud_path: AzureBlobPath) -> Union["BlobProperties", Dict[str, Any]]: - blob = self.service_client.get_blob_client( - container=cloud_path.container, blob=cloud_path.blob - ) - properties = blob.get_blob_properties() + self.hns_cache: Dict[str, bool] = {} + + def _check_hns(self, cloud_path: AzureBlobPath) -> bool: + if cloud_path.container not in self.hns_cache: + hns_enabled: bool = self.service_client.get_account_information().get( + "is_hns_enabled", False + ) # type: ignore + self.hns_cache[cloud_path.container] = hns_enabled + + return self.hns_cache[cloud_path.container] + + def _get_metadata( + self, cloud_path: AzureBlobPath + ) -> Union["BlobProperties", "FileProperties", Dict[str, Any]]: + if self._check_hns(cloud_path): + # works on both files and directories + fsc = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore + + if fsc is not None: + properties = fsc.get_file_client(cloud_path.blob).get_file_properties() + + # no content settings on directory + properties["content_type"] = properties.get( + "content_settings", {"content_type": None} + ).get("content_type") - properties["content_type"] = properties.content_settings.content_type + else: + blob = self.service_client.get_blob_client( + container=cloud_path.container, blob=cloud_path.blob + ) + properties = blob.get_blob_properties() + + properties["content_type"] = properties.content_settings.content_type return properties @@ -155,8 +209,17 @@ def _is_file_or_dir(self, cloud_path: AzureBlobPath) -> Optional[str]: return "dir" try: - self._get_metadata(cloud_path) - return "file" + meta = self._get_metadata(cloud_path) + + # if hns, has is_directory property; else if not hns, _get_metadata will raise if not a file + return ( + "dir" + if meta.get("is_directory", False) + or meta.get("metadata", {}).get("hdi_isfolder", False) + else "file" + ) + + # thrown if not HNS and file does not exist _or_ is dir; check if is dir instead except ResourceNotFoundError: prefix = cloud_path.blob if prefix and not prefix.endswith("/"): @@ -181,17 +244,14 @@ def _exists(self, cloud_path: AzureBlobPath) -> bool: def _list_dir( self, cloud_path: AzureBlobPath, recursive: bool = False ) -> Iterable[Tuple[AzureBlobPath, bool]]: - # shortcut if listing all available containers if not cloud_path.container: - if recursive: - raise NotImplementedError( - "Cannot recursively list all containers and contents; you can get all the containers then recursively list each separately." - ) + for container in self.service_client.list_containers(): + yield self.CloudPath(f"az://{container.name}"), True - yield from ( - (self.CloudPath(f"az://{c.name}"), True) - for c in self.service_client.list_containers() - ) + if not recursive: + continue + + yield from self._list_dir(self.CloudPath(f"az://{container.name}"), recursive=True) return container_client = self.service_client.get_container_client(cloud_path.container) @@ -200,30 +260,24 @@ def _list_dir( if prefix and not prefix.endswith("/"): prefix += "/" - yielded_dirs = set() - - # NOTE: Not recursive may be slower than necessary since it just filters - # the recursive implementation - for o in container_client.list_blobs(name_starts_with=prefix): - # get directory from this path - for parent in PurePosixPath(o.name[len(prefix) :]).parents: - # if we haven't surfaced this directory already - if parent not in yielded_dirs and str(parent) != ".": - # skip if not recursive and this is beyond our depth - if not recursive and "/" in str(parent): - continue + if self._check_hns(cloud_path): + file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore + paths = file_system_client.get_paths(path=cloud_path.blob, recursive=recursive) - yield ( - self.CloudPath(f"az://{cloud_path.container}/{prefix}{parent}"), - True, # is a directory - ) - yielded_dirs.add(parent) + for path in paths: + yield self.CloudPath(f"az://{cloud_path.container}/{path.name}"), path.is_directory - # skip file if not recursive and this is beyond our depth - if not recursive and "/" in o.name[len(prefix) :]: - continue + else: + if not recursive: + blobs = container_client.walk_blobs(name_starts_with=prefix) + else: + blobs = container_client.list_blobs(name_starts_with=prefix) - yield (self.CloudPath(f"az://{cloud_path.container}/{o.name}"), False) # is a file + for blob in blobs: + # walk_blobs returns folders with a trailing slash + blob_path = blob.name.rstrip("/") + blob_cloud_path = self.CloudPath(f"az://{cloud_path.container}/{blob_path}") + yield blob_cloud_path, isinstance(blob, BlobPrefix) def _move_file( self, src: AzureBlobPath, dst: AzureBlobPath, remove_src: bool = True @@ -253,6 +307,10 @@ def _move_file( def _remove(self, cloud_path: AzureBlobPath, missing_ok: bool = True) -> None: file_or_dir = self._is_file_or_dir(cloud_path) if file_or_dir == "dir": + if self._check_hns(cloud_path): + _hns_rmtree(self.data_lake_client, cloud_path.container, cloud_path.blob) + return + blobs = [ b.blob for b, is_dir in self._list_dir(cloud_path, recursive=True) if not is_dir ] @@ -313,4 +371,15 @@ def _generate_presigned_url( return url +def _hns_rmtree(data_lake_client, container, directory): + """Stateless implementation so can be used in test suite cleanup as well. + + If hierarchical namespace is enabled, delete the directory and all its contents. + (The non-HNS version is implemented in `_remove`, but will leave empty folders in HNS). + """ + file_system_client = data_lake_client.get_file_system_client(container) + directory_client = file_system_client.get_directory_client(directory) + directory_client.delete_directory() + + AzureBlobClient.AzureBlobPath = AzureBlobClient.CloudPath # type: ignore diff --git a/pyproject.toml b/pyproject.toml index 63974887..c7f6dcdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ ] [project.optional-dependencies] -azure = ["azure-storage-blob>=12"] +azure = ["azure-storage-blob>=12", "azure-storage-file-datalake>=12"] gs = ["google-cloud-storage"] s3 = ["boto3>=1.34.0"] all = ["cloudpathlib[azure]", "cloudpathlib[gs]", "cloudpathlib[s3]"] diff --git a/tests/conftest.py b/tests/conftest.py index bf680ece..274c5425 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,9 @@ from typing import Dict, Optional from azure.storage.blob import BlobServiceClient +from azure.storage.filedatalake import ( + DataLakeServiceClient, +) import boto3 import botocore from dotenv import find_dotenv, load_dotenv @@ -26,6 +29,7 @@ LocalS3Path, ) import cloudpathlib.azure.azblobclient +from cloudpathlib.azure.azblobclient import _hns_rmtree import cloudpathlib.s3.s3client from .mock_clients.mock_azureblob import mocked_client_class_factory, DEFAULT_CONTAINER_NAME from .mock_clients.mock_gs import ( @@ -109,55 +113,69 @@ def create_test_dir_name(request) -> str: return test_dir -@fixture() -def azure_rig(request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) - test_dir = create_test_dir_name(request) +def azure_rig_factory(conn_str_env_var="AZURE_STORAGE_CONNECTION_STRING"): + @fixture() + def azure_rig(request, monkeypatch, assets_dir): + drive = os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) + test_dir = create_test_dir_name(request) - live_server = os.getenv("USE_LIVE_CLOUD") == "1" + live_server = os.getenv("USE_LIVE_CLOUD") == "1" - if live_server: - # Set up test assets - blob_service_client = BlobServiceClient.from_connection_string( - os.getenv("AZURE_STORAGE_CONNECTION_STRING") - ) - test_files = [ - f for f in assets_dir.glob("**/*") if f.is_file() and f.name not in UPLOAD_IGNORE_LIST - ] - for test_file in test_files: - blob_client = blob_service_client.get_blob_client( - container=drive, - blob=str(f"{test_dir}/{PurePosixPath(test_file.relative_to(assets_dir))}"), + if live_server: + # Set up test assets + blob_service_client = BlobServiceClient.from_connection_string( + os.getenv(conn_str_env_var) ) - blob_client.upload_blob(test_file.read_bytes(), overwrite=True) - else: - monkeypatch.setenv("AZURE_STORAGE_CONNECTION_STRING", "") - # Mock cloud SDK - monkeypatch.setattr( - cloudpathlib.azure.azblobclient, - "BlobServiceClient", - mocked_client_class_factory(test_dir), + data_lake_service_client = DataLakeServiceClient.from_connection_string( + os.getenv(conn_str_env_var) + ) + test_files = [ + f + for f in assets_dir.glob("**/*") + if f.is_file() and f.name not in UPLOAD_IGNORE_LIST + ] + for test_file in test_files: + blob_client = blob_service_client.get_blob_client( + container=drive, + blob=str(f"{test_dir}/{PurePosixPath(test_file.relative_to(assets_dir))}"), + ) + blob_client.upload_blob(test_file.read_bytes(), overwrite=True) + else: + monkeypatch.setenv("AZURE_STORAGE_CONNECTION_STRING", "") + # Mock cloud SDK + monkeypatch.setattr( + cloudpathlib.azure.azblobclient, + "BlobServiceClient", + mocked_client_class_factory(test_dir), + ) + + rig = CloudProviderTestRig( + path_class=AzureBlobPath, + client_class=AzureBlobClient, + drive=drive, + test_dir=test_dir, + live_server=live_server, ) - rig = CloudProviderTestRig( - path_class=AzureBlobPath, - client_class=AzureBlobClient, - drive=drive, - test_dir=test_dir, - live_server=live_server, - ) + rig.client_class().set_as_default_client() # set default client - rig.client_class().set_as_default_client() # set default client + yield rig - yield rig + rig.client_class._default_client = None # reset default client - rig.client_class._default_client = None # reset default client + if live_server: + if blob_service_client.get_account_information().get("is_hns_enabled", False): + _hns_rmtree(data_lake_service_client, drive, test_dir) - if live_server: - # Clean up test dir - container_client = blob_service_client.get_container_client(drive) - to_delete = container_client.list_blobs(name_starts_with=test_dir) - container_client.delete_blobs(*to_delete) + else: + # Clean up test dir + container_client = blob_service_client.get_container_client(drive) + to_delete = container_client.list_blobs(name_starts_with=test_dir) + to_delete = sorted(to_delete, key=lambda b: len(b.name.split("/")), reverse=True) + + container_client.delete_blobs(*to_delete) + + return azure_rig @fixture() @@ -420,16 +438,27 @@ def local_s3_rig(request, monkeypatch, assets_dir): rig.client_class.reset_default_storage_dir() # reset local storage directory +azure_rig = azure_rig_factory("AZURE_STORAGE_CONNECTION_STRING") + +# create azure fixtures for both blob and gen2 storage depending on which live services are configured in +# the environment variables +azure_fixtures = [azure_rig] + +# explicitly test gen2 if configured +if os.getenv("AZURE_STORAGE_GEN2_CONNECTION_STRING"): + azure_gen2_rig = azure_rig_factory("AZURE_STORAGE_GEN2_CONNECTION_STRING") + azure_fixtures.append(azure_gen2_rig) + rig = fixture_union( "rig", - [ - azure_rig, - gs_rig, - s3_rig, - custom_s3_rig, - local_azure_rig, - local_s3_rig, - local_gs_rig, + azure_fixtures + + [ + # gs_rig, + # s3_rig, + # custom_s3_rig, + # local_azure_rig, + # local_s3_rig, + # local_gs_rig, ], ) @@ -438,6 +467,6 @@ def local_s3_rig(request, monkeypatch, assets_dir): "s3_like_rig", [ s3_rig, - custom_s3_rig, + # custom_s3_rig, ], ) From 690f08c2fa95d1db8018b5b6a73dfdf6042f3a6c Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Wed, 17 Jul 2024 12:08:35 -0700 Subject: [PATCH 02/15] add rigs back --- tests/conftest.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 274c5425..14539e89 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -453,12 +453,12 @@ def local_s3_rig(request, monkeypatch, assets_dir): "rig", azure_fixtures + [ - # gs_rig, - # s3_rig, - # custom_s3_rig, - # local_azure_rig, - # local_s3_rig, - # local_gs_rig, + gs_rig, + s3_rig, + custom_s3_rig, + local_azure_rig, + local_s3_rig, + local_gs_rig, ], ) @@ -467,6 +467,6 @@ def local_s3_rig(request, monkeypatch, assets_dir): "s3_like_rig", [ s3_rig, - # custom_s3_rig, + custom_s3_rig, ], ) From 69ba1fc062eeecd6808158a2471776e2d59bf7e6 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sat, 27 Jul 2024 22:25:23 -0700 Subject: [PATCH 03/15] Make mocked tests work with adls --- docs/docs/authentication.md | 7 ++++ tests/conftest.py | 55 ++++++++++++++++++---------- tests/mock_clients/mock_adls_gen2.py | 52 ++++++++++++++++++++++++++ tests/mock_clients/mock_azureblob.py | 37 +++++++++++++------ tests/test_azure_specific.py | 10 ++--- tests/test_cloudpath_manipulation.py | 4 +- 6 files changed, 126 insertions(+), 39 deletions(-) create mode 100644 tests/mock_clients/mock_adls_gen2.py diff --git a/docs/docs/authentication.md b/docs/docs/authentication.md index 76c7b1b3..0557c196 100644 --- a/docs/docs/authentication.md +++ b/docs/docs/authentication.md @@ -211,6 +211,13 @@ client.set_as_default_client() cp3 = CloudPath("s3://cloudpathlib-test-bucket/") ``` +## Accessing Azure DataLake Storage Gen2 (ADLS Gen2) storage with hierarchical namespace enabled + +Some Azure storage accounts are configured with "hierarchical namespace" enabled. This means that the storage account is backed by the Azure DataLake Storage Gen2 product rather than Azure Blob Storage. For many operations, the two are the same and one can use the Azure Blob Storage API. However, for some operations, a developer will need to use the Azure DataLake Storage API. The `AzureBlobClient` class implemented in cloudpathlib is designed to detect if hierarchical namespace is enabled and use the Azure DataLake Storage API in the places where it is necessary or it provides a performance improvement. Usually, a user of cloudpathlib will not need to know if hierarchical namespace is enabled and the storage account is backed by Azure DataLake Storage Gen2 or Azure Blob Storage. + +If needed, the Azure SDK provided `DataLakeServiceClient` object can be accessed via the `AzureBlobClient.data_lake_client`. The Azure SDK provided `BlobServiceClient` object can be accessed via `AzureBlobClient.blob_client`. + + ## Pickling `CloudPath` objects You can pickle and unpickle `CloudPath` objects normally, for example: diff --git a/tests/conftest.py b/tests/conftest.py index 14539e89..cc5adcf0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,6 +32,7 @@ from cloudpathlib.azure.azblobclient import _hns_rmtree import cloudpathlib.s3.s3client from .mock_clients.mock_azureblob import mocked_client_class_factory, DEFAULT_CONTAINER_NAME +from .mock_clients.mock_adls_gen2 import mocked_adls_factory from .mock_clients.mock_gs import ( mocked_client_class_factory as mocked_gsclient_class_factory, DEFAULT_GS_BUCKET_NAME, @@ -113,7 +114,9 @@ def create_test_dir_name(request) -> str: return test_dir -def azure_rig_factory(conn_str_env_var="AZURE_STORAGE_CONNECTION_STRING"): +def azure_rig_factory(conn_str_env_var): + adls_gen2 = conn_str_env_var == "AZURE_STORAGE_GEN2_CONNECTION_STRING" + @fixture() def azure_rig(request, monkeypatch, assets_dir): drive = os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) @@ -142,11 +145,21 @@ def azure_rig(request, monkeypatch, assets_dir): blob_client.upload_blob(test_file.read_bytes(), overwrite=True) else: monkeypatch.setenv("AZURE_STORAGE_CONNECTION_STRING", "") - # Mock cloud SDK + monkeypatch.setenv("AZURE_STORAGE_GEN2_CONNECTION_STRING", "") + + # need shared client so both blob and adls APIs can point to same temp directory + shared_client = mocked_client_class_factory(test_dir, adls_gen2=adls_gen2)() + monkeypatch.setattr( cloudpathlib.azure.azblobclient, "BlobServiceClient", - mocked_client_class_factory(test_dir), + shared_client, + ) + + monkeypatch.setattr( + cloudpathlib.azure.azblobclient, + "DataLakeServiceClient", + mocked_adls_factory(test_dir, shared_client), ) rig = CloudProviderTestRig( @@ -439,26 +452,28 @@ def local_s3_rig(request, monkeypatch, assets_dir): azure_rig = azure_rig_factory("AZURE_STORAGE_CONNECTION_STRING") +azure_gen2_rig = azure_rig_factory("AZURE_STORAGE_GEN2_CONNECTION_STRING") -# create azure fixtures for both blob and gen2 storage depending on which live services are configured in -# the environment variables -azure_fixtures = [azure_rig] - -# explicitly test gen2 if configured -if os.getenv("AZURE_STORAGE_GEN2_CONNECTION_STRING"): - azure_gen2_rig = azure_rig_factory("AZURE_STORAGE_GEN2_CONNECTION_STRING") - azure_fixtures.append(azure_gen2_rig) +# create azure fixtures for both blob and gen2 storage +azure_rigs = fixture_union( + "azure_rigs", + [ + azure_rig, # azure_rig0 + azure_gen2_rig, # azure_rig1 + ], +) rig = fixture_union( "rig", - azure_fixtures - + [ - gs_rig, - s3_rig, - custom_s3_rig, - local_azure_rig, - local_s3_rig, - local_gs_rig, + [ + azure_rig, # azure_rig0 + azure_gen2_rig, # azure_rig1 + # gs_rig, + # s3_rig, + # custom_s3_rig, + # local_azure_rig, + # local_s3_rig, + # local_gs_rig, ], ) @@ -467,6 +482,6 @@ def local_s3_rig(request, monkeypatch, assets_dir): "s3_like_rig", [ s3_rig, - custom_s3_rig, + # custom_s3_rig, ], ) diff --git a/tests/mock_clients/mock_adls_gen2.py b/tests/mock_clients/mock_adls_gen2.py new file mode 100644 index 00000000..26f1314b --- /dev/null +++ b/tests/mock_clients/mock_adls_gen2.py @@ -0,0 +1,52 @@ +from azure.storage.filedatalake import FileProperties + +from .mock_azureblob import mocked_client_class_factory + + +def mocked_adls_factory(test_dir, blob_service_client): + """Just wrap and use `MockBlobClient` where needed to mock ADLS Gen2""" + + class MockedDataLakeServiceClient: + def __init__(self, blob_service_client): + self.blob_service_client = blob_service_client + + @classmethod + def from_connection_string(cls, *args, **kwargs): + return cls(mocked_client_class_factory(test_dir, adls_gen2=True)()) + + def get_file_system_client(self, file_system): + return MockedFileSystemClient(self.blob_service_client) + + return MockedDataLakeServiceClient + + +class MockedFileSystemClient: + def __init__(self, blob_service_client): + self.blob_service_client = blob_service_client + + def get_file_client(self, key): + return MockedFileClient(key, self.blob_service_client) + + +class MockedFileClient: + def __init__(self, key, blob_service_client) -> None: + self.key = key + self.blob_service_client = blob_service_client + + def get_file_properties(self): + path = self.blob_service_client.tmp_path / self.key + + if path.exists() and path.is_dir(): + return FileProperties( + **{ + "name": self.path.name, + "size": 0, + "etag": "etag", + "last_modified": self.path.stat().st_mtime, + "metadata": {"hdi_isfolder": True}, + } + ) + + # fallback to blob properties for files + else: + return self.blob_service_client.get_blob_client("", self.key).get_blob_properties() diff --git a/tests/mock_clients/mock_azureblob.py b/tests/mock_clients/mock_azureblob.py index b07aeb0a..3633ca7a 100644 --- a/tests/mock_clients/mock_azureblob.py +++ b/tests/mock_clients/mock_azureblob.py @@ -17,15 +17,18 @@ DEFAULT_CONTAINER_NAME = "container" -def mocked_client_class_factory(test_dir: str): +def mocked_client_class_factory(test_dir: str, adls_gen2: bool = False, tmp_dir: Path = None): + """If tmp_dir is not None, use that one so that it can be shared with a MockedDataLakeServiceClient.""" + class MockBlobServiceClient: def __init__(self, *args, **kwargs): # copy test assets for reference in tests without affecting assets - self.tmp = TemporaryDirectory() + self.tmp = TemporaryDirectory() if not tmp_dir else tmp_dir self.tmp_path = Path(self.tmp.name) / "test_case_copy" shutil.copytree(TEST_ASSETS, self.tmp_path / test_dir) self.metadata_cache = {} + self.adls_gen2 = adls_gen2 @classmethod def from_connection_string(cls, *args, **kwargs): @@ -61,6 +64,9 @@ def list_containers(self): Container = namedtuple("Container", "name") return [Container(name=DEFAULT_CONTAINER_NAME)] + def get_account_information(self): + return {"is_hns_enabled": self.adls_gen2} + return MockBlobServiceClient @@ -86,6 +92,7 @@ def get_blob_properties(self): "content_type": self.service_client.metadata_cache.get( self.root / self.key, None ), + "metadata": dict(), } ) else: @@ -148,24 +155,30 @@ def exists(self): def list_blobs(self, name_starts_with=None): return mock_item_paged(self.root, name_starts_with) + def walk_blobs(self, name_starts_with=None): + return mock_item_paged(self.root, name_starts_with, recursive=False) + def delete_blobs(self, *blobs): for blob in blobs: (self.root / blob).unlink() delete_empty_parents_up_to_root(path=self.root / blob, root=self.root) -def mock_item_paged(root, name_starts_with=None): +def mock_item_paged(root, name_starts_with=None, recursive=True): items = [] - if not name_starts_with: - name_starts_with = "" - for f in root.glob("**/*"): - if ( - (not f.name.startswith(".")) - and f.is_file() - and (root / name_starts_with) in [f, *f.parents] - ): - items.append((PurePosixPath(f), f)) + if recursive: + items = [ + (PurePosixPath(f), f) + for f in root.glob("**/*") + if ( + (not f.name.startswith(".")) + and f.is_file() + and (root / name_starts_with) in [f, *f.parents] + ) + ] + else: + items = [(PurePosixPath(f), f) for f in (root / name_starts_with).iterdir()] for mocked, local in items: # BlobProperties diff --git a/tests/test_azure_specific.py b/tests/test_azure_specific.py index f229ce61..247934bc 100644 --- a/tests/test_azure_specific.py +++ b/tests/test_azure_specific.py @@ -32,8 +32,8 @@ def test_azureblobpath_nocreds(client_class, monkeypatch): client_class() -def test_as_url(azure_rig): - p: AzureBlobPath = azure_rig.create_cloud_path("dir_0/file0_0.txt") +def test_as_url(azure_rigs): + p: AzureBlobPath = azure_rigs.create_cloud_path("dir_0/file0_0.txt") public_url = str(p.as_url()) public_parts = urlparse(public_url) @@ -50,8 +50,8 @@ def test_as_url(azure_rig): assert "sig" in query_params -def test_partial_download(azure_rig, monkeypatch): - p: AzureBlobPath = azure_rig.create_cloud_path("dir_0/file0_0.txt") +def test_partial_download(azure_rigs, monkeypatch): + p: AzureBlobPath = azure_rigs.create_cloud_path("dir_0/file0_0.txt") # no partial after successful download p.read_text() # downloads @@ -69,7 +69,7 @@ def _patched(self, buffer): buffer.write(b"partial") raise Exception("boom") - if azure_rig.live_server: + if azure_rigs.live_server: m.setattr(StorageStreamDownloader, "readinto", _patched) else: m.setattr(MockStorageStreamDownloader, "readinto", _patched) diff --git a/tests/test_cloudpath_manipulation.py b/tests/test_cloudpath_manipulation.py index a6aad166..ef202c39 100644 --- a/tests/test_cloudpath_manipulation.py +++ b/tests/test_cloudpath_manipulation.py @@ -43,7 +43,7 @@ def test_no_op_actions(rig): assert path.is_absolute() -def test_relative_to(rig, azure_rig, gs_rig): +def test_relative_to(rig, azure_rigs, gs_rig): assert rig.create_cloud_path("bucket/path/to/file.txt").relative_to( rig.create_cloud_path("bucket/path") ) == PurePosixPath("to/file.txt") @@ -59,7 +59,7 @@ def test_relative_to(rig, azure_rig, gs_rig): with pytest.raises(ValueError): assert rig.create_cloud_path("a/b/c/d.file").relative_to(PurePosixPath("/a/b/c")) - other_rig = azure_rig if rig.cloud_prefix != azure_rig.cloud_prefix else gs_rig + other_rig = azure_rigs if rig.cloud_prefix != azure_rigs.cloud_prefix else gs_rig path = CloudPath(f"{rig.cloud_prefix}bucket/path/to/file.txt") other_cloud_path = CloudPath(f"{other_rig.cloud_prefix}bucket/path") with pytest.raises(ValueError): From e9cc822543db7ae6966a6ebc0ab1ea30345c958b Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sat, 27 Jul 2024 23:03:18 -0700 Subject: [PATCH 04/15] add rigs back; make explicit no dirs --- cloudpathlib/azure/azblobclient.py | 7 ++++++- tests/conftest.py | 14 +++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 5a008572..498fe385 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -277,7 +277,12 @@ def _list_dir( # walk_blobs returns folders with a trailing slash blob_path = blob.name.rstrip("/") blob_cloud_path = self.CloudPath(f"az://{cloud_path.container}/{blob_path}") - yield blob_cloud_path, isinstance(blob, BlobPrefix) + + yield blob_cloud_path, ( + isinstance(blob, BlobPrefix) + if not recursive + else False # no folders from list_blobs in non-hns storage accounts + ) def _move_file( self, src: AzureBlobPath, dst: AzureBlobPath, remove_src: bool = True diff --git a/tests/conftest.py b/tests/conftest.py index cc5adcf0..e6eb1775 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -468,12 +468,12 @@ def local_s3_rig(request, monkeypatch, assets_dir): [ azure_rig, # azure_rig0 azure_gen2_rig, # azure_rig1 - # gs_rig, - # s3_rig, - # custom_s3_rig, - # local_azure_rig, - # local_s3_rig, - # local_gs_rig, + gs_rig, + s3_rig, + custom_s3_rig, + local_azure_rig, + local_s3_rig, + local_gs_rig, ], ) @@ -482,6 +482,6 @@ def local_s3_rig(request, monkeypatch, assets_dir): "s3_like_rig", [ s3_rig, - # custom_s3_rig, + custom_s3_rig, ], ) From 064ed93727ccd43d4ca0d44f288daedeb63364b5 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 28 Jul 2024 11:12:07 -0700 Subject: [PATCH 05/15] Update testing and hns key --- CONTRIBUTING.md | 2 +- cloudpathlib/azure/azblobclient.py | 9 +- tests/conftest.py | 132 +++++++++++++------------- tests/test_client.py | 4 +- tests/test_cloudpath_instantiation.py | 2 +- tests/test_cloudpath_manipulation.py | 2 +- 6 files changed, 79 insertions(+), 72 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 19a44945..54c98962 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -83,7 +83,7 @@ make test-live-cloud #### Azure live backend tests -For Azure, you can test both against Azure Blob Storage backends and Azure Data Lake Storage Gen2 backends. To run these tests, you need to set connection strings for both of the backends by setting the following environment variables (in your `.env` file for local development). If `AZURE_STORAGE_GEN2_CONNECTION_STRING` is not set, only the blob storage backend will be tested. +For Azure, you can test both against Azure Blob Storage backends and Azure Data Lake Storage Gen2 backends. To run these tests, you need to set connection strings for both of the backends by setting the following environment variables (in your `.env` file for local development). If `AZURE_STORAGE_GEN2_CONNECTION_STRING` is not set, only the blob storage backend will be tested. To set up a storage account with ADLS Gen2, go through the normal creation flow for a storage account in the Azure portal and select "Enable Hierarchical Namespace" in the "Advanced" tab of the settings when configuring the account. ```bash AZURE_STORAGE_CONNECTION_STRING=your_connection_string diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 498fe385..8bb25d0c 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -139,18 +139,21 @@ def __init__( self.hns_cache: Dict[str, bool] = {} def _check_hns(self, cloud_path: AzureBlobPath) -> bool: - if cloud_path.container not in self.hns_cache: + hns_key = self.service_client.account_name + "__" + cloud_path.container + + if hns_key not in self.hns_cache: hns_enabled: bool = self.service_client.get_account_information().get( "is_hns_enabled", False ) # type: ignore - self.hns_cache[cloud_path.container] = hns_enabled + self.hns_cache[hns_key] = hns_enabled - return self.hns_cache[cloud_path.container] + return self.hns_cache[hns_key] def _get_metadata( self, cloud_path: AzureBlobPath ) -> Union["BlobProperties", "FileProperties", Dict[str, Any]]: if self._check_hns(cloud_path): + # works on both files and directories fsc = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore diff --git a/tests/conftest.py b/tests/conftest.py index e6eb1775..0c5a1134 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -114,81 +114,88 @@ def create_test_dir_name(request) -> str: return test_dir -def azure_rig_factory(conn_str_env_var): - adls_gen2 = conn_str_env_var == "AZURE_STORAGE_GEN2_CONNECTION_STRING" - - @fixture() - def azure_rig(request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) - test_dir = create_test_dir_name(request) +def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir): + drive = os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) + test_dir = create_test_dir_name(request) - live_server = os.getenv("USE_LIVE_CLOUD") == "1" + live_server = os.getenv("USE_LIVE_CLOUD") == "1" - if live_server: - # Set up test assets - blob_service_client = BlobServiceClient.from_connection_string( - os.getenv(conn_str_env_var) - ) - data_lake_service_client = DataLakeServiceClient.from_connection_string( - os.getenv(conn_str_env_var) + if live_server: + # Set up test assets + blob_service_client = BlobServiceClient.from_connection_string( + os.getenv(conn_str_env_var) + ) + data_lake_service_client = DataLakeServiceClient.from_connection_string( + os.getenv(conn_str_env_var) + ) + test_files = [ + f + for f in assets_dir.glob("**/*") + if f.is_file() and f.name not in UPLOAD_IGNORE_LIST + ] + for test_file in test_files: + blob_client = blob_service_client.get_blob_client( + container=drive, + blob=str(f"{test_dir}/{PurePosixPath(test_file.relative_to(assets_dir))}"), ) - test_files = [ - f - for f in assets_dir.glob("**/*") - if f.is_file() and f.name not in UPLOAD_IGNORE_LIST - ] - for test_file in test_files: - blob_client = blob_service_client.get_blob_client( - container=drive, - blob=str(f"{test_dir}/{PurePosixPath(test_file.relative_to(assets_dir))}"), - ) - blob_client.upload_blob(test_file.read_bytes(), overwrite=True) - else: - monkeypatch.setenv("AZURE_STORAGE_CONNECTION_STRING", "") - monkeypatch.setenv("AZURE_STORAGE_GEN2_CONNECTION_STRING", "") - - # need shared client so both blob and adls APIs can point to same temp directory - shared_client = mocked_client_class_factory(test_dir, adls_gen2=adls_gen2)() + blob_client.upload_blob(test_file.read_bytes(), overwrite=True) + else: + monkeypatch.setenv("AZURE_STORAGE_CONNECTION_STRING", "") + monkeypatch.setenv("AZURE_STORAGE_GEN2_CONNECTION_STRING", "") - monkeypatch.setattr( - cloudpathlib.azure.azblobclient, - "BlobServiceClient", - shared_client, - ) + # need shared client so both blob and adls APIs can point to same temp directory + shared_client = mocked_client_class_factory(test_dir, adls_gen2=adls_gen2)() - monkeypatch.setattr( - cloudpathlib.azure.azblobclient, - "DataLakeServiceClient", - mocked_adls_factory(test_dir, shared_client), - ) + monkeypatch.setattr( + cloudpathlib.azure.azblobclient, + "BlobServiceClient", + shared_client, + ) - rig = CloudProviderTestRig( - path_class=AzureBlobPath, - client_class=AzureBlobClient, - drive=drive, - test_dir=test_dir, - live_server=live_server, + monkeypatch.setattr( + cloudpathlib.azure.azblobclient, + "DataLakeServiceClient", + mocked_adls_factory(test_dir, shared_client), ) - rig.client_class().set_as_default_client() # set default client + rig = CloudProviderTestRig( + path_class=AzureBlobPath, + client_class=AzureBlobClient, + drive=drive, + test_dir=test_dir, + live_server=live_server, + required_client_kwargs=dict(connection_string=os.getenv(conn_str_env_var)), # switch on/off adls gen2 + ) + + rig.client_class(connection_string=os.getenv(conn_str_env_var)).set_as_default_client() # set default client + + # add flag for adls gen2 rig to skip some tests + rig.is_adls_gen2 = adls_gen2 - yield rig + yield rig - rig.client_class._default_client = None # reset default client + rig.client_class._default_client = None # reset default client - if live_server: - if blob_service_client.get_account_information().get("is_hns_enabled", False): - _hns_rmtree(data_lake_service_client, drive, test_dir) + if live_server: + if blob_service_client.get_account_information().get("is_hns_enabled", False): + _hns_rmtree(data_lake_service_client, drive, test_dir) - else: - # Clean up test dir - container_client = blob_service_client.get_container_client(drive) - to_delete = container_client.list_blobs(name_starts_with=test_dir) - to_delete = sorted(to_delete, key=lambda b: len(b.name.split("/")), reverse=True) + else: + # Clean up test dir + container_client = blob_service_client.get_container_client(drive) + to_delete = container_client.list_blobs(name_starts_with=test_dir) + to_delete = sorted(to_delete, key=lambda b: len(b.name.split("/")), reverse=True) - container_client.delete_blobs(*to_delete) + container_client.delete_blobs(*to_delete) - return azure_rig + +@fixture() +def azure_rig(request, monkeypatch, assets_dir): + yield from _azure_fixture("AZURE_STORAGE_CONNECTION_STRING", False, request, monkeypatch, assets_dir) + +@fixture() +def azure_gen2_rig(request, monkeypatch, assets_dir): + yield from _azure_fixture("AZURE_STORAGE_GEN2_CONNECTION_STRING", True, request, monkeypatch, assets_dir) @fixture() @@ -451,9 +458,6 @@ def local_s3_rig(request, monkeypatch, assets_dir): rig.client_class.reset_default_storage_dir() # reset local storage directory -azure_rig = azure_rig_factory("AZURE_STORAGE_CONNECTION_STRING") -azure_gen2_rig = azure_rig_factory("AZURE_STORAGE_GEN2_CONNECTION_STRING") - # create azure fixtures for both blob and gen2 storage azure_rigs = fixture_union( "azure_rigs", diff --git a/tests/test_client.py b/tests/test_client.py index 00b6f270..c4761b0e 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -9,7 +9,7 @@ def test_default_client_instantiation(rig): - if not getattr(rig, "is_custom_s3", False): + if not getattr(rig, "is_custom_s3", False) and not (getattr(rig, "is_adls_gen2", False)): # Skip resetting the default client for custom S3 endpoint, but keep the other tests, # since they're still useful. rig.client_class._default_client = None @@ -43,7 +43,7 @@ def test_default_client_instantiation(rig): def test_different_clients(rig): p = rig.create_cloud_path("dir_0/file0_0.txt") - new_client = rig.client_class() + new_client = rig.client_class(**rig.required_client_kwargs) p2 = new_client.CloudPath(f"{rig.cloud_prefix}{rig.drive}/{rig.test_dir}/dir_0/file0_0.txt") assert p.client is not p2.client diff --git a/tests/test_cloudpath_instantiation.py b/tests/test_cloudpath_instantiation.py index 64951495..de139593 100644 --- a/tests/test_cloudpath_instantiation.py +++ b/tests/test_cloudpath_instantiation.py @@ -77,7 +77,7 @@ def test_instantiation_errors(rig): def test_idempotency(rig): rig.client_class._default_client = None - client = rig.client_class() + client = rig.client_class(**rig.required_client_kwargs) p = client.CloudPath(f"{rig.cloud_prefix}{rig.drive}/{rig.test_dir}/dir_0/file0_0.txt") p2 = CloudPath(p) diff --git a/tests/test_cloudpath_manipulation.py b/tests/test_cloudpath_manipulation.py index ef202c39..a3756a81 100644 --- a/tests/test_cloudpath_manipulation.py +++ b/tests/test_cloudpath_manipulation.py @@ -120,7 +120,7 @@ def test_joins(rig): def test_with_segments(rig): assert rig.create_cloud_path("a/b/c/d").with_segments( "x", "y", "z" - ) == rig.client_class().CloudPath(f"{rig.cloud_prefix}x/y/z") + ) == rig.client_class(**rig.required_client_kwargs).CloudPath(f"{rig.cloud_prefix}x/y/z") def test_is_junction(rig): From c14238dadd7787f0e88466ee5bcb3ab06a97de6a Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 28 Jul 2024 11:12:36 -0700 Subject: [PATCH 06/15] format --- tests/conftest.py | 25 +++++++++++++++---------- tests/test_cloudpath_manipulation.py | 6 +++--- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0c5a1134..0f301318 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -122,16 +122,12 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir if live_server: # Set up test assets - blob_service_client = BlobServiceClient.from_connection_string( - os.getenv(conn_str_env_var) - ) + blob_service_client = BlobServiceClient.from_connection_string(os.getenv(conn_str_env_var)) data_lake_service_client = DataLakeServiceClient.from_connection_string( os.getenv(conn_str_env_var) ) test_files = [ - f - for f in assets_dir.glob("**/*") - if f.is_file() and f.name not in UPLOAD_IGNORE_LIST + f for f in assets_dir.glob("**/*") if f.is_file() and f.name not in UPLOAD_IGNORE_LIST ] for test_file in test_files: blob_client = blob_service_client.get_blob_client( @@ -164,10 +160,14 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir drive=drive, test_dir=test_dir, live_server=live_server, - required_client_kwargs=dict(connection_string=os.getenv(conn_str_env_var)), # switch on/off adls gen2 + required_client_kwargs=dict( + connection_string=os.getenv(conn_str_env_var) + ), # switch on/off adls gen2 ) - rig.client_class(connection_string=os.getenv(conn_str_env_var)).set_as_default_client() # set default client + rig.client_class( + connection_string=os.getenv(conn_str_env_var) + ).set_as_default_client() # set default client # add flag for adls gen2 rig to skip some tests rig.is_adls_gen2 = adls_gen2 @@ -191,11 +191,16 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir @fixture() def azure_rig(request, monkeypatch, assets_dir): - yield from _azure_fixture("AZURE_STORAGE_CONNECTION_STRING", False, request, monkeypatch, assets_dir) + yield from _azure_fixture( + "AZURE_STORAGE_CONNECTION_STRING", False, request, monkeypatch, assets_dir + ) + @fixture() def azure_gen2_rig(request, monkeypatch, assets_dir): - yield from _azure_fixture("AZURE_STORAGE_GEN2_CONNECTION_STRING", True, request, monkeypatch, assets_dir) + yield from _azure_fixture( + "AZURE_STORAGE_GEN2_CONNECTION_STRING", True, request, monkeypatch, assets_dir + ) @fixture() diff --git a/tests/test_cloudpath_manipulation.py b/tests/test_cloudpath_manipulation.py index a3756a81..aaf4098c 100644 --- a/tests/test_cloudpath_manipulation.py +++ b/tests/test_cloudpath_manipulation.py @@ -118,9 +118,9 @@ def test_joins(rig): def test_with_segments(rig): - assert rig.create_cloud_path("a/b/c/d").with_segments( - "x", "y", "z" - ) == rig.client_class(**rig.required_client_kwargs).CloudPath(f"{rig.cloud_prefix}x/y/z") + assert rig.create_cloud_path("a/b/c/d").with_segments("x", "y", "z") == rig.client_class( + **rig.required_client_kwargs + ).CloudPath(f"{rig.cloud_prefix}x/y/z") def test_is_junction(rig): From c5cbb0b5ccf42f3b64edefa5e5b853e49884997c Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Fri, 2 Aug 2024 10:05:54 -0700 Subject: [PATCH 07/15] update mocked tests --- tests/conftest.py | 33 ++++---- tests/mock_clients/mock_adls_gen2.py | 96 ++++++++++++++++------ tests/mock_clients/mock_azureblob.py | 117 ++++++++++++++++----------- tests/test_client.py | 2 +- 4 files changed, 161 insertions(+), 87 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0f301318..251a608b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ import os from pathlib import Path, PurePosixPath import shutil +from tempfile import TemporaryDirectory from typing import Dict, Optional from azure.storage.blob import BlobServiceClient @@ -31,8 +32,8 @@ import cloudpathlib.azure.azblobclient from cloudpathlib.azure.azblobclient import _hns_rmtree import cloudpathlib.s3.s3client -from .mock_clients.mock_azureblob import mocked_client_class_factory, DEFAULT_CONTAINER_NAME -from .mock_clients.mock_adls_gen2 import mocked_adls_factory +from .mock_clients.mock_azureblob import MockBlobServiceClient, DEFAULT_CONTAINER_NAME +from .mock_clients.mock_adls_gen2 import MockedDataLakeServiceClient from .mock_clients.mock_gs import ( mocked_client_class_factory as mocked_gsclient_class_factory, DEFAULT_GS_BUCKET_NAME, @@ -120,6 +121,9 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir live_server = os.getenv("USE_LIVE_CLOUD") == "1" + connection_kwargs = dict() + tmpdir = TemporaryDirectory() + if live_server: # Set up test assets blob_service_client = BlobServiceClient.from_connection_string(os.getenv(conn_str_env_var)) @@ -135,23 +139,25 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir blob=str(f"{test_dir}/{PurePosixPath(test_file.relative_to(assets_dir))}"), ) blob_client.upload_blob(test_file.read_bytes(), overwrite=True) + + connection_kwargs["connection_string"] = os.getenv(conn_str_env_var) else: - monkeypatch.setenv("AZURE_STORAGE_CONNECTION_STRING", "") + # pass key mocked params to clients via connection string + monkeypatch.setenv( + "AZURE_STORAGE_CONNECTION_STRING", f"{Path(tmpdir.name) / test_dir};{adls_gen2}" + ) monkeypatch.setenv("AZURE_STORAGE_GEN2_CONNECTION_STRING", "") - # need shared client so both blob and adls APIs can point to same temp directory - shared_client = mocked_client_class_factory(test_dir, adls_gen2=adls_gen2)() - monkeypatch.setattr( cloudpathlib.azure.azblobclient, "BlobServiceClient", - shared_client, + MockBlobServiceClient, ) monkeypatch.setattr( cloudpathlib.azure.azblobclient, "DataLakeServiceClient", - mocked_adls_factory(test_dir, shared_client), + MockedDataLakeServiceClient, ) rig = CloudProviderTestRig( @@ -160,14 +166,10 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir drive=drive, test_dir=test_dir, live_server=live_server, - required_client_kwargs=dict( - connection_string=os.getenv(conn_str_env_var) - ), # switch on/off adls gen2 + required_client_kwargs=connection_kwargs, ) - rig.client_class( - connection_string=os.getenv(conn_str_env_var) - ).set_as_default_client() # set default client + rig.client_class(**connection_kwargs).set_as_default_client() # set default client # add flag for adls gen2 rig to skip some tests rig.is_adls_gen2 = adls_gen2 @@ -188,6 +190,9 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir container_client.delete_blobs(*to_delete) + else: + tmpdir.cleanup() + @fixture() def azure_rig(request, monkeypatch, assets_dir): diff --git a/tests/mock_clients/mock_adls_gen2.py b/tests/mock_clients/mock_adls_gen2.py index 26f1314b..1d9c10a2 100644 --- a/tests/mock_clients/mock_adls_gen2.py +++ b/tests/mock_clients/mock_adls_gen2.py @@ -1,52 +1,96 @@ +from datetime import datetime +from pathlib import Path +from shutil import rmtree +from azure.core.exceptions import ResourceNotFoundError from azure.storage.filedatalake import FileProperties -from .mock_azureblob import mocked_client_class_factory +from tests.mock_clients.mock_azureblob import _JsonCache -def mocked_adls_factory(test_dir, blob_service_client): - """Just wrap and use `MockBlobClient` where needed to mock ADLS Gen2""" +class MockedDataLakeServiceClient: + def __init__(self, test_dir, adls): + # root is parent of the test specific directort + self.root = test_dir.parent + self.test_dir = test_dir + self.adls = adls + self.metadata_cache = _JsonCache(self.root / ".metadata") - class MockedDataLakeServiceClient: - def __init__(self, blob_service_client): - self.blob_service_client = blob_service_client + @classmethod + def from_connection_string(cls, conn_str, credential): + # configured in conftest.py + test_dir, adls = conn_str.split(";") + adls = adls == "True" + test_dir = Path(test_dir) + return cls(test_dir, adls) - @classmethod - def from_connection_string(cls, *args, **kwargs): - return cls(mocked_client_class_factory(test_dir, adls_gen2=True)()) - - def get_file_system_client(self, file_system): - return MockedFileSystemClient(self.blob_service_client) - - return MockedDataLakeServiceClient + def get_file_system_client(self, file_system): + return MockedFileSystemClient(self.root, self.metadata_cache) class MockedFileSystemClient: - def __init__(self, blob_service_client): - self.blob_service_client = blob_service_client + def __init__(self, root, metadata_cache): + self.root = root + self.metadata_cache = metadata_cache def get_file_client(self, key): - return MockedFileClient(key, self.blob_service_client) + return MockedFileClient(key, self.root, self.metadata_cache) + + def get_directory_client(self, key): + return MockedDirClient(key, self.root) + + def get_paths(self, path, recursive=False): + yield from ( + MockedFileClient( + f.relative_to(self.root), self.root, self.metadata_cache + ).get_file_properties() + for f in (self.root / path).glob("**/*" if recursive else "*") + ) class MockedFileClient: - def __init__(self, key, blob_service_client) -> None: + def __init__(self, key, root, metadata_cache) -> None: self.key = key - self.blob_service_client = blob_service_client + self.root = root + self.metadata_cache = metadata_cache def get_file_properties(self): - path = self.blob_service_client.tmp_path / self.key + path = self.root / self.key if path.exists() and path.is_dir(): - return FileProperties( + fp = FileProperties( **{ - "name": self.path.name, + "name": self.key, "size": 0, - "etag": "etag", - "last_modified": self.path.stat().st_mtime, + "ETag": "etag", + "Last-Modified": datetime.fromtimestamp(path.stat().st_mtime), "metadata": {"hdi_isfolder": True}, } ) + fp["is_directory"] = True # not part of object def, but still in API responses... + return fp + + elif path.exists(): + fp = FileProperties( + **{ + "name": self.key, + "size": path.stat().st_size, + "ETag": "etag", + "Last-Modified": datetime.fromtimestamp(path.stat().st_mtime), + "metadata": {"hdi_isfolder": False}, + "Content-Type": self.metadata_cache.get(self.root / self.key, None), + } + ) - # fallback to blob properties for files + fp["is_directory"] = False + return fp else: - return self.blob_service_client.get_blob_client("", self.key).get_blob_properties() + raise ResourceNotFoundError + + +class MockedDirClient: + def __init__(self, key, root) -> None: + self.key = key + self.root = root + + def delete_directory(self): + rmtree(self.root / self.key) diff --git a/tests/mock_clients/mock_azureblob.py b/tests/mock_clients/mock_azureblob.py index 3633ca7a..1afb8a39 100644 --- a/tests/mock_clients/mock_azureblob.py +++ b/tests/mock_clients/mock_azureblob.py @@ -1,8 +1,8 @@ from collections import namedtuple from datetime import datetime +import json from pathlib import Path, PurePosixPath import shutil -from tempfile import TemporaryDirectory from azure.storage.blob import BlobProperties @@ -17,57 +17,82 @@ DEFAULT_CONTAINER_NAME = "container" -def mocked_client_class_factory(test_dir: str, adls_gen2: bool = False, tmp_dir: Path = None): - """If tmp_dir is not None, use that one so that it can be shared with a MockedDataLakeServiceClient.""" - - class MockBlobServiceClient: - def __init__(self, *args, **kwargs): - # copy test assets for reference in tests without affecting assets - self.tmp = TemporaryDirectory() if not tmp_dir else tmp_dir - self.tmp_path = Path(self.tmp.name) / "test_case_copy" - shutil.copytree(TEST_ASSETS, self.tmp_path / test_dir) - - self.metadata_cache = {} - self.adls_gen2 = adls_gen2 - - @classmethod - def from_connection_string(cls, *args, **kwargs): - return cls() - - @property - def account_name(self) -> str: - """Returns well-known account name used by Azurite - See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key - """ - return "devstoreaccount1" - - @property - def credential(self): - """Returns well-known account key used by Azurite - See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key - """ - return SharedKeyCredentialPolicy( - self.account_name, - "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", - ) +class _JsonCache: + def __init__(self, path: Path): + self.path = path + + # initialize to empty + with self.path.open("w") as f: + json.dump({}, f) + + def __getitem__(self, key): + with self.path.open("r") as f: + return json.load(f)[str(key)] + + def __setitem__(self, key, value): + with self.path.open("r") as f: + data = json.load(f) + + with self.path.open("w") as f: + data[str(key)] = value + json.dump(data, f) + + def get(self, key, default=None): + try: + return self[key] + except KeyError: + return default + - def __del__(self): - self.tmp.cleanup() +class MockBlobServiceClient: + def __init__(self, test_dir, adls): + # copy test assets for reference in tests without affecting assets + shutil.copytree(TEST_ASSETS, test_dir, dirs_exist_ok=True) - def get_blob_client(self, container, blob): - return MockBlobClient(self.tmp_path, blob, service_client=self) + # root is parent of the test specific directort + self.root = test_dir.parent + self.test_dir = test_dir + + self.metadata_cache = _JsonCache(self.root / ".metadata") + self.adls_gen2 = adls + + @classmethod + def from_connection_string(cls, conn_str, credential): + # configured in conftest.py + test_dir, adls = conn_str.split(";") + adls = adls == "True" + test_dir = Path(test_dir) + return cls(test_dir, adls) + + @property + def account_name(self) -> str: + """Returns well-known account name used by Azurite + See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key + """ + return "devstoreaccount1" + + @property + def credential(self): + """Returns well-known account key used by Azurite + See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key + """ + return SharedKeyCredentialPolicy( + self.account_name, + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + ) - def get_container_client(self, container): - return MockContainerClient(self.tmp_path, container_name=container) + def get_blob_client(self, container, blob): + return MockBlobClient(self.root, blob, service_client=self) - def list_containers(self): - Container = namedtuple("Container", "name") - return [Container(name=DEFAULT_CONTAINER_NAME)] + def get_container_client(self, container): + return MockContainerClient(self.root, container_name=container) - def get_account_information(self): - return {"is_hns_enabled": self.adls_gen2} + def list_containers(self): + Container = namedtuple("Container", "name") + return [Container(name=DEFAULT_CONTAINER_NAME)] - return MockBlobServiceClient + def get_account_information(self): + return {"is_hns_enabled": self.adls_gen2} class MockBlobClient: diff --git a/tests/test_client.py b/tests/test_client.py index c4761b0e..a665a5a6 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -102,7 +102,7 @@ def my_content_type(path): mimes.append((".potato", "application/potato")) # see if testing custom s3 endpoint, make sure to pass the url to the constructor - kwargs = {} + kwargs = rig.required_client_kwargs.copy() custom_endpoint = os.getenv("CUSTOM_S3_ENDPOINT", "https://s3.us-west-1.drivendatabws.com") if ( rig.client_class is S3Client From 6790d3994da50536ce6e4d322cc5e8b55924a6d8 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Fri, 2 Aug 2024 10:16:37 -0700 Subject: [PATCH 08/15] windows agnostic --- tests/mock_clients/mock_adls_gen2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mock_clients/mock_adls_gen2.py b/tests/mock_clients/mock_adls_gen2.py index 1d9c10a2..f1940e50 100644 --- a/tests/mock_clients/mock_adls_gen2.py +++ b/tests/mock_clients/mock_adls_gen2.py @@ -1,5 +1,5 @@ from datetime import datetime -from pathlib import Path +from pathlib import Path, PurePosixPath from shutil import rmtree from azure.core.exceptions import ResourceNotFoundError from azure.storage.filedatalake import FileProperties @@ -41,7 +41,7 @@ def get_directory_client(self, key): def get_paths(self, path, recursive=False): yield from ( MockedFileClient( - f.relative_to(self.root), self.root, self.metadata_cache + PurePosixPath(f.relative_to(self.root)), self.root, self.metadata_cache ).get_file_properties() for f in (self.root / path).glob("**/*" if recursive else "*") ) From cfe1f3fbfed26db7105c2c739618bd103245fc06 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Fri, 2 Aug 2024 14:59:53 -0500 Subject: [PATCH 09/15] set gen2 var in CI --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9d85a275..edd56b28 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -102,6 +102,7 @@ jobs: env: LIVE_AZURE_CONTAINER: ${{ secrets.LIVE_AZURE_CONTAINER }} AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }} + AZURE_STORAGE_GEN2_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_GEN2_CONNECTION_STRING }} LIVE_GS_BUCKET: ${{ secrets.LIVE_GS_BUCKET }} LIVE_S3_BUCKET: ${{ secrets.LIVE_S3_BUCKET }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} From 29284d6df43bf40d50e80e327062122d4394e107 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 11 Aug 2024 08:14:10 -0700 Subject: [PATCH 10/15] new adls fucntionality; better tests and instantiation --- .env.example | 4 + .gitignore | 2 +- cloudpathlib/azure/azblobclient.py | 108 +++++++++++++++++++------ cloudpathlib/azure/azblobpath.py | 16 +++- tests/conftest.py | 1 + tests/mock_clients/mock_adls_gen2.py | 16 +++- tests/test_azure_specific.py | 116 ++++++++++++++++++++++++++- tests/test_cloudpath_file_io.py | 8 +- tests/test_local.py | 9 ++- 9 files changed, 245 insertions(+), 35 deletions(-) diff --git a/.env.example b/.env.example index be1da87d..7e6fbfaf 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,10 @@ AWS_SECRET_ACCESS_KEY=your_secret_access_key AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=your_account_name;AccountKey=your_account_key;EndpointSuffix=core.windows.net +# if testing with ADLS Gen2 storage, set credentials for that account here +AZURE_STORAGE_GEN2_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=your_account_name;AccountKey=your_account_key;EndpointSuffix=core.windows.net + + GOOGLE_APPLICATION_CREDENTIALS=.gscreds.json # or GCP_PROJECT_ID=your_project_id diff --git a/.gitignore b/.gitignore index d542b01d..59c8c813 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ docs/docs/changelog.md docs/docs/contributing.md # perf output -perf-results.csv +perf-*.csv ## GitHub Python .gitignore ## # https://github.com/github/gitignore/blob/master/Python.gitignore diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 8bb25d0c..480bc348 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -14,6 +14,7 @@ try: from azure.core.exceptions import ResourceNotFoundError + from azure.core.credentials import AzureNamedKeyCredential from azure.storage.blob import ( BlobPrefix, BlobSasPermissions, @@ -54,12 +55,13 @@ def __init__( - Environment variable `""AZURE_STORAGE_CONNECTION_STRING"` containing connecting string with account credentials. See [Azure Storage SDK documentation]( https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python#copy-your-credentials-from-the-azure-portal). - - Account URL via `account_url`, authenticated either with an embedded SAS token, or with - credentials passed to `credentials`. - Connection string via `connection_string`, authenticated either with an embedded SAS token or with credentials passed to `credentials`. + - Account URL via `account_url`, authenticated either with an embedded SAS token, or with + credentials passed to `credentials`. - Instantiated and already authenticated [`BlobServiceClient`]( - https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python). + https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python) or + [`DataLakeServiceClient`](https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.datalakeserviceclient). If multiple methods are used, priority order is reverse of list above (later in list takes priority). If no methods are used, a [`MissingCredentialsError`][cloudpathlib.exceptions.MissingCredentialsError] @@ -102,8 +104,7 @@ def __init__( if connection_string is None: connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None) - self.blob_service_client = None - self.data_lake_client = None + self.data_lake_client = None # only needs to end up being set if HNS is enabled if blob_service_client is not None: self.service_client = blob_service_client @@ -111,13 +112,27 @@ def __init__( # create from blob service client if not passed if data_lake_client is None: self.data_lake_client = DataLakeServiceClient( - account_url=f"https://{self.service_client.account_name}.dfs.core.windows.net", - credential=blob_service_client.credential, + account_url=self.service_client.url.replace(".blob.", ".dfs.", 1), + credential=AzureNamedKeyCredential( + blob_service_client.credential.account_name, + blob_service_client.credential.account_key, + ), ) + else: + self.data_lake_client = data_lake_client - if data_lake_client is not None: + elif data_lake_client is not None: self.data_lake_client = data_lake_client + if blob_service_client is None: + self.service_client = BlobServiceClient( + account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1), + credential=AzureNamedKeyCredential( + data_lake_client.credential.account_name, + data_lake_client.credential.account_key, + ), + ) + elif connection_string is not None: self.service_client = BlobServiceClient.from_connection_string( conn_str=connection_string, credential=credential @@ -126,33 +141,45 @@ def __init__( conn_str=connection_string, credential=credential ) elif account_url is not None: - self.service_client = BlobServiceClient(account_url=account_url, credential=credential) - self.data_lake_client = DataLakeServiceClient( - account_url=account_url, credential=credential - ) + if ".dfs." in account_url: + self.service_client = BlobServiceClient( + account_url=account_url.replace(".dfs.", ".blob."), credential=credential + ) + self.data_lake_client = DataLakeServiceClient( + account_url=account_url, credential=credential + ) + elif ".blob." in account_url: + self.service_client = BlobServiceClient( + account_url=account_url, credential=credential + ) + self.data_lake_client = DataLakeServiceClient( + account_url=account_url.replace(".blob.", ".dfs."), credential=credential + ) + else: + # assume default to blob; HNS not supported + self.service_client = BlobServiceClient( + account_url=account_url, credential=credential + ) + else: raise MissingCredentialsError( "AzureBlobClient does not support anonymous instantiation. " "Credentials are required; see docs for options." ) - self.hns_cache: Dict[str, bool] = {} + self._hns_enabled = None - def _check_hns(self, cloud_path: AzureBlobPath) -> bool: - hns_key = self.service_client.account_name + "__" + cloud_path.container + def _check_hns(self) -> Optional[bool]: + if not self._hns_enabled: + account_info = self.service_client.get_account_information() # type: ignore + self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore - if hns_key not in self.hns_cache: - hns_enabled: bool = self.service_client.get_account_information().get( - "is_hns_enabled", False - ) # type: ignore - self.hns_cache[hns_key] = hns_enabled - - return self.hns_cache[hns_key] + return self._hns_enabled def _get_metadata( self, cloud_path: AzureBlobPath ) -> Union["BlobProperties", "FileProperties", Dict[str, Any]]: - if self._check_hns(cloud_path): + if self._check_hns(): # works on both files and directories fsc = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore @@ -263,7 +290,7 @@ def _list_dir( if prefix and not prefix.endswith("/"): prefix += "/" - if self._check_hns(cloud_path): + if self._check_hns(): file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore paths = file_system_client.get_paths(path=cloud_path.blob, recursive=recursive) @@ -300,6 +327,16 @@ def _move_file( metadata=dict(last_modified=str(datetime.utcnow().timestamp())) ) + # we can use rename API same account and container on adls gen2 + elif remove_src and (src.client is dst.client) and self._check_hns(): + fsc = self.data_lake_client.get_file_system_client(src.container) # type: ignore + + if src.is_dir(): + fsc.get_directory_client(src.blob).rename_directory(f"{dst.container}/{dst.blob}") + else: + dst.parent.mkdir(parents=True, exist_ok=True) + fsc.get_file_client(src.blob).rename_file(f"{dst.container}/{dst.blob}") + else: target = self.service_client.get_blob_client(container=dst.container, blob=dst.blob) @@ -312,10 +349,31 @@ def _move_file( return dst + def _mkdir( + self, cloud_path: AzureBlobPath, parents: bool = False, exist_ok: bool = False + ) -> None: + if self._check_hns(): + file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore + directory_client = file_system_client.get_directory_client(cloud_path.blob) + + if not exist_ok and directory_client.exists(): + raise FileExistsError(f"Directory already exists: {cloud_path}") + + if not parents: + if not self._exists(cloud_path.parent): + raise FileNotFoundError( + f"Parent directory does not exist ({cloud_path.parent}). To create parent directories, use `parents=True`." + ) + + directory_client.create_directory() + else: + # consistent with other mkdir no-op behavior on other backends if not supported + pass + def _remove(self, cloud_path: AzureBlobPath, missing_ok: bool = True) -> None: file_or_dir = self._is_file_or_dir(cloud_path) if file_or_dir == "dir": - if self._check_hns(cloud_path): + if self._check_hns(): _hns_rmtree(self.data_lake_client, cloud_path.container, cloud_path.blob) return diff --git a/cloudpathlib/azure/azblobpath.py b/cloudpathlib/azure/azblobpath.py index e6777ab7..265cfd81 100644 --- a/cloudpathlib/azure/azblobpath.py +++ b/cloudpathlib/azure/azblobpath.py @@ -3,6 +3,8 @@ from tempfile import TemporaryDirectory from typing import TYPE_CHECKING +from cloudpathlib.exceptions import CloudPathIsADirectoryError + try: from azure.core.exceptions import ResourceNotFoundError except ImportError: @@ -44,8 +46,7 @@ def is_file(self) -> bool: return self.client._is_file_or_dir(self) == "file" def mkdir(self, parents=False, exist_ok=False): - # not possible to make empty directory on blob storage - pass + self.client._mkdir(self, parents=parents, exist_ok=exist_ok) def touch(self, exist_ok: bool = True): if self.exists(): @@ -84,6 +85,17 @@ def stat(self): ) ) + def replace(self, target: "AzureBlobPath") -> "AzureBlobPath": + try: + return super().replace(target) + + # we can rename directories on ADLS Gen2 + except CloudPathIsADirectoryError: + if self.client._check_hns(): + return self.client._move_file(self, target) + else: + raise + @property def container(self) -> str: return self._no_prefix.split("/", 1)[0] diff --git a/tests/conftest.py b/tests/conftest.py index 251a608b..301ffe87 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -173,6 +173,7 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir # add flag for adls gen2 rig to skip some tests rig.is_adls_gen2 = adls_gen2 + rig.connection_string = os.getenv(conn_str_env_var) # used for client instantiation tests yield rig diff --git a/tests/mock_clients/mock_adls_gen2.py b/tests/mock_clients/mock_adls_gen2.py index f1940e50..aefdb735 100644 --- a/tests/mock_clients/mock_adls_gen2.py +++ b/tests/mock_clients/mock_adls_gen2.py @@ -4,7 +4,7 @@ from azure.core.exceptions import ResourceNotFoundError from azure.storage.filedatalake import FileProperties -from tests.mock_clients.mock_azureblob import _JsonCache +from tests.mock_clients.mock_azureblob import _JsonCache, DEFAULT_CONTAINER_NAME class MockedDataLakeServiceClient: @@ -86,6 +86,10 @@ def get_file_properties(self): else: raise ResourceNotFoundError + def rename_file(self, new_name): + new_path = self.root / new_name[len(DEFAULT_CONTAINER_NAME + "/") :] + (self.root / self.key).rename(new_path) + class MockedDirClient: def __init__(self, key, root) -> None: @@ -94,3 +98,13 @@ def __init__(self, key, root) -> None: def delete_directory(self): rmtree(self.root / self.key) + + def exists(self): + return (self.root / self.key).exists() + + def create_directory(self): + (self.root / self.key).mkdir(parents=True, exist_ok=True) + + def rename_directory(self, new_name): + new_path = self.root / new_name[len(DEFAULT_CONTAINER_NAME + "/") :] + (self.root / self.key).rename(new_path) diff --git a/tests/test_azure_specific.py b/tests/test_azure_specific.py index 247934bc..474525e2 100644 --- a/tests/test_azure_specific.py +++ b/tests/test_azure_specific.py @@ -1,11 +1,21 @@ import os -from azure.storage.blob import StorageStreamDownloader +from azure.core.credentials import AzureNamedKeyCredential +from azure.storage.blob import ( + BlobServiceClient, + StorageStreamDownloader, +) + +from azure.storage.filedatalake import DataLakeServiceClient import pytest from urllib.parse import urlparse, parse_qs from cloudpathlib import AzureBlobClient, AzureBlobPath -from cloudpathlib.exceptions import MissingCredentialsError +from cloudpathlib.exceptions import ( + CloudPathIsADirectoryError, + DirectoryNotEmptyError, + MissingCredentialsError, +) from cloudpathlib.local import LocalAzureBlobClient, LocalAzureBlobPath from .mock_clients.mock_azureblob import MockStorageStreamDownloader @@ -79,3 +89,105 @@ def _patched(self, buffer): assert not p._local.exists() assert not p.client._partial_filename(p._local).exists() + + +def test_client_instantiation(azure_rigs, monkeypatch): + # don't use creds from env vars for these tests + monkeypatch.delenv("AZURE_STORAGE_CONNECTION_STRING") + + if not azure_rigs.live_server: + return + + bsc = BlobServiceClient.from_connection_string(azure_rigs.connection_string) + dlsc = DataLakeServiceClient.from_connection_string(azure_rigs.connection_string) + + def _check_access(az_client, gen2=False): + """Check API access by listing.""" + assert len(list(az_client.service_client.list_containers())) > 0 + + if gen2: + assert len(list(az_client.data_lake_client.list_file_systems())) > 0 + + # test just BlobServiceClient passed + cl = azure_rigs.client_class(blob_service_client=bsc) + _check_access(cl, gen2=azure_rigs.is_adls_gen2) + + cl = azure_rigs.client_class(data_lake_client=dlsc) + _check_access(cl, gen2=azure_rigs.is_adls_gen2) + + cl = azure_rigs.client_class(blob_service_client=bsc, data_lake_client=dlsc) + _check_access(cl, gen2=azure_rigs.is_adls_gen2) + + cl = azure_rigs.client_class( + account_url=bsc.url, + credential=AzureNamedKeyCredential( + bsc.credential.account_name, bsc.credential.account_key + ), + ) + _check_access(cl, gen2=azure_rigs.is_adls_gen2) + + cl = azure_rigs.client_class( + account_url=dlsc.url, + credential=AzureNamedKeyCredential( + bsc.credential.account_name, bsc.credential.account_key + ), + ) + _check_access(cl, gen2=azure_rigs.is_adls_gen2) + + +def test_adls_gen2_mkdir(azure_gen2_rig): + """Since directories can be created on gen2, we should test mkdir, rmdir, rmtree, and unlink + all work as expected. + """ + p = azure_gen2_rig.create_cloud_path("new_dir") + + # mkdir + p.mkdir() + assert p.exists() and p.is_dir() + # rmdir does not throw + p.rmdir() + + # mkdir + p.mkdir() + p.mkdir(exist_ok=True) # ensure not raises + + with pytest.raises(FileExistsError): + p.mkdir(exist_ok=False) + + # touch file + (p / "file.txt").write_text("content") + # rmdir throws - not empty + with pytest.raises(DirectoryNotEmptyError): + p.rmdir() + + # rmtree works + p.rmtree() + assert not p.exists() + + # mkdir + p2 = p / "nested" + + with pytest.raises(FileNotFoundError): + p2.mkdir() + + p2.mkdir(parents=True) + assert p2.exists() + + with pytest.raises(CloudPathIsADirectoryError): + p2.unlink() + + +def test_adls_gen2_rename(azure_gen2_rig): + # rename file + p = azure_gen2_rig.create_cloud_path("file.txt") + p.write_text("content") + p2 = p.rename(azure_gen2_rig.create_cloud_path("file2.txt")) + assert not p.exists() + assert p2.exists() + + # rename dir + p = azure_gen2_rig.create_cloud_path("dir") + p.mkdir() + p2 = p.rename(azure_gen2_rig.create_cloud_path("dir2")) + assert not p.exists() + assert p2.exists() diff --git a/tests/test_cloudpath_file_io.py b/tests/test_cloudpath_file_io.py index 34c9c913..7dc5b149 100644 --- a/tests/test_cloudpath_file_io.py +++ b/tests/test_cloudpath_file_io.py @@ -40,8 +40,9 @@ def test_file_discovery(rig): with pytest.raises(CloudPathIsADirectoryError): p3.unlink() - with pytest.raises(CloudPathIsADirectoryError): - p3.rename(rig.create_cloud_path("dir_2/")) + if not getattr(rig, "is_adls_gen2", False): + with pytest.raises(CloudPathIsADirectoryError): + p3.rename(rig.create_cloud_path("dir_2/")) with pytest.raises(DirectoryNotEmptyError): p3.rmdir() @@ -360,7 +361,8 @@ def test_file_read_writes(rig, tmp_path): assert datetime.fromtimestamp(p.stat().st_mtime) > before_touch # no-op - p.mkdir() + if not getattr(rig, "is_adls_gen2", False): + p.mkdir() assert p.etag is not None diff --git a/tests/test_local.py b/tests/test_local.py index a983fdd3..15f1b6f9 100644 --- a/tests/test_local.py +++ b/tests/test_local.py @@ -38,7 +38,14 @@ def test_interface(cloud_class, local_class): assert type(cloud_attr) is type(local_attr) if callable(cloud_attr): - assert signature(cloud_attr).parameters == signature(local_attr).parameters + # does not check type annotations, which can vary semantically, but are the same (e.g., Self != AzureBlobPath) + assert all( + a.name == b.name + for a, b in zip( + signature(cloud_attr).parameters.values(), + signature(local_attr).parameters.values(), + ) + ) @pytest.mark.parametrize("client_class", [LocalAzureBlobClient, LocalGSClient, LocalS3Client]) From 12f7388b7889d55f4e63cac9e5f14bda96c76a77 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Wed, 14 Aug 2024 17:06:06 -0700 Subject: [PATCH 11/15] Code review comments --- cloudpathlib/azure/azblobclient.py | 4 ++-- docs/docs/authentication.md | 2 +- tests/mock_clients/mock_azureblob.py | 6 +++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 480bc348..98189378 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -170,7 +170,7 @@ def __init__( self._hns_enabled = None def _check_hns(self) -> Optional[bool]: - if not self._hns_enabled: + if self._hns_enabled is None: account_info = self.service_client.get_account_information() # type: ignore self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore @@ -327,7 +327,7 @@ def _move_file( metadata=dict(last_modified=str(datetime.utcnow().timestamp())) ) - # we can use rename API same account and container on adls gen2 + # we can use rename API when the same account on adls gen2 elif remove_src and (src.client is dst.client) and self._check_hns(): fsc = self.data_lake_client.get_file_system_client(src.container) # type: ignore diff --git a/docs/docs/authentication.md b/docs/docs/authentication.md index 0557c196..36018532 100644 --- a/docs/docs/authentication.md +++ b/docs/docs/authentication.md @@ -215,7 +215,7 @@ cp3 = CloudPath("s3://cloudpathlib-test-bucket/") Some Azure storage accounts are configured with "hierarchical namespace" enabled. This means that the storage account is backed by the Azure DataLake Storage Gen2 product rather than Azure Blob Storage. For many operations, the two are the same and one can use the Azure Blob Storage API. However, for some operations, a developer will need to use the Azure DataLake Storage API. The `AzureBlobClient` class implemented in cloudpathlib is designed to detect if hierarchical namespace is enabled and use the Azure DataLake Storage API in the places where it is necessary or it provides a performance improvement. Usually, a user of cloudpathlib will not need to know if hierarchical namespace is enabled and the storage account is backed by Azure DataLake Storage Gen2 or Azure Blob Storage. -If needed, the Azure SDK provided `DataLakeServiceClient` object can be accessed via the `AzureBlobClient.data_lake_client`. The Azure SDK provided `BlobServiceClient` object can be accessed via `AzureBlobClient.blob_client`. +If needed, the Azure SDK provided `DataLakeServiceClient` object can be accessed via the `AzureBlobClient.data_lake_client`. The Azure SDK provided `BlobServiceClient` object can be accessed via `AzureBlobClient.service_client`. ## Pickling `CloudPath` objects diff --git a/tests/mock_clients/mock_azureblob.py b/tests/mock_clients/mock_azureblob.py index 1afb8a39..f99e0d4a 100644 --- a/tests/mock_clients/mock_azureblob.py +++ b/tests/mock_clients/mock_azureblob.py @@ -18,6 +18,10 @@ class _JsonCache: + """Used to mock file metadata store on cloud storage; saves/writes to disk so + different clients can access the same metadata store. + """ + def __init__(self, path: Path): self.path = path @@ -49,7 +53,7 @@ def __init__(self, test_dir, adls): # copy test assets for reference in tests without affecting assets shutil.copytree(TEST_ASSETS, test_dir, dirs_exist_ok=True) - # root is parent of the test specific directort + # root is parent of the test specific directory self.root = test_dir.parent self.test_dir = test_dir From 55de1489de5d1f965a71ca522042004c15579ae4 Mon Sep 17 00:00:00 2001 From: Jay Qi <2721979+jayqi@users.noreply.github.com> Date: Fri, 23 Aug 2024 19:55:00 -0400 Subject: [PATCH 12/15] Tweak HISTORY.md --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 603fec73..563f64bb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,7 +15,7 @@ - Changed `LocalClient` so that client instances using the default storage access the default local storage directory through the `get_default_storage_dir` rather than having an explicit reference to the path set at instantiation. This means that calling `get_default_storage_dir` will reset the local storage for all clients using the default local storage, whether the client has already been instantiated or is instantiated after resetting. This fixes unintuitive behavior where `reset_local_storage` did not reset local storage when using the default client. (Issue [#414](https://github.com/drivendataorg/cloudpathlib/issues/414)) - Added a new `local_storage_dir` property to `LocalClient`. This will return the current local storage directory used by that client instance. by reference through the `get_default_ rather than with an explicit. -- Add Azure Data Lake Storage Gen2 support (Issue [#161](https://github.com/drivendataorg/cloudpathlib/issues/161), PR [#450](https://github.com/drivendataorg/cloudpathlib/pull/450)), thanks to [@M0dEx](https://github.com/M0dEx) for PR [#447](https://github.com/drivendataorg/cloudpathlib/pull/447) and PR [#449](https://github.com/drivendataorg/cloudpathlib/pull/449) +- Added Azure Data Lake Storage Gen2 support (Issue [#161](https://github.com/drivendataorg/cloudpathlib/issues/161), PR [#450](https://github.com/drivendataorg/cloudpathlib/pull/450)), thanks to [@M0dEx](https://github.com/M0dEx) for PR [#447](https://github.com/drivendataorg/cloudpathlib/pull/447) and PR [#449](https://github.com/drivendataorg/cloudpathlib/pull/449) ## v0.18.1 (2024-02-26) From bb36a52753277a7197bbda7204602a885b8d626a Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Fri, 23 Aug 2024 17:17:32 -0700 Subject: [PATCH 13/15] TEMP: debug test code --- tests/test_caching.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test_caching.py b/tests/test_caching.py index 71890c43..90370902 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -491,10 +491,8 @@ def test_manual_cache_clearing(rig: CloudProviderTestRig): # 2 files present in cache folder assert len(list(filter(lambda x: x.is_file(), client._local_cache_dir.rglob("*")))) == 2 - # clears all files inside the folder, but containing folder still exists - client.clear_cache() - - assert len(list(filter(lambda x: x.is_file(), client._local_cache_dir.rglob("*")))) == 0 + # Enable debugging for garbage collection + gc.callbacks.append(lambda event, args: print(f"GC {event} - {args}")) # also removes containing folder on client cleanted up local_cache_path = cp._local @@ -502,6 +500,19 @@ def test_manual_cache_clearing(rig: CloudProviderTestRig): del cp del client + def _debug(path): + import psutil + + # processes with open files + open_files = [] + for proc in psutil.process_iter(["pid", "name", "open_files"]): + for file in proc.info["open_files"] or []: + if path in file.path: + open_files.append((proc.info["pid"], proc.info["name"], file.path)) + + print(f" OPEN FILES INFO FOR {path}") + print(open_files) + # in CI there can be a lag before the cleanup actually happens @retry( retry=retry_if_exception_type(AssertionError), @@ -510,6 +521,9 @@ def test_manual_cache_clearing(rig: CloudProviderTestRig): reraise=True, ) def _resilient_assert(): + _debug(str(local_cache_path.resolve())) + _debug(str(client_cache_folder.resolve())) + gc.collect() # force gc before asserting assert not local_cache_path.exists() @@ -517,6 +531,8 @@ def _resilient_assert(): _resilient_assert() + gc.callbacks.pop() + def test_reuse_cache_after_manual_cache_clear(rig: CloudProviderTestRig): # use client that we can delete rather than default From b34d964c4487873721c5e1daea0d294980fb676d Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Tue, 27 Aug 2024 21:29:19 -0700 Subject: [PATCH 14/15] don't close non-existent file --- cloudpathlib/cloudpath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py index 5cd92708..d7bf391b 100644 --- a/cloudpathlib/cloudpath.py +++ b/cloudpathlib/cloudpath.py @@ -251,7 +251,7 @@ def client(self): def __del__(self) -> None: # make sure that file handle to local path is closed - if self._handle is not None: + if self._handle is not None and self._local.exists(): self._handle.close() # ensure file removed from cache when cloudpath object deleted From 9853b6b1ea8086eceb5c715733af2685962768be Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Tue, 27 Aug 2024 21:42:52 -0700 Subject: [PATCH 15/15] Revert "TEMP: debug test code" This reverts commit bb36a52753277a7197bbda7204602a885b8d626a. --- tests/test_caching.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/tests/test_caching.py b/tests/test_caching.py index 90370902..71890c43 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -491,8 +491,10 @@ def test_manual_cache_clearing(rig: CloudProviderTestRig): # 2 files present in cache folder assert len(list(filter(lambda x: x.is_file(), client._local_cache_dir.rglob("*")))) == 2 - # Enable debugging for garbage collection - gc.callbacks.append(lambda event, args: print(f"GC {event} - {args}")) + # clears all files inside the folder, but containing folder still exists + client.clear_cache() + + assert len(list(filter(lambda x: x.is_file(), client._local_cache_dir.rglob("*")))) == 0 # also removes containing folder on client cleanted up local_cache_path = cp._local @@ -500,19 +502,6 @@ def test_manual_cache_clearing(rig: CloudProviderTestRig): del cp del client - def _debug(path): - import psutil - - # processes with open files - open_files = [] - for proc in psutil.process_iter(["pid", "name", "open_files"]): - for file in proc.info["open_files"] or []: - if path in file.path: - open_files.append((proc.info["pid"], proc.info["name"], file.path)) - - print(f" OPEN FILES INFO FOR {path}") - print(open_files) - # in CI there can be a lag before the cleanup actually happens @retry( retry=retry_if_exception_type(AssertionError), @@ -521,9 +510,6 @@ def _debug(path): reraise=True, ) def _resilient_assert(): - _debug(str(local_cache_path.resolve())) - _debug(str(client_cache_folder.resolve())) - gc.collect() # force gc before asserting assert not local_cache_path.exists() @@ -531,8 +517,6 @@ def _resilient_assert(): _resilient_assert() - gc.callbacks.pop() - def test_reuse_cache_after_manual_cache_clear(rig: CloudProviderTestRig): # use client that we can delete rather than default