Merge branch 'main' into main

snakemake · Aug 19, 2024 · d8ee87d · d8ee87d
2 parents cb14b4c + c7ba28e
commit d8ee87d
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -3,6 +3,8 @@
 A snakemake storage plugin for Google Cloud Storage.
 For documentation and usage instructions, see the [Snakemake plugin catalog](https://snakemake.github.io/snakemake-plugin-catalog/plugins/storage/gcs.html).
 
+## Testing
+Set up a fake gcs server from [fake-gcs-server](https://github.com/fsouza/fake-gcs-server).
 
 ## License
 

diff --git a/snakemake_storage_plugin_gcs/__init__.py b/snakemake_storage_plugin_gcs/__init__.py
@@ -16,7 +16,11 @@
     StorageObjectGlob,
 )
 from snakemake_interface_storage_plugins.common import Operation
-from snakemake_interface_storage_plugins.io import IOCacheStorageInterface
+from snakemake_interface_storage_plugins.io import (
+    IOCacheStorageInterface,
+    get_constant_prefix,
+    Mtime,
+)
 from snakemake_interface_common.logging import get_logger
 
 from urllib.parse import urlparse
@@ -286,23 +290,22 @@ async def inventory(self, cache: IOCacheStorageInterface):
          - cache.mtime
          - cache.size
         """
-        # TODO enable again later
-        # if self.get_inventory_parent() in cache.exists_in_storage:
-        #     # bucket has been inventorized before, stop here
-        #     return
-
-        # # check if bucket exists
-        # if not self.bucket_exists():
-        #     cache.exists_in_storage[self.cache_key()] = False
-        #     cache.exists_in_storage[self.get_inventory_parent()] = False
-        # else:
-        #     subfolder = os.path.dirname(self.blob.name)
-        #     for blob in self.client.list_blobs(self.bucket_name, prefix=subfolder):
-        #         # By way of being listed, it exists. mtime is a datetime object
-        #         key = self.cache_key(self._local_suffix_from_key(blob.name))
-        #         cache.exists_in_storage[key] = True
-        #         cache.mtime[key] = Mtime(remote=blob.updated.timestamp())
-        #         cache.size[key] = blob.size
+        if self.get_inventory_parent() in cache.exists_in_storage:
+             # bucket has been inventorized before, stop here
+             return
+
+        # check if bucket exists
+        if not self.bucket.exists():
+            cache.exists_in_storage[self.cache_key()] = False
+            cache.exists_in_storage[self.get_inventory_parent()] = False
+        else:
+            subfolder = os.path.dirname(self.blob.name)
+            for blob in self.client.list_blobs(self.bucket_name, prefix=subfolder):
+                # By way of being listed, it exists. mtime is a datetime object
+                key = self.cache_key(self._local_suffix_from_key(blob.name))
+                cache.exists_in_storage[key] = True
+                cache.mtime[key] = Mtime(storage=blob.updated.timestamp())
+                cache.size[key] = blob.size
         #         # TODO cache "is directory" information
 
     def get_inventory_parent(self) -> Optional[str]:
@@ -381,7 +384,7 @@ def store_object(self):
 
         TODO: note from vsoch - I'm not sure I read this function name right,
         but I didn't find an equivalent "upload" function so I thought this might
-        be it. The original function comment is below.
+        be it. The original function comment is below. 
         """
         # Ensure that the object is stored at the location specified by
         # self.local_path().
@@ -463,8 +466,18 @@ def remove(self):
     def list_candidate_matches(self) -> Iterable[str]:
         """Return a list of candidate matches in the storage for the query."""
         # This is used by glob_wildcards() to find matches for wildcards in the query.
-        # The method has to return concretized queries without any remaining wildcards.
-        ...
+        prefix = get_constant_prefix(self.query)
+        if prefix.startswith(f"gcs://{self.bucket.name}"):
+            prefix = prefix[6 + len(self.bucket.name) :].lstrip("/")
+
+            return (
+                f"gcs://{self.bucket.name}/{item.name}"
+                for item in self.bucket.list_blobs(prefix=prefix)
+            )
+        else:
+            raise WorkflowError(
+                f"GCS storage object {self.query} must start with gcs://"
+            )
 
     # Helper functions and properties not part of standard interface
     # TODO check parent class and determine if any of these are already implemented

diff --git a/tests/test_fake_gcs.py b/tests/test_fake_gcs.py
@@ -1,8 +1,10 @@
+# %%
 import tempfile
 import os
 
 from google.auth.credentials import AnonymousCredentials
 from google.cloud import storage
+from google.api_core.exceptions import Conflict
 
 # This endpoint assumes that you are using the default port 4443 from the container.
 # If you are using a different port, please set the environment variable
@@ -36,7 +38,22 @@
 
 # Create a new Bucket
 bucket = client.bucket("snakemake-test-bucket")
-client.create_bucket(bucket)
-bucket.blob("test-hello-world.txt").upload_from_string("Hello World!")
+
+try:
+    client.create_bucket(bucket)
+except Conflict:
+    # Bucket already created
+    pass
+
+file_data = {
+    "test-file.txt": "Hello World!",
+    "test-file_2.txt": "Testing candidates",
+    "test-file_3.txt": "What",
+}
+
+for file_name, contents in file_data.items():
+    blob = bucket.blob(file_name)
+    blob.upload_from_string(contents)
 
 assert not bucket.blob("foo").exists()
+print(list(bucket.list_blobs()))
diff --git a/tests/tests.py b/tests/tests.py
@@ -2,11 +2,15 @@
 import shutil
 from typing import List, Optional, Type
 import uuid
+
 from snakemake_interface_storage_plugins.tests import TestStorageBase
 from snakemake_interface_storage_plugins.storage_provider import StorageProviderBase
 from snakemake_interface_storage_plugins.settings import StorageProviderSettingsBase
 
-from snakemake_storage_plugin_gcs import StorageProvider, StorageProviderSettings
+from snakemake_storage_plugin_gcs import (
+    StorageProvider,
+    StorageProviderSettings,
+)
 
 # Use local fake server as outlined here:
 # https://github.com/fsouza/fake-gcs-server
@@ -110,3 +114,13 @@ def test_storage_nonempty_directory(self, tmp_path):
             if not self.retrieve_only and stored and self.delete:
                 obj.remove()
                 shutil.rmtree(obj.local_path())
+
+    def test_list_candidate_matches(self, tmp_path):
+        obj = self._get_obj(tmp_path, "gcs://snakemake-test-bucket/")
+        candidates = list(obj.list_candidate_matches())
+        # I think the previous test deletes the first test_object
+        expected_matches = [
+            "gcs://snakemake-test-bucket/test-file_2.txt",
+            "gcs://snakemake-test-bucket/test-file_3.txt",
+        ]
+        assert candidates == expected_matches