Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
johanneskoester authored Aug 19, 2024
2 parents cb14b4c + c7ba28e commit d8ee87d
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 24 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
A snakemake storage plugin for Google Cloud Storage.
For documentation and usage instructions, see the [Snakemake plugin catalog](https://snakemake.github.io/snakemake-plugin-catalog/plugins/storage/gcs.html).

## Testing
Set up a fake gcs server from [fake-gcs-server](https://github.com/fsouza/fake-gcs-server).

## License

Expand Down
55 changes: 34 additions & 21 deletions snakemake_storage_plugin_gcs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
StorageObjectGlob,
)
from snakemake_interface_storage_plugins.common import Operation
from snakemake_interface_storage_plugins.io import IOCacheStorageInterface
from snakemake_interface_storage_plugins.io import (
IOCacheStorageInterface,
get_constant_prefix,
Mtime,
)
from snakemake_interface_common.logging import get_logger

from urllib.parse import urlparse
Expand Down Expand Up @@ -286,23 +290,22 @@ async def inventory(self, cache: IOCacheStorageInterface):
- cache.mtime
- cache.size
"""
# TODO enable again later
# if self.get_inventory_parent() in cache.exists_in_storage:
# # bucket has been inventorized before, stop here
# return

# # check if bucket exists
# if not self.bucket_exists():
# cache.exists_in_storage[self.cache_key()] = False
# cache.exists_in_storage[self.get_inventory_parent()] = False
# else:
# subfolder = os.path.dirname(self.blob.name)
# for blob in self.client.list_blobs(self.bucket_name, prefix=subfolder):
# # By way of being listed, it exists. mtime is a datetime object
# key = self.cache_key(self._local_suffix_from_key(blob.name))
# cache.exists_in_storage[key] = True
# cache.mtime[key] = Mtime(remote=blob.updated.timestamp())
# cache.size[key] = blob.size
if self.get_inventory_parent() in cache.exists_in_storage:
# bucket has been inventorized before, stop here
return

# check if bucket exists
if not self.bucket.exists():
cache.exists_in_storage[self.cache_key()] = False
cache.exists_in_storage[self.get_inventory_parent()] = False
else:
subfolder = os.path.dirname(self.blob.name)
for blob in self.client.list_blobs(self.bucket_name, prefix=subfolder):
# By way of being listed, it exists. mtime is a datetime object
key = self.cache_key(self._local_suffix_from_key(blob.name))
cache.exists_in_storage[key] = True
cache.mtime[key] = Mtime(storage=blob.updated.timestamp())
cache.size[key] = blob.size
# # TODO cache "is directory" information

def get_inventory_parent(self) -> Optional[str]:
Expand Down Expand Up @@ -381,7 +384,7 @@ def store_object(self):
TODO: note from vsoch - I'm not sure I read this function name right,
but I didn't find an equivalent "upload" function so I thought this might
be it. The original function comment is below.
be it. The original function comment is below.
"""
# Ensure that the object is stored at the location specified by
# self.local_path().
Expand Down Expand Up @@ -463,8 +466,18 @@ def remove(self):
def list_candidate_matches(self) -> Iterable[str]:
"""Return a list of candidate matches in the storage for the query."""
# This is used by glob_wildcards() to find matches for wildcards in the query.
# The method has to return concretized queries without any remaining wildcards.
...
prefix = get_constant_prefix(self.query)
if prefix.startswith(f"gcs://{self.bucket.name}"):
prefix = prefix[6 + len(self.bucket.name) :].lstrip("/")

return (
f"gcs://{self.bucket.name}/{item.name}"
for item in self.bucket.list_blobs(prefix=prefix)
)
else:
raise WorkflowError(
f"GCS storage object {self.query} must start with gcs://"
)

# Helper functions and properties not part of standard interface
# TODO check parent class and determine if any of these are already implemented
Expand Down
21 changes: 19 additions & 2 deletions tests/test_fake_gcs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# %%
import tempfile
import os

from google.auth.credentials import AnonymousCredentials
from google.cloud import storage
from google.api_core.exceptions import Conflict

# This endpoint assumes that you are using the default port 4443 from the container.
# If you are using a different port, please set the environment variable
Expand Down Expand Up @@ -36,7 +38,22 @@

# Create a new Bucket
bucket = client.bucket("snakemake-test-bucket")
client.create_bucket(bucket)
bucket.blob("test-hello-world.txt").upload_from_string("Hello World!")

try:
client.create_bucket(bucket)
except Conflict:
# Bucket already created
pass

file_data = {
"test-file.txt": "Hello World!",
"test-file_2.txt": "Testing candidates",
"test-file_3.txt": "What",
}

for file_name, contents in file_data.items():
blob = bucket.blob(file_name)
blob.upload_from_string(contents)

assert not bucket.blob("foo").exists()
print(list(bucket.list_blobs()))
16 changes: 15 additions & 1 deletion tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@
import shutil
from typing import List, Optional, Type
import uuid

from snakemake_interface_storage_plugins.tests import TestStorageBase
from snakemake_interface_storage_plugins.storage_provider import StorageProviderBase
from snakemake_interface_storage_plugins.settings import StorageProviderSettingsBase

from snakemake_storage_plugin_gcs import StorageProvider, StorageProviderSettings
from snakemake_storage_plugin_gcs import (
StorageProvider,
StorageProviderSettings,
)

# Use local fake server as outlined here:
# https://github.com/fsouza/fake-gcs-server
Expand Down Expand Up @@ -110,3 +114,13 @@ def test_storage_nonempty_directory(self, tmp_path):
if not self.retrieve_only and stored and self.delete:
obj.remove()
shutil.rmtree(obj.local_path())

def test_list_candidate_matches(self, tmp_path):
obj = self._get_obj(tmp_path, "gcs://snakemake-test-bucket/")
candidates = list(obj.list_candidate_matches())
# I think the previous test deletes the first test_object
expected_matches = [
"gcs://snakemake-test-bucket/test-file_2.txt",
"gcs://snakemake-test-bucket/test-file_3.txt",
]
assert candidates == expected_matches

0 comments on commit d8ee87d

Please sign in to comment.