From b539f67363683d983469846b807b95fb9c653b97 Mon Sep 17 00:00:00 2001 From: tofarr Date: Mon, 27 Jan 2025 08:40:08 -0700 Subject: [PATCH] Fix S3FileStore / GoogleCloudFileStore directory list & deletion (#6449) Co-authored-by: openhands --- openhands/storage/google_cloud.py | 18 ++++- openhands/storage/s3.py | 67 ++++++++++------- tests/unit/test_storage.py | 117 +++++++++++++++++++++++++++++- 3 files changed, 174 insertions(+), 28 deletions(-) diff --git a/openhands/storage/google_cloud.py b/openhands/storage/google_cloud.py index da763363ab3d0..19465c0442cea 100644 --- a/openhands/storage/google_cloud.py +++ b/openhands/storage/google_cloud.py @@ -60,5 +60,19 @@ def list(self, path: str) -> List[str]: return list(blobs) def delete(self, path: str) -> None: - blob = self.bucket.blob(path) - blob.delete() + # Sanitize path + if not path or path == '/': + path = '' + if path.endswith('/'): + path = path[:-1] + + # Try to delete any child resources (Assume the path is a directory) + for blob in self.bucket.list_blobs(prefix=f'{path}/'): + blob.delete() + + # Next try to delete item as a file + try: + blob = self.bucket.blob(path) + blob.delete() + except NotFound: + pass diff --git a/openhands/storage/s3.py b/openhands/storage/s3.py index 86be398c5172e..429e660011e42 100644 --- a/openhands/storage/s3.py +++ b/openhands/storage/s3.py @@ -66,36 +66,53 @@ def read(self, path: str) -> str: ) def list(self, path: str) -> list[str]: - if path and path != '/' and not path.endswith('/'): + if not path or path == '/': + path = '' + elif not path.endswith('/'): path += '/' - try: - response = self.client.list_objects_v2(Bucket=self.bucket, Prefix=path) - # Check if 'Contents' exists in the response - if 'Contents' in response: - objects = [obj['Key'] for obj in response['Contents']] - return objects - else: - return list() - except botocore.exceptions.ClientError as e: - # Catch all S3-related errors - if e.response['Error']['Code'] == 'NoSuchBucket': - raise FileNotFoundError( - f"Error: The bucket '{self.bucket}' does not exist." - ) - elif e.response['Error']['Code'] == 'AccessDenied': - raise FileNotFoundError( - f"Error: Access denied to bucket '{self.bucket}'." - ) - else: - raise FileNotFoundError(f"Error: {e.response['Error']['Message']}") - except Exception as e: - raise FileNotFoundError( - f"Error: Failed to read from bucket '{self.bucket}' at path {path}: {e}" - ) + # The delimiter logic screens out directories, so we can't use it. :( + # For example, given a structure: + # foo/bar/zap.txt + # foo/bar/bang.txt + # ping.txt + # prefix=None, delimiter="/" yields ["ping.txt"] # :( + # prefix="foo", delimiter="/" yields [] # :( + results = set() + prefix_len = len(path) + response = self.client.list_objects_v2(Bucket=self.bucket, Prefix=path) + contents = response.get('Contents') + if not contents: + return [] + paths = [obj['Key'] for obj in response['Contents']] + for sub_path in paths: + if sub_path == path: + continue + try: + index = sub_path.index('/', prefix_len + 1) + if index != prefix_len: + results.add(sub_path[: index + 1]) + except ValueError: + results.add(sub_path) + return list(results) def delete(self, path: str) -> None: try: + # Sanitize path + if not path or path == '/': + path = '' + if path.endswith('/'): + path = path[:-1] + + # Try to delete any child resources (Assume the path is a directory) + response = self.client.list_objects_v2( + Bucket=self.bucket, Prefix=f'{path}/' + ) + for content in response.get('Contents') or []: + self.client.delete_object(Bucket=self.bucket, Key=content['Key']) + + # Next try to delete item as a file self.client.delete_object(Bucket=self.bucket, Key=path) + except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'NoSuchBucket': raise FileNotFoundError( diff --git a/tests/unit/test_storage.py b/tests/unit/test_storage.py index 46915c181492e..37b52f0a924ab 100644 --- a/tests/unit/test_storage.py +++ b/tests/unit/test_storage.py @@ -4,15 +4,19 @@ import shutil from abc import ABC from dataclasses import dataclass, field -from io import StringIO +from io import BytesIO, StringIO from typing import Dict, List, Optional from unittest import TestCase from unittest.mock import patch +import botocore.exceptions +from google.api_core.exceptions import NotFound + from openhands.storage.files import FileStore from openhands.storage.google_cloud import GoogleCloudFileStore from openhands.storage.local import LocalFileStore from openhands.storage.memory import InMemoryFileStore +from openhands.storage.s3 import S3FileStore class _StorageTest(ABC): @@ -73,6 +77,35 @@ def test_deep_list(self): store.delete('foo/bar/qux.txt') store.delete('foo/bar/quux.txt') + def test_directory_deletion(self): + store = self.get_store() + # Create a directory structure + store.write('foo/bar/baz.txt', 'Hello, world!') + store.write('foo/bar/qux.txt', 'Hello, world!') + store.write('foo/other.txt', 'Hello, world!') + store.write('foo/bar/subdir/file.txt', 'Hello, world!') + + # Verify initial structure + self.assertEqual(store.list(''), ['foo/']) + self.assertEqual(sorted(store.list('foo')), ['foo/bar/', 'foo/other.txt']) + self.assertEqual( + sorted(store.list('foo/bar')), + ['foo/bar/baz.txt', 'foo/bar/qux.txt', 'foo/bar/subdir/'], + ) + + # Delete a directory + store.delete('foo/bar') + + # Verify directory and its contents are gone, but other files remain + self.assertEqual(store.list(''), ['foo/']) + self.assertEqual(store.list('foo'), ['foo/other.txt']) + + # Delete root directory + store.delete('foo') + + # Verify everything is gone + self.assertEqual(store.list(''), []) + class TestLocalFileStore(TestCase, _StorageTest): def setUp(self): @@ -94,6 +127,12 @@ def setUp(self): self.store = GoogleCloudFileStore('dear-liza') +class TestS3FileStore(TestCase, _StorageTest): + def setUp(self): + with patch('boto3.client', lambda service, **kwargs: _MockS3Client()): + self.store = S3FileStore('dear-liza') + + # I would have liked to use cloud-storage-mocker here but the python versions were incompatible :( # If we write tests for the S3 storage class I would definitely recommend we use moto. class _MockGoogleCloudClient: @@ -131,6 +170,8 @@ def open(self, op: str): return _MockGoogleCloudBlobWriter(self) def delete(self): + if self.name not in self.bucket.blobs_by_path: + raise NotFound('Blob not found') del self.bucket.blobs_by_path[self.name] @@ -152,3 +193,77 @@ def __exit__(self, exc_type, exc_val, exc_tb): blob = self.blob blob.content = self.content blob.bucket.blobs_by_path[blob.name] = blob + + +class _MockS3Client: + def __init__(self): + self.objects_by_bucket: Dict[str, Dict[str, _MockS3Object]] = {} + + def put_object(self, Bucket: str, Key: str, Body: str | bytes) -> None: + if Bucket not in self.objects_by_bucket: + self.objects_by_bucket[Bucket] = {} + self.objects_by_bucket[Bucket][Key] = _MockS3Object(Key, Body) + + def get_object(self, Bucket: str, Key: str) -> Dict: + if Bucket not in self.objects_by_bucket: + raise botocore.exceptions.ClientError( + { + 'Error': { + 'Code': 'NoSuchBucket', + 'Message': f"The bucket '{Bucket}' does not exist", + } + }, + 'GetObject', + ) + if Key not in self.objects_by_bucket[Bucket]: + raise botocore.exceptions.ClientError( + { + 'Error': { + 'Code': 'NoSuchKey', + 'Message': f"The specified key '{Key}' does not exist", + } + }, + 'GetObject', + ) + content = self.objects_by_bucket[Bucket][Key].content + if isinstance(content, bytes): + return {'Body': BytesIO(content)} + return {'Body': StringIO(content)} + + def list_objects_v2(self, Bucket: str, Prefix: str = '') -> Dict: + if Bucket not in self.objects_by_bucket: + raise botocore.exceptions.ClientError( + { + 'Error': { + 'Code': 'NoSuchBucket', + 'Message': f"The bucket '{Bucket}' does not exist", + } + }, + 'ListObjectsV2', + ) + objects = self.objects_by_bucket[Bucket] + contents = [ + {'Key': key} + for key in objects.keys() + if not Prefix or key.startswith(Prefix) + ] + return {'Contents': contents} if contents else {} + + def delete_object(self, Bucket: str, Key: str) -> None: + if Bucket not in self.objects_by_bucket: + raise botocore.exceptions.ClientError( + { + 'Error': { + 'Code': 'NoSuchBucket', + 'Message': f"The bucket '{Bucket}' does not exist", + } + }, + 'DeleteObject', + ) + self.objects_by_bucket[Bucket].pop(Key, None) + + +@dataclass +class _MockS3Object: + key: str + content: str | bytes