From 5bcb0a41f25b260d9dec6de7d15f4c4804d14d99 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 28 Mar 2025 20:22:07 -0500 Subject: [PATCH 01/65] Initial DB framework, ls works (no details) --- pins/__init__.py | 1 + pins/boards.py | 2 ++ pins/constructors.py | 42 +++++++++++++++++++++++++++++++++++++ pins/databricks/__init__.py | 0 pins/databricks/fs.py | 13 ++++++++++++ 5 files changed, 58 insertions(+) create mode 100644 pins/databricks/__init__.py create mode 100644 pins/databricks/fs.py diff --git a/pins/__init__.py b/pins/__init__.py index 8bfc4e3d..5bf14ef9 100644 --- a/pins/__init__.py +++ b/pins/__init__.py @@ -22,6 +22,7 @@ board_azure, board_s3, board_gcs, + board_databricks, board, ) from .boards import board_deparse diff --git a/pins/boards.py b/pins/boards.py index 91bfb585..eab3aca2 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -868,6 +868,8 @@ def board_deparse(board: BaseBoard): return f"board_gcs({repr(board.board)}{allow_pickle})" elif prot == "http": return f"board_url({repr(board.board)}, {board.pin_paths}{allow_pickle})" + elif set(prot) == "dbutils": + return f"board_databricks({repr(board.board)}{allow_pickle})" else: raise NotImplementedError( f"board deparsing currently not supported for protocol: {prot}" diff --git a/pins/constructors.py b/pins/constructors.py index 6cb5d117..0c40f798 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -87,6 +87,11 @@ def board( fs = RsConnectFs(**storage_options) + elif protocol == "dbc" : + from pins.databricks.fs import DatabricksFs + + fs = DatabricksFs(**storage_options) + else: fs = fsspec.filesystem(protocol, **storage_options) @@ -109,6 +114,8 @@ def board( same_names=True, mapper=PinsRscCacheMapper, ) + elif protocol == "dbc": + None else: # ensures each subdir path is its own cache directory board_cache = prefix_cache(fs, path) @@ -130,6 +137,8 @@ def board( board = board_factory(path, fs, versioned, **pickle_kwargs) elif protocol == "rsc": board = BoardRsConnect(path, fs, versioned, **pickle_kwargs) + elif protocol == "rsc": + board = DatabricksFs(path, fs, versioned, **pickle_kwargs) else: board = BaseBoard(path, fs, versioned, **pickle_kwargs) return board @@ -569,3 +578,36 @@ def board_azure(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): opts = {"use_listings_cache": False} return board("abfs", path, versioned, cache, allow_pickle_read, storage_options=opts) + +def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): + """Create a board to read and write pins from an Databricks Volume folder. + + Parameters + ---------- + path: + Path of form `//`. + versioned: + Whether or not pins should be versioned. + cache: + Whether to use a cache. By default, pins attempts to select the right cache + directory, given your filesystem. If `None` is passed, then no cache will be + used. You can set the cache using the `PINS_CACHE_DIR` environment variable. + allow_pickle_read: optional, bool + Whether to allow reading pins that use the pickle protocol. Pickles are unsafe, + and can execute arbitrary code. Only allow reading pickles if you trust the + board to execute Python code on your computer. + + You can enable reading pickles by setting this to `True`, or by setting the + environment variable `PINS_ALLOW_PICKLE_READ`. If both are set, this argument + takes precedence. + + Notes + ----- + The Databricks board uses the fsspec library (dbutils) to handle interacting with + Databricks Volumes. Currently, its default mode of authentication is supported. + + See + """ + + opts = {"use_listings_cache": False} + return board("dbc", path, versioned, cache, allow_pickle_read, storage_options=opts) \ No newline at end of file diff --git a/pins/databricks/__init__.py b/pins/databricks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py new file mode 100644 index 00000000..80800fbe --- /dev/null +++ b/pins/databricks/fs.py @@ -0,0 +1,13 @@ +from databricks.sdk import WorkspaceClient +from fsspec import AbstractFileSystem +from typing import ClassVar + +class DatabricksFs(AbstractFileSystem): + protocol: ClassVar[str | tuple[str, ...]] = "dbc" + + def ls(self, path, details=False, **kwargs): + w = WorkspaceClient() + all_items = [] + for item in w.files.list_directory_contents("/Volumes/workshops/my-board/my-volume"): + all_items.append(item.name) + return all_items \ No newline at end of file From 3e8768cc3ff3ecebce6014eed63cc4a867002f89 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 28 Mar 2025 20:44:03 -0500 Subject: [PATCH 02/65] Switches to using folder_url instead of path --- pins/constructors.py | 12 +++++++----- pins/databricks/fs.py | 7 +++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index 0c40f798..e7fa2cfe 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -579,13 +579,15 @@ def board_azure(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): opts = {"use_listings_cache": False} return board("abfs", path, versioned, cache, allow_pickle_read, storage_options=opts) -def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): +def board_databricks(folder_url, versioned=True, cache=DEFAULT, allow_pickle_read=None): """Create a board to read and write pins from an Databricks Volume folder. Parameters ---------- - path: - Path of form `//`. + folder_url: + The path to the target folder inside Unity Catalog. The path must include the + catalog, schema, and volume names, preceded by 'Volumes/', like + "/Volumes/my-catalog/my-schema/my-volume". versioned: Whether or not pins should be versioned. cache: @@ -609,5 +611,5 @@ def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None See """ - opts = {"use_listings_cache": False} - return board("dbc", path, versioned, cache, allow_pickle_read, storage_options=opts) \ No newline at end of file + kwargs = dict(folder_url=folder_url) + return board("dbc", None, versioned, cache, allow_pickle_read, storage_options=kwargs) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 80800fbe..10fefa05 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -5,9 +5,12 @@ class DatabricksFs(AbstractFileSystem): protocol: ClassVar[str | tuple[str, ...]] = "dbc" + def __init__(self, folder_url, **kwargs): + self.folder_url = folder_url + def ls(self, path, details=False, **kwargs): w = WorkspaceClient() all_items = [] - for item in w.files.list_directory_contents("/Volumes/workshops/my-board/my-volume"): + for item in w.files.list_directory_contents(self.folder_url): all_items.append(item.name) - return all_items \ No newline at end of file + return all_items From e82efd4c2d83bebc412497555035da2aa12ab830 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 28 Mar 2025 22:00:36 -0500 Subject: [PATCH 03/65] pin_exists() works now --- pins/databricks/fs.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 10fefa05..e7f9600e 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -1,3 +1,4 @@ +import os from databricks.sdk import WorkspaceClient from fsspec import AbstractFileSystem from typing import ClassVar @@ -9,8 +10,17 @@ def __init__(self, folder_url, **kwargs): self.folder_url = folder_url def ls(self, path, details=False, **kwargs): + return self._list_folders(self.folder_url) + + def exists(self, path: str, **kwargs): + path = os.path.basename(path) + return path in self._list_folders(self.folder_url) + + def _list_folders(self, path): w = WorkspaceClient() - all_items = [] - for item in w.files.list_directory_contents(self.folder_url): - all_items.append(item.name) - return all_items + dir_contents = list(w.files.list_directory_contents(path)) + all_folders = [] + for item in dir_contents: + if(item.is_directory): + all_folders.append(item.name) + return all_folders From 38238ebb6b90aeb432e4d4788614ad5804f1c8c6 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Sat, 29 Mar 2025 12:53:58 -0500 Subject: [PATCH 04/65] pin_versions() works --- pins/constructors.py | 2 +- pins/databricks/fs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index e7fa2cfe..b4f22e8a 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -612,4 +612,4 @@ def board_databricks(folder_url, versioned=True, cache=DEFAULT, allow_pickle_rea """ kwargs = dict(folder_url=folder_url) - return board("dbc", None, versioned, cache, allow_pickle_read, storage_options=kwargs) + return board("dbc", folder_url, versioned, cache, allow_pickle_read, storage_options=kwargs) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index e7f9600e..8e6c7ec2 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -10,7 +10,7 @@ def __init__(self, folder_url, **kwargs): self.folder_url = folder_url def ls(self, path, details=False, **kwargs): - return self._list_folders(self.folder_url) + return self._list_folders(path) def exists(self, path: str, **kwargs): path = os.path.basename(path) From e6e950e96ad1bb3294effd728e7905ac7d50f523 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Sun, 30 Mar 2025 20:13:43 -0500 Subject: [PATCH 05/65] pin_open() works --- pins/databricks/fs.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 8e6c7ec2..44fff9b3 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -2,12 +2,15 @@ from databricks.sdk import WorkspaceClient from fsspec import AbstractFileSystem from typing import ClassVar +from io import BytesIO +import shutil class DatabricksFs(AbstractFileSystem): protocol: ClassVar[str | tuple[str, ...]] = "dbc" def __init__(self, folder_url, **kwargs): self.folder_url = folder_url + self.workspace = w = WorkspaceClient() def ls(self, path, details=False, **kwargs): return self._list_folders(path) @@ -16,9 +19,15 @@ def exists(self, path: str, **kwargs): path = os.path.basename(path) return path in self._list_folders(self.folder_url) + def open(self, path: str, mode: str = "rb", *args, **kwargs): + resp = self.workspace.files.download(path) + f = BytesIO() + shutil.copyfileobj(resp.contents, f) + f.seek(0) + return f + def _list_folders(self, path): - w = WorkspaceClient() - dir_contents = list(w.files.list_directory_contents(path)) + dir_contents = list(self.workspace.files.list_directory_contents(path)) all_folders = [] for item in dir_contents: if(item.is_directory): From 477e6d0e4d6940613a55377487f7a1b4d84bee39 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 31 Mar 2025 08:21:20 -0500 Subject: [PATCH 06/65] Makes _list_folders into _list_items --- pins/databricks/fs.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 44fff9b3..ff19cc00 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -13,11 +13,11 @@ def __init__(self, folder_url, **kwargs): self.workspace = w = WorkspaceClient() def ls(self, path, details=False, **kwargs): - return self._list_folders(path) + return self._list_items(path) def exists(self, path: str, **kwargs): path = os.path.basename(path) - return path in self._list_folders(self.folder_url) + return path in self._list_items(self.folder_url) def open(self, path: str, mode: str = "rb", *args, **kwargs): resp = self.workspace.files.download(path) @@ -26,10 +26,9 @@ def open(self, path: str, mode: str = "rb", *args, **kwargs): f.seek(0) return f - def _list_folders(self, path): + def _list_items(self, path): dir_contents = list(self.workspace.files.list_directory_contents(path)) - all_folders = [] + all_items = [] for item in dir_contents: - if(item.is_directory): - all_folders.append(item.name) - return all_folders + all_items.append(item.name) + return all_items From 80570d9757668b6b8ab3cccd09f5f138c3cb44b7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 31 Mar 2025 10:35:40 -0500 Subject: [PATCH 07/65] Adds mkdir & put, gets pin_write() working --- pins/databricks/fs.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index ff19cc00..d2b6febb 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -10,7 +10,7 @@ class DatabricksFs(AbstractFileSystem): def __init__(self, folder_url, **kwargs): self.folder_url = folder_url - self.workspace = w = WorkspaceClient() + self.workspace = WorkspaceClient() def ls(self, path, details=False, **kwargs): return self._list_items(path) @@ -26,9 +26,29 @@ def open(self, path: str, mode: str = "rb", *args, **kwargs): f.seek(0) return f + def mkdir(self, path, create_parents=True, **kwargs): + if not create_parents: + raise NotImplementedError + self.workspace.files.create_directory(path) + + def put( + self, + lpath, + rpath, + recursive=True, + maxdepth=None, + **kwargs, + ): + for item in os.listdir(lpath): + abs_item = os.path.join(lpath, item) + if(os.path.isfile(abs_item)): + dest = os.path.join(rpath, item) + file = open(abs_item, "rb") + self.workspace.files.upload(dest, BytesIO(file.read()), overwrite=True) + def _list_items(self, path): dir_contents = list(self.workspace.files.list_directory_contents(path)) all_items = [] for item in dir_contents: all_items.append(item.name) - return all_items + return all_items \ No newline at end of file From e3e9f5406d83c26b48edde2cc4e09addd39b9b99 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 31 Mar 2025 10:42:16 -0500 Subject: [PATCH 08/65] Formatting improvements --- pins/constructors.py | 2 +- pins/databricks/fs.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index b4f22e8a..1bbc0626 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -137,7 +137,7 @@ def board( board = board_factory(path, fs, versioned, **pickle_kwargs) elif protocol == "rsc": board = BoardRsConnect(path, fs, versioned, **pickle_kwargs) - elif protocol == "rsc": + elif protocol == "dbc": board = DatabricksFs(path, fs, versioned, **pickle_kwargs) else: board = BaseBoard(path, fs, versioned, **pickle_kwargs) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index d2b6febb..13955991 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -1,9 +1,10 @@ import os -from databricks.sdk import WorkspaceClient -from fsspec import AbstractFileSystem -from typing import ClassVar -from io import BytesIO import shutil +from io import BytesIO +from typing import ClassVar +from fsspec import AbstractFileSystem +from databricks.sdk import WorkspaceClient + class DatabricksFs(AbstractFileSystem): protocol: ClassVar[str | tuple[str, ...]] = "dbc" @@ -20,7 +21,7 @@ def exists(self, path: str, **kwargs): return path in self._list_items(self.folder_url) def open(self, path: str, mode: str = "rb", *args, **kwargs): - resp = self.workspace.files.download(path) + resp = self.workspace.files.download(path) f = BytesIO() shutil.copyfileobj(resp.contents, f) f.seek(0) @@ -29,7 +30,7 @@ def open(self, path: str, mode: str = "rb", *args, **kwargs): def mkdir(self, path, create_parents=True, **kwargs): if not create_parents: raise NotImplementedError - self.workspace.files.create_directory(path) + self.workspace.files.create_directory(path) def put( self, @@ -41,7 +42,7 @@ def put( ): for item in os.listdir(lpath): abs_item = os.path.join(lpath, item) - if(os.path.isfile(abs_item)): + if os.path.isfile(abs_item): dest = os.path.join(rpath, item) file = open(abs_item, "rb") self.workspace.files.upload(dest, BytesIO(file.read()), overwrite=True) @@ -50,5 +51,5 @@ def _list_items(self, path): dir_contents = list(self.workspace.files.list_directory_contents(path)) all_items = [] for item in dir_contents: - all_items.append(item.name) - return all_items \ No newline at end of file + all_items.append(item.name) + return all_items From 4a0dac5dbd5170f180c7c5a16de9521baacae560 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 31 Mar 2025 14:58:39 -0500 Subject: [PATCH 09/65] Removes conditional to use a custom board object --- pins/constructors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index 1bbc0626..c2d4f48c 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -136,9 +136,7 @@ def board( if board_factory is not None: board = board_factory(path, fs, versioned, **pickle_kwargs) elif protocol == "rsc": - board = BoardRsConnect(path, fs, versioned, **pickle_kwargs) - elif protocol == "dbc": - board = DatabricksFs(path, fs, versioned, **pickle_kwargs) + board = BoardRsConnect(path, fs, versioned, **pickle_kwargs) else: board = BaseBoard(path, fs, versioned, **pickle_kwargs) return board From 0982eb47f1906a8e6b967198ee9db66975b38a63 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 31 Mar 2025 14:59:18 -0500 Subject: [PATCH 10/65] Improvements to file reading, and adds initial rm --- pins/databricks/fs.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 13955991..f5a56c78 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -14,11 +14,10 @@ def __init__(self, folder_url, **kwargs): self.workspace = WorkspaceClient() def ls(self, path, details=False, **kwargs): - return self._list_items(path) + return self._list_dir(path, "name") def exists(self, path: str, **kwargs): - path = os.path.basename(path) - return path in self._list_items(self.folder_url) + return path in self._list_dir(path, "name") def open(self, path: str, mode: str = "rb", *args, **kwargs): resp = self.workspace.files.download(path) @@ -47,9 +46,33 @@ def put( file = open(abs_item, "rb") self.workspace.files.upload(dest, BytesIO(file.read()), overwrite=True) - def _list_items(self, path): + def rm(self, path, recursive=True, maxdepth=None) -> None: + lev1 = self._list_dir(path) + for item1 in lev1: + if(item1.get("is_directory")): + lev2 = self._list_dir(item1.get("path"), "path") + for item2 in lev2: + self.workspace.files.delete(item2) + self.workspace.files.delete_directory(item1.get("path")) + else: + self.workspace.files.delete(item1.get("path")) + self.workspace.files.delete_directory(path) + + def _map_details(self, item): + details = { + "path" : item.path, + "name" : item.name, + "is_directory" : item.is_directory + } + return details + + def _list_dir(self, path, field = 'all'): dir_contents = list(self.workspace.files.list_directory_contents(path)) - all_items = [] - for item in dir_contents: - all_items.append(item.name) - return all_items + details = list(map(self._map_details, dir_contents)) + if(field != 'all'): + items = [] + for item in details: + items.append(item.get(field)) + else: + items = details + return items From cd8e03272ec6cf3cce250f02a9f01508f7a874c6 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 31 Mar 2025 15:04:30 -0500 Subject: [PATCH 11/65] pin_delete() works, new 'exists' aproach --- pins/databricks/fs.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index f5a56c78..65a15cf1 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -17,7 +17,19 @@ def ls(self, path, details=False, **kwargs): return self._list_dir(path, "name") def exists(self, path: str, **kwargs): - return path in self._list_dir(path, "name") + file_exists = True + try: + self.workspace.files.get_metadata(path) + except: + file_exists = False + + dir_exists = True + try: + self.workspace.files.get_directory_metadata(path) + except: + dir_exists = False + + return file_exists | dir_exists def open(self, path: str, mode: str = "rb", *args, **kwargs): resp = self.workspace.files.download(path) @@ -49,7 +61,7 @@ def put( def rm(self, path, recursive=True, maxdepth=None) -> None: lev1 = self._list_dir(path) for item1 in lev1: - if(item1.get("is_directory")): + if item1.get("is_directory"): lev2 = self._list_dir(item1.get("path"), "path") for item2 in lev2: self.workspace.files.delete(item2) @@ -57,19 +69,19 @@ def rm(self, path, recursive=True, maxdepth=None) -> None: else: self.workspace.files.delete(item1.get("path")) self.workspace.files.delete_directory(path) - + def _map_details(self, item): details = { - "path" : item.path, - "name" : item.name, - "is_directory" : item.is_directory - } + "path": item.path, + "name": item.name, + "is_directory": item.is_directory, + } return details - def _list_dir(self, path, field = 'all'): + def _list_dir(self, path, field="all"): dir_contents = list(self.workspace.files.list_directory_contents(path)) details = list(map(self._map_details, dir_contents)) - if(field != 'all'): + if field != "all": items = [] for item in details: items.append(item.get(field)) From 7092718a9e56620c4b979cee61fc2eba497da19b Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Tue, 1 Apr 2025 12:08:42 -0500 Subject: [PATCH 12/65] Adds some test pieces --- pins/databricks/fs.py | 1 - pins/tests/helpers.py | 46 +++++++++++++++++++++++++++++++-- pins/tests/test_constructors.py | 2 ++ pyproject.toml | 2 ++ 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 65a15cf1..8a1e4eb5 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -10,7 +10,6 @@ class DatabricksFs(AbstractFileSystem): protocol: ClassVar[str | tuple[str, ...]] = "dbc" def __init__(self, folder_url, **kwargs): - self.folder_url = folder_url self.workspace = WorkspaceClient() def ls(self, path, details=False, **kwargs): diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index bfe6341e..bf28f0b4 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -26,6 +26,7 @@ "gcs": {"path": ["PINS_TEST_GCS__PATH", "pins-python"]}, "abfs": {"path": ["PINS_TEST_AZURE__PATH", "ci-pins"]}, "rsc": {"path": ["PINS_TEST_RSC__PATH", RSC_SERVER_URL]}, + "dbc": {"path": ["PINS_TEST_DBC__PATH", "DATABRICKS_VOLUME"]}, } # TODO: Backend initialization should be independent of helpers, but these @@ -170,8 +171,7 @@ def __init__(self, fs_name, path=None, *args, **kwargs): def create_tmp_board(self, src_board=None, versioned=True): from pins.rsconnect.fs import PinBundleManifest # noqa - board = BoardRsConnect("", rsc_fs_from_key("derek"), versioned=versioned) - + board = BaseBoard(path, fs, versioned=versioned) if src_board is None: return board @@ -202,6 +202,48 @@ def teardown_board(self, board): def teardown(self): self.teardown_board(self.create_tmp_board()) +class DbcBoardBuilder(BoardBuilder): + def create_tmp_board(self, src_board=None, versioned=True) -> BaseBoard: + if self.fs_name == "gcs": + opts = {"cache_timeout": 0} + else: + opts = {"use_listings_cache": False} + + fs = filesystem(self.fs_name, **opts) + temp_name = str(uuid.uuid4()) + + if isinstance(self.path, TemporaryDirectory): + path_name = self.path.name + else: + path_name = self.path + + board_name = f"{path_name}/{temp_name}" + + if src_board is not None: + fs.put(src_board, board_name, recursive=True) + else: + fs.mkdir(board_name) + + self.board_path_registry.append(board_name) + return BaseBoard(board_name, fs=fs, versioned=versioned) + + def teardown_board(self, board): + board.fs.rm(board.board, recursive=True) + + def teardown(self): + # cleanup all temporary boards + fs = filesystem(self.fs_name) + + for board_path in self.board_path_registry: + print(board_path) + if fs.exists(board_path): + fs.rm(board_path, recursive=True) + + # only delete the base directory if it is explicitly temporary + if isinstance(self.path, TemporaryDirectory): + self.path.cleanup() + + # Snapshot ==================================================================== diff --git a/pins/tests/test_constructors.py b/pins/tests/test_constructors.py index 3ff9c252..901b9643 100644 --- a/pins/tests/test_constructors.py +++ b/pins/tests/test_constructors.py @@ -41,6 +41,8 @@ def construct_from_board(board): if fs_name in ["file", ("file", "local")]: board = c.board_folder(board.board) + elif fs_name == "dbc": + board = c.board_databricks(board.board) elif fs_name == "rsc": board = c.board_rsconnect( server_url=board.fs.api.server_url, api_key=board.fs.api.api_key diff --git a/pyproject.toml b/pyproject.toml index 96b24ad5..20cb89f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ check = [ "pyright==1.1.372", # Pinned; manually sync with .github/workflows/code-checks.yml "types-appdirs", ] +databricks = ["databricks-sdk"] doc = [ "ipykernel", "ipython<=8.12", @@ -85,6 +86,7 @@ markers = [ "fs_gcs: mark test to only run on Google Cloud Storage bucket filesystem", "fs_abfs: mark test to only run on Azure Datalake filesystem", "fs_rsc: mark test to only run on Posit Connect filesystem", + "fs_dbc: mark test to only run on Databricks Volume filesystem", "skip_on_github: skip this test if running on GitHub", ] testpaths = ["pins"] From 4e7549a44ac9dfd4244b883dd5e2093c8bfabdaf Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 2 Apr 2025 07:50:28 -0500 Subject: [PATCH 13/65] Partially runs tests --- pins/boards.py | 2 +- pins/cache.py | 2 ++ pins/constructors.py | 5 +--- pins/tests/conftest.py | 5 ++-- pins/tests/helpers.py | 50 ++++++++++----------------------------- pins/tests/test_boards.py | 4 +++- pins/tests/test_compat.py | 2 +- pins/utils.py | 1 - 8 files changed, 24 insertions(+), 47 deletions(-) diff --git a/pins/boards.py b/pins/boards.py index eab3aca2..cbca99ec 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -868,7 +868,7 @@ def board_deparse(board: BaseBoard): return f"board_gcs({repr(board.board)}{allow_pickle})" elif prot == "http": return f"board_url({repr(board.board)}, {board.pin_paths}{allow_pickle})" - elif set(prot) == "dbutils": + elif prot == "dbc": return f"board_databricks({repr(board.board)}{allow_pickle})" else: raise NotImplementedError( diff --git a/pins/cache.py b/pins/cache.py index ebffb3b7..559a8b75 100644 --- a/pins/cache.py +++ b/pins/cache.py @@ -324,3 +324,5 @@ def cache_prune(days=30, cache_root=None, prompt=True): # TODO: swap to use entrypoint register_implementation("pinscache", PinsCache) +from .tests.helpers import DbcBoardBuilder +register_implementation("dbc", DbcBoardBuilder) \ No newline at end of file diff --git a/pins/constructors.py b/pins/constructors.py index c2d4f48c..6ce60f87 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -603,10 +603,7 @@ def board_databricks(folder_url, versioned=True, cache=DEFAULT, allow_pickle_rea Notes ----- - The Databricks board uses the fsspec library (dbutils) to handle interacting with - Databricks Volumes. Currently, its default mode of authentication is supported. - - See + The Databricks board uses... """ kwargs = dict(folder_url=folder_url) diff --git a/pins/tests/conftest.py b/pins/tests/conftest.py index 0af518fc..cbdb9a11 100644 --- a/pins/tests/conftest.py +++ b/pins/tests/conftest.py @@ -6,7 +6,7 @@ from importlib_resources import files from pytest import mark as m -from pins.tests.helpers import BoardBuilder, RscBoardBuilder, Snapshot, rm_env +from pins.tests.helpers import BoardBuilder, RscBoardBuilder, DbcBoardBuilder, Snapshot, rm_env EXAMPLE_REL_PATH = "pins/tests/pins-compat" PATH_TO_EXAMPLE_BOARD = files("pins") / "tests/pins-compat" @@ -25,8 +25,9 @@ # rsc should only be used once, because users are created at docker setup time param_rsc = pytest.param(lambda: RscBoardBuilder("rsc"), id="rsc", marks=m.fs_rsc) +param_dbc = pytest.param(lambda: DbcBoardBuilder("dbc"), id="dbc", marks=m.fs_dbc) -params_backend = [*params_safe, param_rsc] +params_backend = [*params_safe, param_rsc, param_dbc] @pytest.fixture(params=params_backend, scope="session") diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index bf28f0b4..2b19b5b9 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -13,6 +13,7 @@ from importlib_resources import files from pins.boards import BaseBoard, BoardRsConnect +from pins.constructors import board_databricks DEFAULT_CREATION_DATE = datetime(2020, 1, 13, 23, 58, 59) @@ -203,47 +204,22 @@ def teardown(self): self.teardown_board(self.create_tmp_board()) class DbcBoardBuilder(BoardBuilder): - def create_tmp_board(self, src_board=None, versioned=True) -> BaseBoard: - if self.fs_name == "gcs": - opts = {"cache_timeout": 0} - else: - opts = {"use_listings_cache": False} - - fs = filesystem(self.fs_name, **opts) - temp_name = str(uuid.uuid4()) - - if isinstance(self.path, TemporaryDirectory): - path_name = self.path.name - else: - path_name = self.path - - board_name = f"{path_name}/{temp_name}" - - if src_board is not None: - fs.put(src_board, board_name, recursive=True) - else: - fs.mkdir(board_name) + def __init__(self, fs_name, path=None, *args, **kwargs): + self.fs_name = fs_name + self.path = None - self.board_path_registry.append(board_name) - return BaseBoard(board_name, fs=fs, versioned=versioned) + def create_tmp_board(self, src_board=None, versioned=True): + db_vol = os.environ.get("DATABRICKS_VOLUME") + board_name = os.path.join(db_vol, "pinstest") + board = board_databricks(board_name) + return board def teardown_board(self, board): - board.fs.rm(board.board, recursive=True) + board.fs.rm(board.board) def teardown(self): - # cleanup all temporary boards - fs = filesystem(self.fs_name) - - for board_path in self.board_path_registry: - print(board_path) - if fs.exists(board_path): - fs.rm(board_path, recursive=True) - - # only delete the base directory if it is explicitly temporary - if isinstance(self.path, TemporaryDirectory): - self.path.cleanup() - - + board = self.create_tmp_board() + self.teardown_board(board.board) # Snapshot ==================================================================== @@ -314,4 +290,4 @@ def rm_env(*args): yield finally: os.environ.clear() - os.environ.update(old_environ) + os.environ.update(old_environ) \ No newline at end of file diff --git a/pins/tests/test_boards.py b/pins/tests/test_boards.py index 4afeecc0..855c707e 100644 --- a/pins/tests/test_boards.py +++ b/pins/tests/test_boards.py @@ -40,7 +40,7 @@ def board_unversioned(backend): @fixture def board_with_cache(backend): from pins.constructors import board as board_constructor - from pins.constructors import board_rsconnect + from pins.constructors import board_rsconnect, board_databricks board = backend.create_tmp_board() @@ -50,6 +50,8 @@ def board_with_cache(backend): # board behavior. As a result, we need to pass the credentials directly in. server_url, api_key = board.fs.api.server_url, board.fs.api.api_key board_with_cache = board_rsconnect(server_url=server_url, api_key=api_key) + elif backend.fs_name == "dbc": + board_with_cache = board_databricks(board.board) else: board_with_cache = board_constructor(backend.fs_name, board.board) diff --git a/pins/tests/test_compat.py b/pins/tests/test_compat.py index 6b66aadd..f94a5899 100644 --- a/pins/tests/test_compat.py +++ b/pins/tests/test_compat.py @@ -25,7 +25,7 @@ def board(backend): def board_manifest(backend): # skip on rsconnect, since it can't add a manifest and the pin names # are too short for use to upload (rsc requires names > 3 characters) - if backend.fs_name == "rsc": + if backend.fs_name == "rsc" | backend.fs_name == "dbc": pytest.skip() board = backend.create_tmp_board(str(PATH_TO_MANIFEST_BOARD.absolute())) diff --git a/pins/utils.py b/pins/utils.py index 7525c148..6af9bab5 100644 --- a/pins/utils.py +++ b/pins/utils.py @@ -7,7 +7,6 @@ from .config import pins_options - def inform(log, msg): if log is not None: log.info(msg) From 409d89ca993c78ddd610336d7be03927d3c7d79a Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 2 Apr 2025 12:15:59 -0500 Subject: [PATCH 14/65] Adds support for `detail` in ls --- pins/databricks/fs.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 8a1e4eb5..e13b188d 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -12,8 +12,15 @@ class DatabricksFs(AbstractFileSystem): def __init__(self, folder_url, **kwargs): self.workspace = WorkspaceClient() - def ls(self, path, details=False, **kwargs): - return self._list_dir(path, "name") + def ls(self, path, detail=False, **kwargs): + files = self._list_dir(path, "name") + if(detail): + all_files = [] + for file in files: + all_files.append(dict(name = file, size = None, type = "file")) + return all_files + else: + return files def exists(self, path: str, **kwargs): file_exists = True From b1e629a7c8bd82fdca3f514e88a3a20b4708006f Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 2 Apr 2025 16:47:01 -0500 Subject: [PATCH 15/65] Figures out how to properly cache the fs --- pins/constructors.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index 6ce60f87..0b092be9 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -115,12 +115,20 @@ def board( mapper=PinsRscCacheMapper, ) elif protocol == "dbc": - None + board_cache = prefix_cache(fs, path) + cache_dir = os.path.join(base_cache_dir, board_cache) + + fs = fsspec.implementations.cached.SimpleCacheFileSystem( + cache_storage=cache_dir, + fs=fs, + hash_prefix=path, + same_names=True + ) else: # ensures each subdir path is its own cache directory board_cache = prefix_cache(fs, path) cache_dir = os.path.join(base_cache_dir, board_cache) - + fs = PinsCache( cache_storage=cache_dir, fs=fs, hash_prefix=path, same_names=True ) From 66bd1c40f932dc91797f8c59315e1edc6157015c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 2 Apr 2025 18:51:42 -0500 Subject: [PATCH 16/65] Fixes teardown for tests --- pins/tests/helpers.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index 2b19b5b9..a8dd26a3 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -207,19 +207,25 @@ class DbcBoardBuilder(BoardBuilder): def __init__(self, fs_name, path=None, *args, **kwargs): self.fs_name = fs_name self.path = None + self.current_board = "" def create_tmp_board(self, src_board=None, versioned=True): db_vol = os.environ.get("DATABRICKS_VOLUME") - board_name = os.path.join(db_vol, "pinstest") + temp_name = str(uuid.uuid4()) + board_name = os.path.join(db_vol, temp_name) board = board_databricks(board_name) + if src_board is not None: + board.fs.put(src_board, board_name, recursive=True) + self.current_board = temp_name return board def teardown_board(self, board): board.fs.rm(board.board) def teardown(self): - board = self.create_tmp_board() - self.teardown_board(board.board) + db_vol = os.environ.get("DATABRICKS_VOLUME") + board = board_databricks(db_vol) + board.fs.rm(db_vol + "/" + self.current_board) # Snapshot ==================================================================== From 263dded961abb2ea7fd5c05c164dfe53d8a49a55 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 2 Apr 2025 18:52:02 -0500 Subject: [PATCH 17/65] extends `rm` one more level (fix later) --- pins/databricks/fs.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index e13b188d..98fe1fd5 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -65,16 +65,24 @@ def put( self.workspace.files.upload(dest, BytesIO(file.read()), overwrite=True) def rm(self, path, recursive=True, maxdepth=None) -> None: - lev1 = self._list_dir(path) - for item1 in lev1: - if item1.get("is_directory"): - lev2 = self._list_dir(item1.get("path"), "path") - for item2 in lev2: - self.workspace.files.delete(item2) - self.workspace.files.delete_directory(item1.get("path")) - else: - self.workspace.files.delete(item1.get("path")) - self.workspace.files.delete_directory(path) + exists = self.exists(path) + if(exists): + lev1 = self._list_dir(path) + for item1 in lev1: + if item1.get("is_directory"): + lev2 = self._list_dir(item1.get("path")) + for item2 in lev2: + if item1.get("is_directory"): + lev3 = self._list_dir(item2.get("path"), "path") + for item3 in lev3: + self.workspace.files.delete(item3) + self.workspace.files.delete_directory(item2.get("path")) + else: + self.workspace.files.delete(item2.get("path")) + self.workspace.files.delete_directory(item1.get("path")) + else: + self.workspace.files.delete(item1.get("path")) + self.workspace.files.delete_directory(path) def _map_details(self, item): details = { From 788ec93632de09dc72178b599923e8508b3ef6dd Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 2 Apr 2025 19:02:54 -0500 Subject: [PATCH 18/65] No errors in tests, moving on to addressing failures --- pins/tests/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pins/tests/test_compat.py b/pins/tests/test_compat.py index f94a5899..498f5782 100644 --- a/pins/tests/test_compat.py +++ b/pins/tests/test_compat.py @@ -25,7 +25,7 @@ def board(backend): def board_manifest(backend): # skip on rsconnect, since it can't add a manifest and the pin names # are too short for use to upload (rsc requires names > 3 characters) - if backend.fs_name == "rsc" | backend.fs_name == "dbc": + if backend.fs_name in ["rsc", "dbc"]: pytest.skip() board = backend.create_tmp_board(str(PATH_TO_MANIFEST_BOARD.absolute())) From 794b22a47e93386d5e93d92ee196decec9241240 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 3 Apr 2025 12:30:55 -0500 Subject: [PATCH 19/65] Adds recursive file/folder mapper --- pins/databricks/fs.py | 62 +++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 98fe1fd5..e06260e1 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -12,15 +12,20 @@ class DatabricksFs(AbstractFileSystem): def __init__(self, folder_url, **kwargs): self.workspace = WorkspaceClient() - def ls(self, path, detail=False, **kwargs): - files = self._list_dir(path, "name") - if(detail): - all_files = [] - for file in files: - all_files.append(dict(name = file, size = None, type = "file")) - return all_files - else: - return files + def ls(self, path, detail=False, **kwargs): + files = _map_folder(path=path, recurse=False) + items = [] + for file in files: + name = file.get("name") + if(detail): + if(file.get("is_directory")): + type = "directory" + else: + type = "file" + items.append(dict(name = name, size = None, type = type)) + else: + items.append(name) + return items def exists(self, path: str, **kwargs): file_exists = True @@ -84,17 +89,10 @@ def rm(self, path, recursive=True, maxdepth=None) -> None: self.workspace.files.delete(item1.get("path")) self.workspace.files.delete_directory(path) - def _map_details(self, item): - details = { - "path": item.path, - "name": item.name, - "is_directory": item.is_directory, - } - return details def _list_dir(self, path, field="all"): dir_contents = list(self.workspace.files.list_directory_contents(path)) - details = list(map(self._map_details, dir_contents)) + details = list(map(_map_details, dir_contents)) if field != "all": items = [] for item in details: @@ -102,3 +100,33 @@ def _list_dir(self, path, field="all"): else: items = details return items + +def _map_folder(path, recurse=True, include_folders=True, include_files=True): + w = WorkspaceClient() + dir_contents = list(w.files.list_directory_contents(path)) + details = list(map(_map_details, dir_contents)) + items = [] + for item in details: + if(item.get("is_directory")): + if(include_folders): + items = items + [item] + if(recurse): + more_details = _map_folder( + path = item.get("path"), + recurse=True, + include_folders=include_folders, + include_files=include_files + ) + items = items + more_details + else: + if(include_files): + items = items + [item] + return items + +def _map_details(item): + details = { + "path": item.path, + "name": item.name, + "is_directory": item.is_directory, + } + return details \ No newline at end of file From 20e481d8f7e4dfc0595e9bcf8d4d57aa696ec369 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 3 Apr 2025 14:03:24 -0500 Subject: [PATCH 20/65] fully recursive put (clean later) --- pins/databricks/fs.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index e06260e1..0878efd4 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -62,12 +62,7 @@ def put( maxdepth=None, **kwargs, ): - for item in os.listdir(lpath): - abs_item = os.path.join(lpath, item) - if os.path.isfile(abs_item): - dest = os.path.join(rpath, item) - file = open(abs_item, "rb") - self.workspace.files.upload(dest, BytesIO(file.read()), overwrite=True) + _map_put(lpath, rpath) def rm(self, path, recursive=True, maxdepth=None) -> None: exists = self.exists(path) @@ -101,6 +96,25 @@ def _list_dir(self, path, field="all"): items = details return items +def _map_put(lpath, rpath): + w = WorkspaceClient() + path = os.path.abspath(lpath) + items = [] + orig_path = path + def test(path): + contents = os.listdir(path) + for item in contents: + abs_path = os.path.join(path, item) + is_file = os.path.isfile(abs_path) + rel_path = os.path.relpath(abs_path, orig_path) + db_path = os.path.join(rpath, rel_path) + if(is_file == False): + test(abs_path) + w.files.create_directory(db_path) + else: + file = open(abs_path, "rb") + w.files.upload(db_path, BytesIO(file.read()), overwrite=True) + def _map_folder(path, recurse=True, include_folders=True, include_files=True): w = WorkspaceClient() dir_contents = list(w.files.list_directory_contents(path)) From 0209d28f4455b9266294d0d0653439dfaa009b50 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 3 Apr 2025 15:32:51 -0500 Subject: [PATCH 21/65] Improvements to _map_put --- pins/databricks/fs.py | 12 ++++++------ pins/tests/helpers.py | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 0878efd4..b2bd8728 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -101,19 +101,19 @@ def _map_put(lpath, rpath): path = os.path.abspath(lpath) items = [] orig_path = path - def test(path): + def _upload_files(path): contents = os.listdir(path) for item in contents: abs_path = os.path.join(path, item) is_file = os.path.isfile(abs_path) rel_path = os.path.relpath(abs_path, orig_path) db_path = os.path.join(rpath, rel_path) - if(is_file == False): - test(abs_path) - w.files.create_directory(db_path) - else: + if(is_file): file = open(abs_path, "rb") - w.files.upload(db_path, BytesIO(file.read()), overwrite=True) + w.files.upload(db_path, BytesIO(file.read()), overwrite=True) + else: + _upload_files(abs_path) + _upload_files(path) def _map_folder(path, recurse=True, include_folders=True, include_files=True): w = WorkspaceClient() diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index a8dd26a3..1ac29400 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -13,6 +13,7 @@ from importlib_resources import files from pins.boards import BaseBoard, BoardRsConnect + from pins.constructors import board_databricks DEFAULT_CREATION_DATE = datetime(2020, 1, 13, 23, 58, 59) @@ -214,9 +215,9 @@ def create_tmp_board(self, src_board=None, versioned=True): temp_name = str(uuid.uuid4()) board_name = os.path.join(db_vol, temp_name) board = board_databricks(board_name) - if src_board is not None: - board.fs.put(src_board, board_name, recursive=True) self.current_board = temp_name + if src_board is not None: + board.fs.put(src_board, board_name) return board def teardown_board(self, board): From 80cd4da2ca4b00389ddbe53cb70e97d52e8e935a Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 4 Apr 2025 08:42:20 -0500 Subject: [PATCH 22/65] Starts moving to discrete functions --- pins/databricks/fs.py | 72 ++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index b2bd8728..a0ab6078 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -28,26 +28,12 @@ def ls(self, path, detail=False, **kwargs): return items def exists(self, path: str, **kwargs): - file_exists = True - try: - self.workspace.files.get_metadata(path) - except: - file_exists = False - - dir_exists = True - try: - self.workspace.files.get_directory_metadata(path) - except: - dir_exists = False - - return file_exists | dir_exists + return _databricks_exists(path) def open(self, path: str, mode: str = "rb", *args, **kwargs): - resp = self.workspace.files.download(path) - f = BytesIO() - shutil.copyfileobj(resp.contents, f) - f.seek(0) - return f + if mode != "rb": + raise NotImplementedError + return _databricks_open(path) def mkdir(self, path, create_parents=True, **kwargs): if not create_parents: @@ -62,7 +48,11 @@ def put( maxdepth=None, **kwargs, ): - _map_put(lpath, rpath) + if not recursive: + raise NotImplementedError + if maxdepth is not None: + raise NotImplementedError + _databricks_put(lpath, rpath) def rm(self, path, recursive=True, maxdepth=None) -> None: exists = self.exists(path) @@ -96,7 +86,7 @@ def _list_dir(self, path, field="all"): items = details return items -def _map_put(lpath, rpath): +def _databricks_put(lpath, rpath): w = WorkspaceClient() path = os.path.abspath(lpath) items = [] @@ -115,6 +105,46 @@ def _upload_files(path): _upload_files(abs_path) _upload_files(path) +def _databricks_open(path): + w = WorkspaceClient() + resp = w.files.download(path) + f = BytesIO() + shutil.copyfileobj(resp.contents, f) + f.seek(0) + return f + +def _databricks_exists(path: str): + w = WorkspaceClient() + try: + w.files.get_metadata(path) + except: + try: + w.files.get_directory_metadata(path) + except: + return False + else: + return True + else: + return True + +def _databricks_ls(path, detail): + w = WorkspaceClient() + contents_raw = w.files.list_directory_contents(path) + contents = list(contents_raw) + items = [] + for item in contents: + item = _map_details(item) + name = item.get("name") + if(detail): + if(item.get("is_directory")): + type = "directory" + else: + type = "file" + items.append(dict(name = name, size = None, type = type)) + else: + items.append(name) + return items + def _map_folder(path, recurse=True, include_folders=True, include_files=True): w = WorkspaceClient() dir_contents = list(w.files.list_directory_contents(path)) @@ -143,4 +173,4 @@ def _map_details(item): "name": item.name, "is_directory": item.is_directory, } - return details \ No newline at end of file + return details From 3e96d19fc650f0729761a1fd796fed9cd6e858b7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 4 Apr 2025 10:35:05 -0500 Subject: [PATCH 23/65] Finishes moving everything to discrete functions --- pins/databricks/fs.py | 91 ++++++++++++------------------------------- 1 file changed, 24 insertions(+), 67 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index a0ab6078..e45d54ba 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -9,23 +9,8 @@ class DatabricksFs(AbstractFileSystem): protocol: ClassVar[str | tuple[str, ...]] = "dbc" - def __init__(self, folder_url, **kwargs): - self.workspace = WorkspaceClient() - def ls(self, path, detail=False, **kwargs): - files = _map_folder(path=path, recurse=False) - items = [] - for file in files: - name = file.get("name") - if(detail): - if(file.get("is_directory")): - type = "directory" - else: - type = "file" - items.append(dict(name = name, size = None, type = type)) - else: - items.append(name) - return items + return _databricks_ls(path, detail) def exists(self, path: str, **kwargs): return _databricks_exists(path) @@ -38,7 +23,7 @@ def open(self, path: str, mode: str = "rb", *args, **kwargs): def mkdir(self, path, create_parents=True, **kwargs): if not create_parents: raise NotImplementedError - self.workspace.files.create_directory(path) + _databricks_mkdir(path) def put( self, @@ -55,36 +40,12 @@ def put( _databricks_put(lpath, rpath) def rm(self, path, recursive=True, maxdepth=None) -> None: - exists = self.exists(path) - if(exists): - lev1 = self._list_dir(path) - for item1 in lev1: - if item1.get("is_directory"): - lev2 = self._list_dir(item1.get("path")) - for item2 in lev2: - if item1.get("is_directory"): - lev3 = self._list_dir(item2.get("path"), "path") - for item3 in lev3: - self.workspace.files.delete(item3) - self.workspace.files.delete_directory(item2.get("path")) - else: - self.workspace.files.delete(item2.get("path")) - self.workspace.files.delete_directory(item1.get("path")) - else: - self.workspace.files.delete(item1.get("path")) - self.workspace.files.delete_directory(path) - - - def _list_dir(self, path, field="all"): - dir_contents = list(self.workspace.files.list_directory_contents(path)) - details = list(map(_map_details, dir_contents)) - if field != "all": - items = [] - for item in details: - items.append(item.get(field)) - else: - items = details - return items + if not recursive: + raise NotImplementedError + if maxdepth is not None: + raise NotImplementedError + if(_databricks_exists(path)): + _databricks_rm_dir(path) def _databricks_put(lpath, rpath): w = WorkspaceClient() @@ -105,7 +66,7 @@ def _upload_files(path): _upload_files(abs_path) _upload_files(path) -def _databricks_open(path): +def _databricks_open(patbh): w = WorkspaceClient() resp = w.files.download(path) f = BytesIO() @@ -133,7 +94,7 @@ def _databricks_ls(path, detail): contents = list(contents_raw) items = [] for item in contents: - item = _map_details(item) + item = _databricks_content_details(item) name = item.get("name") if(detail): if(item.get("is_directory")): @@ -145,29 +106,25 @@ def _databricks_ls(path, detail): items.append(name) return items -def _map_folder(path, recurse=True, include_folders=True, include_files=True): +def _databricks_rm_dir(path): w = WorkspaceClient() - dir_contents = list(w.files.list_directory_contents(path)) - details = list(map(_map_details, dir_contents)) + raw_contents = w.files.list_directory_contents(path) + contents = list(raw_contents) + details = list(map(_databricks_content_details, contents)) items = [] - for item in details: + for item in details: + item_path = item.get("path") if(item.get("is_directory")): - if(include_folders): - items = items + [item] - if(recurse): - more_details = _map_folder( - path = item.get("path"), - recurse=True, - include_folders=include_folders, - include_files=include_files - ) - items = items + more_details + _databricks_rm_dir(item_path) else: - if(include_files): - items = items + [item] - return items + w.files.delete(item_path) + w.files.delete_directory(path) + +def _databricks_mkdir(path): + w = WorkspaceClient() + w.files.create_directory(path) -def _map_details(item): +def _databricks_content_details(item): details = { "path": item.path, "name": item.name, From 00ebe78925df450077bc551e23d584d9e6b79309 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 4 Apr 2025 13:28:18 -0500 Subject: [PATCH 24/65] Fixes typo --- pins/constructors.py | 2 +- pins/databricks/fs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index 0b092be9..0c7de4db 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -122,7 +122,7 @@ def board( cache_storage=cache_dir, fs=fs, hash_prefix=path, - same_names=True + same_names=True ) else: # ensures each subdir path is its own cache directory diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index e45d54ba..23f2d7a7 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -66,7 +66,7 @@ def _upload_files(path): _upload_files(abs_path) _upload_files(path) -def _databricks_open(patbh): +def _databricks_open(path): w = WorkspaceClient() resp = w.files.download(path) f = BytesIO() From 6f65e9d974def52f7617b48a021f9ee0479afcda Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 4 Apr 2025 17:40:12 -0500 Subject: [PATCH 25/65] Creates custom cache class, fixes issue with reading wrong pin --- pins/cache.py | 22 ++++++++++++++++++++++ pins/constructors.py | 13 +++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/pins/cache.py b/pins/cache.py index 559a8b75..f3f5f00f 100644 --- a/pins/cache.py +++ b/pins/cache.py @@ -145,6 +145,28 @@ def _check_file(self, path): if os.path.exists(fn): return fn +class PinsDBCache(SimpleCacheFileSystem): + # Same as PinsCache, but removes _make_local_details + def __init__(self, *args, hash_prefix=None, mapper=HashMapper, **kwargs): + super().__init__(*args, **kwargs) + self.hash_prefix = hash_prefix + self._mapper = mapper(hash_prefix) + + def hash_name(self, path, *args, **kwargs): + return self._mapper(path) + + def _open(self, path, *args, **kwargs): + path = self._strip_protocol(path) + return super()._open(path, *args, **kwargs) + + # same as upstream, brought in to preserve backwards compatibility + def _check_file(self, path): + self._check_cache() + sha = self._mapper(path) + for storage in self.storage: + fn = os.path.join(storage, sha) + if os.path.exists(fn): + return fn class PinsUrlCache(PinsCache): protocol = "pinsurlcache" diff --git a/pins/constructors.py b/pins/constructors.py index 0c7de4db..2f8d407c 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -8,7 +8,7 @@ import fsspec from .boards import BaseBoard, BoardManual, BoardRsConnect, board_deparse -from .cache import PinsAccessTimeCache, PinsCache, PinsRscCacheMapper, prefix_cache +from .cache import PinsAccessTimeCache, PinsCache, PinsRscCacheMapper, prefix_cache, PinsDBCache from .config import get_cache_dir, get_data_dir # Kept here for backward-compatibility reasons @@ -115,15 +115,12 @@ def board( mapper=PinsRscCacheMapper, ) elif protocol == "dbc": + #Find me here board_cache = prefix_cache(fs, path) cache_dir = os.path.join(base_cache_dir, board_cache) - - fs = fsspec.implementations.cached.SimpleCacheFileSystem( - cache_storage=cache_dir, - fs=fs, - hash_prefix=path, - same_names=True - ) + fs = PinsDBCache( + cache_storage=cache_dir, fs=fs, hash_prefix=path, same_names=True + ) else: # ensures each subdir path is its own cache directory board_cache = prefix_cache(fs, path) From 2f400d19a5f13e4041844a28b5d900ebfba31f16 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Sun, 6 Apr 2025 15:47:08 -0500 Subject: [PATCH 26/65] Removes _open from PinsDBCache after confirming that it's not needed --- data.csv | 4 ++++ pins/cache.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 data.csv diff --git a/data.csv b/data.csv new file mode 100644 index 00000000..646dc149 --- /dev/null +++ b/data.csv @@ -0,0 +1,4 @@ +,x +0,1 +1,2 +2,3 diff --git a/pins/cache.py b/pins/cache.py index f3f5f00f..144938c5 100644 --- a/pins/cache.py +++ b/pins/cache.py @@ -155,10 +155,6 @@ def __init__(self, *args, hash_prefix=None, mapper=HashMapper, **kwargs): def hash_name(self, path, *args, **kwargs): return self._mapper(path) - def _open(self, path, *args, **kwargs): - path = self._strip_protocol(path) - return super()._open(path, *args, **kwargs) - # same as upstream, brought in to preserve backwards compatibility def _check_file(self, path): self._check_cache() From 7171510c1c5ebb249840341e7f5efd4dc4713a59 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Sun, 6 Apr 2025 15:48:23 -0500 Subject: [PATCH 27/65] Removes data.csv file --- data.csv | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 data.csv diff --git a/data.csv b/data.csv deleted file mode 100644 index 646dc149..00000000 --- a/data.csv +++ /dev/null @@ -1,4 +0,0 @@ -,x -0,1 -1,2 -2,3 From 50da1c98260e28f43646f25559d4c2f85363ed08 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 7 Apr 2025 14:47:55 -0500 Subject: [PATCH 28/65] Adds get() and _databricks_get() --- data.csv | 4 ++ data2.csv | 4 ++ newtest3/data/20250404T131317Z-a173c/data.csv | 4 ++ newtest3/data/20250404T131317Z-a173c/data.txt | 9 +++++ .../reviews/20250404T131344Z-6cff2/data.txt | 9 +++++ .../20250404T131344Z-6cff2/reviews.csv | 4 ++ .../reviews2/20250406T142724Z-6cff2/data.txt | 9 +++++ .../20250406T142724Z-6cff2/reviews2.csv | 4 ++ .../reviews3/20250406T145752Z-c65b0/data.csv | 4 ++ .../reviews3/20250406T145752Z-c65b0/data.txt | 9 +++++ pins/databricks/fs.py | 40 ++++++++++++++++--- test4.csv | 4 ++ 12 files changed, 99 insertions(+), 5 deletions(-) create mode 100644 data.csv create mode 100644 data2.csv create mode 100644 newtest3/data/20250404T131317Z-a173c/data.csv create mode 100644 newtest3/data/20250404T131317Z-a173c/data.txt create mode 100644 newtest3/reviews/20250404T131344Z-6cff2/data.txt create mode 100644 newtest3/reviews/20250404T131344Z-6cff2/reviews.csv create mode 100644 newtest3/reviews2/20250406T142724Z-6cff2/data.txt create mode 100644 newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv create mode 100644 newtest3/reviews3/20250406T145752Z-c65b0/data.csv create mode 100644 newtest3/reviews3/20250406T145752Z-c65b0/data.txt create mode 100644 test4.csv diff --git a/data.csv b/data.csv new file mode 100644 index 00000000..646dc149 --- /dev/null +++ b/data.csv @@ -0,0 +1,4 @@ +,x +0,1 +1,2 +2,3 diff --git a/data2.csv b/data2.csv new file mode 100644 index 00000000..b879b228 --- /dev/null +++ b/data2.csv @@ -0,0 +1,4 @@ +review +"This has been the best TV I've ever used. Great screen, and sound." +I regret buying this laptop. It is too slow and the keyboard is too noisy +"Not sure how to feel about my new washing machine. Great color, but hard to figure" diff --git a/newtest3/data/20250404T131317Z-a173c/data.csv b/newtest3/data/20250404T131317Z-a173c/data.csv new file mode 100644 index 00000000..d143959b --- /dev/null +++ b/newtest3/data/20250404T131317Z-a173c/data.csv @@ -0,0 +1,4 @@ +x,y +1,4 +2,5 +3,6 diff --git a/newtest3/data/20250404T131317Z-a173c/data.txt b/newtest3/data/20250404T131317Z-a173c/data.txt new file mode 100644 index 00000000..3ec8c56f --- /dev/null +++ b/newtest3/data/20250404T131317Z-a173c/data.txt @@ -0,0 +1,9 @@ +api_version: 1 +created: 20250404T131317Z +description: null +file: data.csv +file_size: 16 +pin_hash: a173cd6a53908980 +title: 'data: a pinned 3 x 2 DataFrame' +type: csv +user: {} diff --git a/newtest3/reviews/20250404T131344Z-6cff2/data.txt b/newtest3/reviews/20250404T131344Z-6cff2/data.txt new file mode 100644 index 00000000..3623b5ca --- /dev/null +++ b/newtest3/reviews/20250404T131344Z-6cff2/data.txt @@ -0,0 +1,9 @@ +api_version: 1 +created: 20250404T131344Z +description: null +file: reviews.csv +file_size: 235 +pin_hash: 6cff20278541542e +title: 'reviews: a pinned 3 x 1 DataFrame' +type: csv +user: {} diff --git a/newtest3/reviews/20250404T131344Z-6cff2/reviews.csv b/newtest3/reviews/20250404T131344Z-6cff2/reviews.csv new file mode 100644 index 00000000..b879b228 --- /dev/null +++ b/newtest3/reviews/20250404T131344Z-6cff2/reviews.csv @@ -0,0 +1,4 @@ +review +"This has been the best TV I've ever used. Great screen, and sound." +I regret buying this laptop. It is too slow and the keyboard is too noisy +"Not sure how to feel about my new washing machine. Great color, but hard to figure" diff --git a/newtest3/reviews2/20250406T142724Z-6cff2/data.txt b/newtest3/reviews2/20250406T142724Z-6cff2/data.txt new file mode 100644 index 00000000..f620f542 --- /dev/null +++ b/newtest3/reviews2/20250406T142724Z-6cff2/data.txt @@ -0,0 +1,9 @@ +api_version: 1 +created: 20250406T142724Z +description: null +file: reviews2.csv +file_size: 235 +pin_hash: 6cff20278541542e +title: 'reviews2: a pinned 3 x 1 DataFrame' +type: csv +user: {} diff --git a/newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv b/newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv new file mode 100644 index 00000000..b879b228 --- /dev/null +++ b/newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv @@ -0,0 +1,4 @@ +review +"This has been the best TV I've ever used. Great screen, and sound." +I regret buying this laptop. It is too slow and the keyboard is too noisy +"Not sure how to feel about my new washing machine. Great color, but hard to figure" diff --git a/newtest3/reviews3/20250406T145752Z-c65b0/data.csv b/newtest3/reviews3/20250406T145752Z-c65b0/data.csv new file mode 100644 index 00000000..646dc149 --- /dev/null +++ b/newtest3/reviews3/20250406T145752Z-c65b0/data.csv @@ -0,0 +1,4 @@ +,x +0,1 +1,2 +2,3 diff --git a/newtest3/reviews3/20250406T145752Z-c65b0/data.txt b/newtest3/reviews3/20250406T145752Z-c65b0/data.txt new file mode 100644 index 00000000..2cedeae1 --- /dev/null +++ b/newtest3/reviews3/20250406T145752Z-c65b0/data.txt @@ -0,0 +1,9 @@ +api_version: 1 +created: 20250406T145752Z +description: null +file: data.csv +file_size: 15 +pin_hash: c65b0e9785abaa60 +title: 'reviews3: a pinned str object' +type: file +user: {} diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 23f2d7a7..a3a6d78d 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -19,6 +19,9 @@ def open(self, path: str, mode: str = "rb", *args, **kwargs): if mode != "rb": raise NotImplementedError return _databricks_open(path) + + def get(self, rpath, lpath, recursive=False, **kwargs): + _databricks_get(self, rpath, lpath, recursive, **kwargs) def mkdir(self, path, create_parents=True, **kwargs): if not create_parents: @@ -57,15 +60,36 @@ def _upload_files(path): for item in contents: abs_path = os.path.join(path, item) is_file = os.path.isfile(abs_path) - rel_path = os.path.relpath(abs_path, orig_path) - db_path = os.path.join(rpath, rel_path) if(is_file): + rel_path = os.path.relpath(abs_path, orig_path) + db_path = os.path.join(rpath, rel_path) file = open(abs_path, "rb") w.files.upload(db_path, BytesIO(file.read()), overwrite=True) else: _upload_files(abs_path) _upload_files(path) +def _databricks_get(board, rpath, lpath, recursive = False, **kwargs): + w = WorkspaceClient() + file_type = _databricks_is_type(rpath) + if(file_type == "file"): + board.fs.get(rpath, lpath, **kwargs) + return + def _get_files(path, recursive, **kwargs): + raw_contents = w.files.list_directory_contents(path) + contents = list(raw_contents) + details = list(map(_databricks_content_details, contents)) + for item in details: + item_path = item.get("path") + if(item.get("is_directory")): + if(recursive): + _get_files(item_path, recursive = recursive, **kwargs) + else: + rel_path = os.path.relpath(item_path, rpath) + target_path = os.path.join(lpath, rel_path) + board.fs.get(item_path, target_path) + _get_files(rpath, recursive, **kwargs) + def _databricks_open(path): w = WorkspaceClient() resp = w.files.download(path) @@ -75,6 +99,12 @@ def _databricks_open(path): return f def _databricks_exists(path: str): + if(_databricks_is_type(path) == "nothing"): + return False + else: + return True + +def _databricks_is_type(path: str): w = WorkspaceClient() try: w.files.get_metadata(path) @@ -82,11 +112,11 @@ def _databricks_exists(path: str): try: w.files.get_directory_metadata(path) except: - return False + return "nothing" else: - return True + return "directory" else: - return True + return "file" def _databricks_ls(path, detail): w = WorkspaceClient() diff --git a/test4.csv b/test4.csv new file mode 100644 index 00000000..b879b228 --- /dev/null +++ b/test4.csv @@ -0,0 +1,4 @@ +review +"This has been the best TV I've ever used. Great screen, and sound." +I regret buying this laptop. It is too slow and the keyboard is too noisy +"Not sure how to feel about my new washing machine. Great color, but hard to figure" From 228d6f11860a4c7f2adf640226410dbb2f709c17 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 7 Apr 2025 14:49:09 -0500 Subject: [PATCH 29/65] Removes test data --- data.csv | 4 ---- data2.csv | 4 ---- newtest3/data/20250404T131317Z-a173c/data.csv | 4 ---- newtest3/data/20250404T131317Z-a173c/data.txt | 9 --------- newtest3/reviews/20250404T131344Z-6cff2/data.txt | 9 --------- newtest3/reviews/20250404T131344Z-6cff2/reviews.csv | 4 ---- newtest3/reviews2/20250406T142724Z-6cff2/data.txt | 9 --------- newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv | 4 ---- newtest3/reviews3/20250406T145752Z-c65b0/data.csv | 4 ---- newtest3/reviews3/20250406T145752Z-c65b0/data.txt | 9 --------- test4.csv | 4 ---- 11 files changed, 64 deletions(-) delete mode 100644 data.csv delete mode 100644 data2.csv delete mode 100644 newtest3/data/20250404T131317Z-a173c/data.csv delete mode 100644 newtest3/data/20250404T131317Z-a173c/data.txt delete mode 100644 newtest3/reviews/20250404T131344Z-6cff2/data.txt delete mode 100644 newtest3/reviews/20250404T131344Z-6cff2/reviews.csv delete mode 100644 newtest3/reviews2/20250406T142724Z-6cff2/data.txt delete mode 100644 newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv delete mode 100644 newtest3/reviews3/20250406T145752Z-c65b0/data.csv delete mode 100644 newtest3/reviews3/20250406T145752Z-c65b0/data.txt delete mode 100644 test4.csv diff --git a/data.csv b/data.csv deleted file mode 100644 index 646dc149..00000000 --- a/data.csv +++ /dev/null @@ -1,4 +0,0 @@ -,x -0,1 -1,2 -2,3 diff --git a/data2.csv b/data2.csv deleted file mode 100644 index b879b228..00000000 --- a/data2.csv +++ /dev/null @@ -1,4 +0,0 @@ -review -"This has been the best TV I've ever used. Great screen, and sound." -I regret buying this laptop. It is too slow and the keyboard is too noisy -"Not sure how to feel about my new washing machine. Great color, but hard to figure" diff --git a/newtest3/data/20250404T131317Z-a173c/data.csv b/newtest3/data/20250404T131317Z-a173c/data.csv deleted file mode 100644 index d143959b..00000000 --- a/newtest3/data/20250404T131317Z-a173c/data.csv +++ /dev/null @@ -1,4 +0,0 @@ -x,y -1,4 -2,5 -3,6 diff --git a/newtest3/data/20250404T131317Z-a173c/data.txt b/newtest3/data/20250404T131317Z-a173c/data.txt deleted file mode 100644 index 3ec8c56f..00000000 --- a/newtest3/data/20250404T131317Z-a173c/data.txt +++ /dev/null @@ -1,9 +0,0 @@ -api_version: 1 -created: 20250404T131317Z -description: null -file: data.csv -file_size: 16 -pin_hash: a173cd6a53908980 -title: 'data: a pinned 3 x 2 DataFrame' -type: csv -user: {} diff --git a/newtest3/reviews/20250404T131344Z-6cff2/data.txt b/newtest3/reviews/20250404T131344Z-6cff2/data.txt deleted file mode 100644 index 3623b5ca..00000000 --- a/newtest3/reviews/20250404T131344Z-6cff2/data.txt +++ /dev/null @@ -1,9 +0,0 @@ -api_version: 1 -created: 20250404T131344Z -description: null -file: reviews.csv -file_size: 235 -pin_hash: 6cff20278541542e -title: 'reviews: a pinned 3 x 1 DataFrame' -type: csv -user: {} diff --git a/newtest3/reviews/20250404T131344Z-6cff2/reviews.csv b/newtest3/reviews/20250404T131344Z-6cff2/reviews.csv deleted file mode 100644 index b879b228..00000000 --- a/newtest3/reviews/20250404T131344Z-6cff2/reviews.csv +++ /dev/null @@ -1,4 +0,0 @@ -review -"This has been the best TV I've ever used. Great screen, and sound." -I regret buying this laptop. It is too slow and the keyboard is too noisy -"Not sure how to feel about my new washing machine. Great color, but hard to figure" diff --git a/newtest3/reviews2/20250406T142724Z-6cff2/data.txt b/newtest3/reviews2/20250406T142724Z-6cff2/data.txt deleted file mode 100644 index f620f542..00000000 --- a/newtest3/reviews2/20250406T142724Z-6cff2/data.txt +++ /dev/null @@ -1,9 +0,0 @@ -api_version: 1 -created: 20250406T142724Z -description: null -file: reviews2.csv -file_size: 235 -pin_hash: 6cff20278541542e -title: 'reviews2: a pinned 3 x 1 DataFrame' -type: csv -user: {} diff --git a/newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv b/newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv deleted file mode 100644 index b879b228..00000000 --- a/newtest3/reviews2/20250406T142724Z-6cff2/reviews2.csv +++ /dev/null @@ -1,4 +0,0 @@ -review -"This has been the best TV I've ever used. Great screen, and sound." -I regret buying this laptop. It is too slow and the keyboard is too noisy -"Not sure how to feel about my new washing machine. Great color, but hard to figure" diff --git a/newtest3/reviews3/20250406T145752Z-c65b0/data.csv b/newtest3/reviews3/20250406T145752Z-c65b0/data.csv deleted file mode 100644 index 646dc149..00000000 --- a/newtest3/reviews3/20250406T145752Z-c65b0/data.csv +++ /dev/null @@ -1,4 +0,0 @@ -,x -0,1 -1,2 -2,3 diff --git a/newtest3/reviews3/20250406T145752Z-c65b0/data.txt b/newtest3/reviews3/20250406T145752Z-c65b0/data.txt deleted file mode 100644 index 2cedeae1..00000000 --- a/newtest3/reviews3/20250406T145752Z-c65b0/data.txt +++ /dev/null @@ -1,9 +0,0 @@ -api_version: 1 -created: 20250406T145752Z -description: null -file: data.csv -file_size: 15 -pin_hash: c65b0e9785abaa60 -title: 'reviews3: a pinned str object' -type: file -user: {} diff --git a/test4.csv b/test4.csv deleted file mode 100644 index b879b228..00000000 --- a/test4.csv +++ /dev/null @@ -1,4 +0,0 @@ -review -"This has been the best TV I've ever used. Great screen, and sound." -I regret buying this laptop. It is too slow and the keyboard is too noisy -"Not sure how to feel about my new washing machine. Great color, but hard to figure" From bf317f0f33ba64fd22196fb42120049f3071bd9d Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 7 Apr 2025 17:15:16 -0500 Subject: [PATCH 30/65] Fixes issue with not handling board versioning for test boards --- pins/tests/helpers.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index 1ac29400..5292468d 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -206,16 +206,16 @@ def teardown(self): class DbcBoardBuilder(BoardBuilder): def __init__(self, fs_name, path=None, *args, **kwargs): - self.fs_name = fs_name - self.path = None - self.current_board = "" + self.path = None + self.fs_name = fs_name + self.current_board = None + self.volume = os.environ.get("DATABRICKS_VOLUME") def create_tmp_board(self, src_board=None, versioned=True): - db_vol = os.environ.get("DATABRICKS_VOLUME") temp_name = str(uuid.uuid4()) - board_name = os.path.join(db_vol, temp_name) - board = board_databricks(board_name) - self.current_board = temp_name + board_name = os.path.join(self.volume, temp_name) + board = board_databricks(board_name, versioned=versioned) + self.current_board = board if src_board is not None: board.fs.put(src_board, board_name) return board @@ -224,9 +224,8 @@ def teardown_board(self, board): board.fs.rm(board.board) def teardown(self): - db_vol = os.environ.get("DATABRICKS_VOLUME") - board = board_databricks(db_vol) - board.fs.rm(db_vol + "/" + self.current_board) + board = board_databricks(self.volume) + board.fs.rm(self.current_board.board) # Snapshot ==================================================================== From 9c3d7ddb661f9e64f4c20649b79a135705fe2242 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Tue, 8 Apr 2025 11:00:25 -0500 Subject: [PATCH 31/65] Passes constructor test --- pins/cache.py | 11 ++++++++++- pins/databricks/fs.py | 16 ++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/pins/cache.py b/pins/cache.py index 144938c5..89e63d02 100644 --- a/pins/cache.py +++ b/pins/cache.py @@ -145,13 +145,22 @@ def _check_file(self, path): if os.path.exists(fn): return fn -class PinsDBCache(SimpleCacheFileSystem): +class PinsDBCache(PinsCache): # Same as PinsCache, but removes _make_local_details def __init__(self, *args, hash_prefix=None, mapper=HashMapper, **kwargs): super().__init__(*args, **kwargs) self.hash_prefix = hash_prefix self._mapper = mapper(hash_prefix) + def _open(self, path, *args, **kwargs): + # For some reason, the open method of SimpleCacheFileSystem doesn't + # call _make_local_details, so we need to patch in here. + # Note that methods like .cat() do call it. Other Caches don't have this issue. + #path = self._strip_protocol(path) + self._make_local_details(path) + + return super()._open(path, *args, **kwargs) + def hash_name(self, path, *args, **kwargs): return self._mapper(path) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index a3a6d78d..b0685b3b 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -120,20 +120,28 @@ def _databricks_is_type(path: str): def _databricks_ls(path, detail): w = WorkspaceClient() + + if(_databricks_is_type(path) == "file"): + if(detail): + return [dict(name = path, size = None, type = "file")] + else: + return path + contents_raw = w.files.list_directory_contents(path) contents = list(contents_raw) items = [] for item in contents: item = _databricks_content_details(item) - name = item.get("name") - if(detail): + item_path = item.get("path") + if(detail): if(item.get("is_directory")): type = "directory" + item_path = item_path.rstrip("/") else: type = "file" - items.append(dict(name = name, size = None, type = type)) + items.append(dict(name = item_path, size = None, type = type)) else: - items.append(name) + items.append(item_path) return items def _databricks_rm_dir(path): From df63ef5568e82c58b7ee97aa411112505965151b Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Tue, 8 Apr 2025 17:03:44 -0500 Subject: [PATCH 32/65] Removes PinsDBCache and manual registration of dbc --- .env.dev | 5 +++++ pins/cache.py | 29 ----------------------------- pins/constructors.py | 11 ++--------- 3 files changed, 7 insertions(+), 38 deletions(-) diff --git a/.env.dev b/.env.dev index 02a8f29c..df405ed8 100644 --- a/.env.dev +++ b/.env.dev @@ -30,3 +30,8 @@ RSC_LICENSE= # (Note that the local file backend always uses a temporary directory.) # # PINS_TEST_S3__PATH="ci-pins" + +# Databricks backend ---- +DATABRICKS_HOST= +DATABRICKS_TOKEN= +DATABRICKS_VOLUME= diff --git a/pins/cache.py b/pins/cache.py index 89e63d02..ebffb3b7 100644 --- a/pins/cache.py +++ b/pins/cache.py @@ -145,33 +145,6 @@ def _check_file(self, path): if os.path.exists(fn): return fn -class PinsDBCache(PinsCache): - # Same as PinsCache, but removes _make_local_details - def __init__(self, *args, hash_prefix=None, mapper=HashMapper, **kwargs): - super().__init__(*args, **kwargs) - self.hash_prefix = hash_prefix - self._mapper = mapper(hash_prefix) - - def _open(self, path, *args, **kwargs): - # For some reason, the open method of SimpleCacheFileSystem doesn't - # call _make_local_details, so we need to patch in here. - # Note that methods like .cat() do call it. Other Caches don't have this issue. - #path = self._strip_protocol(path) - self._make_local_details(path) - - return super()._open(path, *args, **kwargs) - - def hash_name(self, path, *args, **kwargs): - return self._mapper(path) - - # same as upstream, brought in to preserve backwards compatibility - def _check_file(self, path): - self._check_cache() - sha = self._mapper(path) - for storage in self.storage: - fn = os.path.join(storage, sha) - if os.path.exists(fn): - return fn class PinsUrlCache(PinsCache): protocol = "pinsurlcache" @@ -351,5 +324,3 @@ def cache_prune(days=30, cache_root=None, prompt=True): # TODO: swap to use entrypoint register_implementation("pinscache", PinsCache) -from .tests.helpers import DbcBoardBuilder -register_implementation("dbc", DbcBoardBuilder) \ No newline at end of file diff --git a/pins/constructors.py b/pins/constructors.py index 2f8d407c..b90b6ae9 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -8,7 +8,7 @@ import fsspec from .boards import BaseBoard, BoardManual, BoardRsConnect, board_deparse -from .cache import PinsAccessTimeCache, PinsCache, PinsRscCacheMapper, prefix_cache, PinsDBCache +from .cache import PinsAccessTimeCache, PinsCache, PinsRscCacheMapper, prefix_cache from .config import get_cache_dir, get_data_dir # Kept here for backward-compatibility reasons @@ -113,14 +113,7 @@ def board( hash_prefix=hash_prefix, same_names=True, mapper=PinsRscCacheMapper, - ) - elif protocol == "dbc": - #Find me here - board_cache = prefix_cache(fs, path) - cache_dir = os.path.join(base_cache_dir, board_cache) - fs = PinsDBCache( - cache_storage=cache_dir, fs=fs, hash_prefix=path, same_names=True - ) + ) else: # ensures each subdir path is its own cache directory board_cache = prefix_cache(fs, path) From 9e286ea55371cbdf57763c0e98e56115ec3023b7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 10:50:32 -0500 Subject: [PATCH 33/65] Emulates the structure of the other test boards --- pins/databricks/fs.py | 11 +++++++---- pins/tests/helpers.py | 6 ++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index b0685b3b..a90a5fe4 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -4,7 +4,7 @@ from typing import ClassVar from fsspec import AbstractFileSystem from databricks.sdk import WorkspaceClient - +from pins.errors import PinsError class DatabricksFs(AbstractFileSystem): protocol: ClassVar[str | tuple[str, ...]] = "dbc" @@ -91,6 +91,8 @@ def _get_files(path, recursive, **kwargs): _get_files(rpath, recursive, **kwargs) def _databricks_open(path): + if(_databricks_exists(path) == False): + raise PinsError("File or directory does not exist") w = WorkspaceClient() resp = w.files.download(path) f = BytesIO() @@ -119,8 +121,9 @@ def _databricks_is_type(path: str): return "file" def _databricks_ls(path, detail): + if(_databricks_exists(path) == False): + raise PinsError("File or directory does not exist") w = WorkspaceClient() - if(_databricks_is_type(path) == "file"): if(detail): return [dict(name = path, size = None, type = "file")] @@ -133,10 +136,10 @@ def _databricks_ls(path, detail): for item in contents: item = _databricks_content_details(item) item_path = item.get("path") + item_path = item_path.rstrip("/") if(detail): if(item.get("is_directory")): - type = "directory" - item_path = item_path.rstrip("/") + type = "directory" else: type = "file" items.append(dict(name = item_path, size = None, type = type)) diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index 5292468d..ffa566a5 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -16,6 +16,8 @@ from pins.constructors import board_databricks +from pins.databricks.fs import DatabricksFs + DEFAULT_CREATION_DATE = datetime(2020, 1, 13, 23, 58, 59) RSC_SERVER_URL = "http://localhost:3939" @@ -213,8 +215,8 @@ def __init__(self, fs_name, path=None, *args, **kwargs): def create_tmp_board(self, src_board=None, versioned=True): temp_name = str(uuid.uuid4()) - board_name = os.path.join(self.volume, temp_name) - board = board_databricks(board_name, versioned=versioned) + board_name = os.path.join(self.volume, temp_name) + board = BaseBoard(board_name, fs=DatabricksFs(), versioned=versioned) self.current_board = board if src_board is not None: board.fs.put(src_board, board_name) From 98dd383910f1cfc21b1122dbdcf921f789f34350 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 11:34:19 -0500 Subject: [PATCH 34/65] Adds notes to the function, cleans up arguments --- pins/constructors.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index b90b6ae9..9d4253d4 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -575,14 +575,14 @@ def board_azure(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): opts = {"use_listings_cache": False} return board("abfs", path, versioned, cache, allow_pickle_read, storage_options=opts) -def board_databricks(folder_url, versioned=True, cache=DEFAULT, allow_pickle_read=None): +def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): """Create a board to read and write pins from an Databricks Volume folder. Parameters ---------- - folder_url: + path: The path to the target folder inside Unity Catalog. The path must include the - catalog, schema, and volume names, preceded by 'Volumes/', like + catalog, schema, and volume names, preceded by 'Volumes/', for example: "/Volumes/my-catalog/my-schema/my-volume". versioned: Whether or not pins should be versioned. @@ -601,8 +601,11 @@ def board_databricks(folder_url, versioned=True, cache=DEFAULT, allow_pickle_rea Notes ----- - The Databricks board uses... + The Databricks board uses the `databricks-sdk` library to authenticate and interact + with the Databricks Volume. + + See + """ - kwargs = dict(folder_url=folder_url) - return board("dbc", folder_url, versioned, cache, allow_pickle_read, storage_options=kwargs) + return board("dbc", path, versioned, cache, allow_pickle_read) From 9c96fb21f500a19357c620528d52c20a34841a0d Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 11:43:54 -0500 Subject: [PATCH 35/65] Restores BoardRsConnect to helpers --- pins/tests/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index ffa566a5..e3884361 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -175,7 +175,7 @@ def __init__(self, fs_name, path=None, *args, **kwargs): def create_tmp_board(self, src_board=None, versioned=True): from pins.rsconnect.fs import PinBundleManifest # noqa - board = BaseBoard(path, fs, versioned=versioned) + board = BoardRsConnect(path, fs, versioned=versioned) if src_board is None: return board From b6726208413d94474b33eb6abd60a247f74ae64a Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 11:44:48 -0500 Subject: [PATCH 36/65] Adds full BoardRSconnect call --- pins/tests/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index e3884361..d0daa958 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -175,7 +175,7 @@ def __init__(self, fs_name, path=None, *args, **kwargs): def create_tmp_board(self, src_board=None, versioned=True): from pins.rsconnect.fs import PinBundleManifest # noqa - board = BoardRsConnect(path, fs, versioned=versioned) + board = BoardRsConnect("", rsc_fs_from_key("derek"), versioned=versioned) if src_board is None: return board From b5f375b093e27106097cef938697f12d6e541269 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 11:58:02 -0500 Subject: [PATCH 37/65] Gets fs via calling the function for the tests --- pins/tests/helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index d0daa958..e609fb18 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -16,7 +16,6 @@ from pins.constructors import board_databricks -from pins.databricks.fs import DatabricksFs DEFAULT_CREATION_DATE = datetime(2020, 1, 13, 23, 58, 59) @@ -215,8 +214,9 @@ def __init__(self, fs_name, path=None, *args, **kwargs): def create_tmp_board(self, src_board=None, versioned=True): temp_name = str(uuid.uuid4()) - board_name = os.path.join(self.volume, temp_name) - board = BaseBoard(board_name, fs=DatabricksFs(), versioned=versioned) + board_name = os.path.join(self.volume, temp_name) + db_board = board_databricks(board_name, cache=None) + board = BaseBoard(board_name, fs=db_board.fs, versioned=versioned) self.current_board = board if src_board is not None: board.fs.put(src_board, board_name) From db8cb82720bbd65ed56e8d881860074df5233d91 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 12:07:08 -0500 Subject: [PATCH 38/65] Removes Databricks from CI tests --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72025c42..7662f73d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: - name: Run tests shell: bash run: | - pytest pins -m 'not fs_rsc and not skip_on_github' $PYTEST_OPTS + pytest pins -m 'not fs_rsc and not skip_on_github and not fs_dbc' $PYTEST_OPTS env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} From 2bc220a600d21ca66ee7ee6b0721d5565eeefd6d Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 13:46:28 -0500 Subject: [PATCH 39/65] Adds databricks-sdk to requirements dev file --- pyproject.toml | 1 + requirements/dev.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 20cb89f4..e097a31a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ test = [ "pytest-parallel", "s3fs", "rdata", + "databricks-sdk", ] [build-system] diff --git a/requirements/dev.txt b/requirements/dev.txt index 24473daf..26fa76b8 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -92,6 +92,8 @@ cryptography==44.0.0 # azure-storage-blob # msal # pyjwt +databricks-sdk==0.49.0 + # via pins (pyproject.toml) debugpy==1.8.11 # via ipykernel decopatch==1.4.10 From c59e074cced1f5686d81480af0459b1fe3d3ab56 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:26:30 -0500 Subject: [PATCH 40/65] Updates rest of dev reqs versions --- requirements/dev.txt | 160 +++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 73 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 26fa76b8..63ca07ff 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2,18 +2,18 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile --extra=check --extra=doc --extra=test --output-file=- pyproject.toml +# pip-compile --extra=check --extra=doc --extra=test # --index-url https://pypi.python.org/simple/ --trusted-host pypi.org adlfs==2024.12.0 # via pins (pyproject.toml) -aiobotocore==2.15.2 +aiobotocore==2.21.1 # via s3fs -aiohappyeyeballs==2.4.4 +aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.11.10 +aiohttp==3.11.16 # via # adlfs # aiobotocore @@ -33,37 +33,37 @@ appnope==0.1.4 # ipython asttokens==3.0.0 # via stack-data -attrs==24.3.0 +attrs==25.3.0 # via # aiohttp # jsonschema # pytest # referencing # sphobjinv -azure-core==1.32.0 +azure-core==1.33.0 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.53 # via adlfs -azure-identity==1.19.0 +azure-identity==1.21.0 # via adlfs -azure-storage-blob==12.24.0 +azure-storage-blob==12.25.1 # via adlfs backcall==0.2.0 # via ipython -beartype==0.19.0 +beartype==0.20.2 # via plum-dispatch -black==24.10.0 +black==25.1.0 # via quartodoc -botocore==1.35.36 +botocore==1.37.1 # via aiobotocore build==1.2.2.post1 # via pip-tools -cachetools==5.5.0 +cachetools==5.5.2 # via google-auth -certifi==2024.12.14 +certifi==2025.1.31 # via # requests # sphobjinv @@ -73,9 +73,9 @@ cffi==1.17.1 # cryptography cfgv==3.4.0 # via pre-commit -charset-normalizer==3.4.0 +charset-normalizer==3.4.1 # via requests -click==8.1.7 +click==8.1.8 # via # black # pip-tools @@ -86,51 +86,52 @@ comm==0.2.2 # via ipykernel cramjam==2.9.1 # via fastparquet -cryptography==44.0.0 +cryptography==44.0.2 # via # azure-identity # azure-storage-blob # msal # pyjwt databricks-sdk==0.49.0 - # via pins (pyproject.toml) -debugpy==1.8.11 + # via pins (pyproject.toml) +debugpy==1.8.13 # via ipykernel decopatch==1.4.10 # via pytest-cases -decorator==5.1.1 +decorator==5.2.1 # via # gcsfs # ipython distlib==0.3.9 # via virtualenv -executing==2.1.0 +executing==2.2.0 # via stack-data fastjsonschema==2.21.1 # via nbformat fastparquet==2024.11.0 # via pins (pyproject.toml) -filelock==3.16.1 +filelock==3.18.0 # via virtualenv frozenlist==1.5.0 # via # aiohttp # aiosignal -fsspec==2024.10.0 +fsspec==2025.3.2 # via # adlfs # fastparquet # gcsfs # pins (pyproject.toml) # s3fs -gcsfs==2024.10.0 +gcsfs==2025.3.2 # via pins (pyproject.toml) -google-api-core==2.24.0 +google-api-core==2.24.2 # via # google-cloud-core # google-cloud-storage -google-auth==2.37.0 +google-auth==2.38.0 # via + # databricks-sdk # gcsfs # google-api-core # google-auth-oauthlib @@ -138,37 +139,37 @@ google-auth==2.37.0 # google-cloud-storage google-auth-oauthlib==1.2.1 # via gcsfs -google-cloud-core==2.4.1 +google-cloud-core==2.4.3 # via google-cloud-storage -google-cloud-storage==2.19.0 +google-cloud-storage==3.1.0 # via gcsfs -google-crc32c==1.6.0 +google-crc32c==1.7.1 # via # google-cloud-storage # google-resumable-media google-resumable-media==2.7.2 # via google-cloud-storage -googleapis-common-protos==1.66.0 +googleapis-common-protos==1.69.2 # via google-api-core -griffe==1.5.1 +griffe==1.7.2 # via quartodoc -humanize==4.11.0 +humanize==4.12.2 # via pins (pyproject.toml) -identify==2.6.3 +identify==2.6.9 # via pre-commit idna==3.10 # via # requests # yarl -importlib-metadata==8.5.0 +importlib-metadata==8.6.1 # via # pins (pyproject.toml) # quartodoc -importlib-resources==6.4.5 +importlib-resources==6.5.2 # via # pins (pyproject.toml) # quartodoc -iniconfig==2.0.0 +iniconfig==2.1.0 # via pytest ipykernel==6.29.5 # via pins (pyproject.toml) @@ -180,10 +181,12 @@ isodate==0.7.2 # via azure-storage-blob jedi==0.19.2 # via ipython -jinja2==3.1.4 +jinja2==3.1.6 # via pins (pyproject.toml) jmespath==1.0.1 - # via botocore + # via + # aiobotocore + # botocore joblib==1.4.2 # via pins (pyproject.toml) jsonschema==4.23.0 @@ -216,20 +219,21 @@ matplotlib-inline==0.1.7 # ipython mdurl==0.1.2 # via markdown-it-py -msal==1.31.1 +msal==1.32.0 # via # azure-datalake-store # azure-identity # msal-extensions -msal-extensions==1.2.0 +msal-extensions==1.3.1 # via azure-identity -multidict==6.1.0 +multidict==6.4.2 # via + # aiobotocore # aiohttp # yarl mypy-extensions==1.0.0 # via black -nbclient==0.10.1 +nbclient==0.10.2 # via pins (pyproject.toml) nbformat==5.10.4 # via @@ -241,10 +245,12 @@ nodeenv==1.9.1 # via # pre-commit # pyright -numpy==2.2.0 +numpy==2.2.4 # via # fastparquet # pandas + # rdata + # xarray oauthlib==3.2.2 # via requests-oauthlib packaging==24.2 @@ -255,10 +261,13 @@ packaging==24.2 # ipykernel # pytest # pytest-cases + # xarray pandas==2.2.3 # via # fastparquet # pins (pyproject.toml) + # rdata + # xarray parso==0.8.4 # via jedi pathspec==0.12.1 @@ -269,33 +278,31 @@ pickleshare==0.7.5 # via ipython pip-tools==7.4.1 # via pins (pyproject.toml) -platformdirs==4.3.6 +platformdirs==4.3.7 # via # black # jupyter-core # virtualenv pluggy==1.5.0 # via pytest -plum-dispatch==2.5.4 +plum-dispatch==2.5.7 # via quartodoc -portalocker==2.10.1 - # via msal-extensions -pre-commit==4.0.1 +pre-commit==4.2.0 # via pins (pyproject.toml) -prompt-toolkit==3.0.48 +prompt-toolkit==3.0.50 # via ipython -propcache==0.2.1 +propcache==0.3.1 # via # aiohttp # yarl -proto-plus==1.25.0 +proto-plus==1.26.1 # via google-api-core -protobuf==5.29.1 +protobuf==6.30.2 # via # google-api-core # googleapis-common-protos # proto-plus -psutil==6.1.0 +psutil==7.0.0 # via ipykernel ptyprocess==0.7.0 # via pexpect @@ -303,21 +310,21 @@ pure-eval==0.2.3 # via stack-data py==1.11.0 # via pytest -pyarrow==18.1.0 +pyarrow==19.0.1 # via pins (pyproject.toml) pyasn1==0.6.1 # via # pyasn1-modules # rsa -pyasn1-modules==0.4.1 +pyasn1-modules==0.4.2 # via google-auth pycparser==2.22 # via cffi -pydantic==2.10.3 +pydantic==2.11.3 # via quartodoc -pydantic-core==2.27.1 +pydantic-core==2.33.1 # via pydantic -pygments==2.18.0 +pygments==2.19.1 # via # ipython # rich @@ -344,19 +351,20 @@ pytest-parallel==0.1.1 # via pins (pyproject.toml) python-dateutil==2.9.0.post0 # via + # aiobotocore # botocore # jupyter-client # pandas -python-dotenv==1.0.1 +python-dotenv==1.1.0 # via pytest-dotenv -pytz==2024.2 +pytz==2025.2 # via pandas pyyaml==6.0.2 # via # pins (pyproject.toml) # pre-commit # quartodoc -pyzmq==26.2.0 +pyzmq==26.4.0 # via # ipykernel # jupyter-client @@ -364,7 +372,7 @@ quartodoc==0.9.1 # via pins (pyproject.toml) rdata==0.11.2 # via pins (pyproject.toml) -referencing==0.35.1 +referencing==0.36.2 # via # jsonschema # jsonschema-specifications @@ -372,6 +380,7 @@ requests==2.32.3 # via # azure-core # azure-datalake-store + # databricks-sdk # gcsfs # google-api-core # google-cloud-storage @@ -381,27 +390,27 @@ requests==2.32.3 # requests-oauthlib requests-oauthlib==2.0.0 # via google-auth-oauthlib -rich==13.9.4 +rich==14.0.0 # via plum-dispatch -rpds-py==0.22.3 +rpds-py==0.24.0 # via # jsonschema # referencing rsa==4.9 # via google-auth -s3fs==2024.10.0 +s3fs==2025.3.2 # via pins (pyproject.toml) six==1.17.0 # via # azure-core # python-dateutil -sphobjinv==2.3.1.1 +sphobjinv==2.3.1.2 # via quartodoc stack-data==0.6.3 # via ipython tabulate==0.9.0 # via quartodoc -tblib==3.0.0 +tblib==3.1.0 # via pytest-parallel tomli==2.2.1 # via pytest @@ -421,7 +430,7 @@ traitlets==5.14.3 # nbformat types-appdirs==1.4.3.5 # via pins (pyproject.toml) -typing-extensions==4.12.2 +typing-extensions==4.13.2 # via # azure-core # azure-identity @@ -430,13 +439,18 @@ typing-extensions==4.12.2 # pydantic # pydantic-core # quartodoc -tzdata==2024.2 + # rdata + # referencing + # typing-inspection +typing-inspection==0.4.0 + # via pydantic +tzdata==2025.2 # via pandas -urllib3==2.2.3 +urllib3==2.4.0 # via # botocore # requests -virtualenv==20.28.0 +virtualenv==20.30.0 # via pre-commit watchdog==6.0.0 # via quartodoc @@ -444,13 +458,13 @@ wcwidth==0.2.13 # via prompt-toolkit wheel==0.45.1 # via pip-tools -wrapt==1.17.0 +wrapt==1.17.2 # via aiobotocore -xarray==2024.6.0 +xarray==2025.3.1 # via rdata xxhash==3.5.0 # via pins (pyproject.toml) -yarl==1.18.3 +yarl==1.19.0 # via aiohttp zipp==3.21.0 # via importlib-metadata From 9ca444d4f89895391a68519e8aea17c654272c37 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:29:47 -0500 Subject: [PATCH 41/65] Attempts to avoid double forward slash in Linux --- pins/drivers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pins/drivers.py b/pins/drivers.py index fcf9bee3..d6bea3a5 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -30,6 +30,7 @@ def load_path(filename: str, path_to_version, pin_type=None): filename = "data.csv" if path_to_version is not None: + path_to_version = path_to_version.rstrip("/") path_to_file = f"{path_to_version}/{filename}" else: # BoardUrl doesn't have versions, and the file is the full url From 06fb022d1c3a2f37c49fba11a1a121b52f69183c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:55:39 -0500 Subject: [PATCH 42/65] Checks if path_to_version is a str before removing trailing slash --- pins/drivers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pins/drivers.py b/pins/drivers.py index d6bea3a5..1a6818f0 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -30,7 +30,8 @@ def load_path(filename: str, path_to_version, pin_type=None): filename = "data.csv" if path_to_version is not None: - path_to_version = path_to_version.rstrip("/") + if(isinstance(path_to_version), str): + path_to_version = path_to_version.rstrip("/") path_to_file = f"{path_to_version}/{filename}" else: # BoardUrl doesn't have versions, and the file is the full url From b5a817ace5958adf2420111fa40206f52dbea24d Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 11 Apr 2025 14:14:09 -0500 Subject: [PATCH 43/65] Fixes typo on isinstance call --- pins/drivers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pins/drivers.py b/pins/drivers.py index 1a6818f0..7de62465 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -30,7 +30,7 @@ def load_path(filename: str, path_to_version, pin_type=None): filename = "data.csv" if path_to_version is not None: - if(isinstance(path_to_version), str): + if(isinstance(path_to_version, str)): path_to_version = path_to_version.rstrip("/") path_to_file = f"{path_to_version}/{filename}" else: From 2dbe9bee43e80fdaab6b38b7edfda4ed1a242ce3 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 11 Apr 2025 14:43:17 -0500 Subject: [PATCH 44/65] Removes protocol assignment for DatabricksFs --- pins/databricks/fs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index a90a5fe4..ffd41ab1 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -7,7 +7,6 @@ from pins.errors import PinsError class DatabricksFs(AbstractFileSystem): - protocol: ClassVar[str | tuple[str, ...]] = "dbc" def ls(self, path, detail=False, **kwargs): return _databricks_ls(path, detail) From 7e70f1690b1f0ace2e308f459448c52b7a877f4b Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 11 Apr 2025 15:33:31 -0500 Subject: [PATCH 45/65] Passes pre-commit --- pins/__init__.py | 2 +- pins/boards.py | 2 +- pins/constructors.py | 19 +++++---- pins/databricks/fs.py | 95 ++++++++++++++++++++++++------------------ pins/drivers.py | 2 +- pins/tests/conftest.py | 8 +++- pins/tests/helpers.py | 14 +++---- pins/utils.py | 1 + 8 files changed, 82 insertions(+), 61 deletions(-) diff --git a/pins/__init__.py b/pins/__init__.py index 5bf14ef9..4ce19945 100644 --- a/pins/__init__.py +++ b/pins/__init__.py @@ -22,7 +22,7 @@ board_azure, board_s3, board_gcs, - board_databricks, + board_databricks, board, ) from .boards import board_deparse diff --git a/pins/boards.py b/pins/boards.py index cbca99ec..c5f600d0 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -869,7 +869,7 @@ def board_deparse(board: BaseBoard): elif prot == "http": return f"board_url({repr(board.board)}, {board.pin_paths}{allow_pickle})" elif prot == "dbc": - return f"board_databricks({repr(board.board)}{allow_pickle})" + return f"board_databricks({repr(board.board)}{allow_pickle})" else: raise NotImplementedError( f"board deparsing currently not supported for protocol: {prot}" diff --git a/pins/constructors.py b/pins/constructors.py index 9d4253d4..b4f21331 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -87,8 +87,8 @@ def board( fs = RsConnectFs(**storage_options) - elif protocol == "dbc" : - from pins.databricks.fs import DatabricksFs + elif protocol == "dbc": + from pins.databricks.fs import DatabricksFs fs = DatabricksFs(**storage_options) @@ -113,12 +113,12 @@ def board( hash_prefix=hash_prefix, same_names=True, mapper=PinsRscCacheMapper, - ) + ) else: # ensures each subdir path is its own cache directory board_cache = prefix_cache(fs, path) cache_dir = os.path.join(base_cache_dir, board_cache) - + fs = PinsCache( cache_storage=cache_dir, fs=fs, hash_prefix=path, same_names=True ) @@ -134,7 +134,7 @@ def board( if board_factory is not None: board = board_factory(path, fs, versioned, **pickle_kwargs) elif protocol == "rsc": - board = BoardRsConnect(path, fs, versioned, **pickle_kwargs) + board = BoardRsConnect(path, fs, versioned, **pickle_kwargs) else: board = BaseBoard(path, fs, versioned, **pickle_kwargs) return board @@ -575,14 +575,15 @@ def board_azure(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): opts = {"use_listings_cache": False} return board("abfs", path, versioned, cache, allow_pickle_read, storage_options=opts) + def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): """Create a board to read and write pins from an Databricks Volume folder. Parameters ---------- path: - The path to the target folder inside Unity Catalog. The path must include the - catalog, schema, and volume names, preceded by 'Volumes/', for example: + The path to the target folder inside Unity Catalog. The path must include the + catalog, schema, and volume names, preceded by 'Volumes/', for example: "/Volumes/my-catalog/my-schema/my-volume". versioned: Whether or not pins should be versioned. @@ -601,11 +602,11 @@ def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None Notes ----- - The Databricks board uses the `databricks-sdk` library to authenticate and interact + The Databricks board uses the `databricks-sdk` library to authenticate and interact with the Databricks Volume. See - + """ return board("dbc", path, versioned, cache, allow_pickle_read) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index ffd41ab1..a2341b26 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -1,14 +1,16 @@ import os import shutil from io import BytesIO -from typing import ClassVar -from fsspec import AbstractFileSystem + from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound +from fsspec import AbstractFileSystem + from pins.errors import PinsError -class DatabricksFs(AbstractFileSystem): - def ls(self, path, detail=False, **kwargs): +class DatabricksFs(AbstractFileSystem): + def ls(self, path, detail=False, **kwargs): return _databricks_ls(path, detail) def exists(self, path: str, **kwargs): @@ -16,9 +18,9 @@ def exists(self, path: str, **kwargs): def open(self, path: str, mode: str = "rb", *args, **kwargs): if mode != "rb": - raise NotImplementedError + raise NotImplementedError return _databricks_open(path) - + def get(self, rpath, lpath, recursive=False, **kwargs): _databricks_get(self, rpath, lpath, recursive, **kwargs) @@ -46,51 +48,57 @@ def rm(self, path, recursive=True, maxdepth=None) -> None: raise NotImplementedError if maxdepth is not None: raise NotImplementedError - if(_databricks_exists(path)): + if _databricks_exists(path): _databricks_rm_dir(path) + def _databricks_put(lpath, rpath): w = WorkspaceClient() path = os.path.abspath(lpath) - items = [] orig_path = path + def _upload_files(path): contents = os.listdir(path) - for item in contents: + for item in contents: abs_path = os.path.join(path, item) is_file = os.path.isfile(abs_path) - if(is_file): + if is_file: rel_path = os.path.relpath(abs_path, orig_path) - db_path = os.path.join(rpath, rel_path) + db_path = os.path.join(rpath, rel_path) file = open(abs_path, "rb") w.files.upload(db_path, BytesIO(file.read()), overwrite=True) else: _upload_files(abs_path) + _upload_files(path) -def _databricks_get(board, rpath, lpath, recursive = False, **kwargs): + +def _databricks_get(board, rpath, lpath, recursive=False, **kwargs): w = WorkspaceClient() file_type = _databricks_is_type(rpath) - if(file_type == "file"): + if file_type == "file": board.fs.get(rpath, lpath, **kwargs) return + def _get_files(path, recursive, **kwargs): raw_contents = w.files.list_directory_contents(path) contents = list(raw_contents) - details = list(map(_databricks_content_details, contents)) - for item in details: - item_path = item.get("path") - if(item.get("is_directory")): - if(recursive): - _get_files(item_path, recursive = recursive, **kwargs) + details = list(map(_databricks_content_details, contents)) + for item in details: + item_path = item.get("path") + if item.get("is_directory"): + if recursive: + _get_files(item_path, recursive=recursive, **kwargs) else: rel_path = os.path.relpath(item_path, rpath) target_path = os.path.join(lpath, rel_path) board.fs.get(item_path, target_path) + _get_files(rpath, recursive, **kwargs) + def _databricks_open(path): - if(_databricks_exists(path) == False): + if not _databricks_exists(path): raise PinsError("File or directory does not exist") w = WorkspaceClient() resp = w.files.download(path) @@ -99,33 +107,36 @@ def _databricks_open(path): f.seek(0) return f + def _databricks_exists(path: str): - if(_databricks_is_type(path) == "nothing"): + if _databricks_is_type(path) == "nothing": return False else: return True + def _databricks_is_type(path: str): w = WorkspaceClient() try: w.files.get_metadata(path) - except: + except NotFound: try: - w.files.get_directory_metadata(path) - except: + w.files.get_directory_metadata(path) + except NotFound: return "nothing" else: return "directory" else: - return "file" + return "file" + def _databricks_ls(path, detail): - if(_databricks_exists(path) == False): - raise PinsError("File or directory does not exist") + if not _databricks_exists(path): + raise PinsError("File or directory does not exist") w = WorkspaceClient() - if(_databricks_is_type(path) == "file"): - if(detail): - return [dict(name = path, size = None, type = "file")] + if _databricks_is_type(path) == "file": + if detail: + return [dict(name=path, size=None, type="file")] else: return path @@ -136,33 +147,35 @@ def _databricks_ls(path, detail): item = _databricks_content_details(item) item_path = item.get("path") item_path = item_path.rstrip("/") - if(detail): - if(item.get("is_directory")): - type = "directory" + if detail: + if item.get("is_directory"): + item_type = "directory" else: - type = "file" - items.append(dict(name = item_path, size = None, type = type)) + item_type = "file" + items.append(dict(name=item_path, size=None, type=item_type)) else: items.append(item_path) return items + def _databricks_rm_dir(path): w = WorkspaceClient() raw_contents = w.files.list_directory_contents(path) contents = list(raw_contents) details = list(map(_databricks_content_details, contents)) - items = [] - for item in details: - item_path = item.get("path") - if(item.get("is_directory")): - _databricks_rm_dir(item_path) + for item in details: + item_path = item.get("path") + if item.get("is_directory"): + _databricks_rm_dir(item_path) else: w.files.delete(item_path) w.files.delete_directory(path) + def _databricks_mkdir(path): w = WorkspaceClient() - w.files.create_directory(path) + w.files.create_directory(path) + def _databricks_content_details(item): details = { diff --git a/pins/drivers.py b/pins/drivers.py index 7de62465..14f5e84b 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -30,7 +30,7 @@ def load_path(filename: str, path_to_version, pin_type=None): filename = "data.csv" if path_to_version is not None: - if(isinstance(path_to_version, str)): + if isinstance(path_to_version, str): path_to_version = path_to_version.rstrip("/") path_to_file = f"{path_to_version}/{filename}" else: diff --git a/pins/tests/conftest.py b/pins/tests/conftest.py index cbdb9a11..6589955b 100644 --- a/pins/tests/conftest.py +++ b/pins/tests/conftest.py @@ -6,7 +6,13 @@ from importlib_resources import files from pytest import mark as m -from pins.tests.helpers import BoardBuilder, RscBoardBuilder, DbcBoardBuilder, Snapshot, rm_env +from pins.tests.helpers import ( + BoardBuilder, + DbcBoardBuilder, + RscBoardBuilder, + Snapshot, + rm_env, +) EXAMPLE_REL_PATH = "pins/tests/pins-compat" PATH_TO_EXAMPLE_BOARD = files("pins") / "tests/pins-compat" diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index e609fb18..93da5176 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -13,10 +13,8 @@ from importlib_resources import files from pins.boards import BaseBoard, BoardRsConnect - from pins.constructors import board_databricks - DEFAULT_CREATION_DATE = datetime(2020, 1, 13, 23, 58, 59) RSC_SERVER_URL = "http://localhost:3939" @@ -205,21 +203,22 @@ def teardown_board(self, board): def teardown(self): self.teardown_board(self.create_tmp_board()) + class DbcBoardBuilder(BoardBuilder): def __init__(self, fs_name, path=None, *args, **kwargs): - self.path = None - self.fs_name = fs_name + self.path = None + self.fs_name = fs_name self.current_board = None self.volume = os.environ.get("DATABRICKS_VOLUME") def create_tmp_board(self, src_board=None, versioned=True): temp_name = str(uuid.uuid4()) board_name = os.path.join(self.volume, temp_name) - db_board = board_databricks(board_name, cache=None) + db_board = board_databricks(board_name, cache=None) board = BaseBoard(board_name, fs=db_board.fs, versioned=versioned) self.current_board = board if src_board is not None: - board.fs.put(src_board, board_name) + board.fs.put(src_board, board_name) return board def teardown_board(self, board): @@ -229,6 +228,7 @@ def teardown(self): board = board_databricks(self.volume) board.fs.rm(self.current_board.board) + # Snapshot ==================================================================== @@ -298,4 +298,4 @@ def rm_env(*args): yield finally: os.environ.clear() - os.environ.update(old_environ) \ No newline at end of file + os.environ.update(old_environ) diff --git a/pins/utils.py b/pins/utils.py index 6af9bab5..7525c148 100644 --- a/pins/utils.py +++ b/pins/utils.py @@ -7,6 +7,7 @@ from .config import pins_options + def inform(log, msg): if log is not None: log.info(msg) From 1361d27d7f6a0ecba03480d9c7e70009c77d010c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 11 Apr 2025 15:47:27 -0500 Subject: [PATCH 46/65] Adds databricks-sdk to minimum reqs --- requirements/minimum.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/minimum.txt b/requirements/minimum.txt index 325990dc..e2868b02 100644 --- a/requirements/minimum.txt +++ b/requirements/minimum.txt @@ -7,3 +7,4 @@ importlib-metadata==4.4 importlib-resources==1.3 appdirs<2.0.0 humanize==1.0.0 +databricks-sdk==0.49.0 From 7a21bd6a26480301a32249ab3d6f65c057eea0cf Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:06:29 -0500 Subject: [PATCH 47/65] Addresses additional issue from precommit and attempts to solve pyright error --- pins/tests/test_boards.py | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pins/tests/test_boards.py b/pins/tests/test_boards.py index 855c707e..dcd551ee 100644 --- a/pins/tests/test_boards.py +++ b/pins/tests/test_boards.py @@ -40,7 +40,7 @@ def board_unversioned(backend): @fixture def board_with_cache(backend): from pins.constructors import board as board_constructor - from pins.constructors import board_rsconnect, board_databricks + from pins.constructors import board_databricks, board_rsconnect board = backend.create_tmp_board() diff --git a/pyproject.toml b/pyproject.toml index e097a31a..613f5d83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ requires-python = ">=3.9" dynamic = ["version"] dependencies = [ "appdirs<2", # Using appdirs rather than platformdirs is deliberate, see https://github.com/rstudio/pins-python/pull/239 + "databricks-sdk==0.49.0", "fsspec>=2022.2", "humanize>=1", "importlib-metadata>=4.4", From 9d7f9e4a5253c3e0c5d4314febea3b96bbbcb59d Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 28 Apr 2025 09:37:38 -0500 Subject: [PATCH 48/65] Update pins/databricks/fs.py Co-authored-by: Isabel Zimmerman <54685329+isabelizimm@users.noreply.github.com> --- pins/databricks/fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index a2341b26..1e552070 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -65,7 +65,7 @@ def _upload_files(path): if is_file: rel_path = os.path.relpath(abs_path, orig_path) db_path = os.path.join(rpath, rel_path) - file = open(abs_path, "rb") + with open(abs_path, "rb") as file: w.files.upload(db_path, BytesIO(file.read()), overwrite=True) else: _upload_files(abs_path) From f70b0d9b617d6660d4b3d5a0fc6e65b3e436270a Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 28 Apr 2025 09:37:44 -0500 Subject: [PATCH 49/65] Update requirements/minimum.txt Co-authored-by: Isabel Zimmerman <54685329+isabelizimm@users.noreply.github.com> --- requirements/minimum.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/minimum.txt b/requirements/minimum.txt index e2868b02..325990dc 100644 --- a/requirements/minimum.txt +++ b/requirements/minimum.txt @@ -7,4 +7,3 @@ importlib-metadata==4.4 importlib-resources==1.3 appdirs<2.0.0 humanize==1.0.0 -databricks-sdk==0.49.0 From 4a27260140ba673c905a8d9c98322e29536b6063 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 28 Apr 2025 09:37:54 -0500 Subject: [PATCH 50/65] Update pins/databricks/fs.py Co-authored-by: Isabel Zimmerman <54685329+isabelizimm@users.noreply.github.com> --- pins/databricks/fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 1e552070..ae1c6a95 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -132,7 +132,7 @@ def _databricks_is_type(path: str): def _databricks_ls(path, detail): if not _databricks_exists(path): - raise PinsError("File or directory does not exist") + raise PinsError(f"File or directory does not exist: {path}") w = WorkspaceClient() if _databricks_is_type(path) == "file": if detail: From b1f9aecaaa6e23f3ab2f98c787bb9810b6146d3a Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 28 Apr 2025 09:38:04 -0500 Subject: [PATCH 51/65] Update pins/databricks/fs.py Co-authored-by: Isabel Zimmerman <54685329+isabelizimm@users.noreply.github.com> --- pins/databricks/fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index ae1c6a95..3ad8cff7 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -99,7 +99,7 @@ def _get_files(path, recursive, **kwargs): def _databricks_open(path): if not _databricks_exists(path): - raise PinsError("File or directory does not exist") + raise PinsError(f"File or directory does not exist: {path}") w = WorkspaceClient() resp = w.files.download(path) f = BytesIO() From 400a7436bad330487f823bc168a538ea3f06be68 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 28 Apr 2025 14:14:46 -0500 Subject: [PATCH 52/65] Switches from os to pathlib --- pins/databricks/fs.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 3ad8cff7..807fdf9a 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -1,6 +1,6 @@ -import os import shutil from io import BytesIO +from pathlib import Path, PurePath from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound @@ -10,6 +10,8 @@ class DatabricksFs(AbstractFileSystem): + protocol = "dbc" + def ls(self, path, detail=False, **kwargs): return _databricks_ls(path, detail) @@ -54,19 +56,19 @@ def rm(self, path, recursive=True, maxdepth=None) -> None: def _databricks_put(lpath, rpath): w = WorkspaceClient() - path = os.path.abspath(lpath) + path = Path(lpath).absolute() orig_path = path def _upload_files(path): - contents = os.listdir(path) - for item in contents: - abs_path = os.path.join(path, item) - is_file = os.path.isfile(abs_path) + contents = Path(path) + for item in contents.iterdir(): + abs_path = PurePath(path).joinpath(item) + is_file = Path(abs_path).is_file() if is_file: - rel_path = os.path.relpath(abs_path, orig_path) - db_path = os.path.join(rpath, rel_path) - with open(abs_path, "rb") as file: - w.files.upload(db_path, BytesIO(file.read()), overwrite=True) + rel_path = abs_path.relative_to(orig_path) + db_path = PurePath(rpath).joinpath(rel_path) + file = open(abs_path, "rb") + w.files.upload(str(db_path), BytesIO(file.read()), overwrite=True) else: _upload_files(abs_path) @@ -90,16 +92,16 @@ def _get_files(path, recursive, **kwargs): if recursive: _get_files(item_path, recursive=recursive, **kwargs) else: - rel_path = os.path.relpath(item_path, rpath) - target_path = os.path.join(lpath, rel_path) - board.fs.get(item_path, target_path) + rel_path = PurePath(item_path).relative_to(rpath) + target_path = PurePath(lpath).joinpath(rel_path) + board.fs.get(item_path, str(target_path)) _get_files(rpath, recursive, **kwargs) def _databricks_open(path): if not _databricks_exists(path): - raise PinsError(f"File or directory does not exist: {path}") + raise PinsError("File or directory does not exist") w = WorkspaceClient() resp = w.files.download(path) f = BytesIO() @@ -132,7 +134,7 @@ def _databricks_is_type(path: str): def _databricks_ls(path, detail): if not _databricks_exists(path): - raise PinsError(f"File or directory does not exist: {path}") + raise PinsError("File or directory does not exist") w = WorkspaceClient() if _databricks_is_type(path) == "file": if detail: From 641c3ced434ce3aec0950849ce175ef379b16d80 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 28 Apr 2025 16:32:00 -0500 Subject: [PATCH 53/65] Converts functions to staticmethods --- pins/databricks/fs.py | 266 +++++++++++++++++++++--------------------- 1 file changed, 133 insertions(+), 133 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 807fdf9a..d71b4a95 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -13,23 +13,23 @@ class DatabricksFs(AbstractFileSystem): protocol = "dbc" def ls(self, path, detail=False, **kwargs): - return _databricks_ls(path, detail) + return self._databricks_ls(self, path, detail) def exists(self, path: str, **kwargs): - return _databricks_exists(path) + return self._databricks_exists(self, path) def open(self, path: str, mode: str = "rb", *args, **kwargs): if mode != "rb": raise NotImplementedError - return _databricks_open(path) + return self._databricks_open(self, path) def get(self, rpath, lpath, recursive=False, **kwargs): - _databricks_get(self, rpath, lpath, recursive, **kwargs) + self._databricks_get(self, rpath, lpath, recursive, **kwargs) def mkdir(self, path, create_parents=True, **kwargs): if not create_parents: raise NotImplementedError - _databricks_mkdir(path) + self._databricks_mkdir(path) def put( self, @@ -43,146 +43,146 @@ def put( raise NotImplementedError if maxdepth is not None: raise NotImplementedError - _databricks_put(lpath, rpath) + self._databricks_put(lpath, rpath) def rm(self, path, recursive=True, maxdepth=None) -> None: if not recursive: raise NotImplementedError if maxdepth is not None: raise NotImplementedError - if _databricks_exists(path): - _databricks_rm_dir(path) - - -def _databricks_put(lpath, rpath): - w = WorkspaceClient() - path = Path(lpath).absolute() - orig_path = path - - def _upload_files(path): - contents = Path(path) - for item in contents.iterdir(): - abs_path = PurePath(path).joinpath(item) - is_file = Path(abs_path).is_file() - if is_file: - rel_path = abs_path.relative_to(orig_path) - db_path = PurePath(rpath).joinpath(rel_path) - file = open(abs_path, "rb") - w.files.upload(str(db_path), BytesIO(file.read()), overwrite=True) - else: - _upload_files(abs_path) - - _upload_files(path) + if self._databricks_exists(self, path): + self._databricks_rm_dir(self, path) + + @staticmethod + def _databricks_put(lpath, rpath): + w = WorkspaceClient() + path = Path(lpath).absolute() + orig_path = path + + def _upload_files(path): + contents = Path(path) + for item in contents.iterdir(): + abs_path = PurePath(path).joinpath(item) + is_file = Path(abs_path).is_file() + if is_file: + rel_path = abs_path.relative_to(orig_path) + db_path = PurePath(rpath).joinpath(rel_path) + file = open(abs_path, "rb") + w.files.upload(str(db_path), BytesIO(file.read()), overwrite=True) + else: + _upload_files(abs_path) + + _upload_files(path) + + @staticmethod + def _databricks_get(self, board, rpath, lpath, recursive=False, **kwargs): + w = WorkspaceClient() + file_type = self._databricks_is_type(rpath) + if file_type == "file": + board.fs.get(rpath, lpath, **kwargs) + return + + def _get_files(self, path, recursive, **kwargs): + raw_contents = w.files.list_directory_contents(path) + contents = list(raw_contents) + details = list(map(self._databricks_content_details, contents)) + for item in details: + item_path = item.get("path") + if item.get("is_directory"): + if recursive: + _get_files(item_path, recursive=recursive, **kwargs) + else: + rel_path = PurePath(item_path).relative_to(rpath) + target_path = PurePath(lpath).joinpath(rel_path) + board.fs.get(item_path, str(target_path)) + + _get_files(self, rpath, recursive, **kwargs) + + @staticmethod + def _databricks_open(self, path): + if not self._databricks_exists(self, path): + raise PinsError("File or directory does not exist") + w = WorkspaceClient() + resp = w.files.download(path) + f = BytesIO() + shutil.copyfileobj(resp.contents, f) + f.seek(0) + return f + + @staticmethod + def _databricks_exists(self, path: str): + if self._databricks_is_type(path) == "nothing": + return False + else: + return True + @staticmethod + def _databricks_is_type(path: str): + w = WorkspaceClient() + try: + w.files.get_metadata(path) + except NotFound: + try: + w.files.get_directory_metadata(path) + except NotFound: + return "nothing" + else: + return "directory" + else: + return "file" + + @staticmethod + def _databricks_ls(self, path, detail): + if not self._databricks_exists(self, path): + raise PinsError("File or directory does not exist") + w = WorkspaceClient() + if self._databricks_is_type(path) == "file": + if detail: + return [dict(name=path, size=None, type="file")] + else: + return path -def _databricks_get(board, rpath, lpath, recursive=False, **kwargs): - w = WorkspaceClient() - file_type = _databricks_is_type(rpath) - if file_type == "file": - board.fs.get(rpath, lpath, **kwargs) - return + contents_raw = w.files.list_directory_contents(path) + contents = list(contents_raw) + items = [] + for item in contents: + item = self._databricks_content_details(item) + item_path = item.get("path") + item_path = item_path.rstrip("/") + if detail: + if item.get("is_directory"): + item_type = "directory" + else: + item_type = "file" + items.append(dict(name=item_path, size=None, type=item_type)) + else: + items.append(item_path) + return items - def _get_files(path, recursive, **kwargs): + @staticmethod + def _databricks_rm_dir(self, path): + w = WorkspaceClient() raw_contents = w.files.list_directory_contents(path) contents = list(raw_contents) - details = list(map(_databricks_content_details, contents)) + details = list(map(self._databricks_content_details, contents)) for item in details: item_path = item.get("path") if item.get("is_directory"): - if recursive: - _get_files(item_path, recursive=recursive, **kwargs) + self._databricks_rm_dir(self, item_path) else: - rel_path = PurePath(item_path).relative_to(rpath) - target_path = PurePath(lpath).joinpath(rel_path) - board.fs.get(item_path, str(target_path)) - - _get_files(rpath, recursive, **kwargs) - - -def _databricks_open(path): - if not _databricks_exists(path): - raise PinsError("File or directory does not exist") - w = WorkspaceClient() - resp = w.files.download(path) - f = BytesIO() - shutil.copyfileobj(resp.contents, f) - f.seek(0) - return f - - -def _databricks_exists(path: str): - if _databricks_is_type(path) == "nothing": - return False - else: - return True - - -def _databricks_is_type(path: str): - w = WorkspaceClient() - try: - w.files.get_metadata(path) - except NotFound: - try: - w.files.get_directory_metadata(path) - except NotFound: - return "nothing" - else: - return "directory" - else: - return "file" - - -def _databricks_ls(path, detail): - if not _databricks_exists(path): - raise PinsError("File or directory does not exist") - w = WorkspaceClient() - if _databricks_is_type(path) == "file": - if detail: - return [dict(name=path, size=None, type="file")] - else: - return path - - contents_raw = w.files.list_directory_contents(path) - contents = list(contents_raw) - items = [] - for item in contents: - item = _databricks_content_details(item) - item_path = item.get("path") - item_path = item_path.rstrip("/") - if detail: - if item.get("is_directory"): - item_type = "directory" - else: - item_type = "file" - items.append(dict(name=item_path, size=None, type=item_type)) - else: - items.append(item_path) - return items - - -def _databricks_rm_dir(path): - w = WorkspaceClient() - raw_contents = w.files.list_directory_contents(path) - contents = list(raw_contents) - details = list(map(_databricks_content_details, contents)) - for item in details: - item_path = item.get("path") - if item.get("is_directory"): - _databricks_rm_dir(item_path) - else: - w.files.delete(item_path) - w.files.delete_directory(path) - - -def _databricks_mkdir(path): - w = WorkspaceClient() - w.files.create_directory(path) - - -def _databricks_content_details(item): - details = { - "path": item.path, - "name": item.name, - "is_directory": item.is_directory, - } - return details + w.files.delete(item_path) + w.files.delete_directory(path) + + @staticmethod + def _databricks_mkdir(path): + w = WorkspaceClient() + w.files.create_directory(path) + + @staticmethod + def _databricks_content_details(item): + details = { + "path": item.path, + "name": item.name, + "is_directory": item.is_directory, + } + return details From 12310e0618a3eae1c11e3e1c72cef579c04b9006 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Mon, 2 Jun 2025 12:25:31 -0500 Subject: [PATCH 54/65] add docs for board_databricks --- docs/_quarto.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/_quarto.yml b/docs/_quarto.yml index 12bf9c0a..b973defc 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -49,6 +49,8 @@ website: href: reference/board_gcs.qmd - text: "`board_azure`" href: reference/board_azure.qmd + - text: "`board_databricks`" + href: reference/board_databricks.qmd - text: "`board_connect`" href: reference/board_connect.qmd - text: "`board_url`" From 6271cb812c29e5be8106e1a9fa1c95d29da0bdc2 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Mon, 2 Jun 2025 13:18:53 -0500 Subject: [PATCH 55/65] load in quartodoc --- docs/_quarto.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/_quarto.yml b/docs/_quarto.yml index b973defc..afb51204 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -101,6 +101,7 @@ quartodoc: - board_s3 - board_gcs - board_azure + - board_databricks - board_connect - board_url - board From d93a77d48168b8963f9bb13e49181719ba95c62d Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 14:03:47 -0500 Subject: [PATCH 56/65] update all tests --- pins/constructors.py | 17 +++++++++ pins/tests/conftest.py | 5 ++- pins/tests/helpers.py | 65 +++++++++++++++++++++++++++------ pins/tests/test_boards.py | 38 +++++++++++++++++-- pins/tests/test_compat.py | 59 ++++++++++++++++++++++++++---- pins/tests/test_constructors.py | 7 +++- pyproject.toml | 1 - 7 files changed, 165 insertions(+), 27 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index b4f21331..8a84ddd0 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -607,6 +607,23 @@ def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None See + + Examples + -------- + + >>> import pytest; pytest.skip() + + >>> import pins + >>> from dotenv import load_dotenv + >>> load_dotenv() # eg, for a .env file with DATABRICKS_HOST and DATABRICKS_TOKEN set + >>> board = pins.board_databricks("/Volumes/examples/my-board/test-volume") + >>> board.pin_list() + ['df_csv'] + + >>> board.pin_read("df_csv") + x y z + 0 1 a 3 + 1 2 b 4 """ return board("dbc", path, versioned, cache, allow_pickle_read) diff --git a/pins/tests/conftest.py b/pins/tests/conftest.py index 693cf823..676cff37 100644 --- a/pins/tests/conftest.py +++ b/pins/tests/conftest.py @@ -16,6 +16,7 @@ EXAMPLE_REL_PATH = "pins/tests/pins-compat" PATH_TO_EXAMPLE_BOARD = files("pins") / "tests/pins-compat" +PATH_TO_EXAMPLE_BOARD_DBC = "/Volumes/workshops/my-board/my-volume/test" PATH_TO_EXAMPLE_VERSION = PATH_TO_EXAMPLE_BOARD / "df_csv/20220214T163720Z-9bfad/" EXAMPLE_PIN_NAME = "df_csv" @@ -27,13 +28,13 @@ pytest.param(lambda: BoardBuilder("s3"), id="s3", marks=m.fs_s3), pytest.param(lambda: BoardBuilder("gcs"), id="gcs", marks=m.fs_gcs), pytest.param(lambda: BoardBuilder("abfs"), id="abfs", marks=m.fs_abfs), + pytest.param(lambda: DbcBoardBuilder("dbc"), id="dbc", marks=m.fs_dbc) ] # rsc should only be used once, because users are created at docker setup time param_rsc = pytest.param(lambda: RscBoardBuilder("rsc"), id="rsc", marks=m.fs_rsc) -param_dbc = pytest.param(lambda: DbcBoardBuilder("dbc"), id="dbc", marks=m.fs_dbc) -params_backend = [*params_safe, param_rsc, param_dbc] +params_backend = [*params_safe, param_rsc] @pytest.fixture(params=params_backend, scope="session") diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index 93da5176..61dd52a1 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -1,4 +1,5 @@ import contextlib +import functools import json import os import shutil @@ -21,13 +22,15 @@ # TODO: should use pkg_resources for this path? RSC_KEYS_FNAME = "pins/tests/rsconnect_api_keys.json" +DATABRICKS_VOLUME = "/Volumes/workshops/my-board/my-volume/test" + BOARD_CONFIG = { "file": {"path": ["PINS_TEST_FILE__PATH", None]}, "s3": {"path": ["PINS_TEST_S3__PATH", "ci-pins"]}, "gcs": {"path": ["PINS_TEST_GCS__PATH", "pins-python"]}, "abfs": {"path": ["PINS_TEST_AZURE__PATH", "ci-pins"]}, "rsc": {"path": ["PINS_TEST_RSC__PATH", RSC_SERVER_URL]}, - "dbc": {"path": ["PINS_TEST_DBC__PATH", "DATABRICKS_VOLUME"]}, + "dbc": {"path": ["PINS_TEST_DBC__PATH", DATABRICKS_VOLUME]}, } # TODO: Backend initialization should be independent of helpers, but these @@ -35,6 +38,38 @@ # putting imports inside rsconnect particulars for now +def skip_if_dbc(func): + """Decorator to skip test if board protocol is 'dbc'""" + @functools.wraps(func) + def wrapper(*args, **kwargs): + import inspect + board = None + + # Get function signature to map args to parameter names. + # We have to do this since parameterized pytest runs passes in + # args in different orders + sig = inspect.signature(func) + bound_args = sig.bind_partial(*args, **kwargs) + all_args = {**bound_args.arguments, **kwargs} + + if 'board' in all_args: + board = all_args['board'] + elif 'board_with_cache' in all_args: + board = all_args['board_with_cache'] + else: + # Check all arguments for something that looks like a board + for arg_value in all_args.values(): + if hasattr(arg_value, 'fs') and hasattr(arg_value.fs, 'protocol'): + board = arg_value + break + + if board and board.fs.protocol == "dbc": + pytest.skip("All Databricks tests must be read only") + + return func(*args, **kwargs) + return wrapper + + def rsc_from_key(name): from pins.rsconnect.api import RsConnectApi @@ -209,24 +244,32 @@ def __init__(self, fs_name, path=None, *args, **kwargs): self.path = None self.fs_name = fs_name self.current_board = None - self.volume = os.environ.get("DATABRICKS_VOLUME") + self.volume = DATABRICKS_VOLUME def create_tmp_board(self, src_board=None, versioned=True): - temp_name = str(uuid.uuid4()) - board_name = os.path.join(self.volume, temp_name) - db_board = board_databricks(board_name, cache=None) - board = BaseBoard(board_name, fs=db_board.fs, versioned=versioned) + # TODO: use temp boards when boards are not read-only + # temp_name = str(uuid.uuid4()) + # board_name = os.path.join(self.volume, temp_name) + # db_board = board_databricks(board_name, cache=None) + # board = BaseBoard(board_name, fs=db_board.fs, versioned=versioned) + # if src_board is not None: + # board.fs.put(src_board, board_name) + + db_board = board_databricks(self.volume, cache=None) + board = BaseBoard(self.volume, fs=db_board.fs, versioned=versioned) self.current_board = board - if src_board is not None: - board.fs.put(src_board, board_name) return board def teardown_board(self, board): - board.fs.rm(board.board) + pass + # TODO: update when board not read-only + # board.fs.rm(board.board) def teardown(self): - board = board_databricks(self.volume) - board.fs.rm(self.current_board.board) + pass + # TODO: update when board not read-only + # board = board_databricks(self.volume) + # board.fs.rm(self.current_board.board) # Snapshot ==================================================================== diff --git a/pins/tests/test_boards.py b/pins/tests/test_boards.py index dcd551ee..ee22bd3d 100644 --- a/pins/tests/test_boards.py +++ b/pins/tests/test_boards.py @@ -15,7 +15,7 @@ from pins.config import PINS_ENV_INSECURE_READ from pins.errors import PinsError, PinsInsecureReadError, PinsVersionError from pins.meta import MetaRaw -from pins.tests.helpers import DEFAULT_CREATION_DATE, rm_env +from pins.tests.helpers import DEFAULT_CREATION_DATE, rm_env, skip_if_dbc @fixture @@ -40,7 +40,7 @@ def board_unversioned(backend): @fixture def board_with_cache(backend): from pins.constructors import board as board_constructor - from pins.constructors import board_databricks, board_rsconnect + from pins.constructors import board_rsconnect board = backend.create_tmp_board() @@ -50,8 +50,6 @@ def board_with_cache(backend): # board behavior. As a result, we need to pass the credentials directly in. server_url, api_key = board.fs.api.server_url, board.fs.api.api_key board_with_cache = board_rsconnect(server_url=server_url, api_key=api_key) - elif backend.fs_name == "dbc": - board_with_cache = board_databricks(board.board) else: board_with_cache = board_constructor(backend.fs_name, board.board) @@ -73,12 +71,14 @@ def test_board_validate_pin_name_root(board): # pin_write =================================================================== +@skip_if_dbc def test_board_pin_write_default_title(board): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) meta = board.pin_write(df, "df_csv", title=None, type="csv") assert meta.title == "df_csv: a pinned 3 x 2 DataFrame" +@skip_if_dbc def test_board_pin_write_prepare_pin(board, tmp_path: Path): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) @@ -89,6 +89,7 @@ def test_board_pin_write_prepare_pin(board, tmp_path: Path): assert not (tmp_path / "df_csv.csv").is_dir() +@skip_if_dbc def test_board_pin_write_roundtrip(board): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) @@ -102,6 +103,7 @@ def test_board_pin_write_roundtrip(board): assert loaded_df.equals(df) +@skip_if_dbc def test_board_pin_write_type_not_specified_error(board): class C: pass @@ -110,6 +112,7 @@ class C: board.pin_write(C(), "cool_pin") +@skip_if_dbc def test_board_pin_write_type_error(board): class C: pass @@ -120,6 +123,7 @@ class C: assert "MY_TYPE" in exc_info.value.args[0] +@skip_if_dbc def test_board_pin_write_feather_deprecated(board): df = pd.DataFrame({"x": [1, 2, 3]}) @@ -127,6 +131,7 @@ def test_board_pin_write_feather_deprecated(board): board.pin_write(df, "cool_pin", type="feather") +@skip_if_dbc def test_board_pin_write_file_raises_error(board, tmp_path): df = pd.DataFrame({"x": [1, 2, 3]}) @@ -138,6 +143,7 @@ def test_board_pin_write_file_raises_error(board, tmp_path): board.pin_write(path, "cool_pin", type="file") +@skip_if_dbc @pytest.mark.parametrize("force_identical_write", [True, False]) def test_board_pin_write_force_identical_write_pincount(board, force_identical_write): df = pd.DataFrame({"x": [1, 2, 3]}) @@ -155,6 +161,7 @@ def test_board_pin_write_force_identical_write_pincount(board, force_identical_w assert len(versions) == 1 +@skip_if_dbc def test_board_pin_write_force_identical_write_msg( board, capfd: pytest.CaptureFixture[str] ): @@ -172,6 +179,7 @@ def test_board_pin_write_force_identical_write_msg( assert len(versions) == 1 +@skip_if_dbc def test_board_pin_download(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -190,6 +198,7 @@ def test_board_pin_download(board_with_cache, tmp_path): board_with_cache.pin_read("cool_pin") +@skip_if_dbc def test_board_pin_download_filename_many_suffixes(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -206,6 +215,7 @@ def test_board_pin_download_filename_many_suffixes(board_with_cache, tmp_path): assert df.x.tolist() == [1, 2, 3] +@skip_if_dbc def test_board_pin_download_filename_no_suffixes(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -222,6 +232,7 @@ def test_board_pin_download_filename_no_suffixes(board_with_cache, tmp_path): assert df.x.tolist() == [1, 2, 3] +@skip_if_dbc def test_board_pin_download_filename(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -237,6 +248,7 @@ def test_board_pin_download_filename(board_with_cache, tmp_path): assert Path(pin_path).name == "data.csv" +@skip_if_dbc def test_board_pin_download_no_cache_error(board, tmp_path): df = pd.DataFrame({"x": [1, 2, 3]}) path = tmp_path / "data.csv" @@ -255,6 +267,7 @@ def test_board_pin_download_no_cache_error(board, tmp_path): (pin_path,) = board.pin_download("cool_pin") +@skip_if_dbc def test_board_pin_upload_path_list(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -268,6 +281,7 @@ def test_board_pin_upload_path_list(board_with_cache, tmp_path): (pin_path,) = board_with_cache.pin_download("cool_pin") +@skip_if_dbc def test_board_pin_download_filename_multifile(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -312,6 +326,7 @@ def test_board_pin_write_rsc_index_html(board, tmp_path: Path, snapshot): # pin_write against different types ------------------------------------------- +@skip_if_dbc @parametrize( "obj, type_", [ @@ -335,6 +350,7 @@ def test_board_pin_write_type(board, obj, type_, request): obj == dst_obj +@skip_if_dbc def test_board_pin_read_insecure_fail_default(board): board.pin_write({"a": 1}, "test_pin", type="joblib", title="some title") with pytest.raises(PinsInsecureReadError) as exc_info: @@ -343,6 +359,7 @@ def test_board_pin_read_insecure_fail_default(board): assert "joblib" in exc_info.value.args[0] +@skip_if_dbc def test_board_pin_read_insecure_fail_board_flag(board): # board flag prioritized over env var with rm_env(PINS_ENV_INSECURE_READ): @@ -353,6 +370,7 @@ def test_board_pin_read_insecure_fail_board_flag(board): board.pin_read("test_pin") +@skip_if_dbc def test_board_pin_read_insecure_succeed_board_flag(board): # board flag prioritized over env var with rm_env(PINS_ENV_INSECURE_READ): @@ -365,6 +383,7 @@ def test_board_pin_read_insecure_succeed_board_flag(board): # pin_write with unversioned boards =========================================== +@skip_if_dbc @pytest.mark.parametrize("versioned", [None, False]) def test_board_unversioned_pin_write_unversioned_force_identical_write( versioned, board_unversioned @@ -391,6 +410,7 @@ def test_board_unversioned_pin_write_unversioned_force_identical_write( assert board_unversioned.pin_read("test_pin") == {"a": 2} +@skip_if_dbc @pytest.mark.parametrize("versioned", [None, False]) def test_board_unversioned_pin_write_unversioned(versioned, board_unversioned): board_unversioned.pin_write({"a": 1}, "test_pin", type="json", versioned=versioned) @@ -400,6 +420,7 @@ def test_board_unversioned_pin_write_unversioned(versioned, board_unversioned): assert board_unversioned.pin_read("test_pin") == {"a": 2} +@skip_if_dbc def test_board_unversioned_pin_write_versioned(board_unversioned): board_unversioned.pin_write({"a": 1}, "test_pin", type="json", versioned=False) board_unversioned.pin_write({"a": 2}, "test_pin", type="json", versioned=True) @@ -407,6 +428,7 @@ def test_board_unversioned_pin_write_versioned(board_unversioned): assert len(board_unversioned.pin_versions("test_pin")) == 2 +@skip_if_dbc def test_board_versioned_pin_write_unversioned(board): # should fall back to the versioned setting of the board board.pin_write({"a": 1}, "test_pin", type="json") @@ -428,6 +450,9 @@ def pin_name(): @pytest.fixture def pin_del(board, df, pin_name): + # TODO: update when dbc boards no longer read-only + if board.fs.protocol == "dbc": + pytest.skip() # 1min ago to avoid name collision one_min_ago = datetime.now() - timedelta(minutes=1) meta_old = board.pin_write( @@ -445,6 +470,9 @@ def pin_del(board, df, pin_name): @pytest.fixture def pin_prune(board, df, pin_name): + # TODO: update when dbc boards no longer read-only + if board.fs.protocol == "dbc": + pytest.skip() today = datetime.now() day_ago = today - timedelta(days=1, minutes=1) two_days_ago = today - timedelta(days=2, minutes=1) @@ -538,6 +566,7 @@ def test_board_pin_versions_prune_days(board, pin_prune, pin_name, days): assert len(new_versions) == days +@skip_if_dbc def test_board_pin_versions_prune_days_protect_most_recent(board, pin_name): """To address https://github.com/rstudio/pins-python/issues/297""" # Posit cannot handle days, since it involves pulling metadata @@ -581,6 +610,7 @@ def test_board_pin_versions_prune_days_protect_most_recent(board, pin_name): ("the-title", ["x-pin-1", "x-pin-2", "y-pin-1", "y-z"]), ], ) +@skip_if_dbc def test_board_pin_search_name(board, df, search, matches): if board.fs.protocol == "rsc": matches = ["derek/" + m for m in matches] diff --git a/pins/tests/test_compat.py b/pins/tests/test_compat.py index 498f5782..d859921c 100644 --- a/pins/tests/test_compat.py +++ b/pins/tests/test_compat.py @@ -3,8 +3,12 @@ import pytest from pins.errors import PinsError -from pins.tests.conftest import PATH_TO_EXAMPLE_BOARD, PATH_TO_MANIFEST_BOARD -from pins.tests.helpers import xfail_fs +from pins.tests.conftest import ( + PATH_TO_EXAMPLE_BOARD, + PATH_TO_EXAMPLE_BOARD_DBC, + PATH_TO_MANIFEST_BOARD, +) +from pins.tests.helpers import skip_if_dbc, xfail_fs NOT_A_PIN = "not_a_pin_abcdefg" PIN_CSV = "df_csv" @@ -15,7 +19,8 @@ @pytest.fixture(scope="session") def board(backend): board = backend.create_tmp_board(str(PATH_TO_EXAMPLE_BOARD.absolute())) - + if board.fs.protocol == "dbc": + board = backend.create_tmp_board(str(PATH_TO_EXAMPLE_BOARD_DBC)) yield board backend.teardown_board(board) @@ -45,6 +50,18 @@ def test_compat_pin_list(board): if board.fs.protocol == "rsc": # rsc backend uses / for full name dst_sorted = [f"{board.user_name}/{content}" for content in dst_sorted] + if board.fs.protocol == "dbc": + # TODO: update to match when not read-only + dst_sorted = [ + "cool_pin", + "cool_pin2", + "cool_pin3", + "data", + "df_csv", + "reviews", + "reviews2", + "reviews3", + ] assert src_sorted == dst_sorted @@ -57,7 +74,11 @@ def test_compat_pin_versions(board): pytest.skip("RSC uses bundle ids as pin versions") versions = board.pin_versions("df_csv", as_df=False) v_strings = list(v.version for v in versions) - assert v_strings == ["20220214T163718Z-eceac", "20220214T163720Z-9bfad"] + # TODO: update when dbc is not read-only + if board.fs.protocol == "dbc": + v_strings == ["20250410T083026Z-a173c"] + else: + assert v_strings == ["20220214T163718Z-eceac", "20220214T163720Z-9bfad"] @pytest.mark.skip("Used to diagnose os listdir ordering") @@ -92,6 +113,16 @@ def test_compat_pin_meta(board): # TODO: afaik the bundle id is largely non-deterministic, so not possible # to test, but should think a bit more about it. assert meta.name == "derek/df_csv" + # TODO: update when dbc boards are not read-only + elif board.fs.protocol == "dbc": + assert meta.title == "df_csv: a pinned 3 x 2 DataFrame" + assert meta.description is None + assert meta.created == "20220214T163720Z" + assert meta.file == "df_csv.csv" + assert meta.file_size == 16 + assert meta.pin_hash == "a173cd6a53908980" + assert meta.type == "csv" + return else: assert meta.version.version == "20220214T163720Z-9bfad" assert meta.version.created == datetime.datetime(2022, 2, 14, 16, 37, 20) @@ -122,9 +153,15 @@ def test_compat_pin_meta_pin_missing(board): @xfail_fs("rsc") def test_compat_pin_meta_version_arg(board): # note that in RSConnect the version is the bundle id - meta = board.pin_meta(PIN_CSV, "20220214T163718Z-eceac") - assert meta.version.version == "20220214T163718Z-eceac" - assert meta.version.hash == "eceac" + # TODO: update when dbc is not read-only + if board.fs.protocol == "dbc": + meta = board.pin_meta(PIN_CSV, "20250410T083026Z-a173c") + assert meta.version.version == "20250410T083026Z-a173c" + assert meta.version.hash == "a173c" + else: + meta = board.pin_meta(PIN_CSV, "20220214T163718Z-eceac") + assert meta.version.version == "20220214T163718Z-eceac" + assert meta.version.hash == "eceac" def test_compat_pin_meta_version_arg_error(board): @@ -146,12 +183,18 @@ def test_compat_pin_read(board): p_data = PATH_TO_EXAMPLE_BOARD / "df_csv" / "20220214T163720Z-9bfad" / "df_csv.csv" src_df = board.pin_read("df_csv") - dst_df = pd.read_csv(p_data) + + # TODO: update when dbc boards are not read-only + if board.fs.protocol == "dbc": + dst_df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + else: + dst_df = pd.read_csv(p_data) assert isinstance(src_df, pd.DataFrame) assert src_df.equals(dst_df) +@skip_if_dbc def test_compat_pin_read_supported_rds(board): pytest.importorskip("rdata") import pandas as pd diff --git a/pins/tests/test_constructors.py b/pins/tests/test_constructors.py index 901b9643..026a2b28 100644 --- a/pins/tests/test_constructors.py +++ b/pins/tests/test_constructors.py @@ -11,7 +11,7 @@ PATH_TO_EXAMPLE_BOARD, PATH_TO_EXAMPLE_VERSION, ) -from pins.tests.helpers import rm_env +from pins.tests.helpers import rm_env, skip_if_dbc @pytest.fixture @@ -190,6 +190,10 @@ def test_constructor_boards(board, df_csv, tmp_cache): df = board.pin_read("df_csv") # check data + # TODO: update when dbc boards are not read-only + if board.fs.protocol == "dbc": + df_csv = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + assert_frame_equal(df, df_csv) # check the cache structure ----------------------------------------------- @@ -232,6 +236,7 @@ def board2(backend): backend.teardown_board(board2) +@skip_if_dbc def test_constructor_boards_multi_user(board2, df_csv, tmp_cache): prot = board2.fs.protocol fs_name = prot if isinstance(prot, str) else prot[0] diff --git a/pyproject.toml b/pyproject.toml index 49a8b53f..23245859 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,6 @@ requires-python = ">=3.9" dynamic = ["version"] dependencies = [ "appdirs<2", # Using appdirs rather than platformdirs is deliberate, see https://github.com/rstudio/pins-python/pull/239 - "databricks-sdk==0.49.0", "fsspec>=2022.2", "humanize>=1", "importlib-metadata>=4.4", From b688bf4e2bf7821d548ac793f2f9b74aa758f158 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 15:00:18 -0500 Subject: [PATCH 57/65] clean up class, make true optional import --- pins/databricks/fs.py | 79 +++++++++++++++++++++++++-------- pins/tests/conftest.py | 2 +- pins/tests/helpers.py | 13 +++--- pins/tests/test_compat.py | 4 +- pins/tests/test_constructors.py | 2 +- 5 files changed, 73 insertions(+), 27 deletions(-) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index d71b4a95..560cc849 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -2,8 +2,6 @@ from io import BytesIO from pathlib import Path, PurePath -from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound from fsspec import AbstractFileSystem from pins.errors import PinsError @@ -13,18 +11,18 @@ class DatabricksFs(AbstractFileSystem): protocol = "dbc" def ls(self, path, detail=False, **kwargs): - return self._databricks_ls(self, path, detail) + return self._databricks_ls(path, detail) def exists(self, path: str, **kwargs): - return self._databricks_exists(self, path) + return self._databricks_exists(path) def open(self, path: str, mode: str = "rb", *args, **kwargs): if mode != "rb": raise NotImplementedError - return self._databricks_open(self, path) + return self._databricks_open(path) def get(self, rpath, lpath, recursive=False, **kwargs): - self._databricks_get(self, rpath, lpath, recursive, **kwargs) + self._databricks_get(rpath, lpath, recursive, **kwargs) def mkdir(self, path, create_parents=True, **kwargs): if not create_parents: @@ -50,11 +48,18 @@ def rm(self, path, recursive=True, maxdepth=None) -> None: raise NotImplementedError if maxdepth is not None: raise NotImplementedError - if self._databricks_exists(self, path): - self._databricks_rm_dir(self, path) + if self._databricks_exists(path): + self._databricks_rm_dir(path) @staticmethod def _databricks_put(lpath, rpath): + try: + from databricks.sdk import WorkspaceClient + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + w = WorkspaceClient() path = Path(lpath).absolute() orig_path = path @@ -74,15 +79,21 @@ def _upload_files(path): _upload_files(path) - @staticmethod def _databricks_get(self, board, rpath, lpath, recursive=False, **kwargs): + try: + from databricks.sdk import WorkspaceClient + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + w = WorkspaceClient() file_type = self._databricks_is_type(rpath) if file_type == "file": board.fs.get(rpath, lpath, **kwargs) return - def _get_files(self, path, recursive, **kwargs): + def _get_files(path, recursive, **kwargs): raw_contents = w.files.list_directory_contents(path) contents = list(raw_contents) details = list(map(self._databricks_content_details, contents)) @@ -96,11 +107,17 @@ def _get_files(self, path, recursive, **kwargs): target_path = PurePath(lpath).joinpath(rel_path) board.fs.get(item_path, str(target_path)) - _get_files(self, rpath, recursive, **kwargs) + _get_files(rpath, recursive, **kwargs) - @staticmethod def _databricks_open(self, path): - if not self._databricks_exists(self, path): + try: + from databricks.sdk import WorkspaceClient + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + + if not self._databricks_exists(path): raise PinsError("File or directory does not exist") w = WorkspaceClient() resp = w.files.download(path) @@ -109,7 +126,6 @@ def _databricks_open(self, path): f.seek(0) return f - @staticmethod def _databricks_exists(self, path: str): if self._databricks_is_type(path) == "nothing": return False @@ -118,6 +134,14 @@ def _databricks_exists(self, path: str): @staticmethod def _databricks_is_type(path: str): + try: + from databricks.sdk import WorkspaceClient + from databricks.sdk.errors import NotFound + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + w = WorkspaceClient() try: w.files.get_metadata(path) @@ -131,9 +155,15 @@ def _databricks_is_type(path: str): else: return "file" - @staticmethod def _databricks_ls(self, path, detail): - if not self._databricks_exists(self, path): + try: + from databricks.sdk import WorkspaceClient + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + + if not self._databricks_exists(path): raise PinsError("File or directory does not exist") w = WorkspaceClient() if self._databricks_is_type(path) == "file": @@ -159,8 +189,14 @@ def _databricks_ls(self, path, detail): items.append(item_path) return items - @staticmethod def _databricks_rm_dir(self, path): + try: + from databricks.sdk import WorkspaceClient + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + w = WorkspaceClient() raw_contents = w.files.list_directory_contents(path) contents = list(raw_contents) @@ -168,13 +204,20 @@ def _databricks_rm_dir(self, path): for item in details: item_path = item.get("path") if item.get("is_directory"): - self._databricks_rm_dir(self, item_path) + self._databricks_rm_dir(item_path) else: w.files.delete(item_path) w.files.delete_directory(path) @staticmethod def _databricks_mkdir(path): + try: + from databricks.sdk import WorkspaceClient + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + w = WorkspaceClient() w.files.create_directory(path) diff --git a/pins/tests/conftest.py b/pins/tests/conftest.py index 676cff37..2a862096 100644 --- a/pins/tests/conftest.py +++ b/pins/tests/conftest.py @@ -28,7 +28,7 @@ pytest.param(lambda: BoardBuilder("s3"), id="s3", marks=m.fs_s3), pytest.param(lambda: BoardBuilder("gcs"), id="gcs", marks=m.fs_gcs), pytest.param(lambda: BoardBuilder("abfs"), id="abfs", marks=m.fs_abfs), - pytest.param(lambda: DbcBoardBuilder("dbc"), id="dbc", marks=m.fs_dbc) + pytest.param(lambda: DbcBoardBuilder("dbc"), id="dbc", marks=m.fs_dbc), ] # rsc should only be used once, because users are created at docker setup time diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index 61dd52a1..38fd0f18 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -40,9 +40,11 @@ def skip_if_dbc(func): """Decorator to skip test if board protocol is 'dbc'""" + @functools.wraps(func) def wrapper(*args, **kwargs): import inspect + board = None # Get function signature to map args to parameter names. @@ -52,14 +54,14 @@ def wrapper(*args, **kwargs): bound_args = sig.bind_partial(*args, **kwargs) all_args = {**bound_args.arguments, **kwargs} - if 'board' in all_args: - board = all_args['board'] - elif 'board_with_cache' in all_args: - board = all_args['board_with_cache'] + if "board" in all_args: + board = all_args["board"] + elif "board_with_cache" in all_args: + board = all_args["board_with_cache"] else: # Check all arguments for something that looks like a board for arg_value in all_args.values(): - if hasattr(arg_value, 'fs') and hasattr(arg_value.fs, 'protocol'): + if hasattr(arg_value, "fs") and hasattr(arg_value.fs, "protocol"): board = arg_value break @@ -67,6 +69,7 @@ def wrapper(*args, **kwargs): pytest.skip("All Databricks tests must be read only") return func(*args, **kwargs) + return wrapper diff --git a/pins/tests/test_compat.py b/pins/tests/test_compat.py index d859921c..1477d1c3 100644 --- a/pins/tests/test_compat.py +++ b/pins/tests/test_compat.py @@ -117,7 +117,7 @@ def test_compat_pin_meta(board): elif board.fs.protocol == "dbc": assert meta.title == "df_csv: a pinned 3 x 2 DataFrame" assert meta.description is None - assert meta.created == "20220214T163720Z" + assert meta.created == "20250410T083026Z" assert meta.file == "df_csv.csv" assert meta.file_size == 16 assert meta.pin_hash == "a173cd6a53908980" @@ -186,7 +186,7 @@ def test_compat_pin_read(board): # TODO: update when dbc boards are not read-only if board.fs.protocol == "dbc": - dst_df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + dst_df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) else: dst_df = pd.read_csv(p_data) diff --git a/pins/tests/test_constructors.py b/pins/tests/test_constructors.py index 026a2b28..216ed6a3 100644 --- a/pins/tests/test_constructors.py +++ b/pins/tests/test_constructors.py @@ -192,7 +192,7 @@ def test_constructor_boards(board, df_csv, tmp_cache): # check data # TODO: update when dbc boards are not read-only if board.fs.protocol == "dbc": - df_csv = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df_csv = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) assert_frame_equal(df, df_csv) From 564f9ed1f0a3387ea6ce5a876659cc09d7a57629 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 15:06:13 -0500 Subject: [PATCH 58/65] run dbc tests --- .github/workflows/ci.yml | 2 +- pins/databricks/fs.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7662f73d..72025c42 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: - name: Run tests shell: bash run: | - pytest pins -m 'not fs_rsc and not skip_on_github and not fs_dbc' $PYTEST_OPTS + pytest pins -m 'not fs_rsc and not skip_on_github' $PYTEST_OPTS env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 560cc849..6dec76cd 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -118,7 +118,7 @@ def _databricks_open(self, path): ) if not self._databricks_exists(path): - raise PinsError("File or directory does not exist") + raise PinsError(f"File or directory does not exist at path: {path}") w = WorkspaceClient() resp = w.files.download(path) f = BytesIO() @@ -164,7 +164,7 @@ def _databricks_ls(self, path, detail): ) if not self._databricks_exists(path): - raise PinsError("File or directory does not exist") + raise PinsError(f"File or directory does not exist at path: {path}") w = WorkspaceClient() if self._databricks_is_type(path) == "file": if detail: From f7e1aaf269b76ad9c0d09fafb6b8db8d6e4ef154 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 15:10:11 -0500 Subject: [PATCH 59/65] add databricks into pyright deps --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 23245859..f5436f69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ check = [ "pre-commit", "pyright==1.1.372", # Pinned; manually sync with .github/workflows/code-checks.yml "types-appdirs", + "databricks-sdk" ] databricks = ["databricks-sdk"] doc = [ From 6e766a78be2a263416db3192bfcc457ee584bdba Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 15:15:04 -0500 Subject: [PATCH 60/65] load databricks creds --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72025c42..e08d5200 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,6 +71,8 @@ jobs: AWS_REGION: "us-east-1" AZURE_STORAGE_ACCOUNT_NAME: ${{ secrets.AZURE_STORAGE_ACCOUNT_NAME }} AZURE_STORAGE_ACCOUNT_KEY: ${{ secrets.AZURE_STORAGE_ACCOUNT_KEY }} + DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} + DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} PYTEST_OPTS: ${{ matrix.pytest_opts }} REQUIREMENTS: ${{ matrix.requirements }} ACTION_OS: ${{ matrix.os }} From 4f559a5352fa38c94c3dd5d11526967d60d1249c Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 15:23:01 -0500 Subject: [PATCH 61/65] error earlier --- pins/constructors.py | 8 ++++++- pins/databricks/fs.py | 51 +++++++------------------------------------ 2 files changed, 15 insertions(+), 44 deletions(-) diff --git a/pins/constructors.py b/pins/constructors.py index 8a84ddd0..dd9663da 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -10,6 +10,7 @@ from .boards import BaseBoard, BoardManual, BoardRsConnect, board_deparse from .cache import PinsAccessTimeCache, PinsCache, PinsRscCacheMapper, prefix_cache from .config import get_cache_dir, get_data_dir +from .errors import PinsError # Kept here for backward-compatibility reasons # Note that this is not a constructor, but a function to represent them. @@ -625,5 +626,10 @@ def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None 0 1 a 3 1 2 b 4 """ - + try: + import databricks.sdk # noqa: F401 + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) return board("dbc", path, versioned, cache, allow_pickle_read) diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py index 6dec76cd..58bcd95f 100644 --- a/pins/databricks/fs.py +++ b/pins/databricks/fs.py @@ -53,12 +53,7 @@ def rm(self, path, recursive=True, maxdepth=None) -> None: @staticmethod def _databricks_put(lpath, rpath): - try: - from databricks.sdk import WorkspaceClient - except ModuleNotFoundError: - raise PinsError( - "Install the `databricks-sdk` package for Databricks board support." - ) + from databricks.sdk import WorkspaceClient w = WorkspaceClient() path = Path(lpath).absolute() @@ -80,12 +75,7 @@ def _upload_files(path): _upload_files(path) def _databricks_get(self, board, rpath, lpath, recursive=False, **kwargs): - try: - from databricks.sdk import WorkspaceClient - except ModuleNotFoundError: - raise PinsError( - "Install the `databricks-sdk` package for Databricks board support." - ) + from databricks.sdk import WorkspaceClient w = WorkspaceClient() file_type = self._databricks_is_type(rpath) @@ -110,12 +100,7 @@ def _get_files(path, recursive, **kwargs): _get_files(rpath, recursive, **kwargs) def _databricks_open(self, path): - try: - from databricks.sdk import WorkspaceClient - except ModuleNotFoundError: - raise PinsError( - "Install the `databricks-sdk` package for Databricks board support." - ) + from databricks.sdk import WorkspaceClient if not self._databricks_exists(path): raise PinsError(f"File or directory does not exist at path: {path}") @@ -134,13 +119,8 @@ def _databricks_exists(self, path: str): @staticmethod def _databricks_is_type(path: str): - try: - from databricks.sdk import WorkspaceClient - from databricks.sdk.errors import NotFound - except ModuleNotFoundError: - raise PinsError( - "Install the `databricks-sdk` package for Databricks board support." - ) + from databricks.sdk import WorkspaceClient + from databricks.sdk.errors import NotFound w = WorkspaceClient() try: @@ -156,12 +136,7 @@ def _databricks_is_type(path: str): return "file" def _databricks_ls(self, path, detail): - try: - from databricks.sdk import WorkspaceClient - except ModuleNotFoundError: - raise PinsError( - "Install the `databricks-sdk` package for Databricks board support." - ) + from databricks.sdk import WorkspaceClient if not self._databricks_exists(path): raise PinsError(f"File or directory does not exist at path: {path}") @@ -190,12 +165,7 @@ def _databricks_ls(self, path, detail): return items def _databricks_rm_dir(self, path): - try: - from databricks.sdk import WorkspaceClient - except ModuleNotFoundError: - raise PinsError( - "Install the `databricks-sdk` package for Databricks board support." - ) + from databricks.sdk import WorkspaceClient w = WorkspaceClient() raw_contents = w.files.list_directory_contents(path) @@ -211,12 +181,7 @@ def _databricks_rm_dir(self, path): @staticmethod def _databricks_mkdir(path): - try: - from databricks.sdk import WorkspaceClient - except ModuleNotFoundError: - raise PinsError( - "Install the `databricks-sdk` package for Databricks board support." - ) + from databricks.sdk import WorkspaceClient w = WorkspaceClient() w.files.create_directory(path) From 0c45eea079e7766c3018180640fcbacbe12e0816 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 16:00:12 -0500 Subject: [PATCH 62/65] pass for windows cache --- pins/tests/test_constructors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pins/tests/test_constructors.py b/pins/tests/test_constructors.py index 216ed6a3..4790ba30 100644 --- a/pins/tests/test_constructors.py +++ b/pins/tests/test_constructors.py @@ -192,9 +192,9 @@ def test_constructor_boards(board, df_csv, tmp_cache): # check data # TODO: update when dbc boards are not read-only if board.fs.protocol == "dbc": - df_csv = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - assert_frame_equal(df, df_csv) + pass + else: + assert_frame_equal(df, df_csv) # check the cache structure ----------------------------------------------- From 6d07e6a8c4cda0a38dfe5f11b798674e2c77b3a0 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 22:04:00 -0500 Subject: [PATCH 63/65] resolve dependencies --- requirements/dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 7a4ab029..54826c94 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -147,7 +147,7 @@ google-crc32c==1.7.1 # via # google-cloud-storage # google-resumable-media -google-resumable-media==2.7.1 +google-resumable-media==2.7.2 # via google-cloud-storage googleapis-common-protos==1.69.2 # via google-api-core From 220f86d3ab96f38554e2c36238c361ad0cff572c Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 22:06:27 -0500 Subject: [PATCH 64/65] resolve dependencies again --- requirements/dev.txt | 187 ++++++++++++++++++++++--------------------- 1 file changed, 97 insertions(+), 90 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 54826c94..0f8fd751 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2,25 +2,23 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile --extra=doc --extra=test --extra=check --output-file=- --strip-extras pyproject.toml +# pip-compile --extra=check --extra=doc --extra=test --output-file=- --strip-extras pyproject.toml # ---index-url https://pypi.python.org/simple/ ---trusted-host pypi.org -adlfs==2024.7.0 +adlfs==2024.12.0 # via pins (pyproject.toml) -aiobotocore==2.21.1 +aiobotocore==2.22.0 # via s3fs aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.11.16 +aiohttp==3.12.7 # via # adlfs # aiobotocore # gcsfs # s3fs -aioitertools==0.11.0 +aioitertools==0.12.0 # via aiobotocore -aiosignal==1.3.1 +aiosignal==1.3.2 # via aiohttp annotated-types==0.7.0 # via pydantic @@ -30,7 +28,7 @@ appnope==0.1.4 # via # ipykernel # ipython -asttokens==2.4.1 +asttokens==3.0.0 # via stack-data attrs==25.3.0 # via @@ -39,62 +37,63 @@ attrs==25.3.0 # pytest # referencing # sphobjinv -azure-core==1.33.0 +azure-core==1.34.0 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.53 # via adlfs -azure-identity==1.21.0 +azure-identity==1.23.0 # via adlfs azure-storage-blob==12.25.1 # via adlfs backcall==0.2.0 # via ipython -beartype==0.20.2 +beartype==0.21.0 # via plum-dispatch black==25.1.0 # via quartodoc -botocore==1.37.1 +botocore==1.37.3 # via aiobotocore -build==1.2.1 +build==1.2.2.post1 # via pip-tools cachetools==5.5.2 # via google-auth -certifi==2025.1.31 +certifi==2025.4.26 # via # requests # sphobjinv -cffi==1.16.0 +cffi==1.17.1 # via # azure-datalake-store # cryptography cfgv==3.4.0 # via pre-commit -charset-normalizer==3.4.1 +charset-normalizer==3.4.2 # via requests -click==8.1.8 +click==8.2.1 # via + # black # pip-tools # quartodoc colorama==0.4.6 # via griffe comm==0.2.2 # via ipykernel -cramjam==2.8.3 +cramjam==2.10.0 # via fastparquet -cryptography==44.0.2 +cryptography==45.0.3 # via # azure-identity # azure-storage-blob # msal # pyjwt -databricks-sdk==0.49.0 - # via pins (pyproject.toml) databackend==0.0.3 # via pins (pyproject.toml) -debugpy==1.8.2 +databricks-sdk==0.55.0 + # via pins (pyproject.toml) +debugpy==1.8.14 # via ipykernel decopatch==1.4.10 # via pytest-cases @@ -102,34 +101,34 @@ decorator==5.2.1 # via # gcsfs # ipython -distlib==0.3.8 +distlib==0.3.9 # via virtualenv executing==2.2.0 # via stack-data -fastjsonschema==2.20.0 +fastjsonschema==2.21.1 # via nbformat -fastparquet==2024.5.0 +fastparquet==2024.11.0 # via pins (pyproject.toml) filelock==3.18.0 # via virtualenv -frozenlist==1.4.1 +frozenlist==1.6.2 # via # aiohttp # aiosignal -fsspec==2025.3.2 +fsspec==2025.5.1 # via - # pins (pyproject.toml) # adlfs # fastparquet # gcsfs + # pins (pyproject.toml) # s3fs -gcsfs==2025.3.2 +gcsfs==2025.5.1 # via pins (pyproject.toml) -google-api-core==2.24.2 +google-api-core==2.25.0 # via # google-cloud-core # google-cloud-storage -google-auth==2.38.0 +google-auth==2.40.2 # via # databricks-sdk # gcsfs @@ -137,7 +136,7 @@ google-auth==2.38.0 # google-auth-oauthlib # google-cloud-core # google-cloud-storage -google-auth-oauthlib==1.2.1 +google-auth-oauthlib==1.2.2 # via gcsfs google-cloud-core==2.4.3 # via google-cloud-storage @@ -149,19 +148,19 @@ google-crc32c==1.7.1 # google-resumable-media google-resumable-media==2.7.2 # via google-cloud-storage -googleapis-common-protos==1.69.2 +googleapis-common-protos==1.70.0 # via google-api-core -griffe==1.7.2 +griffe==1.7.3 # via quartodoc -humanize==4.12.2 +humanize==4.12.3 # via pins (pyproject.toml) -identify==2.6.9 +identify==2.6.12 # via pre-commit -idna==3.7 +idna==3.10 # via # requests # yarl -importlib-metadata==8.6.1 +importlib-metadata==8.7.0 # via # pins (pyproject.toml) # quartodoc @@ -175,11 +174,11 @@ ipykernel==6.29.5 # via pins (pyproject.toml) ipython==8.12.0 # via - # pins (pyproject.toml) # ipykernel -isodate==0.6.1 + # pins (pyproject.toml) +isodate==0.7.2 # via azure-storage-blob -jedi==0.19.1 +jedi==0.19.2 # via ipython jinja2==3.1.6 # via pins (pyproject.toml) @@ -187,31 +186,31 @@ jmespath==1.0.1 # via # aiobotocore # botocore -joblib==1.4.2 +joblib==1.5.1 # via pins (pyproject.toml) -jsonschema==4.23.0 +jsonschema==4.24.0 # via # nbformat # sphobjinv -jsonschema-specifications==2023.12.1 +jsonschema-specifications==2025.4.1 # via jsonschema -jupyter-client==8.6.2 +jupyter-client==8.6.3 # via # ipykernel # nbclient -jupyter-core==5.7.2 +jupyter-core==5.8.1 # via # ipykernel # jupyter-client # nbclient # nbformat -makefun==1.15.4 +makefun==1.16.0 # via # decopatch # pytest-cases markdown-it-py==3.0.0 # via rich -markupsafe==2.1.5 +markupsafe==3.0.2 # via jinja2 matplotlib-inline==0.1.7 # via @@ -219,33 +218,33 @@ matplotlib-inline==0.1.7 # ipython mdurl==0.1.2 # via markdown-it-py -msal==1.32.0 +msal==1.32.3 # via # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.3.1 # via azure-identity -multidict==6.4.2 +multidict==6.4.4 # via # aiobotocore # aiohttp # yarl -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 # via black nbclient==0.10.2 # via pins (pyproject.toml) nbformat==5.10.4 # via - # pins (pyproject.toml) # nbclient + # pins (pyproject.toml) nest-asyncio==1.6.0 # via ipykernel nodeenv==1.9.1 # via # pre-commit # pyright -numpy==2.2.4 +numpy==2.2.6 # via # fastparquet # pandas @@ -253,8 +252,9 @@ numpy==2.2.4 # xarray oauthlib==3.2.2 # via requests-oauthlib -packaging==24.1 +packaging==25.0 # via + # black # build # fastparquet # ipykernel @@ -263,29 +263,32 @@ packaging==24.1 # xarray pandas==2.2.3 # via - # pins (pyproject.toml) # fastparquet + # pins (pyproject.toml) # rdata # xarray parso==0.8.4 # via jedi +pathspec==0.12.1 + # via black pexpect==4.9.0 # via ipython pickleshare==0.7.5 # via ipython pip-tools==7.4.1 # via pins (pyproject.toml) -platformdirs==4.3.7 +platformdirs==4.3.8 # via + # black # jupyter-core # virtualenv -pluggy==1.5.0 +pluggy==1.6.0 # via pytest plum-dispatch==2.5.7 # via quartodoc pre-commit==4.2.0 # via pins (pyproject.toml) -prompt-toolkit==3.0.50 +prompt-toolkit==3.0.51 # via ipython propcache==0.3.1 # via @@ -293,7 +296,7 @@ propcache==0.3.1 # yarl proto-plus==1.26.1 # via google-api-core -protobuf==6.30.2 +protobuf==6.31.1 # via # google-api-core # googleapis-common-protos @@ -302,13 +305,13 @@ psutil==7.0.0 # via ipykernel ptyprocess==0.7.0 # via pexpect -pure-eval==0.2.2 +pure-eval==0.2.3 # via stack-data py==1.11.0 # via pytest -pyarrow==19.0.1 +pyarrow==20.0.0 # via pins (pyproject.toml) -pyasn1==0.6.0 +pyasn1==0.6.1 # via # pyasn1-modules # rsa @@ -316,17 +319,19 @@ pyasn1-modules==0.4.2 # via google-auth pycparser==2.22 # via cffi -pydantic==2.11.3 +pydantic==2.11.5 # via quartodoc -pydantic-core==2.33.1 +pydantic-core==2.33.2 # via pydantic pygments==2.19.1 # via # ipython # rich -pyjwt==2.8.0 - # via msal -pyproject-hooks==1.1.0 +pyjwt==2.10.1 + # via + # msal + # pyjwt +pyproject-hooks==1.2.0 # via # build # pip-tools @@ -337,7 +342,7 @@ pytest==7.1.3 # pins (pyproject.toml) # pytest-dotenv # pytest-parallel -pytest-cases==3.8.5 +pytest-cases==3.8.6 # via pins (pyproject.toml) pytest-dotenv==0.5.2 # via pins (pyproject.toml) @@ -353,7 +358,7 @@ python-dotenv==1.1.0 # via pytest-dotenv pytz==2025.2 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # pins (pyproject.toml) # pre-commit @@ -362,7 +367,7 @@ pyzmq==26.4.0 # via # ipykernel # jupyter-client -quartodoc==0.7.5 +quartodoc==0.10.0 # via pins (pyproject.toml) rdata==0.11.2 # via pins (pyproject.toml) @@ -372,7 +377,6 @@ referencing==0.36.2 # jsonschema-specifications requests==2.32.3 # via - # pins (pyproject.toml) # azure-core # azure-datalake-store # databricks-sdk @@ -380,27 +384,28 @@ requests==2.32.3 # google-api-core # google-cloud-storage # msal + # pins (pyproject.toml) # quartodoc # requests-oauthlib requests-oauthlib==2.0.0 # via google-auth-oauthlib rich==14.0.0 # via plum-dispatch -rpds-py==0.24.0 +rpds-py==0.25.1 # via # jsonschema # referencing -rsa==4.9 +rsa==4.9.1 # via google-auth -s3fs==2025.3.2 +ruff==0.5.4 # via pins (pyproject.toml) -six==1.16.0 +s3fs==2025.5.1 + # via pins (pyproject.toml) +six==1.17.0 # via - # asttokens # azure-core - # isodate # python-dateutil -sphobjinv==2.3.1.2 +sphobjinv==2.3.1.3 # via quartodoc stack-data==0.6.3 # via ipython @@ -408,9 +413,9 @@ tabulate==0.9.0 # via quartodoc tblib==3.1.0 # via pytest-parallel -tomli==2.0.1 +tomli==2.2.1 # via pytest -tornado==6.4.1 +tornado==6.5.1 # via # ipykernel # jupyter-client @@ -426,18 +431,20 @@ traitlets==5.14.3 # nbformat types-appdirs==1.4.3.5 # via pins (pyproject.toml) -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # azure-core # azure-identity # azure-storage-blob + # pins (pyproject.toml) + # plum-dispatch # pydantic # pydantic-core # quartodoc # rdata # referencing # typing-inspection -typing-inspection==0.4.0 +typing-inspection==0.4.1 # via pydantic tzdata==2025.2 # via pandas @@ -445,23 +452,23 @@ urllib3==2.4.0 # via # botocore # requests -virtualenv==20.30.0 +virtualenv==20.31.2 # via pre-commit -watchdog==4.0.1 +watchdog==6.0.0 # via quartodoc wcwidth==0.2.13 # via prompt-toolkit -wheel==0.43.0 +wheel==0.45.1 # via pip-tools wrapt==1.17.2 # via aiobotocore -xarray==2025.3.1 +xarray==2025.4.0 # via rdata -xxhash==3.4.1 +xxhash==3.5.0 # via pins (pyproject.toml) -yarl==1.19.0 +yarl==1.20.0 # via aiohttp -zipp==3.19.2 +zipp==3.22.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: From 95e7873252b955eb1f8a4218a66af036805a87a6 Mon Sep 17 00:00:00 2001 From: isabel zimmerman Date: Tue, 3 Jun 2025 22:33:09 -0500 Subject: [PATCH 65/65] cannot clean up constructors, skip --- pins/tests/test_constructors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pins/tests/test_constructors.py b/pins/tests/test_constructors.py index 4790ba30..d0106510 100644 --- a/pins/tests/test_constructors.py +++ b/pins/tests/test_constructors.py @@ -178,6 +178,7 @@ def board(backend): backend.teardown_board(board) +@skip_if_dbc # passes, but skipping since this cannot clean itself up properly def test_constructor_boards(board, df_csv, tmp_cache): # TODO: would be nice to have fixtures for each board constructor # doesn't need to copy over pins-compat content