Skip to content
This repository was archived by the owner on Jun 16, 2025. It is now read-only.

Commit 751449d

Browse files
authored
Add support for uploading to volumes
2 parents 42a2fa0 + 1bda33e commit 751449d

File tree

5 files changed

+105
-28
lines changed

5 files changed

+105
-28
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog db-rocket
22

3+
## Version 2.2.0
4+
- Add `use_volumes` and `dst_path` arguments to support uploading to Unity Catalog Volumes.
5+
36
## Version 2.1.0
47
- New paramter for ``rocket launch --glob_path=<...>``, which allows to specify a list of globs for files to deploy during launch.
58

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ stevenmi@MacBook db-rocket % rocket launch --watch=False
111111
- Databricks: >=7
112112
- Python: >=3.7
113113
- Tested on Platform: Linux, MacOs. Windows will probably not work but contributions are welcomed!
114+
- Supports uploading to Unity Catalog Volumes starting from version 3.0.0. Note that the underlying dependency, `databricks-sdk`, is still in beta. We do not recommend using UC Volumes in production.
114115

115116
## Acknowledgments
116117

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ poetry
99
mypy
1010
SecretStorage
1111
readme-renderer
12-
twine
12+
twine
13+
databricks-sdk==0.33.0

rocket/rocket.py

Lines changed: 97 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import os
2-
import glob
32
from typing import Optional, List, Union
43

54
import fire
65

6+
from databricks.sdk import WorkspaceClient
77
from rocket.file_watcher import FileWatcher
88
from rocket.logger import logger
99
from rocket.utils import (
@@ -54,49 +54,63 @@ def launch(
5454
project_location: str = ".",
5555
dbfs_path: Optional[str] = None,
5656
watch: bool = True,
57-
glob_path: Optional[Union[str, List[str]]] = None
57+
glob_path: Optional[Union[str, List[str]]] = None,
58+
use_volumes: Optional[bool] = False,
59+
dst_path: Optional[str] = None,
5860
) -> None:
5961
"""
6062
Entrypoint of the application, triggers a build and deploy
6163
:param project_location: path to project code, default: `"."`
62-
:param dbfs_path: path where the wheel will be stored, ex: dbfs:/tmp/myteam/myproject
64+
:param dbfs_path: path where the wheel will be stored, ex: dbfs:/tmp/myteam/myproject. Only support dbfs path.
6365
:param watch: Set to false if you don't want to automatically sync your files
6466
:param glob_path: glob string or list of strings for additional files to deploy, e.g. "*.json"
67+
:param use_volumes: upload files to unity catalog volumes.
68+
:param dst_path: Destination path to store the files. Support both dbfs:/ and /Volumes. Ideally, we should use dst_path and deprecate dbfs_path.
6569
:return:
6670
"""
6771
if os.getenv("DATABRICKS_TOKEN") is None:
6872
raise Exception("DATABRICKS_TOKEN must be set for db-rocket to work")
6973

70-
if dbfs_path is not None and not dbfs_path.startswith("dbfs:/"):
71-
raise Exception("`dbfs_path` must start with dbfs:/")
72-
73-
try:
74-
execute_shell_command(f"databricks fs ls dbfs:/")
75-
except Exception as e:
76-
raise Exception(
77-
f"Error accessing DBFS via databricks-cli. Please check if your databricks token is set and valid? Try to generate a new token and update existing one with `databricks configure --token`. Error details: {e}"
78-
)
74+
base_dbfs_access_error_message = ("Please check if your databricks token is set and valid? "
75+
"Try to generate a new token and update existing one with "
76+
"`databricks configure --token`.")
77+
if use_volumes:
78+
try:
79+
workspace_client = WorkspaceClient()
80+
workspace_client.dbutils.fs.ls("dbfs:/")
81+
except Exception as e:
82+
raise Exception(
83+
f"Could not access dbfs using databricks SDK. {base_dbfs_access_error_message} Error details: {e}"
84+
)
85+
db_path = self.get_volumes_path(dst_path)
86+
else:
87+
try:
88+
execute_shell_command(f"databricks fs ls dbfs:/")
89+
except Exception as e:
90+
raise Exception(
91+
f"Error accessing DBFS via databricks-cli. {base_dbfs_access_error_message} Error details: {e}"
92+
)
93+
path_to_use = dst_path if dst_path else dbfs_path
94+
db_path = self.get_dbfs_path(path_to_use)
7995

80-
if not dbfs_path:
81-
dbfs_path = f"dbfs:/temp/{os.environ['USER']}"
8296
if watch:
8397
project_name = os.path.abspath(project_location).split("/")[-1]
84-
dbfs_path = f"{dbfs_path}/{project_name}"
98+
db_path = f"{db_path}/{project_name}"
8599

86100
glob_paths = []
87101
if isinstance(glob_path, str):
88102
glob_paths = [os.path.join(project_location, glob_path)]
89103
elif isinstance(glob_path, list):
90104
glob_paths = [os.path.join(project_location, path) for path in glob_path]
91105

92-
self._build_and_deploy(watch=watch, project_location=project_location, dbfs_path=dbfs_path, glob_paths=glob_paths)
106+
self._build_and_deploy(watch=watch, project_location=project_location, db_path=db_path, glob_paths=glob_paths)
93107
if watch:
94108
watcher = FileWatcher(
95109
project_location,
96110
lambda x: self._build_and_deploy(
97111
watch=watch,
98112
modified_files=watcher.modified_files,
99-
dbfs_path=dbfs_path,
113+
db_path=db_path,
100114
project_location=project_location,
101115
glob_paths=glob_path
102116
),
@@ -108,15 +122,15 @@ def _build_and_deploy(
108122
self,
109123
watch: bool,
110124
project_location: str,
111-
dbfs_path: str,
125+
db_path: str,
112126
modified_files: Optional[List[str]] = None,
113127
glob_paths: Optional[List[str]] = None
114128
) -> None:
115129
if modified_files:
116130
logger.info(f"Found changes in {modified_files}. Overwriting them.")
117131
self._deploy(
118132
file_paths=modified_files,
119-
dbfs_path=dbfs_path,
133+
db_path=db_path,
120134
project_location=project_location,
121135
)
122136
return
@@ -128,10 +142,10 @@ def _build_and_deploy(
128142
wheel_path, wheel_file = self._create_python_project_wheel(project_location)
129143
self._deploy(
130144
file_paths=[wheel_path],
131-
dbfs_path=dbfs_path,
145+
db_path=db_path,
132146
project_location=os.path.dirname(wheel_path),
133147
)
134-
install_path = f'{dbfs_path.replace("dbfs:/", "/dbfs/")}/{wheel_file}'
148+
install_path = f"{self.get_install_path(db_path)}/{wheel_file}"
135149

136150
dependency_files = ["requirements.in", "requirements.txt"]
137151
index_urls = []
@@ -183,10 +197,10 @@ def _build_and_deploy(
183197
line.strip() for line in f.readlines() if "index-url" in line
184198
]
185199
self._deploy(
186-
file_paths=list(files), dbfs_path=dbfs_path, project_location=project_location
200+
file_paths=list(files), db_path=db_path, project_location=project_location
187201
)
188202

189-
install_path = f'{dbfs_path.replace("dbfs:/", "/dbfs/")}'
203+
install_path = self.get_install_path(db_path)
190204
index_urls_options = " ".join(index_urls)
191205

192206
if dependency_file_exist:
@@ -215,16 +229,54 @@ def _build_and_deploy(
215229
def _deploy(
216230
self,
217231
file_paths: List[str],
218-
dbfs_path: str,
232+
db_path: str,
219233
project_location: str
220234
) -> None:
235+
if self.is_dbfs(db_path):
236+
self._deploy_dbfs(file_paths, db_path, project_location)
237+
else:
238+
w = WorkspaceClient()
239+
self._deploy_volumes(file_paths, db_path, project_location, w)
240+
241+
def _deploy_dbfs(
242+
self,
243+
file_paths: List[str],
244+
db_path: str,
245+
project_location: str
246+
):
221247
def helper(file: str) -> None:
222-
target_path = f"{dbfs_path}/{os.path.relpath(file, project_location)}"
248+
target_path = f"{db_path}/{os.path.relpath(file, project_location)}"
223249
execute_shell_command(f"databricks fs cp --recursive --overwrite {file} {target_path}")
224250
logger.info(f"Uploaded {file} to {target_path}")
225251

226252
execute_for_each_multithreaded(file_paths, lambda x: helper(x))
227253

254+
def _deploy_volumes(
255+
self,
256+
file_paths: List[str],
257+
db_path: str,
258+
project_location: str,
259+
workspace_client
260+
):
261+
def helper(wc, file: str) -> None:
262+
# sdk asks an absolute path
263+
if not os.path.isabs(file):
264+
cwd = os.getcwd()
265+
file = f"{cwd}/{file}"
266+
target_path = f"{db_path}/{os.path.relpath(file, project_location)}"
267+
# if the file already exists, sdk returns error message: The file being created already exists.
268+
# a feature request is already here: https://github.com/databricks/databricks-sdk-py/issues/548
269+
try:
270+
wc.dbutils.fs.rm(target_path)
271+
except Exception:
272+
pass
273+
# sdk uses urllibs3 to parse paths.
274+
# It need to be file:// to be recognized as a local file. Otherwise it raises file not exist error
275+
wc.dbutils.fs.cp(f"file://{file}", target_path)
276+
logger.info(f"Uploaded {file} to {target_path}")
277+
278+
execute_for_each_multithreaded(file_paths, lambda x: helper(workspace_client, x))
279+
228280
def _create_python_project_wheel(self, project_location: str) -> (str, str):
229281
dist_location = f"{project_location}/dist"
230282
execute_shell_command(f"rm {dist_location}/* 2>/dev/null || true")
@@ -250,6 +302,26 @@ def _create_python_project_wheel(self, project_location: str) -> (str, str):
250302
wheel_path = f"{dist_location}/{wheel_file}"
251303
return wheel_path, wheel_file
252304

305+
def get_dbfs_path(self, path: Optional[str]) -> str:
306+
if path:
307+
logger.warning("The `dbfs_path` parameter is planned for deprecation. Please use the `dst_path` parameter instead.")
308+
if not self.is_dbfs(path):
309+
raise Exception("`dbfs_path` must start with dbfs:/")
310+
return path or f"dbfs:/temp/{os.environ['USER']}"
311+
312+
def get_volumes_path(self, path: Optional[str]) -> str:
313+
if path and not path.startswith("/Volumes"):
314+
raise Exception("`use_volumes` is true. `dst_path` must start with /Volumes")
315+
return path or f"/Volumes/main/data_products/volume/db_rocket/{os.environ['USER']}"
316+
317+
def get_install_path(self, db_path):
318+
if self.is_dbfs(db_path):
319+
return f'{db_path.replace("dbfs:/", "/dbfs/")}'
320+
return db_path
321+
322+
def is_dbfs(self, db_path: str):
323+
return db_path.startswith("dbfs:/")
324+
253325

254326
def main():
255327
fire.Fire(Rocket)

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@
99

1010
setuptools.setup(
1111
name="databricks-rocket",
12-
version="2.1.0",
12+
version="3.0.0",
1313
author="GetYourGuide",
1414
author_email="[email protected]",
1515
description="Keep your local python scripts installed and in sync with a databricks notebook. Shortens the feedback loop to develop projects using a hybrid enviroment",
1616
long_description=readme,
1717
long_description_content_type="text/markdown",
1818
url="https://github.com/getyourguide/db-rocket",
1919
packages=setuptools.find_packages(),
20-
install_requires=["fire", "watchdog~=2.1.9", "build", "databricks_cli"],
20+
install_requires=["fire", "watchdog~=2.1.9", "build", "databricks_cli", "databricks-sdk==0.33.0"],
2121
entry_points={
2222
"console_scripts": ["rocket=rocket.rocket:main", "dbrocket=rocket.rocket:main"]
2323
},

0 commit comments

Comments
 (0)