Skip to content

Commit

Permalink
Add script for simple building and installation (#597)
Browse files Browse the repository at this point in the history
* Add script for simple building and installation

Signed-off-by: Henry Davidge <[email protected]>

* yapf

Signed-off-by: Henry Davidge <[email protected]>

* add databricks-sdk to conda environment

Signed-off-by: Henry Davidge <[email protected]>

* go into python directory to build whl

Signed-off-by: Henry Davidge <[email protected]>

---------

Signed-off-by: Henry Davidge <[email protected]>
Co-authored-by: Henry Davidge <[email protected]>
  • Loading branch information
henrydavidge and Henry Davidge authored Feb 26, 2024
1 parent 1fcb9ab commit 2bf9957
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 3 deletions.
99 changes: 99 additions & 0 deletions bin/build
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python
import argparse
from pathlib import Path
import subprocess
import sys
import re
import os
import glob
import datetime
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.compute import Library, LibraryFullStatusStatus, State

def run_cmd(cmd):
try:
return subprocess.run(cmd, check=True, text=True, capture_output=True)
except subprocess.CalledProcessError as e:
error_str = f'Command "{" ".join(cmd)}" returned code {e.returncode}\n\nStdout:\n{e.stdout}\n\nStderr:\n{e.stderr}'
sys.exit(e.returncode)

def uninstall_if_matches(w, cluster_id, name, lib_type):
libs = [l for l in w.libraries.cluster_status(cluster_id) if l.status == LibraryFullStatusStatus.INSTALLED]
libs = [l.library for l in libs if lib_type in l.library.as_dict() and name in l.library.as_dict()[lib_type]]
if len(libs) == 0:
return False
print(f'Uninstalling existing libraries {", ".join([l.as_dict()[lib_type] for l in libs])} from cluster')
w.libraries.uninstall(cluster_id, libs)
return True

class ChDir(object):
def __init__(self, dir):
self.dir = dir
self.olddir = os.getcwd()

def __enter__(self):
os.chdir(self.dir)

def __exit__(self, *args):
os.chdir(self.olddir)

def main(args):
project_root = Path(__file__).parent.parent
os.chdir(project_root)

whl_path = None
if args.python:
with ChDir('python'):
out = run_cmd(['python', 'setup.py', 'bdist_wheel'])
whl_path = str(project_root / ('python/dist/' + re.search(r'glow\.py\S+\.whl', out.stdout).group(0)))
print(f'Built Python wheel {Path(whl_path).resolve()}')

jar_path = None
if args.scala:
out = run_cmd(['sbt', 'core/assembly'])
core_target_dir = re.search(r'core/\S+/scala-[.\d]+/', out.stdout).group(0)
jars = glob.glob(core_target_dir + '*assembly*.jar')
jar_path = max(jars, key=os.path.getctime)
print(f'Built Scala assembly jar {Path(jar_path).resolve()}')

if args.install:
now = datetime.datetime.now().strftime('%d-%m-%Y_%H:%M:%S,%f')
remote_fname_prefix = f'dbfs:/FileStore/glow/{now}'
print(f'Uploading artifacts to {remote_fname_prefix}')
client = WorkspaceClient()

uninstalled_lib = False
if jar_path is not None:
jar_name = jar_path.split('/')[-1]
uninstalled_lib = uninstall_if_matches(client, args.install, jar_name, 'jar') or uninstalled_lib
remote_path = f'{remote_fname_prefix}/{jar_name}'
with open(jar_path, 'rb') as f:
client.dbfs.upload(remote_path, f)
f.close()
client.libraries.install(args.install, [Library(jar=remote_path)])
print(f'Installed jar {remote_path}')

if whl_path is not None:
whl_name = whl_path.split('/')[-1]
uninstalled_lib = uninstall_if_matches(client, args.install, whl_name, 'whl') or uninstalled_lib
remote_path = f'{remote_fname_prefix}/{whl_name}'
with open(whl_path, 'rb') as f:
client.dbfs.upload(remote_path, f)
f.close()
client.libraries.install(args.install, [Library(whl=remote_path)])
print(f'Installed whl {remote_path}')

if uninstalled_lib and client.clusters.get(args.install).state in [State.RUNNING, State.RESIZING]:
print(f'Restarting cluster so new libraries will take effect')
client.clusters.restart(args.install)

parser = argparse.ArgumentParser(description='''
A script to build Glow artifacts and install them on a Databricks cluster. This script assumes that
the local environment is already set up (conda environment, sbt and Java installation) for whichever artifacts are requested, and
if installation is requested, the cluster already exists.
Any artifacts uploaded to DBFS are not automatically deleted. Deletion should be performed manually or with a cloud storage retention policy.''')
parser.add_argument('--python', help='Build a Python wheel', action='store_true')
parser.add_argument('--scala', help='Build a Scala assembly jar', action='store_true')
parser.add_argument('--install', metavar='CLUSTER_ID', help='If provided, install built artifacts on this cluster. If currently running, the cluster will be restarted. ' +
'Databricks authentication must be provided via environment variables')
main(parser.parse_args())
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ val testJavaOptions = Vector(
// Spark session used by many tasks cannot be used concurrently.
val testConcurrency = 1
Test / fork := true
concurrentRestrictions in Global := Seq(
Global / concurrentRestrictions := Seq(
Tags.limit(Tags.ForkedTestGroup, testConcurrency)
)

Expand Down
1 change: 1 addition & 0 deletions python/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies:
- pip:
- pyspark==3.5.0
- databricks-cli==0.18 # Docs notebook source generation
- databricks-sdk
- setuptools==65.6.3 # Python packaging
- twine # Pypi publishing
- pygments
Expand Down
10 changes: 8 additions & 2 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@

from setuptools import setup, setuptools
import imp
from pathlib import Path

version = imp.load_source('version', 'version.py').VERSION

def relative_file(path):
return (Path(__file__).parent / path).as_posix()


version = imp.load_source('version', relative_file('version.py')).VERSION

setup(name='glow.py',
version=version,
Expand All @@ -31,7 +37,7 @@
],
author='The Glow Authors',
description='An open-source toolkit for large-scale genomic analysis',
long_description=open('README.rst').read(),
long_description=open(relative_file('README.rst')).read(),
long_description_content_type='text/x-rst',
license='Apache License 2.0',
classifiers=[
Expand Down

0 comments on commit 2bf9957

Please sign in to comment.