From 2bf9957aea29393ebf79a8b49a17f196650cd67c Mon Sep 17 00:00:00 2001 From: Henry Davidge Date: Mon, 26 Feb 2024 12:59:26 +0900 Subject: [PATCH] Add script for simple building and installation (#597) * Add script for simple building and installation Signed-off-by: Henry Davidge * yapf Signed-off-by: Henry Davidge * add databricks-sdk to conda environment Signed-off-by: Henry Davidge * go into python directory to build whl Signed-off-by: Henry Davidge --------- Signed-off-by: Henry Davidge Co-authored-by: Henry Davidge --- bin/build | 99 ++++++++++++++++++++++++++++++++++++++++++ build.sbt | 2 +- python/environment.yml | 1 + python/setup.py | 10 ++++- 4 files changed, 109 insertions(+), 3 deletions(-) create mode 100755 bin/build diff --git a/bin/build b/bin/build new file mode 100755 index 000000000..898fc88d9 --- /dev/null +++ b/bin/build @@ -0,0 +1,99 @@ +#!/usr/bin/env python +import argparse +from pathlib import Path +import subprocess +import sys +import re +import os +import glob +import datetime +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.compute import Library, LibraryFullStatusStatus, State + +def run_cmd(cmd): + try: + return subprocess.run(cmd, check=True, text=True, capture_output=True) + except subprocess.CalledProcessError as e: + error_str = f'Command "{" ".join(cmd)}" returned code {e.returncode}\n\nStdout:\n{e.stdout}\n\nStderr:\n{e.stderr}' + sys.exit(e.returncode) + +def uninstall_if_matches(w, cluster_id, name, lib_type): + libs = [l for l in w.libraries.cluster_status(cluster_id) if l.status == LibraryFullStatusStatus.INSTALLED] + libs = [l.library for l in libs if lib_type in l.library.as_dict() and name in l.library.as_dict()[lib_type]] + if len(libs) == 0: + return False + print(f'Uninstalling existing libraries {", ".join([l.as_dict()[lib_type] for l in libs])} from cluster') + w.libraries.uninstall(cluster_id, libs) + return True + +class ChDir(object): + def __init__(self, dir): + self.dir = dir + self.olddir = os.getcwd() + + def __enter__(self): + os.chdir(self.dir) + + def __exit__(self, *args): + os.chdir(self.olddir) + +def main(args): + project_root = Path(__file__).parent.parent + os.chdir(project_root) + + whl_path = None + if args.python: + with ChDir('python'): + out = run_cmd(['python', 'setup.py', 'bdist_wheel']) + whl_path = str(project_root / ('python/dist/' + re.search(r'glow\.py\S+\.whl', out.stdout).group(0))) + print(f'Built Python wheel {Path(whl_path).resolve()}') + + jar_path = None + if args.scala: + out = run_cmd(['sbt', 'core/assembly']) + core_target_dir = re.search(r'core/\S+/scala-[.\d]+/', out.stdout).group(0) + jars = glob.glob(core_target_dir + '*assembly*.jar') + jar_path = max(jars, key=os.path.getctime) + print(f'Built Scala assembly jar {Path(jar_path).resolve()}') + + if args.install: + now = datetime.datetime.now().strftime('%d-%m-%Y_%H:%M:%S,%f') + remote_fname_prefix = f'dbfs:/FileStore/glow/{now}' + print(f'Uploading artifacts to {remote_fname_prefix}') + client = WorkspaceClient() + + uninstalled_lib = False + if jar_path is not None: + jar_name = jar_path.split('/')[-1] + uninstalled_lib = uninstall_if_matches(client, args.install, jar_name, 'jar') or uninstalled_lib + remote_path = f'{remote_fname_prefix}/{jar_name}' + with open(jar_path, 'rb') as f: + client.dbfs.upload(remote_path, f) + f.close() + client.libraries.install(args.install, [Library(jar=remote_path)]) + print(f'Installed jar {remote_path}') + + if whl_path is not None: + whl_name = whl_path.split('/')[-1] + uninstalled_lib = uninstall_if_matches(client, args.install, whl_name, 'whl') or uninstalled_lib + remote_path = f'{remote_fname_prefix}/{whl_name}' + with open(whl_path, 'rb') as f: + client.dbfs.upload(remote_path, f) + f.close() + client.libraries.install(args.install, [Library(whl=remote_path)]) + print(f'Installed whl {remote_path}') + + if uninstalled_lib and client.clusters.get(args.install).state in [State.RUNNING, State.RESIZING]: + print(f'Restarting cluster so new libraries will take effect') + client.clusters.restart(args.install) + +parser = argparse.ArgumentParser(description=''' + A script to build Glow artifacts and install them on a Databricks cluster. This script assumes that + the local environment is already set up (conda environment, sbt and Java installation) for whichever artifacts are requested, and + if installation is requested, the cluster already exists. + Any artifacts uploaded to DBFS are not automatically deleted. Deletion should be performed manually or with a cloud storage retention policy.''') +parser.add_argument('--python', help='Build a Python wheel', action='store_true') +parser.add_argument('--scala', help='Build a Scala assembly jar', action='store_true') +parser.add_argument('--install', metavar='CLUSTER_ID', help='If provided, install built artifacts on this cluster. If currently running, the cluster will be restarted. ' + + 'Databricks authentication must be provided via environment variables') +main(parser.parse_args()) \ No newline at end of file diff --git a/build.sbt b/build.sbt index 0ab751d4a..8167c3c27 100644 --- a/build.sbt +++ b/build.sbt @@ -77,7 +77,7 @@ val testJavaOptions = Vector( // Spark session used by many tasks cannot be used concurrently. val testConcurrency = 1 Test / fork := true -concurrentRestrictions in Global := Seq( +Global / concurrentRestrictions := Seq( Tags.limit(Tags.ForkedTestGroup, testConcurrency) ) diff --git a/python/environment.yml b/python/environment.yml index ca646fd02..109ab31de 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -24,6 +24,7 @@ dependencies: - pip: - pyspark==3.5.0 - databricks-cli==0.18 # Docs notebook source generation + - databricks-sdk - setuptools==65.6.3 # Python packaging - twine # Pypi publishing - pygments diff --git a/python/setup.py b/python/setup.py index 9022aa7c7..e52bc01ae 100755 --- a/python/setup.py +++ b/python/setup.py @@ -14,8 +14,14 @@ from setuptools import setup, setuptools import imp +from pathlib import Path -version = imp.load_source('version', 'version.py').VERSION + +def relative_file(path): + return (Path(__file__).parent / path).as_posix() + + +version = imp.load_source('version', relative_file('version.py')).VERSION setup(name='glow.py', version=version, @@ -31,7 +37,7 @@ ], author='The Glow Authors', description='An open-source toolkit for large-scale genomic analysis', - long_description=open('README.rst').read(), + long_description=open(relative_file('README.rst')).read(), long_description_content_type='text/x-rst', license='Apache License 2.0', classifiers=[