From bd9fc169ed6823ea5eeb34031d2f50810ac25c3f Mon Sep 17 00:00:00 2001 From: Kevin Sayers Date: Mon, 26 Aug 2024 20:51:22 -0600 Subject: [PATCH 01/10] adding output of recommended config --- omics/cli/run_analyzer/__main__.py | 43 +++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py index d5ad679..dcbf73b 100755 --- a/omics/cli/run_analyzer/__main__.py +++ b/omics/cli/run_analyzer/__main__.py @@ -12,6 +12,7 @@ [--out=] [--plot=] [--headroom=] + [--config=] [--help] Options: @@ -24,6 +25,7 @@ -o, --out= Write output to file -P, --plot= Plot a run timeline to a directory -H, --headroom= Adds a fractional buffer to the size of recommended memory and CPU. Values must be between 0.0 and 1.0. + -c, --config= Output a config file with recommended resources -h, --help Show help text Examples: @@ -54,7 +56,7 @@ import dateutil.utils import docopt from bokeh.plotting import output_file - +import textwrap from . import timeline # type: ignore exename = os.path.basename(sys.argv[0]) @@ -410,6 +412,27 @@ def get_timeline_event(res, resources): "running": (time3 - time2).total_seconds(), } +def create_config(engine, task_resources, filename): + + if engine == 'NEXTFLOW': + with open(filename, 'w') as out: + for task in task_resources: + task_string = textwrap.dedent(f""" + withName: {task} {{ + cpu = {task_resources[task]['cpus']} + memory = {task_resources[task]['mem']} + }} + """) + out.write(task_string) + + elif engine == 'CWL': + pass + elif engine == 'WDL': + pass + else: + raise ValueError("Unknown workflow engine") + + if __name__ == "__main__": # Parse command-line options @@ -522,11 +545,28 @@ def tocsv(val): writer = csv.writer(out, lineterminator="\n") writer.writerow(formatted_headers) + config = {} + for res in resources: add_metrics(res, resources, pricing, headroom) metrics = res.get("metrics", {}) + if res['type'] == 'run': + omics = session.client("omics") + wfid = res['workflow'].split('/')[-1] + engine = omics.get_workflow(id=wfid)['engine'] + if res['type'] == 'task': + task_name = res['name'].split(" ")[0] + if task_name not in config.keys(): + config[task_name] ={ + 'cpus': metrics['recommendedCpus'], + 'mem': metrics['recommendedMemoryGiB'] + } row = [tocsv(metrics.get(h, res.get(h))) for h in hdrs] writer.writerow(row) + + if opts["--config"]: + filename = opts['--config'] + create_config(engine, config, filename) if opts["--out"]: sys.stderr.write(f"{exename}: wrote {opts['--out']}\n") if opts["--plot"]: @@ -558,3 +598,4 @@ def tocsv(val): title = f"arn: {run['arn']}, name: {run.get('name')}" timeline.plot_timeline(resources, title=title, max_duration_hrs=run_duration_hrs) + From 7138ded7de66b4e095c3435c5805971ee952c570 Mon Sep 17 00:00:00 2001 From: Kevin Sayers Date: Wed, 28 Aug 2024 21:55:53 -0600 Subject: [PATCH 02/10] compare all tasks with the same name and keep the max resources --- omics/cli/run_analyzer/__main__.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py index dcbf73b..459d780 100755 --- a/omics/cli/run_analyzer/__main__.py +++ b/omics/cli/run_analyzer/__main__.py @@ -426,7 +426,14 @@ def create_config(engine, task_resources, filename): out.write(task_string) elif engine == 'CWL': - pass + with open(filename, "w") as out: + for task in task_resources: + task_string = textwrap.dedent(f""" + {task}: + coresMin: {task_resources[task]['cpus']} + ramMin: {task_resources[task]['mem']} + """) + out.write(task_string) elif engine == 'WDL': pass else: @@ -561,6 +568,11 @@ def tocsv(val): 'cpus': metrics['recommendedCpus'], 'mem': metrics['recommendedMemoryGiB'] } + else: + config[task_name] ={ + 'cpus': max(metrics['recommendedCpus'], config[task_name]['cpus']), + 'mem': max(metrics['recommendedMemoryGiB'], config[task_name]['mem']) + } row = [tocsv(metrics.get(h, res.get(h))) for h in hdrs] writer.writerow(row) From 8bb899c969ee69a67a866c5ae3bd1c71ed5d02a0 Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Fri, 6 Sep 2024 14:37:49 -0400 Subject: [PATCH 03/10] Update README.md (#64) Updates instructions for building from source --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bef7036..4008f22 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,17 @@ AWS HealthOmics Tools is available through pypi. To install, type: pip install amazon-omics-tools ``` -To install from source: +### Install from source + +Installing from source requires that your machine has the following prerequisites installed: +- `python3.10` or above +- `poetry` package manager +- `make` build tool ``` git clone https://github.com/awslabs/amazon-omics-tools.git -pip install ./amazon-omics-tools +cd ./amazon-omics-tools +make install ``` ## SDK Tools From 92b04f17e65bf025939aa26e09f8d263c4899d05 Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Wed, 11 Sep 2024 19:31:23 -0400 Subject: [PATCH 04/10] fixes label offsets and increases the threshold where we stop adding labels (#65) --- omics/cli/run_analyzer/timeline.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/omics/cli/run_analyzer/timeline.py b/omics/cli/run_analyzer/timeline.py index dcc1812..4cdce10 100644 --- a/omics/cli/run_analyzer/timeline.py +++ b/omics/cli/run_analyzer/timeline.py @@ -89,6 +89,7 @@ def plot_timeline(tasks, title="", time_units="min", max_duration_hrs=5, show_pl p_run = figure(width=960, height=800, sizing_mode="stretch_both", tooltips=tooltips) p_run.hbar( + # start time bar y="y", left="starting_left", right="starting_right", @@ -98,6 +99,7 @@ def plot_timeline(tasks, title="", time_units="min", max_duration_hrs=5, show_pl legend_label="starting", ) p_run.hbar( + # running time bar y="y", left="running_left", right="running_right", @@ -106,9 +108,12 @@ def plot_timeline(tasks, title="", time_units="min", max_duration_hrs=5, show_pl source=source, legend_label="running", ) - if len(data) < 50: + if len(data) < 101: p_run.text( - x="text_x", + # task name label + color="black", + x="running_right", + x_offset=10, y="y", text="name", alpha=0.4, From 8ea9adc92a1f7f67f6175181e23e5c910424c1b0 Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Mon, 16 Sep 2024 10:53:59 -0400 Subject: [PATCH 05/10] Update README.md (#66) caveats on price estimation --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 4008f22..081d7bf 100644 --- a/README.md +++ b/README.md @@ -399,6 +399,9 @@ For rows that are a _task_ type, the maximums, averages and reserved columns ref Based on the metrics observed and calculated for a run, the application will recommend the smallest instance type that could be used for each task in the run. The type is reported in the `omicsInstanceTypeMinimum` column. To obtain this type for a task you can set the task CPU and memory requested for the task to the values of `recommendedCpus` and `recommendedMemoryGiB` in you workflow definition. Based on this change each task would be estimated to reduce the cost of the run by `estimatedUSD` minus `minimumUSD`. The total potential cost reduction for the entire run can be estimated by subtracting the `minimumUSD` value from the `estimatedUSD` value in the row where the `type` is "`run`". +> [!WARNING] +> Cost estimates are based on the AWS list price at the time the run analysis is performed. In the event prices have changed these may not reflect the price you were charged at the time of the run. Further, the run analyzer does not account for any discounts, credits or price agreements you may have. Price estimates for recommended instance sizes (`minimumUSD`) assume that the runtime of the task will remain the same on the recommended instance. Actual costs will be determined based on the actual runtime. + #### Add headroom to recommendations Sometimes you will see variance in the amount of memory and CPU used in a run task, especially if you expect to run workflows with larger input files than were used in the analyzed run. For this reason you might want to allow add some headroom to the recommendations produced by the the run analyzer. From 288157f047fd2ca002a1f0cbddba74a750002a60 Mon Sep 17 00:00:00 2001 From: Kevin Sayers Date: Fri, 13 Sep 2024 15:17:47 -0600 Subject: [PATCH 06/10] updating config --- omics/cli/run_analyzer/__main__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py index 459d780..46edf35 100755 --- a/omics/cli/run_analyzer/__main__.py +++ b/omics/cli/run_analyzer/__main__.py @@ -435,7 +435,14 @@ def create_config(engine, task_resources, filename): """) out.write(task_string) elif engine == 'WDL': - pass + with open(filename, "w") as out: + for task in task_resources: + task_string = textwrap.dedent(f""" + {task}: + cpu: "{task_resources[task]['cpus']}" + memory: "{task_resources[task]['mem']}" + """) + out.write(task_string) else: raise ValueError("Unknown workflow engine") From 2a76bdb83f2376d0a2626fd3db89dbee4625dbeb Mon Sep 17 00:00:00 2001 From: Kevin Sayers Date: Mon, 16 Sep 2024 21:24:46 -0600 Subject: [PATCH 07/10] function to handle base task --- omics/cli/run_analyzer/__main__.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py index 46edf35..a06eb86 100755 --- a/omics/cli/run_analyzer/__main__.py +++ b/omics/cli/run_analyzer/__main__.py @@ -445,7 +445,20 @@ def create_config(engine, task_resources, filename): out.write(task_string) else: raise ValueError("Unknown workflow engine") - + +def get_base_task(engine, task): + # Returns the base task name + if engine == 'NEXTFLOW': + individual_task = task.split(" ")[0] + return individual_task + elif engine == 'CWL': + individual_task = task.split(" ")[0] + return individual_task + elif engine == 'WDL': + individual_task = task.split(" ")[0] + return individual_task + else: + raise ValueError("Unknown workflow engine") if __name__ == "__main__": @@ -569,7 +582,7 @@ def tocsv(val): wfid = res['workflow'].split('/')[-1] engine = omics.get_workflow(id=wfid)['engine'] if res['type'] == 'task': - task_name = res['name'].split(" ")[0] + task_name = get_base_task(engine, res['name']) if task_name not in config.keys(): config[task_name] ={ 'cpus': metrics['recommendedCpus'], From 440533b94b991b70fbf259661e4ac41a5fa5ad19 Mon Sep 17 00:00:00 2001 From: Kevin Sayers Date: Tue, 17 Sep 2024 21:37:28 -0600 Subject: [PATCH 08/10] Limiting config to nextflow --- omics/cli/run_analyzer/__main__.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py index a06eb86..622084d 100755 --- a/omics/cli/run_analyzer/__main__.py +++ b/omics/cli/run_analyzer/__main__.py @@ -25,7 +25,7 @@ -o, --out= Write output to file -P, --plot= Plot a run timeline to a directory -H, --headroom= Adds a fractional buffer to the size of recommended memory and CPU. Values must be between 0.0 and 1.0. - -c, --config= Output a config file with recommended resources + -c, --config= Output a config file with recommended resources (Nextflow only) -h, --help Show help text Examples: @@ -426,23 +426,9 @@ def create_config(engine, task_resources, filename): out.write(task_string) elif engine == 'CWL': - with open(filename, "w") as out: - for task in task_resources: - task_string = textwrap.dedent(f""" - {task}: - coresMin: {task_resources[task]['cpus']} - ramMin: {task_resources[task]['mem']} - """) - out.write(task_string) + raise ValueError("--config does not currently support CWL workflows") elif engine == 'WDL': - with open(filename, "w") as out: - for task in task_resources: - task_string = textwrap.dedent(f""" - {task}: - cpu: "{task_resources[task]['cpus']}" - memory: "{task_resources[task]['mem']}" - """) - out.write(task_string) + raise ValueError("--config does not currently support WDL workflows") else: raise ValueError("Unknown workflow engine") @@ -452,11 +438,9 @@ def get_base_task(engine, task): individual_task = task.split(" ")[0] return individual_task elif engine == 'CWL': - individual_task = task.split(" ")[0] - return individual_task + return task elif engine == 'WDL': - individual_task = task.split(" ")[0] - return individual_task + return task else: raise ValueError("Unknown workflow engine") From 7ffb8eef43e5a79590072c22628b64c99b097168 Mon Sep 17 00:00:00 2001 From: Kevin Sayers Date: Mon, 23 Sep 2024 22:03:40 -0600 Subject: [PATCH 09/10] moving writeconfig to new file and adding tests --- omics/cli/run_analyzer/__main__.py | 47 +++------------------ omics/cli/run_analyzer/writeconfig.py | 32 ++++++++++++++ tests/run_analyzer/__init__.py | 0 tests/run_analyzer/unit/test_writeconfig.py | 14 ++++++ 4 files changed, 53 insertions(+), 40 deletions(-) create mode 100644 omics/cli/run_analyzer/writeconfig.py create mode 100644 tests/run_analyzer/__init__.py create mode 100644 tests/run_analyzer/unit/test_writeconfig.py diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py index 622084d..c0b9d21 100755 --- a/omics/cli/run_analyzer/__main__.py +++ b/omics/cli/run_analyzer/__main__.py @@ -12,7 +12,7 @@ [--out=] [--plot=] [--headroom=] - [--config=] + [--write-config=] [--help] Options: @@ -25,7 +25,7 @@ -o, --out= Write output to file -P, --plot= Plot a run timeline to a directory -H, --headroom= Adds a fractional buffer to the size of recommended memory and CPU. Values must be between 0.0 and 1.0. - -c, --config= Output a config file with recommended resources (Nextflow only) + -c, --write-config= Output a config file with recommended resources (Nextflow only) -h, --help Show help text Examples: @@ -56,8 +56,8 @@ import dateutil.utils import docopt from bokeh.plotting import output_file -import textwrap from . import timeline # type: ignore +from . import writeconfig exename = os.path.basename(sys.argv[0]) OMICS_LOG_GROUP = "/aws/omics/WorkflowLog" @@ -412,39 +412,6 @@ def get_timeline_event(res, resources): "running": (time3 - time2).total_seconds(), } -def create_config(engine, task_resources, filename): - - if engine == 'NEXTFLOW': - with open(filename, 'w') as out: - for task in task_resources: - task_string = textwrap.dedent(f""" - withName: {task} {{ - cpu = {task_resources[task]['cpus']} - memory = {task_resources[task]['mem']} - }} - """) - out.write(task_string) - - elif engine == 'CWL': - raise ValueError("--config does not currently support CWL workflows") - elif engine == 'WDL': - raise ValueError("--config does not currently support WDL workflows") - else: - raise ValueError("Unknown workflow engine") - -def get_base_task(engine, task): - # Returns the base task name - if engine == 'NEXTFLOW': - individual_task = task.split(" ")[0] - return individual_task - elif engine == 'CWL': - return task - elif engine == 'WDL': - return task - else: - raise ValueError("Unknown workflow engine") - - if __name__ == "__main__": # Parse command-line options opts = docopt.docopt(__doc__) @@ -566,7 +533,7 @@ def tocsv(val): wfid = res['workflow'].split('/')[-1] engine = omics.get_workflow(id=wfid)['engine'] if res['type'] == 'task': - task_name = get_base_task(engine, res['name']) + task_name = writeconfig.get_base_task(engine, res['name']) if task_name not in config.keys(): config[task_name] ={ 'cpus': metrics['recommendedCpus'], @@ -580,9 +547,9 @@ def tocsv(val): row = [tocsv(metrics.get(h, res.get(h))) for h in hdrs] writer.writerow(row) - if opts["--config"]: - filename = opts['--config'] - create_config(engine, config, filename) + if opts["--write-config"]: + filename = opts['--write-config'] + writeconfig.create_config(engine, config, filename) if opts["--out"]: sys.stderr.write(f"{exename}: wrote {opts['--out']}\n") if opts["--plot"]: diff --git a/omics/cli/run_analyzer/writeconfig.py b/omics/cli/run_analyzer/writeconfig.py new file mode 100644 index 0000000..a1950a7 --- /dev/null +++ b/omics/cli/run_analyzer/writeconfig.py @@ -0,0 +1,32 @@ +import textwrap + +def create_config(engine, task_resources, filename): + if engine == 'NEXTFLOW': + with open(filename, 'w') as out: + for task in task_resources: + task_string = textwrap.dedent(f""" + withName: {task} {{ + cpu = {task_resources[task]['cpus']} + memory = {task_resources[task]['mem']} + }} + """) + out.write(task_string) + + elif engine == 'CWL': + raise ValueError("--write-config does not currently support CWL workflows") + elif engine == 'WDL': + raise ValueError("--write-config does not currently support WDL workflows") + else: + raise ValueError("Unknown workflow engine") + +def get_base_task(engine, task): + # Returns the base task name + if engine == 'NEXTFLOW': + individual_task = task.split(" ")[0] + return individual_task + elif engine == 'CWL': + return task + elif engine == 'WDL': + return task + else: + raise ValueError("Unknown workflow engine") \ No newline at end of file diff --git a/tests/run_analyzer/__init__.py b/tests/run_analyzer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/run_analyzer/unit/test_writeconfig.py b/tests/run_analyzer/unit/test_writeconfig.py new file mode 100644 index 0000000..4b4151e --- /dev/null +++ b/tests/run_analyzer/unit/test_writeconfig.py @@ -0,0 +1,14 @@ +import unittest +from omics.cli.run_analyzer import writeconfig + +class TestGetBaseTask(unittest.TestCase): + def test_get_base_task_nextflow(self): + result = writeconfig.get_base_task('NEXTFLOW', 'task1 (sample1)') + self.assertEqual(result, 'task1') + + def test_get_base_task_cwl(self): + result = writeconfig.get_base_task('CWL', 'task1 (sample1)') + self.assertRaises(ValueError) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From b8f4b5a56c51e8b2c0ae5938e731bd979eae273f Mon Sep 17 00:00:00 2001 From: Kevin Sayers Date: Mon, 23 Sep 2024 22:21:34 -0600 Subject: [PATCH 10/10] adding README details --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 081d7bf..3cb7b91 100644 --- a/README.md +++ b/README.md @@ -432,6 +432,15 @@ this returns something like: omics-run-analyzer: wrote run-1234567.json ``` +#### Output optimized configuration +> [!WARNING] +> Currently this feature only supports Nextflow workflows. + +The `--write-config` option will write a new configuration file with the `recommendedCpus` and `recommendedMemoryGiB` as the resource requirements. This will take the maximum values if the task is run multiple times with different inputs. + +```bash +python -m omics.cli.run_analyzer 123456 --write-config=optimized.config +``` ## Security See [CONTRIBUTING](https://github.com/awslabs/amazon-omics-tools/blob/main/CONTRIBUTING.md#security-issue-notifications) for more information.