Add option to output a new config file (#63)

Add write-config option to the runanalyzer. This provides an optimized configuration file. --------- Co-authored-by: Kevin Sayers <[email protected]> Co-authored-by: Mark Schreiber <[email protected]>
awslabs · Sep 24, 2024 · e2a8286 · e2a8286
1 parent 40da936
commit e2a8286
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -432,6 +432,15 @@ this returns something like:
 omics-run-analyzer: wrote run-1234567.json
 ```
 
+#### Output optimized configuration
+> [!WARNING]
+> Currently this feature only supports Nextflow workflows.
+
+The `--write-config` option will write a new configuration file with the `recommendedCpus` and `recommendedMemoryGiB` as the resource requirements. This will take the maximum values if the task is run multiple times with different inputs. 
+
+```bash
+python -m omics.cli.run_analyzer 123456 --write-config=optimized.config
+```
 ## Security
 
 See [CONTRIBUTING](https://github.com/awslabs/amazon-omics-tools/blob/main/CONTRIBUTING.md#security-issue-notifications) for more information.

diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py
@@ -12,6 +12,7 @@
                           [--out=<path>]
                           [--plot=<directory>]
                           [--headroom=<float>]
+                          [--write-config=<path>]
                           [--help]
 
 Options:
@@ -24,6 +25,7 @@
  -o, --out=<path>         Write output to file
  -P, --plot=<directory>   Plot a run timeline to a directory
  -H, --headroom=<float>   Adds a fractional buffer to the size of recommended memory and CPU. Values must be between 0.0 and 1.0.
+ -c, --write-config=<path>      Output a config file with recommended resources (Nextflow only)
  -h, --help               Show help text
 
 Examples:
@@ -54,8 +56,8 @@
 import dateutil.utils
 import docopt
 from bokeh.plotting import output_file
-
 from . import timeline  # type: ignore
+from . import writeconfig 
 
 exename = os.path.basename(sys.argv[0])
 OMICS_LOG_GROUP = "/aws/omics/WorkflowLog"
@@ -407,7 +409,6 @@ def get_timeline_event(res, resources):
         "running": (time3 - time2).total_seconds(),
     }
 
-
 if __name__ == "__main__":
     # Parse command-line options
     opts = docopt.docopt(__doc__)
@@ -519,11 +520,33 @@ def tocsv(val):
 
             writer = csv.writer(out, lineterminator="\n")
             writer.writerow(formatted_headers)
+            config = {}
+
             for res in resources:
                 add_metrics(res, resources, pricing, headroom)
                 metrics = res.get("metrics", {})
+                if res['type'] == 'run':
+                    omics = session.client("omics")
+                    wfid = res['workflow'].split('/')[-1]
+                    engine = omics.get_workflow(id=wfid)['engine']
+                if res['type'] == 'task':
+                    task_name = writeconfig.get_base_task(engine, res['name'])
+                    if task_name not in config.keys():
+                        config[task_name] ={
+                            'cpus': metrics['recommendedCpus'],
+                            'mem': metrics['recommendedMemoryGiB']
+                        }
+                    else:
+                        config[task_name] ={
+                            'cpus': max(metrics['recommendedCpus'], config[task_name]['cpus']),
+                            'mem': max(metrics['recommendedMemoryGiB'], config[task_name]['mem'])
+                        }
                 row = [tocsv(metrics.get(h, res.get(h))) for h in hdrs]
                 writer.writerow(row)
+
+            if opts["--write-config"]:
+                filename = opts['--write-config']
+                writeconfig.create_config(engine, config, filename)
         if opts["--out"]:
             sys.stderr.write(f"{exename}: wrote {opts['--out']}\n")
     if opts["--plot"]:
@@ -555,3 +578,4 @@ def tocsv(val):
         title = f"arn: {run['arn']}, name: {run.get('name')}"
 
         timeline.plot_timeline(resources, title=title, max_duration_hrs=run_duration_hrs)
+
diff --git a/omics/cli/run_analyzer/writeconfig.py b/omics/cli/run_analyzer/writeconfig.py
@@ -0,0 +1,32 @@
+import textwrap
+
+def create_config(engine, task_resources, filename):
+    if engine == 'NEXTFLOW':
+        with open(filename, 'w') as out:
+            for task in task_resources:
+                task_string = textwrap.dedent(f"""
+                withName: {task} {{
+                    cpu = {task_resources[task]['cpus']}
+                    memory = {task_resources[task]['mem']}
+                }}
+                """)
+                out.write(task_string)
+
+    elif engine == 'CWL':
+        raise ValueError("--write-config does not currently support CWL workflows")
+    elif engine == 'WDL':
+        raise ValueError("--write-config does not currently support WDL workflows")
+    else:
+        raise ValueError("Unknown workflow engine")
+
+def get_base_task(engine, task):
+    # Returns the base task name
+    if engine == 'NEXTFLOW':
+        individual_task = task.split(" ")[0]
+        return individual_task
+    elif engine == 'CWL':
+        return task
+    elif engine == 'WDL':
+        return task
+    else:
+        raise ValueError("Unknown workflow engine")
diff --git a/tests/run_analyzer/__init__.py b/tests/run_analyzer/__init__.py
diff --git a/tests/run_analyzer/unit/test_writeconfig.py b/tests/run_analyzer/unit/test_writeconfig.py
@@ -0,0 +1,14 @@
+import unittest
+from omics.cli.run_analyzer import writeconfig
+
+class TestGetBaseTask(unittest.TestCase):
+    def test_get_base_task_nextflow(self):
+        result = writeconfig.get_base_task('NEXTFLOW', 'task1 (sample1)')
+        self.assertEqual(result, 'task1')
+
+    def test_get_base_task_cwl(self):
+        result = writeconfig.get_base_task('CWL', 'task1 (sample1)')
+        self.assertRaises(ValueError)
+
+if __name__ == '__main__':
+    unittest.main()