From 0a6193795e08dfa9d6668aada90813f79b2211d2 Mon Sep 17 00:00:00 2001
From: Mark Schreiber <mrschre@amazon.com>
Date: Mon, 26 Aug 2024 15:55:57 -0400
Subject: [PATCH] Headroom (#58)

* adds headroom variable to recommendation calculations

* details how to use the metrics to optimize a workflow and calculate the potential cost-reduction
---
 README.md                          | 20 +++++++++++++++++++-
 omics/cli/run_analyzer/__main__.py | 29 +++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 77d33af..bef7036 100644
--- a/README.md
+++ b/README.md
@@ -364,7 +364,9 @@ The CSV output by the command above includes the following columns:
 * __gpusRequested__ : The number of GPUs requested by the workflow task
 * __memoryRequestedGiB__ : Gibibytes of memory requested by the workflow task
 * __omicsInstanceTypeReserved__ : Requested HealthOmics instance type for each task
-* __omicsInstanceTypeMinimum__ : Minimum HealthOmics instance type that could run each task. 
+* __omicsInstanceTypeMinimum__ : Minimum HealthOmics instance type that could run each task.
+* __recommendedCpus__: The number of CPUs recommended for this task (corresponding to the number of CPUs in the omicsInstanceTypeMinimum)
+* __recommendedMemoryGiB__: The amount of GiB of memory recommended for this task (corresponding to the number of CPUs in the omicsInstanceTypeMinimum)
 * __estimatedUSD__ : Estimated HealthOmics charges (USD) for the workflow based on _sizeReserved_ and _runningSeconds_
 * __minimumUSD__ : Estimated HealthOmics charges (USD) for the workflow based on the recommended _omicsInstanceTypeMinimum_ and _runningSeconds_
 * __cpuUtilizationRatio__ : CPU utilization (_cpusMaximum_ / _cpusReserved_) for workflow task(s)
@@ -381,9 +383,25 @@ The CSV output by the command above includes the following columns:
 * __storageMaximumGiB__ : Maximum gibibytes of storage used during a single 1-minute interval
 * __storageAverageGiB__ : Average gibibytes of storage used by the workflow run
 
+For rows that are a _task_ type, the maximums, averages and reserved columns refer to the maximum, average or reserved amounts of the respective resource used by that task. These values can be used to guide the resources that should be requested for that task. For rows that are a _run_ type the maximums, averages and reserved columns refer to the maximum, average or reserved amounts of the respective resource used __concurrently__ by that run. These values can be used to determine if the accounts HealthOmics active CPUs/memory limits are being reached which might indicate the run is constrained by these limits.
+
 > [!WARNING]  
 > At this time AWS HealthOmics does not report the average or maximum storage used by runs that use "DYNAMIC" storage that run for under two hours. Because of this limitation the `storageMaximumGiB` and `storageAverageGiB` are set to zero and will not be included in the estimate run cost.
 
+#### Run Optimization and Estimated Cost Reduction
+
+Based on the metrics observed and calculated for a run, the application will recommend the smallest instance type that could be used for each task in the run. The type is reported in the `omicsInstanceTypeMinimum` column. To obtain this type for a task you can set the task CPU and memory requested for the task to the values of `recommendedCpus` and  `recommendedMemoryGiB` in you workflow definition. Based on this change each task would be estimated to
+reduce the cost of the run by `estimatedUSD` minus `minimumUSD`. The total potential cost reduction for the entire run can be estimated by subtracting the `minimumUSD` value from the `estimatedUSD` value in the row where the `type` is "`run`".
+
+#### Add headroom to recommendations
+
+Sometimes you will see variance in the amount of memory and CPU used in a run task, especially if you expect to run workflows with larger input files than were used in the analyzed run. For this reason you might want to allow add some headroom to the recommendations produced by the the run analyzer.
+
+The `-H` or `--headroom` flag can be use to add an additional `0.0` to `1.0` times the max CPU or memory used by a task to the calculation used to determine
+the `omicsInstanceTypeMinimum` recommendation. For example if a task used a max of 3.6 GiB of memory and the headroom value is 0.5 then 6 GiB of memory - `math.ceil(3.6 * (1 + 0.5))` - will be used to determine the minimum instance type that should be used.
+
+If your analyzed run is already close to optimal then adding headroom might result in the recommended minimum instance being larger than the instance used in the run which will also cause the "`minimumUSD`" to be larger than the "`estimatedUSD`".
+
 #### Produce a timeline plot for a run
 
 The RunAnalyzer tool can produce an interative timeline plot of a workflow. The plots allow you to visualize how individual tasks ran over the course of the run.
diff --git a/omics/cli/run_analyzer/__main__.py b/omics/cli/run_analyzer/__main__.py
index d1cf37f..d5ad679 100755
--- a/omics/cli/run_analyzer/__main__.py
+++ b/omics/cli/run_analyzer/__main__.py
@@ -11,6 +11,7 @@
                           [--file=<path>]
                           [--out=<path>]
                           [--plot=<directory>]
+                          [--headroom=<float>]
                           [--help]
 
 Options:
@@ -22,6 +23,7 @@
  -f, --file=<path>        Load input from file
  -o, --out=<path>         Write output to file
  -P, --plot=<directory>   Plot a run timeline to a directory
+ -H, --headroom=<float>   Adds a fractional buffer to the size of recommended memory and CPU. Values must be between 0.0 and 1.0.
  -h, --help               Show help text
 
 Examples:
@@ -34,10 +36,15 @@
  omics-run-analyzer 2345678:12345678-1234-5678-9012-123456789012
  # Output workflow run and tasks in JSON format
  omics-run-analyzer 1234567 -s -o run-1234567.json
+ # Plot a timeline of a workflow run and write the plot the HTML to "out/"
+ omics-run-analyzer 1234567 -P out
+ # Putput a workflow run analysis with 10% headroom added to recommended CPU and memory
+ omics-run-analyzer 1234567 -P timeline -H 0.1
 """
 import csv
 import datetime
 import json
+import math
 import os
 import re
 import sys
@@ -293,13 +300,15 @@ def add_run_util(run, tasks):
             metrics[name] /= time
 
 
-def add_metrics(res, resources, pricing):
+def add_metrics(res, resources, pricing, headroom):
     """Add run/task metrics"""
     arn = re.split(r"[:/]", res["arn"])
     rtype = arn[-2]
     region = arn[3]
     res["type"] = rtype
 
+    headroom_multiplier = 1 + headroom
+
     metrics = res.get("metrics", {})
     # if a resource has no metrics body then we can skip the rest
     if res.get("metrics") is None:
@@ -330,7 +339,7 @@ def add_metrics(res, resources, pricing):
     if store_res and store_max:
         metrics["storageUtilizationRatio"] = float(store_max) / float(store_res)
 
-    storage_type = res.get("storageType")
+    storage_type = res.get("storageType", STORAGE_TYPE_STATIC_RUN_STORAGE)
 
     if rtype == "run":
         # Get capacity requested (static), capacity max. used (dynamic) and
@@ -352,7 +361,7 @@ def add_metrics(res, resources, pricing):
 
         # Get price for optimal static storage
         if store_max:
-            capacity = get_static_storage_gib(store_max)
+            capacity = get_static_storage_gib(store_max * headroom_multiplier)
         gib_hrs = capacity * running / SECS_PER_HOUR
         price = get_pricing(pricing, PRICE_RESOURCE_TYPE_STATIC_RUN_STORAGE, region, gib_hrs)
         if price:
@@ -366,6 +375,9 @@ def add_metrics(res, resources, pricing):
         if price:
             metrics["estimatedUSD"] = price
         if cpus_max and mem_max and not gpus_res:
+            # Get smallest instance type that meets the requirements
+            cpus_max = math.ceil(cpus_max * headroom_multiplier)
+            mem_max = math.ceil(mem_max * headroom_multiplier)
             (itype, cpus, mem) = get_instance(cpus_max, mem_max)
             metrics["omicsInstanceTypeMinimum"] = itype
             metrics["recommendedCpus"] = cpus
@@ -453,6 +465,15 @@ def get_timeline_event(res, resources):
                 row = [event.get(h, "") for h in hdrs]
                 writer.writerow(row)
         else:
+            headroom = 0.0
+            if opts["--headroom"]:
+                try:
+                    headroom = float(opts["--headroom"])
+                except Exception:
+                    die(f'the --headroom argument {opts["--headroom"]} is not a valid float value')
+                if headroom > 1.0 or headroom < 0.0:
+                    die(f"the --headroom argument {headroom} must be between 0.0 and 1.0")
+
             # Show run statistics
             def tocsv(val):
                 if val is None:
@@ -502,7 +523,7 @@ def tocsv(val):
             writer = csv.writer(out, lineterminator="\n")
             writer.writerow(formatted_headers)
             for res in resources:
-                add_metrics(res, resources, pricing)
+                add_metrics(res, resources, pricing, headroom)
                 metrics = res.get("metrics", {})
                 row = [tocsv(metrics.get(h, res.get(h))) for h in hdrs]
                 writer.writerow(row)