From 98586b33090cd6c6fc37ca94e6825bb41645bd7a Mon Sep 17 00:00:00 2001
From: Alec Scott <scott112@llnl.gov>
Date: Thu, 21 Dec 2023 11:35:07 -0800
Subject: [PATCH 01/27] Add basic GitHub Actions CI

---
 .github/workflows/ci.yml                      | 50 +++++++++++++++++++
 .github/workflows/requirements/style.txt      |  2 +
 .github/workflows/requirements/unit-tests.txt |  2 +
 .github/workflows/style.yml                   | 27 ++++++++++
 .github/workflows/unit-tests.yml              | 25 ++++++++++
 gantry/__init__.py                            |  0
 gantry/__main__.py                            |  6 +++
 7 files changed, 112 insertions(+)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .github/workflows/requirements/style.txt
 create mode 100644 .github/workflows/requirements/unit-tests.txt
 create mode 100644 .github/workflows/style.yml
 create mode 100644 .github/workflows/unit-tests.yml
 create mode 100644 gantry/__init__.py
 create mode 100644 gantry/__main__.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..3eda4da
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,50 @@
+name: ci
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ci-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}
+  cancel-in-progress: true
+
+jobs:
+  changes:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read
+    outputs:
+      style: ${{ steps.filter.outputs.style }}
+      unit-tests: ${{ steps.filter.outputs.unit-tests }}
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # @v2
+        if: ${{ github.event_name == 'push' }}
+        with:
+          fetch-depth: 0
+
+        # For pull requests it's not necessary to checkout the code
+      - uses: dorny/paths-filter@4512585405083f25c027a35db413c2b3b9006d50
+        id: filter
+        with:
+          filters: |
+            style:
+              - '.github/**'
+              - 'gantry/**'
+              - 'pyproject.toml'
+            unit-tests:
+              - '.github/**'
+              - 'gantry/**'
+              - 'pyproject.toml'
+
+  style:
+    if: ${{ needs.changes.outputs.style == 'true' }}
+    needs: changes
+    uses: ./.github/workflows/style.yml
+
+  unit-tests:
+    if: ${{ needs.changes.outputs.unit-tests == 'true' }}
+    needs: [changes, style]
+    uses: ./.github/workflows/unit-tests.yml
diff --git a/.github/workflows/requirements/style.txt b/.github/workflows/requirements/style.txt
new file mode 100644
index 0000000..dd22bb4
--- /dev/null
+++ b/.github/workflows/requirements/style.txt
@@ -0,0 +1,2 @@
+black==23.12.0
+flake8==6.1.0
diff --git a/.github/workflows/requirements/unit-tests.txt b/.github/workflows/requirements/unit-tests.txt
new file mode 100644
index 0000000..1393afd
--- /dev/null
+++ b/.github/workflows/requirements/unit-tests.txt
@@ -0,0 +1,2 @@
+pytest==7.4.3
+pytest-asyncio==0.23.2
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
new file mode 100644
index 0000000..d5f7155
--- /dev/null
+++ b/.github/workflows/style.yml
@@ -0,0 +1,27 @@
+name: Linting & Style Checks
+on:
+  # This Workflow can be triggered manually
+  workflow_dispatch:
+  workflow_call:
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: '.github/workflows/requirements/style.txt'
+
+      - name: Install Python dependencies
+        run: |
+          pip install -r .github/workflows/requirements/style.txt
+
+      - name: Lint and Format Check with Flake8 and Black
+        run: |
+          black --diff --check .
+          flake8 hubcast/
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
new file mode 100644
index 0000000..551403a
--- /dev/null
+++ b/.github/workflows/unit-tests.yml
@@ -0,0 +1,25 @@
+name: Unit Tests
+on:
+  # This Workflow can be triggered manually
+  workflow_dispatch:
+  workflow_call:
+
+jobs:
+  ubuntu:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.11']
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '.github/workflows/requirements/unit-tests.txt'
+      - name: Install Python dependencies
+        run: |
+          pip install -r .github/workflows/requirements/unit-tests.txt
+      - name: Run Unit Tests with Pytest
+        run: |
+          pytest
diff --git a/gantry/__init__.py b/gantry/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/gantry/__main__.py b/gantry/__main__.py
new file mode 100644
index 0000000..491f8ff
--- /dev/null
+++ b/gantry/__main__.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello World")
+
+
+if __name__ == "__main__":
+    main()

From 57ca71c4e74edf03302582aab7093f7ec12d4a90 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 25 Dec 2023 14:51:56 -0800
Subject: [PATCH 02/27] rough draft of collection functionality

---
 .envrc                     |  12 ++
 .flake8                    |   3 +
 .gitignore                 |   5 +
 db/schema.sql              |  55 ++++++++
 gantry/tests/test_utils.py |  20 +++
 gantry/utils/__init__.py   |   0
 gantry/utils/collect.py    | 281 +++++++++++++++++++++++++++++++++++++
 gantry/utils/db.py         |  40 ++++++
 gantry/utils/gitlab.py     |  25 ++++
 gantry/utils/misc.py       |  26 ++++
 gantry/utils/prometheus.py |  75 ++++++++++
 pyproject.toml             |   4 +
 spack.yaml                 |  13 ++
 13 files changed, 559 insertions(+)
 create mode 100644 .envrc
 create mode 100644 .flake8
 create mode 100644 .gitignore
 create mode 100644 db/schema.sql
 create mode 100644 gantry/tests/test_utils.py
 create mode 100644 gantry/utils/__init__.py
 create mode 100644 gantry/utils/collect.py
 create mode 100644 gantry/utils/db.py
 create mode 100644 gantry/utils/gitlab.py
 create mode 100644 gantry/utils/misc.py
 create mode 100644 gantry/utils/prometheus.py
 create mode 100644 pyproject.toml
 create mode 100644 spack.yaml

diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000..5283fcd
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,12 @@
+#------------------------------------------------------------------------
+# Load Development Spack Environment (If Spack is installed.)
+#
+# Run 'direnv allow' from within the cloned repository to automatically
+# load the spack environment when you enter the directory.
+#------------------------------------------------------------------------
+if type spack &>/dev/null; then
+    . $SPACK_ROOT/share/spack/setup-env.sh
+    spack env activate -d .
+fi
+
+dotenv
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..f295e07
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203, E704
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..372e265
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+.env
+spack.lock
+.spack-env
+db/*.db
diff --git a/db/schema.sql b/db/schema.sql
new file mode 100644
index 0000000..c6fcac0
--- /dev/null
+++ b/db/schema.sql
@@ -0,0 +1,55 @@
+CREATE TABLE vms (
+    id INTEGER PRIMARY KEY,
+    start INTEGER NOT NULL,
+    -- VM end is the max of the build end times
+    hostname TEXT NOT NULL,
+    cores REAL NOT NULL,
+    mem REAL NOT NULL,
+    arch TEXT NOT NULL,
+    os TEXT NOT NULL,
+    instance_type TEXT NOT NULL
+);
+
+
+CREATE TABLE builds (
+    -- TODO do we want an entry here for if the job has been retried?
+    id INTEGER PRIMARY KEY,
+    pod TEXT NOT NULL UNIQUE,
+    vm INTEGER NOT NULL,
+    start INTEGER NOT NULL,
+    end INTEGER NOT NULL,
+    job_id INTEGER NOT NULL,
+    job_status TEXT NOT NULL,
+    ref TEXT NOT NULL,
+    pkg_name TEXT NOT NULL,
+    pkg_version TEXT NOT NULL,
+    pkg_variants TEXT NOT NULL, -- can be stored as JSONB in the future?
+    compiler_name TEXT NOT NULL,
+    compiler_version TEXT NOT NULL,
+    arch TEXT NOT NULL,
+    stack TEXT NOT NULL,
+    build_jobs INTEGER NOT NULL,
+    cpu_request REAL NOT NULL,
+    cpu_limit REAL, -- this can be null
+    cpu_mean REAL NOT NULL,
+    cpu_median REAL NOT NULL,
+    cpu_max REAL NOT NULL,
+    cpu_min REAL NOT NULL,
+    cpu_stddev REAL NOT NULL,
+    mem_request REAL NOT NULL,
+    mem_limit REAL NOT NULL,
+    mem_mean REAL NOT NULL,
+    mem_median REAL NOT NULL,
+    mem_max REAL NOT NULL,
+    mem_min REAL NOT NULL,
+    mem_stddev REAL NOT NULL,
+    FOREIGN KEY (vm)
+        REFERENCES vms (id)
+            ON UPDATE CASCADE
+            ON DELETE CASCADE
+);
+
+CREATE TABLE ghost_jobs (
+    id INTEGER PRIMARY KEY,
+    job_id INTEGER NOT NULL
+);
diff --git a/gantry/tests/test_utils.py b/gantry/tests/test_utils.py
new file mode 100644
index 0000000..3e83088
--- /dev/null
+++ b/gantry/tests/test_utils.py
@@ -0,0 +1,20 @@
+import pytest
+
+from gantry.utils.misc import spec_variants
+
+# write tests for spec_variants here
+# +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on has to equal {}
+
+
+@pytest.fixture
+def variant_string():
+    return "+adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on"
+
+
+def test_spec_variants(variant_string):
+    assert spec_variants(variant_string) == {
+        "adios2": True,
+        "advanced_debug": False,
+        "patches": ["02253c7", "acb3805", "b724e6a"],
+        "use_vtkm": "on",
+    }
diff --git a/gantry/utils/__init__.py b/gantry/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py
new file mode 100644
index 0000000..b19dd4d
--- /dev/null
+++ b/gantry/utils/collect.py
@@ -0,0 +1,281 @@
+import json
+import logging
+import math
+import re
+import statistics
+import sys
+from datetime import datetime
+
+from utils.db import SqliteClient
+from utils.gitlab import GitlabClient
+from utils.misc import spec_variants
+from utils.prometheus import PrometheusClient
+
+
+async def fetch_job(job: dict) -> dict:
+    # TODO match gitlab webhook payload and process datetimes?
+    if job["build_status"] not in ("success", "failed"):
+        return
+
+    if job["build_status"] == "failed":
+        # TODO implement retry mechanism
+        return
+
+    job_name_pattern = re.compile(r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)")
+    job_name_match = job_name_pattern.match(job["build_name"])
+    if not job_name_match:
+        # generate jobs, non build jobs, etc
+        return
+
+    gitlab = GitlabClient()
+    prometheus = PrometheusClient()
+    db = SqliteClient()
+
+    # check if job has already been inserted into the database
+    db.execute("select job_id from builds where job_id = ?", (job["build_id"],))
+    if db.fetchone():
+        logging.info(f"job {job['build_id']} already in database")
+        return
+
+    job_log = await gitlab.job_log(job["build_id"])
+    if is_ghost(job_log):
+        db.insert("ghost_jobs", (None, job["build_id"]))
+        return
+
+    job["start"] = datetime.fromisoformat(job["build_started_at"]).timestamp()
+    job["end"] = datetime.fromisoformat(job["build_finished_at"]).timestamp()
+
+    # prometheus is not guaranteed to have data at the exact start and end times
+    # instead of creating an arbitrary buffer, ask for data in the middle of the job
+    query_time = (job["end"] + job["start"]) / 2
+
+    pod_annotations_res = await prometheus.query(
+        type="single",
+        query={
+            "metric": "kube_pod_annotations",
+            "filters": {"annotation_gitlab_ci_job_id": job["build_id"]},
+        },
+        time=query_time,
+    )
+
+    job.update(
+        {
+            "pod": pod_annotations_res[0]["labels"]["pod"],
+            # TODO int? is it guaranteed to be here?
+            "build_jobs": pod_annotations_res[0]["labels"][
+                "annotation_metrics_spack_job_build_jobs"
+            ],
+            "arch": pod_annotations_res[0]["labels"][
+                "annotation_metrics_spack_job_spec_arch"
+            ],
+            "pkg_name": pod_annotations_res[0]["labels"][
+                "annotation_metrics_spack_job_spec_pkg_name"
+            ],
+            "pkg_version": pod_annotations_res[0]["labels"][
+                "annotation_metrics_spack_job_spec_pkg_version"
+            ],
+            "pkg_variants": spec_variants(
+                pod_annotations_res[0]["labels"][
+                    "annotation_metrics_spack_job_spec_variants"
+                ]
+            ),
+            "compiler_name": pod_annotations_res[0]["labels"][
+                "annotation_metrics_spack_job_spec_compiler_name"
+            ],
+            "compiler_version": pod_annotations_res[0]["labels"][
+                "annotation_metrics_spack_job_spec_compiler_version"
+            ],
+            "stack": job_name_match.group(6),
+        }
+    )
+
+    job_requests_res = await prometheus.query(
+        type="single",
+        query={
+            "metric": "kube_pod_container_resource_requests",
+            "filters": {"container": "build", "pod": job["pod"]},
+        },
+        time=query_time,
+    )
+
+    job_limits_res = await prometheus.query(
+        type="single",
+        query={
+            "metric": "kube_pod_container_resource_limits",
+            "filters": {"container": "build", "pod": job["pod"]},
+        },
+        time=query_time,
+    )
+
+    mem_usage = process_usage(
+        await prometheus.query(
+            type="range",
+            query={
+                "metric": "container_memory_working_set_bytes",
+                "filters": {"container": "build", "pod": job["pod"]},
+            },
+            start=job["start"],
+            end=job["end"],
+        ),
+        job["build_id"],
+    )
+
+    cpu_usage = process_usage(
+        await prometheus.query(
+            type="range",
+            custom_query=f"rate(container_cpu_usage_seconds_total{{pod='{job['pod']}', container='build'}}[90s])",
+            start=job["start"],
+            end=job["end"],
+        ),
+        job["build_id"],
+    )
+
+    # instead of needing to fetch the node where the pod ran from kube_pod_info
+    # we can grab it from kube_pod_container_resource_limits
+    # weirdly, it's not available in kube_pod_labels or annotations
+    # https://github.com/kubernetes/kube-state-metrics/issues/1148
+    vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time)
+    requests = process_resources_res(job_requests_res)
+    limits = process_resources_res(job_limits_res)
+
+    # TODO insert into db here
+
+    return db.insert(
+        "builds",
+        (
+            None,
+            job["pod"],
+            vm,
+            job["start"],
+            job["end"],
+            job["build_id"],
+            job["build_status"],
+            job["ref"],
+            job["pkg_name"],
+            job["pkg_version"],
+            # dict to string
+            json.dumps(job["pkg_variants"]),
+            job["compiler_name"],
+            job["compiler_version"],
+            job["arch"],
+            job["stack"],
+            job["build_jobs"],
+            requests["cpu"]["value"],
+            # currently not set as of 12-23
+            limits.get("cpu", {}).get("value"),
+            cpu_usage["mean"],
+            cpu_usage["median"],
+            cpu_usage["max"],
+            cpu_usage["min"],
+            cpu_usage["stddev"],
+            requests["memory"]["value"],
+            limits["memory"]["value"],
+            mem_usage["mean"],
+            mem_usage["median"],
+            mem_usage["max"],
+            mem_usage["min"],
+            mem_usage["stddev"],
+        ),
+    )
+
+
+async def fetch_vm(hostname: str, query_time: float) -> dict:
+    prometheus = PrometheusClient()
+    db = SqliteClient()
+    vm_start_res = await prometheus.query(
+        type="single",
+        query={
+            "metric": "kube_node_created",
+            "filters": {"node": hostname},
+        },
+        time=query_time,
+    )
+
+    vm_start = float(vm_start_res[0]["values"][1])
+
+    db.execute(
+        "select id from vms where hostname = ? and start = ?", (hostname, vm_start)
+    )
+    vm_id = db.fetchone()
+
+    if vm_id:
+        logging.info(f"vm {hostname} already in database with id {vm_id[0]}")
+        return vm_id[0]
+
+    vm_capacity = process_resources_res(
+        await prometheus.query(
+            type="single",
+            query={
+                "metric": "kube_node_status_capacity",
+                "filters": {"node": hostname},
+            },
+            time=query_time,
+        )
+    )
+
+    vm_labels = await prometheus.query(
+        type="single",
+        query={
+            "metric": "kube_node_labels",
+            "filters": {"node": hostname},
+        },
+        time=query_time,
+    )
+
+    return db.insert(
+        "vms",
+        (
+            None,
+            vm_start,
+            hostname,
+            vm_capacity["cpu"]["value"],
+            vm_capacity["memory"]["value"],
+            vm_labels[0]["labels"]["label_kubernetes_io_arch"],
+            vm_labels[0]["labels"]["label_kubernetes_io_os"],
+            vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"],
+        ),
+    )
+
+
+def is_ghost(log):
+    return "No need to rebuild" in log
+
+
+def process_resources_res(res: dict) -> dict:
+    processed = {}
+    for item in res:
+        # duplicates are ignored by overwriting the previous entry
+        processed[item["labels"]["resource"]] = {
+            "unit": item["labels"]["unit"],
+            "value": float(item["values"][1]),
+        }
+
+    return processed
+
+
+def process_usage(res: dict, job_id: int) -> dict:
+    if not res:
+        # sometimes prometheus reports no data for a job if the time range is too small
+        logging.error(f"lack of usage data for job {job_id}")
+        sys.exit()
+
+    usage = [float(value) for timestamp, value in res[0]["values"]]
+
+    sum_stats = {
+        "mean": statistics.fmean(usage),
+        # use pstdev because we have the whole population
+        "stddev": statistics.pstdev(usage),
+        "max": max(usage),
+        "min": min(usage),
+        "median": statistics.median(usage),
+    }
+
+    if (
+        sum_stats["stddev"] == 0
+        or sum_stats["mean"] == 0
+        or math.isnan(sum_stats["stddev"])
+    ):
+        logging.error(f"usage data is invalid for job {job_id}")
+        sys.exit()
+
+    return sum_stats
diff --git a/gantry/utils/db.py b/gantry/utils/db.py
new file mode 100644
index 0000000..5ea9de5
--- /dev/null
+++ b/gantry/utils/db.py
@@ -0,0 +1,40 @@
+import os
+import sqlite3
+
+
+class SqliteClient:
+    def __init__(self):
+        self.conn = sqlite3.connect(os.environ["DB_FILE"])
+        self.cursor = self.conn.cursor()
+        self.execute("PRAGMA foreign_keys = ON;")
+
+    def execute(self, query, params=None):
+        if params:
+            self.cursor.execute(query, params)
+        else:
+            self.cursor.execute(query)
+
+    def insert(self, table, values):
+        self.execute(
+            f"insert into {table} values ({','.join(['?'] * len(values))})", values
+        )
+        self.commit()
+        return self.cursor.lastrowid
+
+    def fetchall(self):
+        return self.cursor.fetchall()
+
+    def fetchone(self):
+        return self.cursor.fetchone()
+
+    def commit(self):
+        self.conn.commit()
+
+    def close(self):
+        self.conn.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
diff --git a/gantry/utils/gitlab.py b/gantry/utils/gitlab.py
new file mode 100644
index 0000000..4ad652d
--- /dev/null
+++ b/gantry/utils/gitlab.py
@@ -0,0 +1,25 @@
+import logging
+import os
+
+import aiohttp
+
+
+class GitlabClient:
+    def __init__(self):
+        self.base_url = os.environ["GITLAB_URL"]
+        self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_TOKEN"]}
+
+    async def request(self, url: str, response_type: str) -> dict:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, headers=self.headers) as resp:
+                if resp.status != 200:
+                    logging.error(f"Gitlab query failed with status {resp.status}")
+                    return {}
+                if response_type == "json":
+                    return await resp.json()
+                if response_type == "text":
+                    return await resp.text()
+
+    async def job_log(self, id: int) -> str:
+        url = f"{self.base_url}/jobs/{id}/trace"
+        return await self.request(url, "text")
diff --git a/gantry/utils/misc.py b/gantry/utils/misc.py
new file mode 100644
index 0000000..31c67ff
--- /dev/null
+++ b/gantry/utils/misc.py
@@ -0,0 +1,26 @@
+def spec_variants(spec: str) -> dict:
+    """Given a spec's concrete variants, return a dict of variant name: value."""
+    # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on
+
+    # TODO handle errors and invalid inputs
+
+    variants = {}
+    spec = spec.replace("+", " +")
+    spec = spec.replace("~", " ~")
+    parts = spec.split(" ")
+
+    for part in parts:
+        if "=" in part:
+            name, value = part.split("=")
+            # multiple values
+            if "," in value:
+                variants[name] = value.split(",")
+            else:
+                variants[name] = value
+        else:
+            if part.startswith("+"):
+                variants[part[1:]] = True
+            elif part.startswith("~"):
+                variants[part[1:]] = False
+
+    return variants
diff --git a/gantry/utils/prometheus.py b/gantry/utils/prometheus.py
new file mode 100644
index 0000000..ca1eac1
--- /dev/null
+++ b/gantry/utils/prometheus.py
@@ -0,0 +1,75 @@
+import logging
+import math
+import os
+import urllib.parse
+
+import aiohttp
+
+
+class PrometheusClient:
+    # TODO error handling for unexpected data
+    # todo retry mechanism for failed requests?
+
+    def __init__(self):
+        self.base_url = os.environ["PROMETHEUS_URL"]
+        self.cookies = {"_oauth2_proxy": os.environ["PROMETHEUS_COOKIE"]}
+
+    async def query(self, type: str, **kwargs) -> dict:
+        # TODO add validation for kwargs and comments
+        query_str = (
+            kwargs["custom_query"]
+            if kwargs.get("custom_query")
+            else query_to_str(**kwargs["query"])
+        )
+
+        if type == "range":
+            # prometheus will only return this many frames
+            max_resolution = 10_000
+            # calculating the max step size to get the desired resolution
+            step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution)
+            url = f"{self.base_url}/query_range?query={query_str}&start={kwargs['start']}&end={kwargs['end']}&step={step}s"
+            return await self._query(url)
+        elif type == "single":
+            url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}"
+            return await self._query(url)
+
+    async def _query(self, url: str) -> dict:
+        """Query Prometheus with a query string"""
+        async with aiohttp.ClientSession() as session:
+            # submit cookie with request
+            async with session.get(url, cookies=self.cookies) as resp:
+                if resp.status != 200:
+                    logging.error(f"Prometheus query failed with status {resp.status}")
+                    return {}
+                try:
+                    return self.process_response(await resp.json())
+                except aiohttp.ContentTypeError:
+                    logging.error(
+                        """Prometheus query failed with unexpected response.
+                        The cookie may have expired."""
+                    )
+                    return {}
+
+    def process_response(self, response: dict) -> dict:
+        """Process Prometheus response into a more usable format"""
+        result_type = response.get("data", {}).get("resultType")
+        values_dict = {
+            "matrix": "values",
+            "vector": "value",
+        }
+
+        if result_type not in values_dict:
+            logging.error(f"Prometheus response type {result_type} not supported")
+            return {}
+
+        return [
+            {"labels": result["metric"], "values": result[values_dict[result_type]]}
+            for result in response["data"]["result"]
+        ]
+
+
+def query_to_str(metric: str, filters: dict) -> str:
+    # TODO add a test for this
+    # expected output: metric{key1="val1", key2="val2"}
+    filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()])
+    return urllib.parse.quote(f"{metric}{{{filters_str}}}")
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4620ee1
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,4 @@
+[tool.isort]
+profile = "black"
+skip_gitignore = true
+color_output = true
diff --git a/spack.yaml b/spack.yaml
new file mode 100644
index 0000000..7249289
--- /dev/null
+++ b/spack.yaml
@@ -0,0 +1,13 @@
+spack:
+  specs:
+  - python
+  - py-aiohttp
+  - py-pytest
+  - py-pytest-asyncio
+  - py-flake8
+  - py-black
+  - py-isort
+  - sqlite
+  view: true
+  concretizer:
+    unify: true

From c8a39d8cb125ce41f50b09cb522b66c513a064ab Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 25 Dec 2023 15:05:52 -0800
Subject: [PATCH 03/27] Revert "Add basic GitHub Actions CI"

This reverts commit 98586b33090cd6c6fc37ca94e6825bb41645bd7a.
---
 .github/workflows/ci.yml                      | 50 -------------------
 .github/workflows/requirements/style.txt      |  2 -
 .github/workflows/requirements/unit-tests.txt |  2 -
 .github/workflows/style.yml                   | 27 ----------
 .github/workflows/unit-tests.yml              | 25 ----------
 gantry/__init__.py                            |  0
 gantry/__main__.py                            |  6 ---
 7 files changed, 112 deletions(-)
 delete mode 100644 .github/workflows/ci.yml
 delete mode 100644 .github/workflows/requirements/style.txt
 delete mode 100644 .github/workflows/requirements/unit-tests.txt
 delete mode 100644 .github/workflows/style.yml
 delete mode 100644 .github/workflows/unit-tests.yml
 delete mode 100644 gantry/__init__.py
 delete mode 100644 gantry/__main__.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index 3eda4da..0000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: ci
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-
-concurrency:
-  group: ci-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}
-  cancel-in-progress: true
-
-jobs:
-  changes:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
-    outputs:
-      style: ${{ steps.filter.outputs.style }}
-      unit-tests: ${{ steps.filter.outputs.unit-tests }}
-    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # @v2
-        if: ${{ github.event_name == 'push' }}
-        with:
-          fetch-depth: 0
-
-        # For pull requests it's not necessary to checkout the code
-      - uses: dorny/paths-filter@4512585405083f25c027a35db413c2b3b9006d50
-        id: filter
-        with:
-          filters: |
-            style:
-              - '.github/**'
-              - 'gantry/**'
-              - 'pyproject.toml'
-            unit-tests:
-              - '.github/**'
-              - 'gantry/**'
-              - 'pyproject.toml'
-
-  style:
-    if: ${{ needs.changes.outputs.style == 'true' }}
-    needs: changes
-    uses: ./.github/workflows/style.yml
-
-  unit-tests:
-    if: ${{ needs.changes.outputs.unit-tests == 'true' }}
-    needs: [changes, style]
-    uses: ./.github/workflows/unit-tests.yml
diff --git a/.github/workflows/requirements/style.txt b/.github/workflows/requirements/style.txt
deleted file mode 100644
index dd22bb4..0000000
--- a/.github/workflows/requirements/style.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-black==23.12.0
-flake8==6.1.0
diff --git a/.github/workflows/requirements/unit-tests.txt b/.github/workflows/requirements/unit-tests.txt
deleted file mode 100644
index 1393afd..0000000
--- a/.github/workflows/requirements/unit-tests.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-pytest==7.4.3
-pytest-asyncio==0.23.2
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
deleted file mode 100644
index d5f7155..0000000
--- a/.github/workflows/style.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: Linting & Style Checks
-on:
-  # This Workflow can be triggered manually
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  lint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-
-      - name: Set up Python 3.11
-        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c
-        with:
-          python-version: '3.11'
-          cache: 'pip'
-          cache-dependency-path: '.github/workflows/requirements/style.txt'
-
-      - name: Install Python dependencies
-        run: |
-          pip install -r .github/workflows/requirements/style.txt
-
-      - name: Lint and Format Check with Flake8 and Black
-        run: |
-          black --diff --check .
-          flake8 hubcast/
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
deleted file mode 100644
index 551403a..0000000
--- a/.github/workflows/unit-tests.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: Unit Tests
-on:
-  # This Workflow can be triggered manually
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  ubuntu:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ['3.8', '3.11']
-    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-      - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: 'pip'
-          cache-dependency-path: '.github/workflows/requirements/unit-tests.txt'
-      - name: Install Python dependencies
-        run: |
-          pip install -r .github/workflows/requirements/unit-tests.txt
-      - name: Run Unit Tests with Pytest
-        run: |
-          pytest
diff --git a/gantry/__init__.py b/gantry/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/gantry/__main__.py b/gantry/__main__.py
deleted file mode 100644
index 491f8ff..0000000
--- a/gantry/__main__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def main():
-    print("Hello World")
-
-
-if __name__ == "__main__":
-    main()

From a8dec44c9b0f8d061f9a2cdd7a53ce393cc67afb Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Wed, 10 Jan 2024 23:11:56 -0800
Subject: [PATCH 04/27] line breaks

---
 gantry/utils/collect.py    | 6 +++++-
 gantry/utils/prometheus.py | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py
index b19dd4d..6704b01 100644
--- a/gantry/utils/collect.py
+++ b/gantry/utils/collect.py
@@ -123,7 +123,10 @@ async def fetch_job(job: dict) -> dict:
     cpu_usage = process_usage(
         await prometheus.query(
             type="range",
-            custom_query=f"rate(container_cpu_usage_seconds_total{{pod='{job['pod']}', container='build'}}[90s])",
+            custom_query=(
+                f"rate(container_cpu_usage_seconds_total{{"
+                f"pod='{job['pod']}', container='build'}}[90s])"
+            ),
             start=job["start"],
             end=job["end"],
         ),
@@ -257,6 +260,7 @@ def process_usage(res: dict, job_id: int) -> dict:
     if not res:
         # sometimes prometheus reports no data for a job if the time range is too small
         logging.error(f"lack of usage data for job {job_id}")
+        # TODO throw exception
         sys.exit()
 
     usage = [float(value) for timestamp, value in res[0]["values"]]
diff --git a/gantry/utils/prometheus.py b/gantry/utils/prometheus.py
index ca1eac1..94ecab8 100644
--- a/gantry/utils/prometheus.py
+++ b/gantry/utils/prometheus.py
@@ -27,7 +27,13 @@ async def query(self, type: str, **kwargs) -> dict:
             max_resolution = 10_000
             # calculating the max step size to get the desired resolution
             step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution)
-            url = f"{self.base_url}/query_range?query={query_str}&start={kwargs['start']}&end={kwargs['end']}&step={step}s"
+            url = (
+                f"{self.base_url}/query_range?"
+                f"query={query_str}&"
+                f"start={kwargs['start']}&"
+                f"end={kwargs['end']}&"
+                f"step={step}s"
+            )
             return await self._query(url)
         elif type == "single":
             url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}"

From 1f3de5ccfb321fdb3b8553823d107d88de7bedaa Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 15 Jan 2024 22:49:15 -0800
Subject: [PATCH 05/27] improvements to collection

---
 db/schema.sql              |   9 +-
 gantry/utils/collect.py    | 219 ++++++++++++++++++++-----------------
 gantry/utils/db.py         |  40 -------
 gantry/utils/gitlab.py     |   6 +-
 gantry/utils/misc.py       |  17 ++-
 gantry/utils/prometheus.py |  47 +++++---
 6 files changed, 169 insertions(+), 169 deletions(-)
 delete mode 100644 gantry/utils/db.py

diff --git a/db/schema.sql b/db/schema.sql
index c6fcac0..316b132 100644
--- a/db/schema.sql
+++ b/db/schema.sql
@@ -1,7 +1,6 @@
 CREATE TABLE vms (
     id INTEGER PRIMARY KEY,
-    start INTEGER NOT NULL,
-    -- VM end is the max of the build end times
+    uuid TEXT NOT NULL,
     hostname TEXT NOT NULL,
     cores REAL NOT NULL,
     mem REAL NOT NULL,
@@ -12,7 +11,6 @@ CREATE TABLE vms (
 
 
 CREATE TABLE builds (
-    -- TODO do we want an entry here for if the job has been retried?
     id INTEGER PRIMARY KEY,
     pod TEXT NOT NULL UNIQUE,
     vm INTEGER NOT NULL,
@@ -20,17 +18,18 @@ CREATE TABLE builds (
     end INTEGER NOT NULL,
     job_id INTEGER NOT NULL,
     job_status TEXT NOT NULL,
+    num_retries INTEGER NOT NULL,
     ref TEXT NOT NULL,
     pkg_name TEXT NOT NULL,
     pkg_version TEXT NOT NULL,
-    pkg_variants TEXT NOT NULL, -- can be stored as JSONB in the future?
+    pkg_variants TEXT NOT NULL,
     compiler_name TEXT NOT NULL,
     compiler_version TEXT NOT NULL,
     arch TEXT NOT NULL,
     stack TEXT NOT NULL,
     build_jobs INTEGER NOT NULL,
     cpu_request REAL NOT NULL,
-    cpu_limit REAL, -- this can be null
+    cpu_limit REAL, -- this can be null becasue it's currently not set
     cpu_mean REAL NOT NULL,
     cpu_median REAL NOT NULL,
     cpu_max REAL NOT NULL,
diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py
index 6704b01..702ed34 100644
--- a/gantry/utils/collect.py
+++ b/gantry/utils/collect.py
@@ -3,22 +3,22 @@
 import math
 import re
 import statistics
-import sys
 from datetime import datetime
 
-from utils.db import SqliteClient
 from utils.gitlab import GitlabClient
-from utils.misc import spec_variants
+from utils.misc import db_insert, spec_variants
 from utils.prometheus import PrometheusClient
 
 
-async def fetch_job(job: dict) -> dict:
-    # TODO match gitlab webhook payload and process datetimes?
-    if job["build_status"] not in ("success", "failed"):
-        return
+class InvalidDataError(Exception):
+    pass
 
-    if job["build_status"] == "failed":
-        # TODO implement retry mechanism
+
+async def fetch_job(job: dict, db) -> dict:
+    gitlab = GitlabClient()
+    prometheus = PrometheusClient()
+
+    if job["build_status"] not in ("success", "failed"):
         return
 
     job_name_pattern = re.compile(r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)")
@@ -27,19 +27,19 @@ async def fetch_job(job: dict) -> dict:
         # generate jobs, non build jobs, etc
         return
 
-    gitlab = GitlabClient()
-    prometheus = PrometheusClient()
-    db = SqliteClient()
-
     # check if job has already been inserted into the database
-    db.execute("select job_id from builds where job_id = ?", (job["build_id"],))
-    if db.fetchone():
-        logging.info(f"job {job['build_id']} already in database")
-        return
+    async with db.execute(
+        "select job_id from builds where job_id = ?", (job["build_id"],)
+    ) as cursor:
+        if await cursor.fetchone():
+            logging.info(f"job {job['build_id']} already in database")
+            return
 
     job_log = await gitlab.job_log(job["build_id"])
     if is_ghost(job_log):
-        db.insert("ghost_jobs", (None, job["build_id"]))
+        await db.execute(
+            ("insert into ghost_jobs (name) values (?)"), (job["build_id"],)
+        )
         return
 
     job["start"] = datetime.fromisoformat(job["build_started_at"]).timestamp()
@@ -61,10 +61,11 @@ async def fetch_job(job: dict) -> dict:
     job.update(
         {
             "pod": pod_annotations_res[0]["labels"]["pod"],
-            # TODO int? is it guaranteed to be here?
-            "build_jobs": pod_annotations_res[0]["labels"][
-                "annotation_metrics_spack_job_build_jobs"
-            ],
+            "build_jobs": int(
+                pod_annotations_res[0]["labels"][
+                    "annotation_metrics_spack_job_build_jobs"
+                ]
+            ),
             "arch": pod_annotations_res[0]["labels"][
                 "annotation_metrics_spack_job_spec_arch"
             ],
@@ -133,88 +134,98 @@ async def fetch_job(job: dict) -> dict:
         job["build_id"],
     )
 
+    if job["build_status"] == "failed":
+        oom_status = prometheus.query(
+            type="range",
+            query={
+                "metric": "kube_pod_container_status_last_terminated_reason",
+                "filters": {
+                    "container": "build",
+                    "pod": job["pod"],
+                    "reason": "OOMKilled",
+                },
+            },
+            start=job["start"],
+            end=job["end"] + 10 * 60,  # give a 10 minute buffer
+        )
+        # TODO retry the job if OOM, do not return as we still want to save the build
+        if not oom_status:
+            return
+
     # instead of needing to fetch the node where the pod ran from kube_pod_info
     # we can grab it from kube_pod_container_resource_limits
     # weirdly, it's not available in kube_pod_labels or annotations
     # https://github.com/kubernetes/kube-state-metrics/issues/1148
-    vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time)
+    vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time, db)
     requests = process_resources_res(job_requests_res)
     limits = process_resources_res(job_limits_res)
 
-    # TODO insert into db here
-
-    return db.insert(
-        "builds",
-        (
-            None,
-            job["pod"],
-            vm,
-            job["start"],
-            job["end"],
-            job["build_id"],
-            job["build_status"],
-            job["ref"],
-            job["pkg_name"],
-            job["pkg_version"],
-            # dict to string
-            json.dumps(job["pkg_variants"]),
-            job["compiler_name"],
-            job["compiler_version"],
-            job["arch"],
-            job["stack"],
-            job["build_jobs"],
-            requests["cpu"]["value"],
-            # currently not set as of 12-23
-            limits.get("cpu", {}).get("value"),
-            cpu_usage["mean"],
-            cpu_usage["median"],
-            cpu_usage["max"],
-            cpu_usage["min"],
-            cpu_usage["stddev"],
-            requests["memory"]["value"],
-            limits["memory"]["value"],
-            mem_usage["mean"],
-            mem_usage["median"],
-            mem_usage["max"],
-            mem_usage["min"],
-            mem_usage["stddev"],
-        ),
+    await db.execute(
+        *db_insert(
+            "builds",
+            (
+                None,
+                job["pod"],
+                vm,
+                job["start"],
+                job["end"],
+                job["build_id"],
+                job["build_status"],
+                job["retries_count"],
+                job["ref"],
+                job["pkg_name"],
+                job["pkg_version"],
+                json.dumps(job["pkg_variants"]),  # dict to string
+                job["compiler_name"],
+                job["compiler_version"],
+                job["arch"],
+                job["stack"],
+                job["build_jobs"],
+                requests["cpu"]["value"],
+                # currently not set as of 12-23
+                limits.get("cpu", {}).get("value"),
+                cpu_usage["mean"],
+                cpu_usage["median"],
+                cpu_usage["max"],
+                cpu_usage["min"],
+                cpu_usage["stddev"],
+                requests["memory"]["value"],
+                limits["memory"]["value"],
+                mem_usage["mean"],
+                mem_usage["median"],
+                mem_usage["max"],
+                mem_usage["min"],
+                mem_usage["stddev"],
+            ),
+        )
     )
 
+    # vm and build will get saved at the same time to make sure
+    # we don't accidentally commit a vm without a build
+    await db.commit()
+
+    return
+
 
-async def fetch_vm(hostname: str, query_time: float) -> dict:
+async def fetch_vm(hostname: str, query_time: float, db) -> dict:
     prometheus = PrometheusClient()
-    db = SqliteClient()
-    vm_start_res = await prometheus.query(
+    vm_info = await prometheus.query(
         type="single",
         query={
-            "metric": "kube_node_created",
+            "metric": "kube_node_info",
             "filters": {"node": hostname},
         },
         time=query_time,
     )
 
-    vm_start = float(vm_start_res[0]["values"][1])
-
-    db.execute(
-        "select id from vms where hostname = ? and start = ?", (hostname, vm_start)
-    )
-    vm_id = db.fetchone()
+    vm_uuid = vm_info[0]["labels"]["system_uuid"]
 
-    if vm_id:
-        logging.info(f"vm {hostname} already in database with id {vm_id[0]}")
-        return vm_id[0]
+    async with db.execute("select id from vms where uuid = ?", (vm_uuid,)) as cursor:
+        old_vm = await cursor.fetchone()
 
-    vm_capacity = process_resources_res(
-        await prometheus.query(
-            type="single",
-            query={
-                "metric": "kube_node_status_capacity",
-                "filters": {"node": hostname},
-            },
-            time=query_time,
-        )
-    )
+        if old_vm:
+            logging.info(f"vm {hostname} already in database with id {old_vm[0]}")
+            return old_vm[0]
 
     vm_labels = await prometheus.query(
         type="single",
@@ -225,19 +236,26 @@ async def fetch_vm(hostname: str, query_time: float) -> dict:
         time=query_time,
     )
 
-    return db.insert(
-        "vms",
-        (
-            None,
-            vm_start,
-            hostname,
-            vm_capacity["cpu"]["value"],
-            vm_capacity["memory"]["value"],
-            vm_labels[0]["labels"]["label_kubernetes_io_arch"],
-            vm_labels[0]["labels"]["label_kubernetes_io_os"],
-            vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"],
-        ),
-    )
+    async with db.execute(
+        *db_insert(
+            "vms",
+            (
+                None,
+                vm_uuid,
+                hostname,
+                float(vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_cpu"]),
+                float(
+                    vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_memory"]
+                ),
+                vm_labels[0]["labels"]["label_kubernetes_io_arch"],
+                vm_labels[0]["labels"]["label_kubernetes_io_os"],
+                vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"],
+            ),
+        )
+    ) as cursor:
+        vm_id = cursor.lastrowid
+
+    return vm_id
 
 
 def is_ghost(log):
@@ -260,14 +278,13 @@ def process_usage(res: dict, job_id: int) -> dict:
     if not res:
         # sometimes prometheus reports no data for a job if the time range is too small
         logging.error(f"lack of usage data for job {job_id}")
-        # TODO throw exception
-        sys.exit()
+        raise InvalidDataError
 
     usage = [float(value) for timestamp, value in res[0]["values"]]
 
     sum_stats = {
         "mean": statistics.fmean(usage),
-        # use pstdev because we have the whole population
+        # pstdev because we have the whole population
         "stddev": statistics.pstdev(usage),
         "max": max(usage),
         "min": min(usage),
@@ -280,6 +297,6 @@ def process_usage(res: dict, job_id: int) -> dict:
         or math.isnan(sum_stats["stddev"])
     ):
         logging.error(f"usage data is invalid for job {job_id}")
-        sys.exit()
+        raise InvalidDataError
 
     return sum_stats
diff --git a/gantry/utils/db.py b/gantry/utils/db.py
deleted file mode 100644
index 5ea9de5..0000000
--- a/gantry/utils/db.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import sqlite3
-
-
-class SqliteClient:
-    def __init__(self):
-        self.conn = sqlite3.connect(os.environ["DB_FILE"])
-        self.cursor = self.conn.cursor()
-        self.execute("PRAGMA foreign_keys = ON;")
-
-    def execute(self, query, params=None):
-        if params:
-            self.cursor.execute(query, params)
-        else:
-            self.cursor.execute(query)
-
-    def insert(self, table, values):
-        self.execute(
-            f"insert into {table} values ({','.join(['?'] * len(values))})", values
-        )
-        self.commit()
-        return self.cursor.lastrowid
-
-    def fetchall(self):
-        return self.cursor.fetchall()
-
-    def fetchone(self):
-        return self.cursor.fetchone()
-
-    def commit(self):
-        self.conn.commit()
-
-    def close(self):
-        self.conn.close()
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *args):
-        self.close()
diff --git a/gantry/utils/gitlab.py b/gantry/utils/gitlab.py
index 4ad652d..96dcf0d 100644
--- a/gantry/utils/gitlab.py
+++ b/gantry/utils/gitlab.py
@@ -1,4 +1,3 @@
-import logging
 import os
 
 import aiohttp
@@ -10,11 +9,8 @@ def __init__(self):
         self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_TOKEN"]}
 
     async def request(self, url: str, response_type: str) -> dict:
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(raise_for_status=True) as session:
             async with session.get(url, headers=self.headers) as resp:
-                if resp.status != 200:
-                    logging.error(f"Gitlab query failed with status {resp.status}")
-                    return {}
                 if response_type == "json":
                     return await resp.json()
                 if response_type == "text":
diff --git a/gantry/utils/misc.py b/gantry/utils/misc.py
index 31c67ff..0376c67 100644
--- a/gantry/utils/misc.py
+++ b/gantry/utils/misc.py
@@ -1,15 +1,15 @@
 def spec_variants(spec: str) -> dict:
-    """Given a spec's concrete variants, return a dict of variant name: value."""
+    """Given a spec's concrete variants, return a dict of name: value."""
     # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on
 
-    # TODO handle errors and invalid inputs
-
     variants = {}
     spec = spec.replace("+", " +")
     spec = spec.replace("~", " ~")
     parts = spec.split(" ")
 
     for part in parts:
+        if len(part) < 2:
+            continue
         if "=" in part:
             name, value = part.split("=")
             # multiple values
@@ -24,3 +24,14 @@ def spec_variants(spec: str) -> dict:
                 variants[part[1:]] = False
 
     return variants
+
+
+def db_insert(table, values):
+    """
+    Returns an INSERT statement given a table name and tuple of values.
+    Must provide values for all columns in the table, including the primary key.
+    """
+    return (
+        f"insert into {table} values ({','.join(['?'] * (len(values)) )})",
+        values,
+    )
diff --git a/gantry/utils/prometheus.py b/gantry/utils/prometheus.py
index 94ecab8..b1a6cb2 100644
--- a/gantry/utils/prometheus.py
+++ b/gantry/utils/prometheus.py
@@ -7,16 +7,34 @@
 
 
 class PrometheusClient:
-    # TODO error handling for unexpected data
-    # todo retry mechanism for failed requests?
-
     def __init__(self):
         self.base_url = os.environ["PROMETHEUS_URL"]
         self.cookies = {"_oauth2_proxy": os.environ["PROMETHEUS_COOKIE"]}
 
     async def query(self, type: str, **kwargs) -> dict:
-        # TODO add validation for kwargs and comments
-        query_str = (
+        """
+        type: "range" or "single"
+
+        for range queries: set `start` and `end` (unix timestamps)
+        for single queries: set `time` (unix timestamp)
+
+        for custom queries: set `custom_query` (string)
+
+        for metric queries: set `query` (dict)
+            example:
+                "query": {
+                    "metric": "metric_name",
+                    "filters": {"filter1": "value1", "filter2": "value2"}
+                }
+        """
+
+        # validate that one of query or custom_query is set, but not both or neither
+        if not kwargs.get("query") and not kwargs.get("custom_query"):
+            raise ValueError("query or custom_query must be set")
+        if kwargs.get("query") and kwargs.get("custom_query"):
+            raise ValueError("query and custom_query cannot both be set")
+
+        query_str = urllib.parse.quote(
             kwargs["custom_query"]
             if kwargs.get("custom_query")
             else query_to_str(**kwargs["query"])
@@ -41,14 +59,11 @@ async def query(self, type: str, **kwargs) -> dict:
 
     async def _query(self, url: str) -> dict:
         """Query Prometheus with a query string"""
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(raise_for_status=True) as session:
             # submit cookie with request
             async with session.get(url, cookies=self.cookies) as resp:
-                if resp.status != 200:
-                    logging.error(f"Prometheus query failed with status {resp.status}")
-                    return {}
                 try:
-                    return self.process_response(await resp.json())
+                    return self.prettify_res(await resp.json())
                 except aiohttp.ContentTypeError:
                     logging.error(
                         """Prometheus query failed with unexpected response.
@@ -56,8 +71,8 @@ async def _query(self, url: str) -> dict:
                     )
                     return {}
 
-    def process_response(self, response: dict) -> dict:
-        """Process Prometheus response into a more usable format"""
+    def prettify_res(self, response: dict) -> dict:
+        """Process Prometheus response into an arrray of dicts with {label: value}"""
         result_type = response.get("data", {}).get("resultType")
         values_dict = {
             "matrix": "values",
@@ -75,7 +90,9 @@ def process_response(self, response: dict) -> dict:
 
 
 def query_to_str(metric: str, filters: dict) -> str:
-    # TODO add a test for this
-    # expected output: metric{key1="val1", key2="val2"}
+    """
+    In: "metric", {key1: value1, key2: value2}
+    Out: "metric{key1="value1", key2="value2"}"
+    """
     filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()])
-    return urllib.parse.quote(f"{metric}{{{filters_str}}}")
+    return f"{metric}{{{filters_str}}}"

From 130a1f103e8c9fe11a617f4abb5e9163a94519d4 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 15 Jan 2024 22:49:43 -0800
Subject: [PATCH 06/27] aiohttp server basics

---
 gantry/main.py  | 19 +++++++++++++++++++
 gantry/views.py | 16 ++++++++++++++++
 spack.yaml      |  1 +
 3 files changed, 36 insertions(+)
 create mode 100644 gantry/main.py
 create mode 100644 gantry/views.py

diff --git a/gantry/main.py b/gantry/main.py
new file mode 100644
index 0000000..496140a
--- /dev/null
+++ b/gantry/main.py
@@ -0,0 +1,19 @@
+import os
+
+import aiosqlite
+from aiohttp import web
+from views import routes
+
+
+async def init_db(app: web.Application):
+    db = await aiosqlite.connect(os.environ["DB_FILE"])
+    await db.execute("PRAGMA foreign_keys = ON;")
+    app["db"] = db
+    yield
+    await db.close()
+
+
+app = web.Application()
+app.add_routes(routes)
+app.cleanup_ctx.append(init_db)
+web.run_app(app)
diff --git a/gantry/views.py b/gantry/views.py
new file mode 100644
index 0000000..0820b23
--- /dev/null
+++ b/gantry/views.py
@@ -0,0 +1,16 @@
+from aiohttp import web
+from utils.collect import fetch_job
+
+routes = web.RouteTableDef()
+
+
+@routes.post("/collect")
+async def collect_job(request: web.Request) -> web.Response:
+    payload = await request.json()
+
+    # TODO validate gitlab token
+    if request.headers.get("X-Gitlab-Event") != "Job Hook":
+        return web.Response(status=400, text="invalid event type")
+
+    await fetch_job(payload, request.app["db"])
+    return web.Response(status=200)
diff --git a/spack.yaml b/spack.yaml
index 7249289..44863c0 100644
--- a/spack.yaml
+++ b/spack.yaml
@@ -7,6 +7,7 @@ spack:
   - py-flake8
   - py-black
   - py-isort
+  - py-aiosqlite
   - sqlite
   view: true
   concretizer:

From 45d8ef153837da9dfd34b156ba511e3e231326d2 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Thu, 18 Jan 2024 00:22:07 -0800
Subject: [PATCH 07/27] refactoring of collection

---
 db/schema.sql                        |   6 +-
 gantry/__main__.py                   |  21 +-
 gantry/collection.py                 |  91 ++++++++
 gantry/main.py                       |  19 --
 gantry/models/__init__.py            |   3 +
 gantry/models/build.py               | 255 ++++++++++++++++++++++
 gantry/models/vm.py                  | 107 ++++++++++
 gantry/tests/test_utils.py           |   2 +-
 gantry/{utils => util}/__init__.py   |   0
 gantry/util/gitlab.py                |  33 +++
 gantry/util/misc.py                  |  59 ++++++
 gantry/{utils => util}/prometheus.py |  68 ++++++
 gantry/utils/collect.py              | 302 ---------------------------
 gantry/utils/gitlab.py               |  21 --
 gantry/utils/misc.py                 |  37 ----
 gantry/views.py                      |  17 +-
 16 files changed, 653 insertions(+), 388 deletions(-)
 create mode 100644 gantry/collection.py
 delete mode 100644 gantry/main.py
 create mode 100644 gantry/models/__init__.py
 create mode 100644 gantry/models/build.py
 create mode 100644 gantry/models/vm.py
 rename gantry/{utils => util}/__init__.py (100%)
 create mode 100644 gantry/util/gitlab.py
 create mode 100644 gantry/util/misc.py
 rename gantry/{utils => util}/prometheus.py (66%)
 delete mode 100644 gantry/utils/collect.py
 delete mode 100644 gantry/utils/gitlab.py
 delete mode 100644 gantry/utils/misc.py

diff --git a/db/schema.sql b/db/schema.sql
index 316b132..6b1c24f 100644
--- a/db/schema.sql
+++ b/db/schema.sql
@@ -1,6 +1,6 @@
 CREATE TABLE vms (
     id INTEGER PRIMARY KEY,
-    uuid TEXT NOT NULL,
+    uuid TEXT NOT NULL UNIQUE,
     hostname TEXT NOT NULL,
     cores REAL NOT NULL,
     mem REAL NOT NULL,
@@ -16,9 +16,9 @@ CREATE TABLE builds (
     vm INTEGER NOT NULL,
     start INTEGER NOT NULL,
     end INTEGER NOT NULL,
-    job_id INTEGER NOT NULL,
+    job_id INTEGER NOT NULL UNIQUE,
     job_status TEXT NOT NULL,
-    num_retries INTEGER NOT NULL,
+    retries INTEGER NOT NULL,
     ref TEXT NOT NULL,
     pkg_name TEXT NOT NULL,
     pkg_version TEXT NOT NULL,
diff --git a/gantry/__main__.py b/gantry/__main__.py
index 491f8ff..64f408e 100644
--- a/gantry/__main__.py
+++ b/gantry/__main__.py
@@ -1,5 +1,24 @@
+import os
+
+import aiosqlite
+from aiohttp import web
+
+from gantry.views import routes
+
+
+async def init_db(app: web.Application):
+    db = await aiosqlite.connect(os.environ["DB_FILE"])
+    await db.execute("PRAGMA foreign_keys = ON;")
+    app["db"] = db
+    yield
+    await db.close()
+
+
 def main():
-    print("Hello World")
+    app = web.Application()
+    app.add_routes(routes)
+    app.cleanup_ctx.append(init_db)
+    web.run_app(app)
 
 
 if __name__ == "__main__":
diff --git a/gantry/collection.py b/gantry/collection.py
new file mode 100644
index 0000000..9aa8121
--- /dev/null
+++ b/gantry/collection.py
@@ -0,0 +1,91 @@
+import logging
+
+import aiosqlite
+
+from gantry.models import VM, Build
+from gantry.util.gitlab import GitlabClient
+from gantry.util.prometheus import IncompleteData, PrometheusClient
+
+
+async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None:
+    """
+    Fetches a job's information from Prometheus and inserts it into the database.
+    If there is data missing at any point, the function will still return so the webhook
+    responds as expected. If an exception is thrown, that behavior was unanticipated by
+    this program and should be investigated.
+
+    args:
+        payload: a dictionary containing the information from the Gitlab job hook
+        db: an active aiosqlite connection
+
+    returns: None in order to accomodate a 200 response for the webhook.
+    """
+
+    gitlab = GitlabClient()
+    prometheus = PrometheusClient()
+
+    build = Build(
+        status=payload["build_status"],
+        name=payload["build_name"],
+        id=payload["build_id"],
+        start=payload["build_started_at"],
+        end=payload["build_finished_at"],
+        retries=payload["retries_count"],
+        ref=payload["ref"],
+    )
+
+    # perform checks to see if we should collect data for this job
+    if (
+        build.status not in ("success",)
+        or not build.valid_name  # is not a build job
+        or await build.in_db(db)  # job already in the database
+        or await build.is_ghost(db, gitlab)
+    ):
+        return
+
+    try:
+        await build.get_annotations(prometheus)
+        await build.get_resources(prometheus)
+        await build.get_usage(prometheus)
+        vm_id = await fetch_vm(db, prometheus, build.node, build.midpoint)
+    except IncompleteData as e:
+        # missing data, skip this job
+        logging.error(e)
+        return
+
+    await build.insert(db, vm_id)
+    # vm and build will get saved at the same time to make sure
+    # we don't accidentally commit a vm without a build
+    await db.commit()
+
+    return
+
+
+async def fetch_vm(
+    db: aiosqlite.Connection,
+    prometheus: PrometheusClient,
+    hostname: dict,
+    query_time: float,
+) -> int:
+    """
+    Finds an existing VM in the database or inserts a new one.
+
+    args:
+        db: an active aiosqlite connection
+        prometheus:
+        hostname: the hostname of the VM
+        query_time: any point during VM runtime, usually grabbed from build
+
+    returns: id of the inserted or existing VM
+    """
+    vm = VM(
+        hostname=hostname,
+        query_time=query_time,
+    )
+
+    # do not proceed if the VM exists
+    if existing_vm := await vm.db_id(db, prometheus):
+        return existing_vm
+
+    await vm.get_labels(prometheus)
+    return await vm.insert(db)
diff --git a/gantry/main.py b/gantry/main.py
deleted file mode 100644
index 496140a..0000000
--- a/gantry/main.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import os
-
-import aiosqlite
-from aiohttp import web
-from views import routes
-
-
-async def init_db(app: web.Application):
-    db = await aiosqlite.connect(os.environ["DB_FILE"])
-    await db.execute("PRAGMA foreign_keys = ON;")
-    app["db"] = db
-    yield
-    await db.close()
-
-
-app = web.Application()
-app.add_routes(routes)
-app.cleanup_ctx.append(init_db)
-web.run_app(app)
diff --git a/gantry/models/__init__.py b/gantry/models/__init__.py
new file mode 100644
index 0000000..57e9b66
--- /dev/null
+++ b/gantry/models/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+from .build import Build
+from .vm import VM
diff --git a/gantry/models/build.py b/gantry/models/build.py
new file mode 100644
index 0000000..4289bb8
--- /dev/null
+++ b/gantry/models/build.py
@@ -0,0 +1,255 @@
+import json
+import logging
+import re
+from datetime import datetime
+
+import aiosqlite
+
+from gantry.util.gitlab import GitlabClient
+from gantry.util.misc import insert_dict, setattrs, spec_variants
+from gantry.util.prometheus import (
+    IncompleteData,
+    PrometheusClient,
+    process_resources,
+    process_usage,
+)
+
+
+class Build:
+    def __init__(
+        self,
+        status: str,
+        name: str,
+        id: int,
+        start: str,
+        end: str,
+        retries: int,
+        ref: str,
+    ):
+        self.status = status
+        self.name = name
+        self.id = id
+        self.start = datetime.fromisoformat(start).timestamp()
+        self.end = datetime.fromisoformat(end).timestamp()
+        self.retries = retries
+        self.ref = ref
+
+    @property
+    def valid_name(self) -> bool:
+        """Returns True if the job is a build job, False otherwise."""
+
+        # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0
+        # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1
+        job_name_pattern = re.compile(
+            r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)"
+        )
+        job_name_match = job_name_pattern.match(self.name)
+        # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack
+        return bool(job_name_match)
+
+    @property
+    def midpoint(self) -> float:
+        """Returns the midpoint of the job in unix time."""
+        # prometheus is not guaranteed to have data at the exact start and end times
+        # instead of creating an arbitrary buffer, ask for data in the middle of the job
+        return (self.start + self.end) / 2
+
+    async def is_ghost(self, db: aiosqlite.Connection, gl: GitlabClient) -> bool:
+        """Returns the job's ghost status."""
+
+        # prevent duplicate jobs from being inserted into the database
+        async with db.execute(
+            "select job_id from ghost_jobs where job_id = ?", (self.id,)
+        ) as cursor:
+            if await cursor.fetchone():
+                # ghost job is already in the database
+                return True
+
+        log = await gl.job_log(self.id)
+        ghost = "No need to rebuild" in log
+
+        if ghost:
+            await db.execute(("insert into ghost_jobs (name) values (?)"), (self.id,))
+
+        return ghost
+
+    async def in_db(self, db: aiosqlite.Connection) -> bool:
+        """Checks if the job is already in the db."""
+
+        async with db.execute(
+            "select job_id from builds where job_id = ?", (self.id,)
+        ) as cursor:
+            found = bool(await cursor.fetchone())
+
+        if found:
+            logging.warning(f"job {self.id} already in database")
+
+        return found
+
+    async def get_annotations(self, prometheus: PrometheusClient):
+        """Fetches the annotations and assigns multiple attributes."""
+
+        annotations_res = await prometheus.query(
+            type="single",
+            query={
+                "metric": "kube_pod_annotations",
+                "filters": {"annotation_gitlab_ci_job_id": self.id},
+            },
+            time=self.midpoint,
+        )
+
+        if not annotations_res:
+            raise IncompleteData("missing annotations")
+
+        annotations = annotations_res[0]["labels"]
+
+        setattrs(
+            self,
+            pod=annotations["pod"],
+            # if build jobs is not set, defaults to 16 due to spack settings
+            build_jobs=annotations.get("annotation_metrics_spack_job_build_jobs", 16),
+            arch=annotations["annotation_metrics_spack_job_spec_arch"],
+            pkg_name=annotations["annotation_metrics_spack_job_spec_pkg_name"],
+            pkg_version=annotations["annotation_metrics_spack_job_spec_pkg_version"],
+            pkg_variants=spec_variants(
+                annotations["annotation_metrics_spack_job_spec_variants"]
+            ),
+            compiler_name=annotations[
+                "annotation_metrics_spack_job_spec_compiler_name"
+            ],
+            compiler_version=annotations[
+                "annotation_metrics_spack_job_spec_compiler_version"
+            ],
+            stack="testing"
+            # stack=job_name_dict["stack"],
+        )
+
+    async def get_resources(self, prometheus: PrometheusClient):
+        """fetches pod requests and limits, and also sets the node hostname"""
+        requests = process_resources(
+            await prometheus.query(
+                type="single",
+                query={
+                    "metric": "kube_pod_container_resource_requests",
+                    "filters": {"container": "build", "pod": self.pod},
+                },
+                time=self.midpoint,
+            ),
+            self.id,
+        )
+
+        limits_res = await prometheus.query(
+            type="single",
+            query={
+                "metric": "kube_pod_container_resource_limits",
+                "filters": {"container": "build", "pod": self.pod},
+            },
+            time=self.midpoint,
+        )
+
+        if not limits_res:
+            raise IncompleteData(f"missing limits for job {self.id}")
+
+        # instead of needing to fetch the node where the pod ran from kube_pod_info
+        # we can grab it from kube_pod_container_resource_limits
+        # weirdly, it's not available in kube_pod_labels or annotations
+        # https://github.com/kubernetes/kube-state-metrics/issues/1148
+
+        self.node = limits_res[0]["labels"]["node"]
+        limits = process_resources(limits_res, self.id)
+
+        setattrs(
+            self,
+            cpu_request=requests["cpu"]["value"],
+            mem_request=requests["memory"]["value"],
+            cpu_limit=limits.get("cpu", {}).get("value"),
+            mem_limit=limits["memory"]["value"],
+        )
+
+    async def get_usage(self, prometheus: PrometheusClient):
+        """Sets resource usage attributes."""
+
+        mem_usage = process_usage(
+            await prometheus.query(
+                type="range",
+                query={
+                    "metric": "container_memory_working_set_bytes",
+                    "filters": {"container": "build", "pod": self.pod},
+                },
+                start=self.start,
+                end=self.end,
+            ),
+            self.id,
+        )
+
+        cpu_usage = process_usage(
+            await prometheus.query(
+                type="range",
+                custom_query=(
+                    f"rate(container_cpu_usage_seconds_total{{"
+                    f"pod='{self.pod}', container='build'}}[90s])"
+                ),
+                start=self.start,
+                end=self.end,
+            ),
+            self.id,
+        )
+
+        setattrs(
+            self,
+            cpu_mean=cpu_usage["mean"],
+            cpu_median=cpu_usage["median"],
+            cpu_max=cpu_usage["max"],
+            cpu_min=cpu_usage["min"],
+            cpu_stddev=cpu_usage["stddev"],
+            mem_mean=mem_usage["mean"],
+            mem_median=mem_usage["median"],
+            mem_max=mem_usage["max"],
+            mem_min=mem_usage["min"],
+            mem_stddev=mem_usage["stddev"],
+        )
+
+    async def insert(self, db: aiosqlite.Connection, vm_id: int) -> int:
+        """Inserts the build into the database and returns its id."""
+
+        async with db.execute(
+            *insert_dict(
+                "builds",
+                {
+                    "pod": self.pod,
+                    "vm": vm_id,
+                    "start": self.start,
+                    "end": self.end,
+                    "job_id": self.id,
+                    "job_status": self.status,
+                    "retries": self.retries,
+                    "ref": self.ref,
+                    "pkg_name": self.pkg_name,
+                    "pkg_version": self.pkg_version,
+                    "pkg_variants": json.dumps(self.pkg_variants),  # dict to string
+                    "compiler_name": self.compiler_name,
+                    "compiler_version": self.compiler_version,
+                    "arch": self.arch,
+                    "stack": self.stack,
+                    "build_jobs": self.build_jobs,
+                    "cpu_request": self.cpu_request,
+                    "cpu_limit": self.cpu_limit,
+                    "cpu_mean": self.cpu_mean,
+                    "cpu_median": self.cpu_median,
+                    "cpu_max": self.cpu_max,
+                    "cpu_min": self.cpu_min,
+                    "cpu_stddev": self.cpu_stddev,
+                    "mem_request": self.mem_request,
+                    "mem_limit": self.mem_limit,
+                    "mem_mean": self.mem_mean,
+                    "mem_median": self.mem_median,
+                    "mem_max": self.mem_max,
+                    "mem_min": self.mem_min,
+                    "mem_stddev": self.mem_stddev,
+                },
+                # if the job somehow gets added into the db (pod+id being unique)
+                # then ignore the insert
+                ignore=True,
+            )
+        ) as cursor:
+            return cursor.lastrowid
diff --git a/gantry/models/vm.py b/gantry/models/vm.py
new file mode 100644
index 0000000..b763a91
--- /dev/null
+++ b/gantry/models/vm.py
@@ -0,0 +1,107 @@
+import aiosqlite
+
+from gantry.util.misc import insert_dict, setattrs
+from gantry.util.prometheus import IncompleteData, PrometheusClient
+
+MB_IN_BYTES = 1_000_000
+
+
+class VM:
+    def __init__(self, hostname: str, query_time: float):
+        """
+        args:
+            hostname: the hostname of the VM
+            query_time: any point during VM runtime, usually grabbed from build
+        """
+        self.hostname = hostname
+        self.query_time = query_time
+
+    async def db_id(
+        self, db: aiosqlite.Connection, prometheus: PrometheusClient
+    ) -> int | None:
+        """
+        Returns the id of the vm if it exists in the database, otherwise returns None.
+        Also sets the uuid of the vm.
+        """
+        vm_info = await prometheus.query(
+            type="single",
+            query={
+                "metric": "kube_node_info",
+                "filters": {"node": self.hostname},
+            },
+            time=self.query_time,
+        )
+
+        if not vm_info:
+            raise IncompleteData(f"missing vm info for {self.hostname}")
+
+        self.uuid = vm_info[0]["labels"]["system_uuid"]
+
+        # look for the vm in the database
+        async with db.execute(
+            "select id from vms where uuid = ?", (self.uuid,)
+        ) as cursor:
+            old_vm = await cursor.fetchone()
+
+            if old_vm:
+                return old_vm[0]
+
+        return None
+
+    async def get_labels(self, prometheus: PrometheusClient):
+        """Sets multiple attributes of the VM based on its labels."""
+
+        vm_labels_res = await prometheus.query(
+            type="single",
+            query={
+                "metric": "kube_node_labels",
+                "filters": {"node": self.hostname},
+            },
+            time=self.query_time,
+        )
+
+        if not vm_labels_res:
+            raise IncompleteData(f"missing vm labels for {self.hostname}")
+
+        labels = vm_labels_res[0]["labels"]
+
+        setattrs(
+            self,
+            cores=float(labels["label_karpenter_k8s_aws_instance_cpu"]),
+            mem=float(labels["label_karpenter_k8s_aws_instance_memory"]),
+            arch=labels["label_kubernetes_io_arch"],
+            os=labels["label_kubernetes_io_os"],
+            instance_type=labels["label_node_kubernetes_io_instance_type"],
+        )
+
+    async def insert(self, db: aiosqlite.Connection) -> int:
+        """Inserts the VM into the database and returns its id."""
+        async with db.execute(
+            *insert_dict(
+                "vms",
+                {
+                    "uuid": self.uuid,
+                    "hostname": self.hostname,
+                    "cores": self.cores,
+                    # convert to bytes to be consistent with other resource metrics
+                    "mem": self.mem * MB_IN_BYTES,
+                    "arch": self.arch,
+                    "os": self.os,
+                    "instance_type": self.instance_type,
+                },
+                # deal with races
+                ignore=True,
+            )
+        ) as cursor:
+            pk = cursor.lastrowid
+
+        if pk == 0:
+            # the ignore part of the query was triggered, some other call
+            # must have inserted the vm before this one
+            async with db.execute(
+                "select id from vms where uuid = ?", (self.uuid,)
+            ) as cursor:
+                pk_res = await cursor.fetchone()
+                pk = pk_res[0]
+
+        return pk
diff --git a/gantry/tests/test_utils.py b/gantry/tests/test_utils.py
index 3e83088..010e08b 100644
--- a/gantry/tests/test_utils.py
+++ b/gantry/tests/test_utils.py
@@ -1,6 +1,6 @@
 import pytest
 
-from gantry.utils.misc import spec_variants
+from gantry.util.misc import spec_variants
 
 # write tests for spec_variants here
 # +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on has to equal {}
diff --git a/gantry/utils/__init__.py b/gantry/util/__init__.py
similarity index 100%
rename from gantry/utils/__init__.py
rename to gantry/util/__init__.py
diff --git a/gantry/util/gitlab.py b/gantry/util/gitlab.py
new file mode 100644
index 0000000..6658377
--- /dev/null
+++ b/gantry/util/gitlab.py
@@ -0,0 +1,33 @@
+import os
+
+import aiohttp
+
+
+class GitlabClient:
+    def __init__(self):
+        self.base_url = os.environ["GITLAB_URL"]
+        self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_API_TOKEN"]}
+
+    async def _request(self, url: str, response_type: str) -> dict | str:
+        """
+        Helper for requests to the Gitlab API.
+
+        args:
+            url: the url to request
+            response_type: the type of response to expect (json or text)
+
+        returns: the response from Gitlab in the specified format
+        """
+
+        async with aiohttp.ClientSession(raise_for_status=True) as session:
+            async with session.get(url, headers=self.headers) as resp:
+                if response_type == "json":
+                    return await resp.json()
+                if response_type == "text":
+                    return await resp.text()
+
+    async def job_log(self, job_id: int) -> str:
+        """Given a job id, returns the log from that job"""
+
+        url = f"{self.base_url}/jobs/{job_id}/trace"
+        return await self._request(url, "text")
diff --git a/gantry/util/misc.py b/gantry/util/misc.py
new file mode 100644
index 0000000..2c6a69c
--- /dev/null
+++ b/gantry/util/misc.py
@@ -0,0 +1,59 @@
+def spec_variants(spec: str) -> dict:
+    """Given a spec's concrete variants, return a dict in name: value format."""
+    # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on
+
+    variants = {}
+    # give some padding to + and ~ so we can split on them
+    spec = spec.replace("+", " +")
+    spec = spec.replace("~", " ~")
+    parts = spec.split(" ")
+
+    for part in parts:
+        if len(part) < 2:
+            continue
+        if "=" in part:
+            name, value = part.split("=")
+            if "," in value:
+                # array of the multiple values
+                variants[name] = value.split(",")
+            else:
+                # string of the single value
+                variants[name] = value
+        else:
+            # anything after the first character is the value
+            if part.startswith("+"):
+                variants[part[1:]] = True
+            elif part.startswith("~"):
+                variants[part[1:]] = False
+
+    return variants
+
+
+def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]:
+    """
+    Crafts an SQLite INSERT statement from a dictionary.
+
+    args:
+        table: name of the table to insert into
+        input: dictionary of values to insert
+        ignore: whether to ignore duplicate entries
+
+    returns: tuple of (query, values)
+    """
+
+    columns = ", ".join(input.keys())
+    values = ", ".join(["?" for _ in range(len(input))])
+    query = f"INSERT INTO {table} ({columns}) VALUES ({values})"
+
+    if ignore:
+        query = query.replace("INSERT", "INSERT OR IGNORE")
+
+    # using a tuple of values from the dictionary
+    values_tuple = tuple(input.values())
+    return query, values_tuple
+
+
+def setattrs(_self, **kwargs):
+    """Sets multiple attributes of an object from a dictionary."""
+    for k, v in kwargs.items():
+        setattr(_self, k, v)
diff --git a/gantry/utils/prometheus.py b/gantry/util/prometheus.py
similarity index 66%
rename from gantry/utils/prometheus.py
rename to gantry/util/prometheus.py
index b1a6cb2..921069a 100644
--- a/gantry/utils/prometheus.py
+++ b/gantry/util/prometheus.py
@@ -1,11 +1,16 @@
 import logging
 import math
 import os
+import statistics
 import urllib.parse
 
 import aiohttp
 
 
+class IncompleteData(Exception):
+    pass
+
+
 class PrometheusClient:
     def __init__(self):
         self.base_url = os.environ["PROMETHEUS_URL"]
@@ -96,3 +101,66 @@ def query_to_str(metric: str, filters: dict) -> str:
     """
     filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()])
     return f"{metric}{{{filters_str}}}"
+
+
+def process_resources(res: dict, job_id: int) -> dict:
+    """
+    Processes the resource limits and requests from a Prometheus response into
+    readable format.
+
+    args:
+        res: Prometheus response
+        job_id: job id for error logging
+
+    returns: dict with {resource: {unit: value}} format
+    """
+
+    if not res:
+        raise IncompleteData(f"resource data is missing for job {job_id}")
+
+    processed = {}
+    for item in res:
+        # duplicates are ignored by overwriting the previous entry
+        processed[item["labels"]["resource"]] = {
+            "unit": item["labels"]["unit"],
+            "value": float(item["values"][1]),
+        }
+
+    return processed
+
+
+def process_usage(res: dict, job_id: int) -> dict:
+    """
+    Processes the usage data from a Prometheus response into readable format.
+    This could either be CPU usage or memory usage.
+
+    args:
+        res: Prometheus response
+        job_id: job id for error logging
+
+    returns: dict with {statistic: value} format
+    """
+
+    if not res:
+        # sometimes prometheus reports no data for a job if the time range is too small
+        raise IncompleteData(f"usage data is missing for job {job_id}")
+
+    usage = [float(value) for timestamp, value in res[0]["values"]]
+
+    sum_stats = {
+        "mean": statistics.fmean(usage),
+        # pstdev because we have the whole population
+        "stddev": statistics.pstdev(usage),
+        "max": max(usage),
+        "min": min(usage),
+        "median": statistics.median(usage),
+    }
+
+    if (
+        sum_stats["stddev"] == 0
+        or sum_stats["mean"] == 0
+        or math.isnan(sum_stats["stddev"])
+    ):
+        raise IncompleteData(f"usage data is invalid for job {job_id}")
+
+    return sum_stats
diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py
deleted file mode 100644
index 702ed34..0000000
--- a/gantry/utils/collect.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import json
-import logging
-import math
-import re
-import statistics
-from datetime import datetime
-
-from utils.gitlab import GitlabClient
-from utils.misc import db_insert, spec_variants
-from utils.prometheus import PrometheusClient
-
-
-class InvalidDataError(Exception):
-    pass
-
-
-async def fetch_job(job: dict, db) -> dict:
-    gitlab = GitlabClient()
-    prometheus = PrometheusClient()
-
-    if job["build_status"] not in ("success", "failed"):
-        return
-
-    job_name_pattern = re.compile(r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)")
-    job_name_match = job_name_pattern.match(job["build_name"])
-    if not job_name_match:
-        # generate jobs, non build jobs, etc
-        return
-
-    # check if job has already been inserted into the database
-    async with db.execute(
-        "select job_id from builds where job_id = ?", (job["build_id"],)
-    ) as cursor:
-        if await cursor.fetchone():
-            logging.info(f"job {job['build_id']} already in database")
-            return
-
-    job_log = await gitlab.job_log(job["build_id"])
-    if is_ghost(job_log):
-        await db.execute(
-            ("insert into ghost_jobs (name) values (?)"), (job["build_id"],)
-        )
-        return
-
-    job["start"] = datetime.fromisoformat(job["build_started_at"]).timestamp()
-    job["end"] = datetime.fromisoformat(job["build_finished_at"]).timestamp()
-
-    # prometheus is not guaranteed to have data at the exact start and end times
-    # instead of creating an arbitrary buffer, ask for data in the middle of the job
-    query_time = (job["end"] + job["start"]) / 2
-
-    pod_annotations_res = await prometheus.query(
-        type="single",
-        query={
-            "metric": "kube_pod_annotations",
-            "filters": {"annotation_gitlab_ci_job_id": job["build_id"]},
-        },
-        time=query_time,
-    )
-
-    job.update(
-        {
-            "pod": pod_annotations_res[0]["labels"]["pod"],
-            "build_jobs": int(
-                pod_annotations_res[0]["labels"][
-                    "annotation_metrics_spack_job_build_jobs"
-                ]
-            ),
-            "arch": pod_annotations_res[0]["labels"][
-                "annotation_metrics_spack_job_spec_arch"
-            ],
-            "pkg_name": pod_annotations_res[0]["labels"][
-                "annotation_metrics_spack_job_spec_pkg_name"
-            ],
-            "pkg_version": pod_annotations_res[0]["labels"][
-                "annotation_metrics_spack_job_spec_pkg_version"
-            ],
-            "pkg_variants": spec_variants(
-                pod_annotations_res[0]["labels"][
-                    "annotation_metrics_spack_job_spec_variants"
-                ]
-            ),
-            "compiler_name": pod_annotations_res[0]["labels"][
-                "annotation_metrics_spack_job_spec_compiler_name"
-            ],
-            "compiler_version": pod_annotations_res[0]["labels"][
-                "annotation_metrics_spack_job_spec_compiler_version"
-            ],
-            "stack": job_name_match.group(6),
-        }
-    )
-
-    job_requests_res = await prometheus.query(
-        type="single",
-        query={
-            "metric": "kube_pod_container_resource_requests",
-            "filters": {"container": "build", "pod": job["pod"]},
-        },
-        time=query_time,
-    )
-
-    job_limits_res = await prometheus.query(
-        type="single",
-        query={
-            "metric": "kube_pod_container_resource_limits",
-            "filters": {"container": "build", "pod": job["pod"]},
-        },
-        time=query_time,
-    )
-
-    mem_usage = process_usage(
-        await prometheus.query(
-            type="range",
-            query={
-                "metric": "container_memory_working_set_bytes",
-                "filters": {"container": "build", "pod": job["pod"]},
-            },
-            start=job["start"],
-            end=job["end"],
-        ),
-        job["build_id"],
-    )
-
-    cpu_usage = process_usage(
-        await prometheus.query(
-            type="range",
-            custom_query=(
-                f"rate(container_cpu_usage_seconds_total{{"
-                f"pod='{job['pod']}', container='build'}}[90s])"
-            ),
-            start=job["start"],
-            end=job["end"],
-        ),
-        job["build_id"],
-    )
-
-    if job["build_status"] == "failed":
-        oom_status = prometheus.query(
-            type="range",
-            query={
-                "metric": "kube_pod_container_status_last_terminated_reason",
-                "filters": {
-                    "container": "build",
-                    "pod": job["pod"],
-                    "reason": "OOMKilled",
-                },
-            },
-            start=job["start"],
-            end=job["end"] + 10 * 60,  # give a 10 minute buffer
-        )
-        # TODO retry the job if OOM, do not return as we still want to save the build
-        if not oom_status:
-            return
-
-    # instead of needing to fetch the node where the pod ran from kube_pod_info
-    # we can grab it from kube_pod_container_resource_limits
-    # weirdly, it's not available in kube_pod_labels or annotations
-    # https://github.com/kubernetes/kube-state-metrics/issues/1148
-    vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time, db)
-    requests = process_resources_res(job_requests_res)
-    limits = process_resources_res(job_limits_res)
-
-    await db.execute(
-        *db_insert(
-            "builds",
-            (
-                None,
-                job["pod"],
-                vm,
-                job["start"],
-                job["end"],
-                job["build_id"],
-                job["build_status"],
-                job["retries_count"],
-                job["ref"],
-                job["pkg_name"],
-                job["pkg_version"],
-                json.dumps(job["pkg_variants"]),  # dict to string
-                job["compiler_name"],
-                job["compiler_version"],
-                job["arch"],
-                job["stack"],
-                job["build_jobs"],
-                requests["cpu"]["value"],
-                # currently not set as of 12-23
-                limits.get("cpu", {}).get("value"),
-                cpu_usage["mean"],
-                cpu_usage["median"],
-                cpu_usage["max"],
-                cpu_usage["min"],
-                cpu_usage["stddev"],
-                requests["memory"]["value"],
-                limits["memory"]["value"],
-                mem_usage["mean"],
-                mem_usage["median"],
-                mem_usage["max"],
-                mem_usage["min"],
-                mem_usage["stddev"],
-            ),
-        )
-    )
-
-    # vm and build will get saved at the same time to make sure
-    # we don't accidentally commit a vm without a build
-    await db.commit()
-
-    return
-
-
-async def fetch_vm(hostname: str, query_time: float, db) -> dict:
-    prometheus = PrometheusClient()
-    vm_info = await prometheus.query(
-        type="single",
-        query={
-            "metric": "kube_node_info",
-            "filters": {"node": hostname},
-        },
-        time=query_time,
-    )
-
-    vm_uuid = vm_info[0]["labels"]["system_uuid"]
-
-    async with db.execute("select id from vms where uuid = ?", (vm_uuid,)) as cursor:
-        old_vm = await cursor.fetchone()
-
-        if old_vm:
-            logging.info(f"vm {hostname} already in database with id {old_vm[0]}")
-            return old_vm[0]
-
-    vm_labels = await prometheus.query(
-        type="single",
-        query={
-            "metric": "kube_node_labels",
-            "filters": {"node": hostname},
-        },
-        time=query_time,
-    )
-
-    async with db.execute(
-        *db_insert(
-            "vms",
-            (
-                None,
-                vm_uuid,
-                hostname,
-                float(vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_cpu"]),
-                float(
-                    vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_memory"]
-                ),
-                vm_labels[0]["labels"]["label_kubernetes_io_arch"],
-                vm_labels[0]["labels"]["label_kubernetes_io_os"],
-                vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"],
-            ),
-        )
-    ) as cursor:
-        vm_id = cursor.lastrowid
-
-    return vm_id
-
-
-def is_ghost(log):
-    return "No need to rebuild" in log
-
-
-def process_resources_res(res: dict) -> dict:
-    processed = {}
-    for item in res:
-        # duplicates are ignored by overwriting the previous entry
-        processed[item["labels"]["resource"]] = {
-            "unit": item["labels"]["unit"],
-            "value": float(item["values"][1]),
-        }
-
-    return processed
-
-
-def process_usage(res: dict, job_id: int) -> dict:
-    if not res:
-        # sometimes prometheus reports no data for a job if the time range is too small
-        logging.error(f"lack of usage data for job {job_id}")
-        raise InvalidDataError
-
-    usage = [float(value) for timestamp, value in res[0]["values"]]
-
-    sum_stats = {
-        "mean": statistics.fmean(usage),
-        # pstdev because we have the whole population
-        "stddev": statistics.pstdev(usage),
-        "max": max(usage),
-        "min": min(usage),
-        "median": statistics.median(usage),
-    }
-
-    if (
-        sum_stats["stddev"] == 0
-        or sum_stats["mean"] == 0
-        or math.isnan(sum_stats["stddev"])
-    ):
-        logging.error(f"usage data is invalid for job {job_id}")
-        raise InvalidDataError
-
-    return sum_stats
diff --git a/gantry/utils/gitlab.py b/gantry/utils/gitlab.py
deleted file mode 100644
index 96dcf0d..0000000
--- a/gantry/utils/gitlab.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import os
-
-import aiohttp
-
-
-class GitlabClient:
-    def __init__(self):
-        self.base_url = os.environ["GITLAB_URL"]
-        self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_TOKEN"]}
-
-    async def request(self, url: str, response_type: str) -> dict:
-        async with aiohttp.ClientSession(raise_for_status=True) as session:
-            async with session.get(url, headers=self.headers) as resp:
-                if response_type == "json":
-                    return await resp.json()
-                if response_type == "text":
-                    return await resp.text()
-
-    async def job_log(self, id: int) -> str:
-        url = f"{self.base_url}/jobs/{id}/trace"
-        return await self.request(url, "text")
diff --git a/gantry/utils/misc.py b/gantry/utils/misc.py
deleted file mode 100644
index 0376c67..0000000
--- a/gantry/utils/misc.py
+++ /dev/null
@@ -1,37 +0,0 @@
-def spec_variants(spec: str) -> dict:
-    """Given a spec's concrete variants, return a dict of name: value."""
-    # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on
-
-    variants = {}
-    spec = spec.replace("+", " +")
-    spec = spec.replace("~", " ~")
-    parts = spec.split(" ")
-
-    for part in parts:
-        if len(part) < 2:
-            continue
-        if "=" in part:
-            name, value = part.split("=")
-            # multiple values
-            if "," in value:
-                variants[name] = value.split(",")
-            else:
-                variants[name] = value
-        else:
-            if part.startswith("+"):
-                variants[part[1:]] = True
-            elif part.startswith("~"):
-                variants[part[1:]] = False
-
-    return variants
-
-
-def db_insert(table, values):
-    """
-    Returns an INSERT statement given a table name and tuple of values.
-    Must provide values for all columns in the table, including the primary key.
-    """
-    return (
-        f"insert into {table} values ({','.join(['?'] * (len(values)) )})",
-        values,
-    )
diff --git a/gantry/views.py b/gantry/views.py
index 0820b23..6b11b80 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -1,16 +1,25 @@
+import os
+import json
+
 from aiohttp import web
-from utils.collect import fetch_job
+
+from gantry.collection import fetch_build
 
 routes = web.RouteTableDef()
 
 
 @routes.post("/collect")
 async def collect_job(request: web.Request) -> web.Response:
-    payload = await request.json()
+    try:
+        payload = await request.json()
+    except json.decoder.JSONDecodeError:
+        return web.Response(status=400, text="invalid json")
+
+    if request.headers.get("X-Gitlab-Token") != os.environ["GITLAB_WEBHOOK_TOKEN"]:
+        return web.Response(status=401, text="invalid token")
 
-    # TODO validate gitlab token
     if request.headers.get("X-Gitlab-Event") != "Job Hook":
         return web.Response(status=400, text="invalid event type")
 
-    await fetch_job(payload, request.app["db"])
+    await fetch_build(payload, request.app["db"])
     return web.Response(status=200)

From a1a864b4644f045181fe727b65371615b1b8c6ea Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 22 Jan 2024 17:33:38 -0800
Subject: [PATCH 08/27] isort

---
 gantry/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gantry/views.py b/gantry/views.py
index 6b11b80..180fc9e 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -1,5 +1,5 @@
-import os
 import json
+import os
 
 from aiohttp import web
 

From 63165508ac84976c96b359b2646079e0d61ce799 Mon Sep 17 00:00:00 2001
From: Caetano Melone <cmelone@users.noreply.github.com>
Date: Wed, 24 Jan 2024 00:02:01 -0800
Subject: [PATCH 09/27] don't depend on dotenv for .env sourcing

Co-authored-by: Alec Scott <scott112@llnl.gov>
---
 .envrc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.envrc b/.envrc
index 5283fcd..dcea6a5 100644
--- a/.envrc
+++ b/.envrc
@@ -9,4 +9,9 @@ if type spack &>/dev/null; then
     spack env activate -d .
 fi
 
-dotenv
+#------------------------------------------------------------------------
+# Load Environment Variables from .env (if files exists)
+#------------------------------------------------------------------------
+if [ -e .env ]; then
+    source .env
+fi

From 0f89fba3fb5b01a7d8d7226370254f1f6245d1f6 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Wed, 24 Jan 2024 12:53:49 -0800
Subject: [PATCH 10/27] add stack

---
 gantry/models/build.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gantry/models/build.py b/gantry/models/build.py
index 4289bb8..c34e433 100644
--- a/gantry/models/build.py
+++ b/gantry/models/build.py
@@ -99,7 +99,7 @@ async def get_annotations(self, prometheus: PrometheusClient):
         )
 
         if not annotations_res:
-            raise IncompleteData("missing annotations")
+            raise IncompleteData(f"missing annotations for job {self.id}")
 
         annotations = annotations_res[0]["labels"]
 
@@ -120,8 +120,7 @@ async def get_annotations(self, prometheus: PrometheusClient):
             compiler_version=annotations[
                 "annotation_metrics_spack_job_spec_compiler_version"
             ],
-            stack="testing"
-            # stack=job_name_dict["stack"],
+            stack=annotations["annotation_metrics_spack_ci_stack_name"],
         )
 
     async def get_resources(self, prometheus: PrometheusClient):

From 4e5324d756d1b316bdb96827be6da3e9447d2fb9 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Wed, 24 Jan 2024 12:54:05 -0800
Subject: [PATCH 11/27] restructure how clients are initialized

---
 gantry/__main__.py        | 12 ++++++++++++
 gantry/collection.py      | 10 ++++++----
 gantry/util/gitlab.py     |  8 +++-----
 gantry/util/prometheus.py | 12 ++++++++----
 gantry/views.py           |  4 +++-
 5 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/gantry/__main__.py b/gantry/__main__.py
index 64f408e..c19dae0 100644
--- a/gantry/__main__.py
+++ b/gantry/__main__.py
@@ -3,6 +3,8 @@
 import aiosqlite
 from aiohttp import web
 
+from gantry.util.gitlab import GitlabClient
+from gantry.util.prometheus import PrometheusClient
 from gantry.views import routes
 
 
@@ -14,10 +16,20 @@ async def init_db(app: web.Application):
     await db.close()
 
 
+async def init_clients(app: web.Application):
+    app["gitlab"] = GitlabClient(
+        os.environ["GITLAB_URL"], os.environ["GITLAB_API_TOKEN"]
+    )
+    app["prometheus"] = PrometheusClient(
+        os.environ["PROMETHEUS_URL"], os.environ.get("PROMETHEUS_COOKIE", "")
+    )
+
+
 def main():
     app = web.Application()
     app.add_routes(routes)
     app.cleanup_ctx.append(init_db)
+    app.on_startup.append(init_clients)
     web.run_app(app)
 
 
diff --git a/gantry/collection.py b/gantry/collection.py
index 9aa8121..c651dea 100644
--- a/gantry/collection.py
+++ b/gantry/collection.py
@@ -7,7 +7,12 @@
 from gantry.util.prometheus import IncompleteData, PrometheusClient
 
 
-async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None:
+async def fetch_build(
+    payload: dict,
+    db: aiosqlite.Connection,
+    gitlab: GitlabClient,
+    prometheus: PrometheusClient,
+) -> None:
     """
     Fetches a job's information from Prometheus and inserts it into the database.
     If there is data missing at any point, the function will still return so the webhook
@@ -21,9 +26,6 @@ async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None:
     returns: None in order to accomodate a 200 response for the webhook.
     """
 
-    gitlab = GitlabClient()
-    prometheus = PrometheusClient()
-
     build = Build(
         status=payload["build_status"],
         name=payload["build_name"],
diff --git a/gantry/util/gitlab.py b/gantry/util/gitlab.py
index 6658377..7ab672e 100644
--- a/gantry/util/gitlab.py
+++ b/gantry/util/gitlab.py
@@ -1,12 +1,10 @@
-import os
-
 import aiohttp
 
 
 class GitlabClient:
-    def __init__(self):
-        self.base_url = os.environ["GITLAB_URL"]
-        self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_API_TOKEN"]}
+    def __init__(self, base_url: str, api_token: str):
+        self.base_url = base_url
+        self.headers = {"PRIVATE-TOKEN": api_token}
 
     async def _request(self, url: str, response_type: str) -> dict | str:
         """
diff --git a/gantry/util/prometheus.py b/gantry/util/prometheus.py
index 921069a..a3db981 100644
--- a/gantry/util/prometheus.py
+++ b/gantry/util/prometheus.py
@@ -1,6 +1,5 @@
 import logging
 import math
-import os
 import statistics
 import urllib.parse
 
@@ -12,9 +11,14 @@ class IncompleteData(Exception):
 
 
 class PrometheusClient:
-    def __init__(self):
-        self.base_url = os.environ["PROMETHEUS_URL"]
-        self.cookies = {"_oauth2_proxy": os.environ["PROMETHEUS_COOKIE"]}
+    def __init__(self, base_url: str, auth_cookie: str = ""):
+        # cookie will only be used if set
+        if auth_cookie:
+            self.cookies = {"_oauth2_proxy": auth_cookie}
+        else:
+            self.cookies = {}
+
+        self.base_url = base_url
 
     async def query(self, type: str, **kwargs) -> dict:
         """
diff --git a/gantry/views.py b/gantry/views.py
index 180fc9e..8967c19 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -21,5 +21,7 @@ async def collect_job(request: web.Request) -> web.Response:
     if request.headers.get("X-Gitlab-Event") != "Job Hook":
         return web.Response(status=400, text="invalid event type")
 
-    await fetch_build(payload, request.app["db"])
+    await fetch_build(
+        payload, request.app["db"], request.app["gitlab"], request.app["prometheus"]
+    )
     return web.Response(status=200)

From 45820b400f6463bfaf7b9d88991ce3fe3634a4a9 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Wed, 24 Jan 2024 15:41:21 -0800
Subject: [PATCH 12/27] reorganize files into clients/models/routes

---
 gantry/__main__.py                     | 3 +--
 gantry/clients/__init__.py             | 3 +++
 gantry/{util => clients}/gitlab.py     | 0
 gantry/{util => clients}/prometheus.py | 0
 gantry/models/build.py                 | 4 ++--
 gantry/models/vm.py                    | 2 +-
 gantry/{ => routes}/collection.py      | 4 ++--
 gantry/views.py                        | 2 +-
 8 files changed, 10 insertions(+), 8 deletions(-)
 create mode 100644 gantry/clients/__init__.py
 rename gantry/{util => clients}/gitlab.py (100%)
 rename gantry/{util => clients}/prometheus.py (100%)
 rename gantry/{ => routes}/collection.py (95%)

diff --git a/gantry/__main__.py b/gantry/__main__.py
index c19dae0..ebb3e34 100644
--- a/gantry/__main__.py
+++ b/gantry/__main__.py
@@ -3,8 +3,7 @@
 import aiosqlite
 from aiohttp import web
 
-from gantry.util.gitlab import GitlabClient
-from gantry.util.prometheus import PrometheusClient
+from gantry.clients import GitlabClient, PrometheusClient
 from gantry.views import routes
 
 
diff --git a/gantry/clients/__init__.py b/gantry/clients/__init__.py
new file mode 100644
index 0000000..2dbe3f6
--- /dev/null
+++ b/gantry/clients/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+from .gitlab import GitlabClient
+from .prometheus import PrometheusClient
diff --git a/gantry/util/gitlab.py b/gantry/clients/gitlab.py
similarity index 100%
rename from gantry/util/gitlab.py
rename to gantry/clients/gitlab.py
diff --git a/gantry/util/prometheus.py b/gantry/clients/prometheus.py
similarity index 100%
rename from gantry/util/prometheus.py
rename to gantry/clients/prometheus.py
diff --git a/gantry/models/build.py b/gantry/models/build.py
index c34e433..67de8e4 100644
--- a/gantry/models/build.py
+++ b/gantry/models/build.py
@@ -5,9 +5,9 @@
 
 import aiosqlite
 
-from gantry.util.gitlab import GitlabClient
+from gantry.clients.gitlab import GitlabClient
 from gantry.util.misc import insert_dict, setattrs, spec_variants
-from gantry.util.prometheus import (
+from gantry.clients.prometheus import (
     IncompleteData,
     PrometheusClient,
     process_resources,
diff --git a/gantry/models/vm.py b/gantry/models/vm.py
index b763a91..59fe864 100644
--- a/gantry/models/vm.py
+++ b/gantry/models/vm.py
@@ -1,7 +1,7 @@
 import aiosqlite
 
 from gantry.util.misc import insert_dict, setattrs
-from gantry.util.prometheus import IncompleteData, PrometheusClient
+from gantry.clients.prometheus import IncompleteData, PrometheusClient
 
 MB_IN_BYTES = 1_000_000
 
diff --git a/gantry/collection.py b/gantry/routes/collection.py
similarity index 95%
rename from gantry/collection.py
rename to gantry/routes/collection.py
index c651dea..66d987b 100644
--- a/gantry/collection.py
+++ b/gantry/routes/collection.py
@@ -3,8 +3,8 @@
 import aiosqlite
 
 from gantry.models import VM, Build
-from gantry.util.gitlab import GitlabClient
-from gantry.util.prometheus import IncompleteData, PrometheusClient
+from gantry.clients.gitlab import GitlabClient
+from gantry.clients.prometheus import IncompleteData, PrometheusClient
 
 
 async def fetch_build(
diff --git a/gantry/views.py b/gantry/views.py
index 8967c19..b311e8d 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -3,7 +3,7 @@
 
 from aiohttp import web
 
-from gantry.collection import fetch_build
+from gantry.routes.collection import fetch_build
 
 routes = web.RouteTableDef()
 

From 4f78c3a5c43d28b4b706546bce21d852384f3f63 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Wed, 24 Jan 2024 15:51:00 -0800
Subject: [PATCH 13/27] decouple spec utility functions from misc.py

---
 gantry/models/build.py      | 13 ---------
 gantry/routes/collection.py |  3 +-
 gantry/util/misc.py         | 55 -------------------------------------
 gantry/util/spec.py         | 46 +++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 69 deletions(-)
 create mode 100644 gantry/util/spec.py

diff --git a/gantry/models/build.py b/gantry/models/build.py
index 67de8e4..169a6b8 100644
--- a/gantry/models/build.py
+++ b/gantry/models/build.py
@@ -34,19 +34,6 @@ def __init__(
         self.retries = retries
         self.ref = ref
 
-    @property
-    def valid_name(self) -> bool:
-        """Returns True if the job is a build job, False otherwise."""
-
-        # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0
-        # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1
-        job_name_pattern = re.compile(
-            r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)"
-        )
-        job_name_match = job_name_pattern.match(self.name)
-        # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack
-        return bool(job_name_match)
-
     @property
     def midpoint(self) -> float:
         """Returns the midpoint of the job in unix time."""
diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py
index 66d987b..b6c48a8 100644
--- a/gantry/routes/collection.py
+++ b/gantry/routes/collection.py
@@ -5,6 +5,7 @@
 from gantry.models import VM, Build
 from gantry.clients.gitlab import GitlabClient
 from gantry.clients.prometheus import IncompleteData, PrometheusClient
+from gantry.util.spec import valid_build_name
 
 
 async def fetch_build(
@@ -39,7 +40,7 @@ async def fetch_build(
     # perform checks to see if we should collect data for this job
     if (
         build.status not in ("success",)
-        or not build.valid_name  # is not a build job
+        or not valid_build_name(build.name)  # is not a build job
         or await build.in_db(db)  # job already in the database
         or await build.is_ghost(db, gitlab)
     ):
diff --git a/gantry/util/misc.py b/gantry/util/misc.py
index 2c6a69c..0ff0892 100644
--- a/gantry/util/misc.py
+++ b/gantry/util/misc.py
@@ -1,58 +1,3 @@
-def spec_variants(spec: str) -> dict:
-    """Given a spec's concrete variants, return a dict in name: value format."""
-    # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on
-
-    variants = {}
-    # give some padding to + and ~ so we can split on them
-    spec = spec.replace("+", " +")
-    spec = spec.replace("~", " ~")
-    parts = spec.split(" ")
-
-    for part in parts:
-        if len(part) < 2:
-            continue
-        if "=" in part:
-            name, value = part.split("=")
-            if "," in value:
-                # array of the multiple values
-                variants[name] = value.split(",")
-            else:
-                # string of the single value
-                variants[name] = value
-        else:
-            # anything after the first character is the value
-            if part.startswith("+"):
-                variants[part[1:]] = True
-            elif part.startswith("~"):
-                variants[part[1:]] = False
-
-    return variants
-
-
-def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]:
-    """
-    Crafts an SQLite INSERT statement from a dictionary.
-
-    args:
-        table: name of the table to insert into
-        input: dictionary of values to insert
-        ignore: whether to ignore duplicate entries
-
-    returns: tuple of (query, values)
-    """
-
-    columns = ", ".join(input.keys())
-    values = ", ".join(["?" for _ in range(len(input))])
-    query = f"INSERT INTO {table} ({columns}) VALUES ({values})"
-
-    if ignore:
-        query = query.replace("INSERT", "INSERT OR IGNORE")
-
-    # using a tuple of values from the dictionary
-    values_tuple = tuple(input.values())
-    return query, values_tuple
-
-
 def setattrs(_self, **kwargs):
     """Sets multiple attributes of an object from a dictionary."""
     for k, v in kwargs.items():
diff --git a/gantry/util/spec.py b/gantry/util/spec.py
new file mode 100644
index 0000000..9376ece
--- /dev/null
+++ b/gantry/util/spec.py
@@ -0,0 +1,46 @@
+import re
+
+def spec_variants(spec: str) -> dict:
+    """Given a spec's concrete variants, return a dict in name: value format."""
+    # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on
+
+    variants = {}
+    # give some padding to + and ~ so we can split on them
+    spec = spec.replace("+", " +")
+    spec = spec.replace("~", " ~")
+    parts = spec.split(" ")
+
+    for part in parts:
+        if len(part) < 2:
+            continue
+        if "=" in part:
+            name, value = part.split("=")
+            if "," in value:
+                # array of the multiple values
+                variants[name] = value.split(",")
+            else:
+                # string of the single value
+                variants[name] = value
+        else:
+            # anything after the first character is the value
+            if part.startswith("+"):
+                variants[part[1:]] = True
+            elif part.startswith("~"):
+                variants[part[1:]] = False
+
+    return variants
+
+def valid_build_name(name):
+    """Returns True if the job is a build job, False otherwise."""
+
+    # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0
+    # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1
+    job_name_pattern = re.compile(
+        r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)"
+    )
+    job_name_match = job_name_pattern.match(name)
+    # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack
+    return bool(job_name_match)
+
+
+

From fe2449d4a92e1d2ff9116b5af3093bb420fdbc1c Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Thu, 25 Jan 2024 15:08:30 -0800
Subject: [PATCH 14/27] rename vm: node build: job

---
 db/schema.sql | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/db/schema.sql b/db/schema.sql
index 6b1c24f..26352e7 100644
--- a/db/schema.sql
+++ b/db/schema.sql
@@ -1,4 +1,4 @@
-CREATE TABLE vms (
+CREATE TABLE nodes (
     id INTEGER PRIMARY KEY,
     uuid TEXT NOT NULL UNIQUE,
     hostname TEXT NOT NULL,
@@ -10,15 +10,14 @@ CREATE TABLE vms (
 );
 
 
-CREATE TABLE builds (
+CREATE TABLE jobs (
     id INTEGER PRIMARY KEY,
     pod TEXT NOT NULL UNIQUE,
-    vm INTEGER NOT NULL,
+    node INTEGER NOT NULL,
     start INTEGER NOT NULL,
     end INTEGER NOT NULL,
     job_id INTEGER NOT NULL UNIQUE,
     job_status TEXT NOT NULL,
-    retries INTEGER NOT NULL,
     ref TEXT NOT NULL,
     pkg_name TEXT NOT NULL,
     pkg_version TEXT NOT NULL,
@@ -42,8 +41,8 @@ CREATE TABLE builds (
     mem_max REAL NOT NULL,
     mem_min REAL NOT NULL,
     mem_stddev REAL NOT NULL,
-    FOREIGN KEY (vm)
-        REFERENCES vms (id)
+    FOREIGN KEY (node)
+        REFERENCES nodes (id)
             ON UPDATE CASCADE
             ON DELETE CASCADE
 );

From 7b05c8ce0869b4cbbee5b80e7516b796528c873e Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Thu, 25 Jan 2024 15:08:50 -0800
Subject: [PATCH 15/27] reorganize functionality around clients rather than
 models

---
 gantry/clients/prometheus.py | 207 +++++++++++++++++++++++++++++-
 gantry/db/__init__.py        |   3 +
 gantry/db/get.py             |  41 ++++++
 gantry/db/insert.py          |  69 ++++++++++
 gantry/models/__init__.py    |   3 +-
 gantry/models/build.py       | 241 -----------------------------------
 gantry/models/job.py         |  40 ++++++
 gantry/models/vm.py          | 107 ----------------
 gantry/routes/collection.py  | 104 ++++++++++-----
 gantry/util/misc.py          |   4 -
 gantry/util/spec.py          |  17 ---
 gantry/views.py              |  12 +-
 12 files changed, 432 insertions(+), 416 deletions(-)
 create mode 100644 gantry/db/__init__.py
 create mode 100644 gantry/db/get.py
 create mode 100644 gantry/db/insert.py
 delete mode 100644 gantry/models/build.py
 create mode 100644 gantry/models/job.py
 delete mode 100644 gantry/models/vm.py
 delete mode 100644 gantry/util/misc.py

diff --git a/gantry/clients/prometheus.py b/gantry/clients/prometheus.py
index a3db981..720c8cc 100644
--- a/gantry/clients/prometheus.py
+++ b/gantry/clients/prometheus.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import math
 import statistics
@@ -5,6 +6,8 @@
 
 import aiohttp
 
+from gantry.util.spec import spec_variants
+
 
 class IncompleteData(Exception):
     pass
@@ -97,6 +100,198 @@ def prettify_res(self, response: dict) -> dict:
             for result in response["data"]["result"]
         ]
 
+    async def get_job_annotations(self, job_id: int, time: float) -> dict:
+        """
+        args:
+            job_id: job id
+            time: when to query (unix timestamp)
+        returns: dict of annotations
+        """
+
+        res = await self.query(
+            type="single",
+            query={
+                "metric": "kube_pod_annotations",
+                "filters": {"annotation_gitlab_ci_job_id": job_id},
+            },
+            time=time,
+        )
+
+        if not res:
+            raise IncompleteData("annotation data is missing")
+
+        annotations = res[0]["labels"]
+
+        return {
+            "pod": annotations["pod"],
+            # if build jobs is not set, defaults to 16 due to spack config
+            "build_jobs": annotations.get(
+                "annotation_metrics_spack_job_build_jobs", 16
+            ),
+            "arch": annotations["annotation_metrics_spack_job_spec_arch"],
+            "pkg_name": annotations["annotation_metrics_spack_job_spec_pkg_name"],
+            "pkg_version": annotations["annotation_metrics_spack_job_spec_pkg_version"],
+            "pkg_variants": json.dumps(
+                spec_variants(annotations["annotation_metrics_spack_job_spec_variants"])
+            ),
+            "compiler_name": annotations[
+                "annotation_metrics_spack_job_spec_compiler_name"
+            ],
+            "compiler_version": annotations[
+                "annotation_metrics_spack_job_spec_compiler_version"
+            ],
+            "stack": annotations["annotation_metrics_spack_ci_stack_name"],
+        }
+
+    async def get_job_resources(self, pod: str, time: float) -> tuple[dict, str]:
+        """
+        args:
+            job_id: job id
+            pod: pod name
+            time: when to query (unix timestamp)
+        returns: dict of resources and node hostname
+        """
+
+        requests = process_resources(
+            await self.query(
+                type="single",
+                query={
+                    "metric": "kube_pod_container_resource_requests",
+                    "filters": {"container": "build", "pod": pod},
+                },
+                time=time,
+            )
+        )
+
+        limits_res = await self.query(
+            type="single",
+            query={
+                "metric": "kube_pod_container_resource_limits",
+                "filters": {"container": "build", "pod": pod},
+            },
+            time=time,
+        )
+
+        if not limits_res:
+            raise IncompleteData("missing limits")
+
+        # instead of needing to fetch the node where the pod ran from kube_pod_info
+        # we can grab it from kube_pod_container_resource_limits
+        # weirdly, it's not available in kube_pod_labels or annotations
+        # https://github.com/kubernetes/kube-state-metrics/issues/1148
+        node = limits_res[0]["labels"]["node"]
+        limits = process_resources(limits_res)
+
+        return (
+            {
+                "cpu_request": requests["cpu"]["value"],
+                "mem_request": requests["memory"]["value"],
+                "cpu_limit": limits.get("cpu", {}).get("value"),
+                "mem_limit": limits["memory"]["value"],
+            },
+            node,
+        )
+
+    async def get_job_usage(self, pod: str, start: float, end: float) -> dict:
+        """
+        Gets resource usage attributes for a job.
+
+        args:
+            pod: pod name
+            start: start time (unix timestamp)
+            end: end time (unix timestamp)
+        returns: dict of usage stats
+        """
+
+        mem_usage = process_usage(
+            await self.query(
+                type="range",
+                query={
+                    "metric": "container_memory_working_set_bytes",
+                    "filters": {"container": "build", "pod": pod},
+                },
+                start=start,
+                end=end,
+            )
+        )
+
+        cpu_usage = process_usage(
+            await self.query(
+                type="range",
+                custom_query=(
+                    f"rate(container_cpu_usage_seconds_total{{"
+                    f"pod='{pod}', container='build'}}[90s])"
+                ),
+                start=start,
+                end=end,
+            )
+        )
+
+        return {
+            "cpu_mean": cpu_usage["mean"],
+            "cpu_median": cpu_usage["median"],
+            "cpu_max": cpu_usage["max"],
+            "cpu_min": cpu_usage["min"],
+            "cpu_stddev": cpu_usage["stddev"],
+            "mem_mean": mem_usage["mean"],
+            "mem_median": mem_usage["median"],
+            "mem_max": mem_usage["max"],
+            "mem_min": mem_usage["min"],
+            "mem_stddev": mem_usage["stddev"],
+        }
+
+    async def get_node_uuid(self, hostname: str, time: float) -> dict:
+        """
+        args:
+            hostname: node hostname
+            time: time to query (unix timestamp)
+        returns: dict of node info (UUID as of now)
+        """
+
+        res = await self.query(
+            type="single",
+            query={
+                "metric": "kube_node_info",
+                "filters": {"node": hostname},
+            },
+            time=time,
+        )
+
+        if not res:
+            raise IncompleteData(f"node info is missing. hostname={hostname}")
+
+        return res[0]["labels"]["system_uuid"]
+
+    async def get_node_labels(self, hostname: str, time: float) -> dict:
+        """
+        args:
+            hostname: node hostname
+            time: time to query (unix timestamp)
+        returns: dict of node labels
+        """
+
+        res = await self.query(
+            type="single",
+            query={
+                "metric": "kube_node_labels",
+                "filters": {"node": hostname},
+            },
+            time=time,
+        )
+
+        if not res:
+            raise IncompleteData(f"node labels are missing. hostname={hostname}")
+
+        labels = res[0]["labels"]
+
+        return {
+            "cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]),
+            "mem": float(labels["label_karpenter_k8s_aws_instance_memory"]),
+            "arch": labels["label_kubernetes_io_arch"],
+            "os": labels["label_kubernetes_io_os"],
+            "instance_type": labels["label_node_kubernetes_io_instance_type"],
+        }
+
 
 def query_to_str(metric: str, filters: dict) -> str:
     """
@@ -107,20 +302,19 @@ def query_to_str(metric: str, filters: dict) -> str:
     return f"{metric}{{{filters_str}}}"
 
 
-def process_resources(res: dict, job_id: int) -> dict:
+def process_resources(res: dict) -> dict:
     """
     Processes the resource limits and requests from a Prometheus response into
     readable format.
 
     args:
         res: Prometheus response
-        job_id: job id for error logging
 
     returns: dict with {resource: {unit: value}} format
     """
 
     if not res:
-        raise IncompleteData(f"resource data is missing for job {job_id}")
+        raise IncompleteData("resource data is missing")
 
     processed = {}
     for item in res:
@@ -133,21 +327,20 @@ def process_resources(res: dict, job_id: int) -> dict:
     return processed
 
 
-def process_usage(res: dict, job_id: int) -> dict:
+def process_usage(res: dict) -> dict:
     """
     Processes the usage data from a Prometheus response into readable format.
     This could either be CPU usage or memory usage.
 
     args:
         res: Prometheus response
-        job_id: job id for error logging
 
     returns: dict with {statistic: value} format
     """
 
     if not res:
         # sometimes prometheus reports no data for a job if the time range is too small
-        raise IncompleteData(f"usage data is missing for job {job_id}")
+        raise IncompleteData("usage data is missing")
 
     usage = [float(value) for timestamp, value in res[0]["values"]]
 
@@ -165,6 +358,6 @@ def process_usage(res: dict, job_id: int) -> dict:
         or sum_stats["mean"] == 0
         or math.isnan(sum_stats["stddev"])
     ):
-        raise IncompleteData(f"usage data is invalid for job {job_id}")
+        raise IncompleteData("usage data is invalid")
 
     return sum_stats
diff --git a/gantry/db/__init__.py b/gantry/db/__init__.py
new file mode 100644
index 0000000..dab0a74
--- /dev/null
+++ b/gantry/db/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+from .get import *
+from .insert import *
diff --git a/gantry/db/get.py b/gantry/db/get.py
new file mode 100644
index 0000000..8ef7977
--- /dev/null
+++ b/gantry/db/get.py
@@ -0,0 +1,41 @@
+import logging
+
+import aiosqlite
+
+
+async def get_node(db: aiosqlite.Connection, uuid: str) -> int | None:
+    """return the primary key if found, otherwise return None"""
+
+    async with db.execute("select id from nodes where uuid = ?", (uuid,)) as cursor:
+        if cur_node := await cursor.fetchone():
+            return cur_node[0]
+
+    return None
+
+
+async def job_exists(db: aiosqlite.Connection, job_id: int) -> bool:
+    """return if the job exists in the database"""
+
+    async with db.execute("select id from jobs where job_id = ?", (job_id,)) as cursor:
+        if await cursor.fetchone():
+            logging.warning(
+                f"""
+                job {job_id} already in database.
+                check why multiple requests are being sent.
+                """
+            )
+            return True
+
+    return False
+
+
+async def ghost_exists(db: aiosqlite.Connection, job_id: int) -> bool:
+    """return if the ghost job exists in the database"""
+
+    async with db.execute(
+        "select id from ghost_jobs where job_id = ?", (job_id,)
+    ) as cursor:
+        if await cursor.fetchone():
+            return True
+
+    return False
diff --git a/gantry/db/insert.py b/gantry/db/insert.py
new file mode 100644
index 0000000..da35620
--- /dev/null
+++ b/gantry/db/insert.py
@@ -0,0 +1,69 @@
+import aiosqlite
+
+from gantry.db.get import get_node
+
+
+def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]:
+    """
+    crafts an sqlite insert statement from a dictionary.
+
+    args:
+        table: name of the table to insert into
+        input: dictionary of values to insert
+        ignore: whether to ignore duplicate entries
+
+    returns: tuple of (query, values)
+    """
+
+    columns = ", ".join(input.keys())
+    values = ", ".join(["?" for _ in range(len(input))])
+    query = f"INSERT INTO {table} ({columns}) VALUES ({values})"
+
+    if ignore:
+        query = query.replace("INSERT", "INSERT OR IGNORE")
+
+    # using a tuple of values from the dictionary
+    values_tuple = tuple(input.values())
+    return query, values_tuple
+
+
+async def insert_ghost(db: aiosqlite.Connection, job_id: int) -> None:
+    """Inserts a ghost job into the database."""
+
+    await db.execute(("insert into ghost_jobs (name) values (?)"), (job_id,))
+
+
+async def insert_node(db: aiosqlite.Connection, node: dict) -> int:
+    """Inserts a node into the database."""
+
+    async with db.execute(
+        *insert_dict(
+            "nodes",
+            node,
+            # deal with races
+            ignore=True,
+        )
+    ) as cursor:
+        pk = cursor.lastrowid
+
+    if pk == 0:
+        # the ignore part of the query was triggered, some other call
+        # must have inserted the node before this one
+        pk = await get_node(db, node["uuid"])
+
+    return pk
+
+
+async def insert_job(db: aiosqlite.Connection, job: dict) -> int:
+    """Inserts a job into the database."""
+
+    async with db.execute(
+        *insert_dict(
+            "jobs",
+            job,
+            # if the job somehow gets added into the db (pod+id being unique)
+            # then ignore the insert
+            ignore=True,
+        )
+    ) as cursor:
+        return cursor.lastrowid
diff --git a/gantry/models/__init__.py b/gantry/models/__init__.py
index 57e9b66..73d8633 100644
--- a/gantry/models/__init__.py
+++ b/gantry/models/__init__.py
@@ -1,3 +1,2 @@
 # flake8: noqa
-from .build import Build
-from .vm import VM
+from .job import Job
diff --git a/gantry/models/build.py b/gantry/models/build.py
deleted file mode 100644
index 169a6b8..0000000
--- a/gantry/models/build.py
+++ /dev/null
@@ -1,241 +0,0 @@
-import json
-import logging
-import re
-from datetime import datetime
-
-import aiosqlite
-
-from gantry.clients.gitlab import GitlabClient
-from gantry.util.misc import insert_dict, setattrs, spec_variants
-from gantry.clients.prometheus import (
-    IncompleteData,
-    PrometheusClient,
-    process_resources,
-    process_usage,
-)
-
-
-class Build:
-    def __init__(
-        self,
-        status: str,
-        name: str,
-        id: int,
-        start: str,
-        end: str,
-        retries: int,
-        ref: str,
-    ):
-        self.status = status
-        self.name = name
-        self.id = id
-        self.start = datetime.fromisoformat(start).timestamp()
-        self.end = datetime.fromisoformat(end).timestamp()
-        self.retries = retries
-        self.ref = ref
-
-    @property
-    def midpoint(self) -> float:
-        """Returns the midpoint of the job in unix time."""
-        # prometheus is not guaranteed to have data at the exact start and end times
-        # instead of creating an arbitrary buffer, ask for data in the middle of the job
-        return (self.start + self.end) / 2
-
-    async def is_ghost(self, db: aiosqlite.Connection, gl: GitlabClient) -> bool:
-        """Returns the job's ghost status."""
-
-        # prevent duplicate jobs from being inserted into the database
-        async with db.execute(
-            "select job_id from ghost_jobs where job_id = ?", (self.id,)
-        ) as cursor:
-            if await cursor.fetchone():
-                # ghost job is already in the database
-                return True
-
-        log = await gl.job_log(self.id)
-        ghost = "No need to rebuild" in log
-
-        if ghost:
-            await db.execute(("insert into ghost_jobs (name) values (?)"), (self.id,))
-
-        return ghost
-
-    async def in_db(self, db: aiosqlite.Connection) -> bool:
-        """Checks if the job is already in the db."""
-
-        async with db.execute(
-            "select job_id from builds where job_id = ?", (self.id,)
-        ) as cursor:
-            found = bool(await cursor.fetchone())
-
-        if found:
-            logging.warning(f"job {self.id} already in database")
-
-        return found
-
-    async def get_annotations(self, prometheus: PrometheusClient):
-        """Fetches the annotations and assigns multiple attributes."""
-
-        annotations_res = await prometheus.query(
-            type="single",
-            query={
-                "metric": "kube_pod_annotations",
-                "filters": {"annotation_gitlab_ci_job_id": self.id},
-            },
-            time=self.midpoint,
-        )
-
-        if not annotations_res:
-            raise IncompleteData(f"missing annotations for job {self.id}")
-
-        annotations = annotations_res[0]["labels"]
-
-        setattrs(
-            self,
-            pod=annotations["pod"],
-            # if build jobs is not set, defaults to 16 due to spack settings
-            build_jobs=annotations.get("annotation_metrics_spack_job_build_jobs", 16),
-            arch=annotations["annotation_metrics_spack_job_spec_arch"],
-            pkg_name=annotations["annotation_metrics_spack_job_spec_pkg_name"],
-            pkg_version=annotations["annotation_metrics_spack_job_spec_pkg_version"],
-            pkg_variants=spec_variants(
-                annotations["annotation_metrics_spack_job_spec_variants"]
-            ),
-            compiler_name=annotations[
-                "annotation_metrics_spack_job_spec_compiler_name"
-            ],
-            compiler_version=annotations[
-                "annotation_metrics_spack_job_spec_compiler_version"
-            ],
-            stack=annotations["annotation_metrics_spack_ci_stack_name"],
-        )
-
-    async def get_resources(self, prometheus: PrometheusClient):
-        """fetches pod requests and limits, and also sets the node hostname"""
-        requests = process_resources(
-            await prometheus.query(
-                type="single",
-                query={
-                    "metric": "kube_pod_container_resource_requests",
-                    "filters": {"container": "build", "pod": self.pod},
-                },
-                time=self.midpoint,
-            ),
-            self.id,
-        )
-
-        limits_res = await prometheus.query(
-            type="single",
-            query={
-                "metric": "kube_pod_container_resource_limits",
-                "filters": {"container": "build", "pod": self.pod},
-            },
-            time=self.midpoint,
-        )
-
-        if not limits_res:
-            raise IncompleteData(f"missing limits for job {self.id}")
-
-        # instead of needing to fetch the node where the pod ran from kube_pod_info
-        # we can grab it from kube_pod_container_resource_limits
-        # weirdly, it's not available in kube_pod_labels or annotations
-        # https://github.com/kubernetes/kube-state-metrics/issues/1148
-
-        self.node = limits_res[0]["labels"]["node"]
-        limits = process_resources(limits_res, self.id)
-
-        setattrs(
-            self,
-            cpu_request=requests["cpu"]["value"],
-            mem_request=requests["memory"]["value"],
-            cpu_limit=limits.get("cpu", {}).get("value"),
-            mem_limit=limits["memory"]["value"],
-        )
-
-    async def get_usage(self, prometheus: PrometheusClient):
-        """Sets resource usage attributes."""
-
-        mem_usage = process_usage(
-            await prometheus.query(
-                type="range",
-                query={
-                    "metric": "container_memory_working_set_bytes",
-                    "filters": {"container": "build", "pod": self.pod},
-                },
-                start=self.start,
-                end=self.end,
-            ),
-            self.id,
-        )
-
-        cpu_usage = process_usage(
-            await prometheus.query(
-                type="range",
-                custom_query=(
-                    f"rate(container_cpu_usage_seconds_total{{"
-                    f"pod='{self.pod}', container='build'}}[90s])"
-                ),
-                start=self.start,
-                end=self.end,
-            ),
-            self.id,
-        )
-
-        setattrs(
-            self,
-            cpu_mean=cpu_usage["mean"],
-            cpu_median=cpu_usage["median"],
-            cpu_max=cpu_usage["max"],
-            cpu_min=cpu_usage["min"],
-            cpu_stddev=cpu_usage["stddev"],
-            mem_mean=mem_usage["mean"],
-            mem_median=mem_usage["median"],
-            mem_max=mem_usage["max"],
-            mem_min=mem_usage["min"],
-            mem_stddev=mem_usage["stddev"],
-        )
-
-    async def insert(self, db: aiosqlite.Connection, vm_id: int) -> int:
-        """Inserts the build into the database and returns its id."""
-
-        async with db.execute(
-            *insert_dict(
-                "builds",
-                {
-                    "pod": self.pod,
-                    "vm": vm_id,
-                    "start": self.start,
-                    "end": self.end,
-                    "job_id": self.id,
-                    "job_status": self.status,
-                    "retries": self.retries,
-                    "ref": self.ref,
-                    "pkg_name": self.pkg_name,
-                    "pkg_version": self.pkg_version,
-                    "pkg_variants": json.dumps(self.pkg_variants),  # dict to string
-                    "compiler_name": self.compiler_name,
-                    "compiler_version": self.compiler_version,
-                    "arch": self.arch,
-                    "stack": self.stack,
-                    "build_jobs": self.build_jobs,
-                    "cpu_request": self.cpu_request,
-                    "cpu_limit": self.cpu_limit,
-                    "cpu_mean": self.cpu_mean,
-                    "cpu_median": self.cpu_median,
-                    "cpu_max": self.cpu_max,
-                    "cpu_min": self.cpu_min,
-                    "cpu_stddev": self.cpu_stddev,
-                    "mem_request": self.mem_request,
-                    "mem_limit": self.mem_limit,
-                    "mem_mean": self.mem_mean,
-                    "mem_median": self.mem_median,
-                    "mem_max": self.mem_max,
-                    "mem_min": self.mem_min,
-                    "mem_stddev": self.mem_stddev,
-                },
-                # if the job somehow gets added into the db (pod+id being unique)
-                # then ignore the insert
-                ignore=True,
-            )
-        ) as cursor:
-            return cursor.lastrowid
diff --git a/gantry/models/job.py b/gantry/models/job.py
new file mode 100644
index 0000000..64b2f77
--- /dev/null
+++ b/gantry/models/job.py
@@ -0,0 +1,40 @@
+import re
+from datetime import datetime
+
+
+class Job:
+    def __init__(
+        self,
+        status: str,
+        name: str,
+        id: int,
+        start: str,
+        end: str,
+        ref: str,
+    ):
+        self.status = status
+        self.name = name
+        self.id = id
+        self.start = datetime.fromisoformat(start).timestamp()
+        self.end = datetime.fromisoformat(end).timestamp()
+        self.ref = ref
+
+    @property
+    def midpoint(self) -> float:
+        """Returns the midpoint of the job in unix time."""
+        # prometheus is not guaranteed to have data at the exact start and end times
+        # instead of creating an arbitrary buffer, ask for data in the middle of the job
+        return (self.start + self.end) / 2
+
+    @property
+    def valid_build_name(self) -> bool:
+        """validates the job name."""
+
+        # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0
+        # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1
+        job_name_pattern = re.compile(
+            r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)"
+        )
+        job_name_match = job_name_pattern.match(self.name)
+        # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack
+        return bool(job_name_match)
diff --git a/gantry/models/vm.py b/gantry/models/vm.py
deleted file mode 100644
index 59fe864..0000000
--- a/gantry/models/vm.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import aiosqlite
-
-from gantry.util.misc import insert_dict, setattrs
-from gantry.clients.prometheus import IncompleteData, PrometheusClient
-
-MB_IN_BYTES = 1_000_000
-
-
-class VM:
-    def __init__(self, hostname: str, query_time: float):
-        """
-        args:
-            hostname: the hostname of the VM
-            query_time: any point during VM runtime, usually grabbed from build
-        """
-        self.hostname = hostname
-        self.query_time = query_time
-
-    async def db_id(
-        self, db: aiosqlite.Connection, prometheus: PrometheusClient
-    ) -> int | None:
-        """
-        Returns the id of the vm if it exists in the database, otherwise returns None.
-        Also sets the uuid of the vm.
-        """
-        vm_info = await prometheus.query(
-            type="single",
-            query={
-                "metric": "kube_node_info",
-                "filters": {"node": self.hostname},
-            },
-            time=self.query_time,
-        )
-
-        if not vm_info:
-            raise IncompleteData(f"missing vm info for {self.hostname}")
-
-        self.uuid = vm_info[0]["labels"]["system_uuid"]
-
-        # look for the vm in the database
-        async with db.execute(
-            "select id from vms where uuid = ?", (self.uuid,)
-        ) as cursor:
-            old_vm = await cursor.fetchone()
-
-            if old_vm:
-                return old_vm[0]
-
-        return None
-
-    async def get_labels(self, prometheus: PrometheusClient):
-        """Sets multiple attributes of the VM based on its labels."""
-
-        vm_labels_res = await prometheus.query(
-            type="single",
-            query={
-                "metric": "kube_node_labels",
-                "filters": {"node": self.hostname},
-            },
-            time=self.query_time,
-        )
-
-        if not vm_labels_res:
-            raise IncompleteData(f"missing vm labels for {self.hostname}")
-
-        labels = vm_labels_res[0]["labels"]
-
-        setattrs(
-            self,
-            cores=float(labels["label_karpenter_k8s_aws_instance_cpu"]),
-            mem=float(labels["label_karpenter_k8s_aws_instance_memory"]),
-            arch=labels["label_kubernetes_io_arch"],
-            os=labels["label_kubernetes_io_os"],
-            instance_type=labels["label_node_kubernetes_io_instance_type"],
-        )
-
-    async def insert(self, db: aiosqlite.Connection) -> int:
-        """Inserts the VM into the database and returns its id."""
-        async with db.execute(
-            *insert_dict(
-                "vms",
-                {
-                    "uuid": self.uuid,
-                    "hostname": self.hostname,
-                    "cores": self.cores,
-                    # convert to bytes to be consistent with other resource metrics
-                    "mem": self.mem * MB_IN_BYTES,
-                    "arch": self.arch,
-                    "os": self.os,
-                    "instance_type": self.instance_type,
-                },
-                # deal with races
-                ignore=True,
-            )
-        ) as cursor:
-            pk = cursor.lastrowid
-
-        if pk == 0:
-            # the ignore part of the query was triggered, some other call
-            # must have inserted the vm before this one
-            async with db.execute(
-                "select id from vms where uuid = ?", (self.uuid,)
-            ) as cursor:
-                pk_res = await cursor.fetchone()
-                pk = pk_res[0]
-
-        return pk
diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py
index b6c48a8..25e119f 100644
--- a/gantry/routes/collection.py
+++ b/gantry/routes/collection.py
@@ -2,15 +2,17 @@
 
 import aiosqlite
 
-from gantry.models import VM, Build
+from gantry import db
 from gantry.clients.gitlab import GitlabClient
 from gantry.clients.prometheus import IncompleteData, PrometheusClient
-from gantry.util.spec import valid_build_name
+from gantry.models import Job
 
+MB_IN_BYTES = 1_000_000
 
-async def fetch_build(
+
+async def fetch_job(
     payload: dict,
-    db: aiosqlite.Connection,
+    db_conn: aiosqlite.Connection,
     gitlab: GitlabClient,
     prometheus: PrometheusClient,
 ) -> None:
@@ -27,68 +29,100 @@ async def fetch_build(
     returns: None in order to accomodate a 200 response for the webhook.
     """
 
-    build = Build(
+    job = Job(
         status=payload["build_status"],
         name=payload["build_name"],
         id=payload["build_id"],
         start=payload["build_started_at"],
         end=payload["build_finished_at"],
-        retries=payload["retries_count"],
         ref=payload["ref"],
     )
 
     # perform checks to see if we should collect data for this job
     if (
-        build.status not in ("success",)
-        or not valid_build_name(build.name)  # is not a build job
-        or await build.in_db(db)  # job already in the database
-        or await build.is_ghost(db, gitlab)
+        job.status != "success"
+        or not job.valid_build_name  # is not a build job
+        or await db.job_exists(db_conn, job.id)  # job already in the database
+        or await db.ghost_exists(db_conn, job.id)  # ghost already in db
     ):
         return
 
+    # check if the job is a ghost
+    job_log = await gitlab.job_log(job.id)
+    is_ghost = "No need to rebuild" in job_log
+    if is_ghost:
+        db.insert_ghost(db_conn, job.id)
+        return
+
     try:
-        await build.get_annotations(prometheus)
-        await build.get_resources(prometheus)
-        await build.get_usage(prometheus)
-        vm_id = await fetch_vm(db, prometheus, build.node, build.midpoint)
+        annotations = await prometheus.get_job_annotations(job.id, job.midpoint)
+        resources, node_hostname = await prometheus.get_job_resources(
+            annotations["pod"], job.midpoint
+        )
+        usage = await prometheus.get_job_usage(annotations["pod"], job.start, job.end)
+        node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint)
     except IncompleteData as e:
         # missing data, skip this job
-        logging.error(e)
+        logging.error(f"{e} job={job.id}")
         return
 
-    await build.insert(db, vm_id)
-    # vm and build will get saved at the same time to make sure
-    # we don't accidentally commit a vm without a build
-    await db.commit()
+    await db.insert_job(
+        db_conn,
+        {
+            "node": node_id,
+            "start": job.start,
+            "end": job.end,
+            "job_id": job.id,
+            "job_status": job.status,
+            "ref": job.ref,
+            **annotations,
+            **resources,
+            **usage,
+        },
+    )
+
+    # job and node will get saved at the same time to make sure
+    # we don't accidentally commit a node without a job
+    await db_conn.commit()
 
     return
 
 
-async def fetch_vm(
-    db: aiosqlite.Connection,
+async def fetch_node(
+    db_conn: aiosqlite.Connection,
     prometheus: PrometheusClient,
     hostname: dict,
     query_time: float,
 ) -> int:
     """
-    Finds an existing VM in the database or inserts a new one.
+    Finds an existing node in the database or inserts a new one.
 
     args:
         db: an active aiosqlite connection
         prometheus:
-        hostname: the hostname of the VM
-        query_time: any point during VM runtime, usually grabbed from build
+        hostname: the hostname of the node
+        query_time: any point during node runtime, usually grabbed from job
 
-    returns: id of the inserted or existing VM
+    returns: id of the inserted or existing node
     """
-    vm = VM(
-        hostname=hostname,
-        query_time=query_time,
-    )
 
-    # do not proceed if the VM exists
-    if existing_vm := await vm.db_id(db, prometheus):
-        return existing_vm
-
-    await vm.get_labels(prometheus)
-    return await vm.insert(db)
+    node_uuid = await prometheus.get_node_uuid(hostname, query_time)
+
+    # do not proceed if the node exists
+    if existing_node := await db.get_node(db_conn, node_uuid):
+        return existing_node
+
+    node_labels = await prometheus.get_node_labels(hostname, query_time)
+    return await db.insert_node(
+        db_conn,
+        {
+            "uuid": node_uuid,
+            "hostname": hostname,
+            "cores": node_labels["cores"],
+            # convert to bytes to be consistent with other resource metrics
+            "mem": node_labels["mem"] * MB_IN_BYTES,
+            "arch": node_labels["arch"],
+            "os": node_labels["os"],
+            "instance_type": node_labels["instance_type"],
+        },
+    )
diff --git a/gantry/util/misc.py b/gantry/util/misc.py
deleted file mode 100644
index 0ff0892..0000000
--- a/gantry/util/misc.py
+++ /dev/null
@@ -1,4 +0,0 @@
-def setattrs(_self, **kwargs):
-    """Sets multiple attributes of an object from a dictionary."""
-    for k, v in kwargs.items():
-        setattr(_self, k, v)
diff --git a/gantry/util/spec.py b/gantry/util/spec.py
index 9376ece..eb1b33d 100644
--- a/gantry/util/spec.py
+++ b/gantry/util/spec.py
@@ -1,5 +1,3 @@
-import re
-
 def spec_variants(spec: str) -> dict:
     """Given a spec's concrete variants, return a dict in name: value format."""
     # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on
@@ -29,18 +27,3 @@ def spec_variants(spec: str) -> dict:
                 variants[part[1:]] = False
 
     return variants
-
-def valid_build_name(name):
-    """Returns True if the job is a build job, False otherwise."""
-
-    # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0
-    # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1
-    job_name_pattern = re.compile(
-        r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)"
-    )
-    job_name_match = job_name_pattern.match(name)
-    # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack
-    return bool(job_name_match)
-
-
-
diff --git a/gantry/views.py b/gantry/views.py
index b311e8d..d9a0bb4 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -1,9 +1,10 @@
+import asyncio
 import json
 import os
 
 from aiohttp import web
 
-from gantry.routes.collection import fetch_build
+from gantry.routes.collection import fetch_job
 
 routes = web.RouteTableDef()
 
@@ -21,7 +22,12 @@ async def collect_job(request: web.Request) -> web.Response:
     if request.headers.get("X-Gitlab-Event") != "Job Hook":
         return web.Response(status=400, text="invalid event type")
 
-    await fetch_build(
-        payload, request.app["db"], request.app["gitlab"], request.app["prometheus"]
+    # will return immediately, but will not block the event loop
+    # allowing fetch_job to run in the background
+    asyncio.ensure_future(
+        fetch_job(
+            payload, request.app["db"], request.app["gitlab"], request.app["prometheus"]
+        )
     )
+
     return web.Response(status=200)

From 608c043214686013c58785e1990f9548338be624 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Thu, 25 Jan 2024 15:28:29 -0800
Subject: [PATCH 16/27] job_id -> gitlab_id

---
 db/schema.sql                |  5 ++---
 gantry/clients/gitlab.py     |  4 ++--
 gantry/clients/prometheus.py |  7 +++----
 gantry/db/get.py             | 12 +++++++-----
 gantry/db/insert.py          |  4 ++--
 gantry/models/job.py         |  4 ++--
 gantry/routes/collection.py  | 16 ++++++++--------
 7 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/db/schema.sql b/db/schema.sql
index 26352e7..8104a4e 100644
--- a/db/schema.sql
+++ b/db/schema.sql
@@ -9,14 +9,13 @@ CREATE TABLE nodes (
     instance_type TEXT NOT NULL
 );
 
-
 CREATE TABLE jobs (
     id INTEGER PRIMARY KEY,
     pod TEXT NOT NULL UNIQUE,
     node INTEGER NOT NULL,
     start INTEGER NOT NULL,
     end INTEGER NOT NULL,
-    job_id INTEGER NOT NULL UNIQUE,
+    gitlab_id INTEGER NOT NULL UNIQUE,
     job_status TEXT NOT NULL,
     ref TEXT NOT NULL,
     pkg_name TEXT NOT NULL,
@@ -49,5 +48,5 @@ CREATE TABLE jobs (
 
 CREATE TABLE ghost_jobs (
     id INTEGER PRIMARY KEY,
-    job_id INTEGER NOT NULL
+    gitlab_id INTEGER NOT NULL
 );
diff --git a/gantry/clients/gitlab.py b/gantry/clients/gitlab.py
index 7ab672e..97f9500 100644
--- a/gantry/clients/gitlab.py
+++ b/gantry/clients/gitlab.py
@@ -24,8 +24,8 @@ async def _request(self, url: str, response_type: str) -> dict | str:
                 if response_type == "text":
                     return await resp.text()
 
-    async def job_log(self, job_id: int) -> str:
+    async def job_log(self, gl_id: int) -> str:
         """Given a job id, returns the log from that job"""
 
-        url = f"{self.base_url}/jobs/{job_id}/trace"
+        url = f"{self.base_url}/jobs/{gl_id}/trace"
         return await self._request(url, "text")
diff --git a/gantry/clients/prometheus.py b/gantry/clients/prometheus.py
index 720c8cc..87be7ce 100644
--- a/gantry/clients/prometheus.py
+++ b/gantry/clients/prometheus.py
@@ -100,10 +100,10 @@ def prettify_res(self, response: dict) -> dict:
             for result in response["data"]["result"]
         ]
 
-    async def get_job_annotations(self, job_id: int, time: float) -> dict:
+    async def get_job_annotations(self, gl_id: int, time: float) -> dict:
         """
         args:
-            job_id: job id
+            gl_id: gitlab job id
             time: when to query (unix timestamp)
         returns: dict of annotations
         """
@@ -112,7 +112,7 @@ async def get_job_annotations(self, job_id: int, time: float) -> dict:
             type="single",
             query={
                 "metric": "kube_pod_annotations",
-                "filters": {"annotation_gitlab_ci_job_id": job_id},
+                "filters": {"annotation_gitlab_ci_job_id": gl_id},
             },
             time=time,
         )
@@ -146,7 +146,6 @@ async def get_job_annotations(self, job_id: int, time: float) -> dict:
     async def get_job_resources(self, pod: str, time: float) -> tuple[dict, str]:
         """
         args:
-            job_id: job id
             pod: pod name
             time: when to query (unix timestamp)
         returns: dict of resources and node hostname
diff --git a/gantry/db/get.py b/gantry/db/get.py
index 8ef7977..c597c3e 100644
--- a/gantry/db/get.py
+++ b/gantry/db/get.py
@@ -13,14 +13,16 @@ async def get_node(db: aiosqlite.Connection, uuid: str) -> int | None:
     return None
 
 
-async def job_exists(db: aiosqlite.Connection, job_id: int) -> bool:
+async def job_exists(db: aiosqlite.Connection, gl_id: int) -> bool:
     """return if the job exists in the database"""
 
-    async with db.execute("select id from jobs where job_id = ?", (job_id,)) as cursor:
+    async with db.execute(
+        "select id from jobs where gitlab_id = ?", (gl_id,)
+    ) as cursor:
         if await cursor.fetchone():
             logging.warning(
                 f"""
-                job {job_id} already in database.
+                job {gl_id} already in database.
                 check why multiple requests are being sent.
                 """
             )
@@ -29,11 +31,11 @@ async def job_exists(db: aiosqlite.Connection, job_id: int) -> bool:
     return False
 
 
-async def ghost_exists(db: aiosqlite.Connection, job_id: int) -> bool:
+async def ghost_exists(db: aiosqlite.Connection, gl_id: int) -> bool:
     """return if the ghost job exists in the database"""
 
     async with db.execute(
-        "select id from ghost_jobs where job_id = ?", (job_id,)
+        "select id from ghost_jobs where gitlab_id = ?", (gl_id,)
     ) as cursor:
         if await cursor.fetchone():
             return True
diff --git a/gantry/db/insert.py b/gantry/db/insert.py
index da35620..3df157d 100644
--- a/gantry/db/insert.py
+++ b/gantry/db/insert.py
@@ -27,10 +27,10 @@ def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]:
     return query, values_tuple
 
 
-async def insert_ghost(db: aiosqlite.Connection, job_id: int) -> None:
+async def insert_ghost(db: aiosqlite.Connection, gl_id: int) -> None:
     """Inserts a ghost job into the database."""
 
-    await db.execute(("insert into ghost_jobs (name) values (?)"), (job_id,))
+    await db.execute(("insert into ghost_jobs (gitlab_id) values (?)"), (gl_id,))
 
 
 async def insert_node(db: aiosqlite.Connection, node: dict) -> int:
diff --git a/gantry/models/job.py b/gantry/models/job.py
index 64b2f77..3c3a794 100644
--- a/gantry/models/job.py
+++ b/gantry/models/job.py
@@ -7,14 +7,14 @@ def __init__(
         self,
         status: str,
         name: str,
-        id: int,
+        gl_id: int,
         start: str,
         end: str,
         ref: str,
     ):
         self.status = status
         self.name = name
-        self.id = id
+        self.gl_id = gl_id
         self.start = datetime.fromisoformat(start).timestamp()
         self.end = datetime.fromisoformat(end).timestamp()
         self.ref = ref
diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py
index 25e119f..d5125cd 100644
--- a/gantry/routes/collection.py
+++ b/gantry/routes/collection.py
@@ -32,7 +32,7 @@ async def fetch_job(
     job = Job(
         status=payload["build_status"],
         name=payload["build_name"],
-        id=payload["build_id"],
+        gl_id=payload["build_id"],
         start=payload["build_started_at"],
         end=payload["build_finished_at"],
         ref=payload["ref"],
@@ -42,20 +42,20 @@ async def fetch_job(
     if (
         job.status != "success"
         or not job.valid_build_name  # is not a build job
-        or await db.job_exists(db_conn, job.id)  # job already in the database
-        or await db.ghost_exists(db_conn, job.id)  # ghost already in db
+        or await db.job_exists(db_conn, job.gl_id)  # job already in the database
+        or await db.ghost_exists(db_conn, job.gl_id)  # ghost already in db
     ):
         return
 
     # check if the job is a ghost
-    job_log = await gitlab.job_log(job.id)
+    job_log = await gitlab.job_log(job.gl_id)
     is_ghost = "No need to rebuild" in job_log
     if is_ghost:
-        db.insert_ghost(db_conn, job.id)
+        db.insert_ghost(db_conn, job.gl_id)
         return
 
     try:
-        annotations = await prometheus.get_job_annotations(job.id, job.midpoint)
+        annotations = await prometheus.get_job_annotations(job.gl_id, job.midpoint)
         resources, node_hostname = await prometheus.get_job_resources(
             annotations["pod"], job.midpoint
         )
@@ -63,7 +63,7 @@ async def fetch_job(
         node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint)
     except IncompleteData as e:
         # missing data, skip this job
-        logging.error(f"{e} job={job.id}")
+        logging.error(f"{e} job={job.gl_id}")
         return
 
     await db.insert_job(
@@ -72,7 +72,7 @@ async def fetch_job(
             "node": node_id,
             "start": job.start,
             "end": job.end,
-            "job_id": job.id,
+            "gitlab_id": job.gl_id,
             "job_status": job.status,
             "ref": job.ref,
             **annotations,

From 11d05fc99ddbe4df1ca68367ef00a02f50b32ce1 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Thu, 25 Jan 2024 16:38:11 -0800
Subject: [PATCH 17/27] make prometheus client more modular

---
 gantry/clients/prometheus.py            | 362 ------------------------
 gantry/clients/prometheus/__init__.py   |   2 +
 gantry/clients/prometheus/job.py        | 148 ++++++++++
 gantry/clients/prometheus/node.py       |  58 ++++
 gantry/clients/prometheus/prometheus.py | 105 +++++++
 gantry/clients/prometheus/util.py       |  76 +++++
 gantry/routes/collection.py             |  13 +-
 7 files changed, 396 insertions(+), 368 deletions(-)
 delete mode 100644 gantry/clients/prometheus.py
 create mode 100644 gantry/clients/prometheus/__init__.py
 create mode 100644 gantry/clients/prometheus/job.py
 create mode 100644 gantry/clients/prometheus/node.py
 create mode 100644 gantry/clients/prometheus/prometheus.py
 create mode 100644 gantry/clients/prometheus/util.py

diff --git a/gantry/clients/prometheus.py b/gantry/clients/prometheus.py
deleted file mode 100644
index 87be7ce..0000000
--- a/gantry/clients/prometheus.py
+++ /dev/null
@@ -1,362 +0,0 @@
-import json
-import logging
-import math
-import statistics
-import urllib.parse
-
-import aiohttp
-
-from gantry.util.spec import spec_variants
-
-
-class IncompleteData(Exception):
-    pass
-
-
-class PrometheusClient:
-    def __init__(self, base_url: str, auth_cookie: str = ""):
-        # cookie will only be used if set
-        if auth_cookie:
-            self.cookies = {"_oauth2_proxy": auth_cookie}
-        else:
-            self.cookies = {}
-
-        self.base_url = base_url
-
-    async def query(self, type: str, **kwargs) -> dict:
-        """
-        type: "range" or "single"
-
-        for range queries: set `start` and `end` (unix timestamps)
-        for single queries: set `time` (unix timestamp)
-
-        for custom queries: set `custom_query` (string)
-
-        for metric queries: set `query` (dict)
-            example:
-                "query": {
-                    "metric": "metric_name",
-                    "filters": {"filter1": "value1", "filter2": "value2"}
-                }
-        """
-
-        # validate that one of query or custom_query is set, but not both or neither
-        if not kwargs.get("query") and not kwargs.get("custom_query"):
-            raise ValueError("query or custom_query must be set")
-        if kwargs.get("query") and kwargs.get("custom_query"):
-            raise ValueError("query and custom_query cannot both be set")
-
-        query_str = urllib.parse.quote(
-            kwargs["custom_query"]
-            if kwargs.get("custom_query")
-            else query_to_str(**kwargs["query"])
-        )
-
-        if type == "range":
-            # prometheus will only return this many frames
-            max_resolution = 10_000
-            # calculating the max step size to get the desired resolution
-            step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution)
-            url = (
-                f"{self.base_url}/query_range?"
-                f"query={query_str}&"
-                f"start={kwargs['start']}&"
-                f"end={kwargs['end']}&"
-                f"step={step}s"
-            )
-            return await self._query(url)
-        elif type == "single":
-            url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}"
-            return await self._query(url)
-
-    async def _query(self, url: str) -> dict:
-        """Query Prometheus with a query string"""
-        async with aiohttp.ClientSession(raise_for_status=True) as session:
-            # submit cookie with request
-            async with session.get(url, cookies=self.cookies) as resp:
-                try:
-                    return self.prettify_res(await resp.json())
-                except aiohttp.ContentTypeError:
-                    logging.error(
-                        """Prometheus query failed with unexpected response.
-                        The cookie may have expired."""
-                    )
-                    return {}
-
-    def prettify_res(self, response: dict) -> dict:
-        """Process Prometheus response into an arrray of dicts with {label: value}"""
-        result_type = response.get("data", {}).get("resultType")
-        values_dict = {
-            "matrix": "values",
-            "vector": "value",
-        }
-
-        if result_type not in values_dict:
-            logging.error(f"Prometheus response type {result_type} not supported")
-            return {}
-
-        return [
-            {"labels": result["metric"], "values": result[values_dict[result_type]]}
-            for result in response["data"]["result"]
-        ]
-
-    async def get_job_annotations(self, gl_id: int, time: float) -> dict:
-        """
-        args:
-            gl_id: gitlab job id
-            time: when to query (unix timestamp)
-        returns: dict of annotations
-        """
-
-        res = await self.query(
-            type="single",
-            query={
-                "metric": "kube_pod_annotations",
-                "filters": {"annotation_gitlab_ci_job_id": gl_id},
-            },
-            time=time,
-        )
-
-        if not res:
-            raise IncompleteData("annotation data is missing")
-
-        annotations = res[0]["labels"]
-
-        return {
-            "pod": annotations["pod"],
-            # if build jobs is not set, defaults to 16 due to spack config
-            "build_jobs": annotations.get(
-                "annotation_metrics_spack_job_build_jobs", 16
-            ),
-            "arch": annotations["annotation_metrics_spack_job_spec_arch"],
-            "pkg_name": annotations["annotation_metrics_spack_job_spec_pkg_name"],
-            "pkg_version": annotations["annotation_metrics_spack_job_spec_pkg_version"],
-            "pkg_variants": json.dumps(
-                spec_variants(annotations["annotation_metrics_spack_job_spec_variants"])
-            ),
-            "compiler_name": annotations[
-                "annotation_metrics_spack_job_spec_compiler_name"
-            ],
-            "compiler_version": annotations[
-                "annotation_metrics_spack_job_spec_compiler_version"
-            ],
-            "stack": annotations["annotation_metrics_spack_ci_stack_name"],
-        }
-
-    async def get_job_resources(self, pod: str, time: float) -> tuple[dict, str]:
-        """
-        args:
-            pod: pod name
-            time: when to query (unix timestamp)
-        returns: dict of resources and node hostname
-        """
-
-        requests = process_resources(
-            await self.query(
-                type="single",
-                query={
-                    "metric": "kube_pod_container_resource_requests",
-                    "filters": {"container": "build", "pod": pod},
-                },
-                time=time,
-            )
-        )
-
-        limits_res = await self.query(
-            type="single",
-            query={
-                "metric": "kube_pod_container_resource_limits",
-                "filters": {"container": "build", "pod": pod},
-            },
-            time=time,
-        )
-
-        if not limits_res:
-            raise IncompleteData("missing limits")
-
-        # instead of needing to fetch the node where the pod ran from kube_pod_info
-        # we can grab it from kube_pod_container_resource_limits
-        # weirdly, it's not available in kube_pod_labels or annotations
-        # https://github.com/kubernetes/kube-state-metrics/issues/1148
-        node = limits_res[0]["labels"]["node"]
-        limits = process_resources(limits_res)
-
-        return (
-            {
-                "cpu_request": requests["cpu"]["value"],
-                "mem_request": requests["memory"]["value"],
-                "cpu_limit": limits.get("cpu", {}).get("value"),
-                "mem_limit": limits["memory"]["value"],
-            },
-            node,
-        )
-
-    async def get_job_usage(self, pod: str, start: float, end: float) -> dict:
-        """
-        Gets resource usage attributes for a job.
-
-        args:
-            pod: pod name
-            start: start time (unix timestamp)
-            end: end time (unix timestamp)
-        returns: dict of usage stats
-        """
-
-        mem_usage = process_usage(
-            await self.query(
-                type="range",
-                query={
-                    "metric": "container_memory_working_set_bytes",
-                    "filters": {"container": "build", "pod": pod},
-                },
-                start=start,
-                end=end,
-            )
-        )
-
-        cpu_usage = process_usage(
-            await self.query(
-                type="range",
-                custom_query=(
-                    f"rate(container_cpu_usage_seconds_total{{"
-                    f"pod='{pod}', container='build'}}[90s])"
-                ),
-                start=start,
-                end=end,
-            )
-        )
-
-        return {
-            "cpu_mean": cpu_usage["mean"],
-            "cpu_median": cpu_usage["median"],
-            "cpu_max": cpu_usage["max"],
-            "cpu_min": cpu_usage["min"],
-            "cpu_stddev": cpu_usage["stddev"],
-            "mem_mean": mem_usage["mean"],
-            "mem_median": mem_usage["median"],
-            "mem_max": mem_usage["max"],
-            "mem_min": mem_usage["min"],
-            "mem_stddev": mem_usage["stddev"],
-        }
-
-    async def get_node_uuid(self, hostname: str, time: float) -> dict:
-        """
-        args:
-            hostname: node hostname
-            time: time to query (unix timestamp)
-        returns: dict of node info (UUID as of now)
-        """
-
-        res = await self.query(
-            type="single",
-            query={
-                "metric": "kube_node_info",
-                "filters": {"node": hostname},
-            },
-            time=time,
-        )
-
-        if not res:
-            raise IncompleteData(f"node info is missing. hostname={hostname}")
-
-        return res[0]["labels"]["system_uuid"]
-
-    async def get_node_labels(self, hostname: str, time: float) -> dict:
-        """
-        args:
-            hostname: node hostname
-            time: time to query (unix timestamp)
-        returns: dict of node labels
-        """
-
-        res = await self.query(
-            type="single",
-            query={
-                "metric": "kube_node_labels",
-                "filters": {"node": hostname},
-            },
-            time=time,
-        )
-
-        if not res:
-            raise IncompleteData(f"node labels are missing. hostname={hostname}")
-
-        labels = res[0]["labels"]
-
-        return {
-            "cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]),
-            "mem": float(labels["label_karpenter_k8s_aws_instance_memory"]),
-            "arch": labels["label_kubernetes_io_arch"],
-            "os": labels["label_kubernetes_io_os"],
-            "instance_type": labels["label_node_kubernetes_io_instance_type"],
-        }
-
-
-def query_to_str(metric: str, filters: dict) -> str:
-    """
-    In: "metric", {key1: value1, key2: value2}
-    Out: "metric{key1="value1", key2="value2"}"
-    """
-    filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()])
-    return f"{metric}{{{filters_str}}}"
-
-
-def process_resources(res: dict) -> dict:
-    """
-    Processes the resource limits and requests from a Prometheus response into
-    readable format.
-
-    args:
-        res: Prometheus response
-
-    returns: dict with {resource: {unit: value}} format
-    """
-
-    if not res:
-        raise IncompleteData("resource data is missing")
-
-    processed = {}
-    for item in res:
-        # duplicates are ignored by overwriting the previous entry
-        processed[item["labels"]["resource"]] = {
-            "unit": item["labels"]["unit"],
-            "value": float(item["values"][1]),
-        }
-
-    return processed
-
-
-def process_usage(res: dict) -> dict:
-    """
-    Processes the usage data from a Prometheus response into readable format.
-    This could either be CPU usage or memory usage.
-
-    args:
-        res: Prometheus response
-
-    returns: dict with {statistic: value} format
-    """
-
-    if not res:
-        # sometimes prometheus reports no data for a job if the time range is too small
-        raise IncompleteData("usage data is missing")
-
-    usage = [float(value) for timestamp, value in res[0]["values"]]
-
-    sum_stats = {
-        "mean": statistics.fmean(usage),
-        # pstdev because we have the whole population
-        "stddev": statistics.pstdev(usage),
-        "max": max(usage),
-        "min": min(usage),
-        "median": statistics.median(usage),
-    }
-
-    if (
-        sum_stats["stddev"] == 0
-        or sum_stats["mean"] == 0
-        or math.isnan(sum_stats["stddev"])
-    ):
-        raise IncompleteData("usage data is invalid")
-
-    return sum_stats
diff --git a/gantry/clients/prometheus/__init__.py b/gantry/clients/prometheus/__init__.py
new file mode 100644
index 0000000..9234832
--- /dev/null
+++ b/gantry/clients/prometheus/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .prometheus import PrometheusClient
diff --git a/gantry/clients/prometheus/job.py b/gantry/clients/prometheus/job.py
new file mode 100644
index 0000000..48be608
--- /dev/null
+++ b/gantry/clients/prometheus/job.py
@@ -0,0 +1,148 @@
+import json
+
+from gantry.clients.prometheus import util
+from gantry.util.spec import spec_variants
+
+
+class PrometheusJobClient:
+    def __init__(self, client):
+        self.client = client
+
+    async def get_annotations(self, gl_id: int, time: float) -> dict:
+        """
+        args:
+            gl_id: gitlab job id
+            time: when to query (unix timestamp)
+        returns: dict of annotations
+        """
+
+        res = await self.client.query(
+            type="single",
+            query={
+                "metric": "kube_pod_annotations",
+                "filters": {"annotation_gitlab_ci_job_id": gl_id},
+            },
+            time=time,
+        )
+
+        if not res:
+            raise util.IncompleteData("annotation data is missing")
+
+        annotations = res[0]["labels"]
+
+        return {
+            "pod": annotations["pod"],
+            # if build jobs is not set, defaults to 16 due to spack config
+            "build_jobs": annotations.get(
+                "annotation_metrics_spack_job_build_jobs", 16
+            ),
+            "arch": annotations["annotation_metrics_spack_job_spec_arch"],
+            "pkg_name": annotations["annotation_metrics_spack_job_spec_pkg_name"],
+            "pkg_version": annotations["annotation_metrics_spack_job_spec_pkg_version"],
+            "pkg_variants": json.dumps(
+                spec_variants(annotations["annotation_metrics_spack_job_spec_variants"])
+            ),
+            "compiler_name": annotations[
+                "annotation_metrics_spack_job_spec_compiler_name"
+            ],
+            "compiler_version": annotations[
+                "annotation_metrics_spack_job_spec_compiler_version"
+            ],
+            "stack": annotations["annotation_metrics_spack_ci_stack_name"],
+        }
+
+    async def get_resources(self, pod: str, time: float) -> tuple[dict, str]:
+        """
+        args:
+            pod: pod name
+            time: when to query (unix timestamp)
+        returns: dict of resources and node hostname
+        """
+
+        requests = util.process_resources(
+            await self.client.query(
+                type="single",
+                query={
+                    "metric": "kube_pod_container_resource_requests",
+                    "filters": {"container": "build", "pod": pod},
+                },
+                time=time,
+            )
+        )
+
+        limits_res = await self.client.query(
+            type="single",
+            query={
+                "metric": "kube_pod_container_resource_limits",
+                "filters": {"container": "build", "pod": pod},
+            },
+            time=time,
+        )
+
+        if not limits_res:
+            raise util.IncompleteData("missing limits")
+
+        # instead of needing to fetch the node where the pod ran from kube_pod_info
+        # we can grab it from kube_pod_container_resource_limits
+        # weirdly, it's not available in kube_pod_labels or annotations
+        # https://github.com/kubernetes/kube-state-metrics/issues/1148
+        node = limits_res[0]["labels"]["node"]
+        limits = util.process_resources(limits_res)
+
+        return (
+            {
+                "cpu_request": requests["cpu"]["value"],
+                "mem_request": requests["memory"]["value"],
+                "cpu_limit": limits.get("cpu", {}).get("value"),
+                "mem_limit": limits["memory"]["value"],
+            },
+            node,
+        )
+
+    async def get_usage(self, pod: str, start: float, end: float) -> dict:
+        """
+        Gets resource usage attributes for a job.
+
+        args:
+            pod: pod name
+            start: start time (unix timestamp)
+            end: end time (unix timestamp)
+        returns: dict of usage stats
+        """
+
+        mem_usage = util.process_usage(
+            await self.client.query(
+                type="range",
+                query={
+                    "metric": "container_memory_working_set_bytes",
+                    "filters": {"container": "build", "pod": pod},
+                },
+                start=start,
+                end=end,
+            )
+        )
+
+        cpu_usage = util.process_usage(
+            await self.client.query(
+                type="range",
+                custom_query=(
+                    f"rate(container_cpu_usage_seconds_total{{"
+                    f"pod='{pod}', container='build'}}[90s])"
+                ),
+                start=start,
+                end=end,
+            )
+        )
+
+        return {
+            "cpu_mean": cpu_usage["mean"],
+            "cpu_median": cpu_usage["median"],
+            "cpu_max": cpu_usage["max"],
+            "cpu_min": cpu_usage["min"],
+            "cpu_stddev": cpu_usage["stddev"],
+            "mem_mean": mem_usage["mean"],
+            "mem_median": mem_usage["median"],
+            "mem_max": mem_usage["max"],
+            "mem_min": mem_usage["min"],
+            "mem_stddev": mem_usage["stddev"],
+        }
diff --git a/gantry/clients/prometheus/node.py b/gantry/clients/prometheus/node.py
new file mode 100644
index 0000000..13a3f50
--- /dev/null
+++ b/gantry/clients/prometheus/node.py
@@ -0,0 +1,58 @@
+from gantry.clients.prometheus import util
+
+
+class PrometheusNodeClient:
+    def __init__(self, client):
+        self.client = client
+
+    async def get_uuid(self, hostname: str, time: float) -> dict:
+        """
+        args:
+            hostname: node hostname
+            time: time to query (unix timestamp)
+        returns: dict of node info (UUID as of now)
+        """
+
+        res = await self.client.query(
+            type="single",
+            query={
+                "metric": "kube_node_info",
+                "filters": {"node": hostname},
+            },
+            time=time,
+        )
+
+        if not res:
+            raise util.IncompleteData(f"node info is missing. hostname={hostname}")
+
+        return res[0]["labels"]["system_uuid"]
+
+    async def get_labels(self, hostname: str, time: float) -> dict:
+        """
+        args:
+            hostname: node hostname
+            time: time to query (unix timestamp)
+        returns: dict of node labels
+        """
+
+        res = await self.client.query(
+            type="single",
+            query={
+                "metric": "kube_node_labels",
+                "filters": {"node": hostname},
+            },
+            time=time,
+        )
+
+        if not res:
+            raise util.IncompleteData(f"node labels are missing. hostname={hostname}")
+
+        labels = res[0]["labels"]
+
+        return {
+            "cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]),
+            "mem": float(labels["label_karpenter_k8s_aws_instance_memory"]),
+            "arch": labels["label_kubernetes_io_arch"],
+            "os": labels["label_kubernetes_io_os"],
+            "instance_type": labels["label_node_kubernetes_io_instance_type"],
+        }
diff --git a/gantry/clients/prometheus/prometheus.py b/gantry/clients/prometheus/prometheus.py
new file mode 100644
index 0000000..8ee06b3
--- /dev/null
+++ b/gantry/clients/prometheus/prometheus.py
@@ -0,0 +1,105 @@
+import logging
+import math
+import urllib.parse
+
+import aiohttp
+
+from gantry.clients.prometheus import util
+from gantry.clients.prometheus.job import PrometheusJobClient
+from gantry.clients.prometheus.node import PrometheusNodeClient
+
+
+class PrometheusClient:
+    def __init__(self, base_url: str, auth_cookie: str = ""):
+        # cookie will only be used if set
+        if auth_cookie:
+            self.cookies = {"_oauth2_proxy": auth_cookie}
+        else:
+            self.cookies = {}
+
+        self.base_url = base_url
+
+    async def query(self, type: str, **kwargs) -> dict:
+        """
+        type: "range" or "single"
+
+        for range queries: set `start` and `end` (unix timestamps)
+        for single queries: set `time` (unix timestamp)
+
+        for custom queries: set `custom_query` (string)
+
+        for metric queries: set `query` (dict)
+            example:
+                "query": {
+                    "metric": "metric_name",
+                    "filters": {"filter1": "value1", "filter2": "value2"}
+                }
+        """
+
+        # validate that one of query or custom_query is set, but not both or neither
+        if not kwargs.get("query") and not kwargs.get("custom_query"):
+            raise ValueError("query or custom_query must be set")
+        if kwargs.get("query") and kwargs.get("custom_query"):
+            raise ValueError("query and custom_query cannot both be set")
+
+        query_str = urllib.parse.quote(
+            kwargs["custom_query"]
+            if kwargs.get("custom_query")
+            else util.query_to_str(**kwargs["query"])
+        )
+
+        if type == "range":
+            # prometheus will only return this many frames
+            max_resolution = 10_000
+            # calculating the max step size to get the desired resolution
+            step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution)
+            url = (
+                f"{self.base_url}/query_range?"
+                f"query={query_str}&"
+                f"start={kwargs['start']}&"
+                f"end={kwargs['end']}&"
+                f"step={step}s"
+            )
+            return await self._query(url)
+        elif type == "single":
+            url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}"
+            return await self._query(url)
+
+    async def _query(self, url: str) -> dict:
+        """Query Prometheus with a query string"""
+        async with aiohttp.ClientSession(raise_for_status=True) as session:
+            # submit cookie with request
+            async with session.get(url, cookies=self.cookies) as resp:
+                try:
+                    return self.prettify_res(await resp.json())
+                except aiohttp.ContentTypeError:
+                    logging.error(
+                        """Prometheus query failed with unexpected response.
+                        The cookie may have expired."""
+                    )
+                    return {}
+
+    def prettify_res(self, response: dict) -> dict:
+        """Process Prometheus response into an arrray of dicts with {label: value}"""
+        result_type = response.get("data", {}).get("resultType")
+        values_dict = {
+            "matrix": "values",
+            "vector": "value",
+        }
+
+        if result_type not in values_dict:
+            logging.error(f"Prometheus response type {result_type} not supported")
+            return {}
+
+        return [
+            {"labels": result["metric"], "values": result[values_dict[result_type]]}
+            for result in response["data"]["result"]
+        ]
+
+    @property
+    def job(self):
+        return PrometheusJobClient(self)
+
+    @property
+    def node(self):
+        return PrometheusNodeClient(self)
diff --git a/gantry/clients/prometheus/util.py b/gantry/clients/prometheus/util.py
new file mode 100644
index 0000000..8bb5e42
--- /dev/null
+++ b/gantry/clients/prometheus/util.py
@@ -0,0 +1,76 @@
+import math
+import statistics
+
+
+class IncompleteData(Exception):
+    pass
+
+
+def query_to_str(metric: str, filters: dict) -> str:
+    """
+    In: "metric", {key1: value1, key2: value2}
+    Out: "metric{key1="value1", key2="value2"}"
+    """
+    filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()])
+    return f"{metric}{{{filters_str}}}"
+
+
+def process_resources(res: dict) -> dict:
+    """
+    Processes the resource limits and requests from a Prometheus response into
+    readable format.
+
+    args:
+        res: Prometheus response
+
+    returns: dict with {resource: {unit: value}} format
+    """
+
+    if not res:
+        raise IncompleteData("resource data is missing")
+
+    processed = {}
+    for item in res:
+        # duplicates are ignored by overwriting the previous entry
+        processed[item["labels"]["resource"]] = {
+            "unit": item["labels"]["unit"],
+            "value": float(item["values"][1]),
+        }
+
+    return processed
+
+
+def process_usage(res: dict) -> dict:
+    """
+    Processes the usage data from a Prometheus response into readable format.
+    This could either be CPU usage or memory usage.
+
+    args:
+        res: Prometheus response
+
+    returns: dict with {statistic: value} format
+    """
+
+    if not res:
+        # sometimes prometheus reports no data for a job if the time range is too small
+        raise IncompleteData("usage data is missing")
+
+    usage = [float(value) for timestamp, value in res[0]["values"]]
+
+    sum_stats = {
+        "mean": statistics.fmean(usage),
+        # pstdev because we have the whole population
+        "stddev": statistics.pstdev(usage),
+        "max": max(usage),
+        "min": min(usage),
+        "median": statistics.median(usage),
+    }
+
+    if (
+        sum_stats["stddev"] == 0
+        or sum_stats["mean"] == 0
+        or math.isnan(sum_stats["stddev"])
+    ):
+        raise IncompleteData("usage data is invalid")
+
+    return sum_stats
diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py
index d5125cd..08831d4 100644
--- a/gantry/routes/collection.py
+++ b/gantry/routes/collection.py
@@ -4,7 +4,8 @@
 
 from gantry import db
 from gantry.clients.gitlab import GitlabClient
-from gantry.clients.prometheus import IncompleteData, PrometheusClient
+from gantry.clients.prometheus import PrometheusClient
+from gantry.clients.prometheus.util import IncompleteData
 from gantry.models import Job
 
 MB_IN_BYTES = 1_000_000
@@ -55,11 +56,11 @@ async def fetch_job(
         return
 
     try:
-        annotations = await prometheus.get_job_annotations(job.gl_id, job.midpoint)
-        resources, node_hostname = await prometheus.get_job_resources(
+        annotations = await prometheus.job.get_annotations(job.gl_id, job.midpoint)
+        resources, node_hostname = await prometheus.job.get_resources(
             annotations["pod"], job.midpoint
         )
-        usage = await prometheus.get_job_usage(annotations["pod"], job.start, job.end)
+        usage = await prometheus.job.get_usage(annotations["pod"], job.start, job.end)
         node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint)
     except IncompleteData as e:
         # missing data, skip this job
@@ -106,13 +107,13 @@ async def fetch_node(
     returns: id of the inserted or existing node
     """
 
-    node_uuid = await prometheus.get_node_uuid(hostname, query_time)
+    node_uuid = await prometheus.node.get_uuid(hostname, query_time)
 
     # do not proceed if the node exists
     if existing_node := await db.get_node(db_conn, node_uuid):
         return existing_node
 
-    node_labels = await prometheus.get_node_labels(hostname, query_time)
+    node_labels = await prometheus.node.get_labels(hostname, query_time)
     return await db.insert_node(
         db_conn,
         {

From 49c1eaf48397f073bea56b7422e0638cd663af1f Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Thu, 25 Jan 2024 17:12:24 -0800
Subject: [PATCH 18/27] lessen fatality of not receiving the right hook

---
 gantry/views.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gantry/views.py b/gantry/views.py
index d9a0bb4..8363af8 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -1,5 +1,6 @@
 import asyncio
 import json
+import logging
 import os
 
 from aiohttp import web
@@ -20,7 +21,9 @@ async def collect_job(request: web.Request) -> web.Response:
         return web.Response(status=401, text="invalid token")
 
     if request.headers.get("X-Gitlab-Event") != "Job Hook":
-        return web.Response(status=400, text="invalid event type")
+        logging.error(f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab.")
+        # return 200 so gitlab doesn't disable the webhook -- this is not fatal
+        return web.Response(status=200)
 
     # will return immediately, but will not block the event loop
     # allowing fetch_job to run in the background

From 3bb1ebcb7cf499ff0be043293b843a77f889f29c Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Thu, 25 Jan 2024 18:25:08 -0800
Subject: [PATCH 19/27] black

---
 gantry/views.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gantry/views.py b/gantry/views.py
index 8363af8..821016b 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -21,7 +21,9 @@ async def collect_job(request: web.Request) -> web.Response:
         return web.Response(status=401, text="invalid token")
 
     if request.headers.get("X-Gitlab-Event") != "Job Hook":
-        logging.error(f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab.")
+        logging.error(
+            f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab."
+        )
         # return 200 so gitlab doesn't disable the webhook -- this is not fatal
         return web.Response(status=200)
 

From 0bfe4e665463011f8c7aefbadd48ae08cc9f3a12 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Tue, 30 Jan 2024 22:30:50 -0800
Subject: [PATCH 20/27] no need to store ghost jobs as they are being collected
 by KW

---
 db/schema.sql               | 5 -----
 gantry/db/insert.py         | 6 ------
 gantry/routes/collection.py | 1 -
 3 files changed, 12 deletions(-)

diff --git a/db/schema.sql b/db/schema.sql
index 8104a4e..bba3549 100644
--- a/db/schema.sql
+++ b/db/schema.sql
@@ -45,8 +45,3 @@ CREATE TABLE jobs (
             ON UPDATE CASCADE
             ON DELETE CASCADE
 );
-
-CREATE TABLE ghost_jobs (
-    id INTEGER PRIMARY KEY,
-    gitlab_id INTEGER NOT NULL
-);
diff --git a/gantry/db/insert.py b/gantry/db/insert.py
index 3df157d..7564ad9 100644
--- a/gantry/db/insert.py
+++ b/gantry/db/insert.py
@@ -27,12 +27,6 @@ def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]:
     return query, values_tuple
 
 
-async def insert_ghost(db: aiosqlite.Connection, gl_id: int) -> None:
-    """Inserts a ghost job into the database."""
-
-    await db.execute(("insert into ghost_jobs (gitlab_id) values (?)"), (gl_id,))
-
-
 async def insert_node(db: aiosqlite.Connection, node: dict) -> int:
     """Inserts a node into the database."""
 
diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py
index 08831d4..1ca6d82 100644
--- a/gantry/routes/collection.py
+++ b/gantry/routes/collection.py
@@ -52,7 +52,6 @@ async def fetch_job(
     job_log = await gitlab.job_log(job.gl_id)
     is_ghost = "No need to rebuild" in job_log
     if is_ghost:
-        db.insert_ghost(db_conn, job.gl_id)
         return
 
     try:

From 004310646f06db854b114cdb2bc9b05175b4e69b Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Sun, 4 Feb 2024 23:21:07 -0800
Subject: [PATCH 21/27] don't try to collect UO-ran jobs [ci skip]

---
 gantry/routes/collection.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py
index 1ca6d82..1fd7ed7 100644
--- a/gantry/routes/collection.py
+++ b/gantry/routes/collection.py
@@ -27,7 +27,7 @@ async def fetch_job(
         payload: a dictionary containing the information from the Gitlab job hook
         db: an active aiosqlite connection
 
-    returns: None in order to accomodate a 200 response for the webhook.
+    returns: None in order to accommodate a 200 response for the webhook.
     """
 
     job = Job(
@@ -43,6 +43,8 @@ async def fetch_job(
     if (
         job.status != "success"
         or not job.valid_build_name  # is not a build job
+        # uo runners are not in Prometheus
+        or payload["runner"]["description"].startswith("uo")
         or await db.job_exists(db_conn, job.gl_id)  # job already in the database
         or await db.ghost_exists(db_conn, job.gl_id)  # ghost already in db
     ):

From 53fb639a1080cec9a53d2f75b84e19ef67965647 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 5 Feb 2024 11:25:30 -0800
Subject: [PATCH 22/27] remove tests [ci skip]

---
 gantry/tests/test_utils.py | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 gantry/tests/test_utils.py

diff --git a/gantry/tests/test_utils.py b/gantry/tests/test_utils.py
deleted file mode 100644
index 010e08b..0000000
--- a/gantry/tests/test_utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-
-from gantry.util.misc import spec_variants
-
-# write tests for spec_variants here
-# +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on has to equal {}
-
-
-@pytest.fixture
-def variant_string():
-    return "+adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on"
-
-
-def test_spec_variants(variant_string):
-    assert spec_variants(variant_string) == {
-        "adios2": True,
-        "advanced_debug": False,
-        "patches": ["02253c7", "acb3805", "b724e6a"],
-        "use_vtkm": "on",
-    }

From 9da7248d44d22bc14d74445a3cdb7d9fc0fcd50c Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 5 Feb 2024 11:32:45 -0800
Subject: [PATCH 23/27] import clients individually [ci skip]

---
 gantry/__main__.py         | 3 ++-
 gantry/clients/__init__.py | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/gantry/__main__.py b/gantry/__main__.py
index ebb3e34..ff25d64 100644
--- a/gantry/__main__.py
+++ b/gantry/__main__.py
@@ -3,7 +3,8 @@
 import aiosqlite
 from aiohttp import web
 
-from gantry.clients import GitlabClient, PrometheusClient
+from gantry.clients.gitlab import GitlabClient
+from gantry.clients.prometheus import PrometheusClient
 from gantry.views import routes
 
 
diff --git a/gantry/clients/__init__.py b/gantry/clients/__init__.py
index 2dbe3f6..e69de29 100644
--- a/gantry/clients/__init__.py
+++ b/gantry/clients/__init__.py
@@ -1,3 +0,0 @@
-# flake8: noqa
-from .gitlab import GitlabClient
-from .prometheus import PrometheusClient

From 8e01222f2c3424dc6a2bca6b788d72f942714022 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 5 Feb 2024 11:33:41 -0800
Subject: [PATCH 24/27] version the API [ci skip]

---
 gantry/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gantry/views.py b/gantry/views.py
index 821016b..3acb80b 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -10,7 +10,7 @@
 routes = web.RouteTableDef()
 
 
-@routes.post("/collect")
+@routes.post("/v1/collect")
 async def collect_job(request: web.Request) -> web.Response:
     try:
         payload = await request.json()

From fd7ecfa813b52b3b078eaa1c60ad1192d77fcd90 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 5 Feb 2024 11:58:12 -0800
Subject: [PATCH 25/27] break up PrometheusClient.query into query_single and
 query_range [ci skip]

---
 gantry/clients/prometheus/job.py        | 17 ++----
 gantry/clients/prometheus/node.py       |  6 +-
 gantry/clients/prometheus/prometheus.py | 77 ++++++++++++-------------
 gantry/clients/prometheus/util.py       | 14 +++++
 4 files changed, 60 insertions(+), 54 deletions(-)

diff --git a/gantry/clients/prometheus/job.py b/gantry/clients/prometheus/job.py
index 48be608..9f7162c 100644
--- a/gantry/clients/prometheus/job.py
+++ b/gantry/clients/prometheus/job.py
@@ -16,8 +16,7 @@ async def get_annotations(self, gl_id: int, time: float) -> dict:
         returns: dict of annotations
         """
 
-        res = await self.client.query(
-            type="single",
+        res = await self.client.query_single(
             query={
                 "metric": "kube_pod_annotations",
                 "filters": {"annotation_gitlab_ci_job_id": gl_id},
@@ -60,8 +59,7 @@ async def get_resources(self, pod: str, time: float) -> tuple[dict, str]:
         """
 
         requests = util.process_resources(
-            await self.client.query(
-                type="single",
+            await self.client.query_single(
                 query={
                     "metric": "kube_pod_container_resource_requests",
                     "filters": {"container": "build", "pod": pod},
@@ -70,8 +68,7 @@ async def get_resources(self, pod: str, time: float) -> tuple[dict, str]:
             )
         )
 
-        limits_res = await self.client.query(
-            type="single",
+        limits_res = await self.client.query_single(
             query={
                 "metric": "kube_pod_container_resource_limits",
                 "filters": {"container": "build", "pod": pod},
@@ -111,8 +108,7 @@ async def get_usage(self, pod: str, start: float, end: float) -> dict:
         """
 
         mem_usage = util.process_usage(
-            await self.client.query(
-                type="range",
+            await self.client.query_range(
                 query={
                     "metric": "container_memory_working_set_bytes",
                     "filters": {"container": "build", "pod": pod},
@@ -123,9 +119,8 @@ async def get_usage(self, pod: str, start: float, end: float) -> dict:
         )
 
         cpu_usage = util.process_usage(
-            await self.client.query(
-                type="range",
-                custom_query=(
+            await self.client.query_range(
+                query=(
                     f"rate(container_cpu_usage_seconds_total{{"
                     f"pod='{pod}', container='build'}}[90s])"
                 ),
diff --git a/gantry/clients/prometheus/node.py b/gantry/clients/prometheus/node.py
index 13a3f50..abfb217 100644
--- a/gantry/clients/prometheus/node.py
+++ b/gantry/clients/prometheus/node.py
@@ -13,8 +13,7 @@ async def get_uuid(self, hostname: str, time: float) -> dict:
         returns: dict of node info (UUID as of now)
         """
 
-        res = await self.client.query(
-            type="single",
+        res = await self.client.query_single(
             query={
                 "metric": "kube_node_info",
                 "filters": {"node": hostname},
@@ -35,8 +34,7 @@ async def get_labels(self, hostname: str, time: float) -> dict:
         returns: dict of node labels
         """
 
-        res = await self.client.query(
-            type="single",
+        res = await self.client.query_single(
             query={
                 "metric": "kube_node_labels",
                 "filters": {"node": hostname},
diff --git a/gantry/clients/prometheus/prometheus.py b/gantry/clients/prometheus/prometheus.py
index 8ee06b3..e6ecaee 100644
--- a/gantry/clients/prometheus/prometheus.py
+++ b/gantry/clients/prometheus/prometheus.py
@@ -1,6 +1,5 @@
 import logging
 import math
-import urllib.parse
 
 import aiohttp
 
@@ -19,51 +18,51 @@ def __init__(self, base_url: str, auth_cookie: str = ""):
 
         self.base_url = base_url
 
-    async def query(self, type: str, **kwargs) -> dict:
+    async def query_single(self, query: str | dict, time: int) -> dict:
+        """Query Prometheus for a single value
+        args:
+
+            query: str or dict
+                if str, the query string
+                if dict, the metric and filters
+                example:
+                    "query": {
+                        "metric": "metric_name",
+                        "filters": {"filter1": "value1", "filter2": "value2"}
+                    }
+            time: int (unix timestamp)
+
+        returns: dict with {label: value} format
         """
-        type: "range" or "single"
 
-        for range queries: set `start` and `end` (unix timestamps)
-        for single queries: set `time` (unix timestamp)
+        query = util.process_query(query)
+        url = f"{self.base_url}/query?query={query}&time={time}"
+        return await self._query(url)
 
-        for custom queries: set `custom_query` (string)
+    async def query_range(self, query: str | dict, start: int, end: int) -> dict:
+        """Query Prometheus for a range of values
 
-        for metric queries: set `query` (dict)
-            example:
-                "query": {
-                    "metric": "metric_name",
-                    "filters": {"filter1": "value1", "filter2": "value2"}
-                }
-        """
+        args:
+            query: see query_single
+            start: int (unix timestamp)
+            end: int (unix timestamp)
 
-        # validate that one of query or custom_query is set, but not both or neither
-        if not kwargs.get("query") and not kwargs.get("custom_query"):
-            raise ValueError("query or custom_query must be set")
-        if kwargs.get("query") and kwargs.get("custom_query"):
-            raise ValueError("query and custom_query cannot both be set")
+        returns: list of dicts with {label: value} format
+        """
 
-        query_str = urllib.parse.quote(
-            kwargs["custom_query"]
-            if kwargs.get("custom_query")
-            else util.query_to_str(**kwargs["query"])
+        query = util.process_query(query)
+        # prometheus will only return this many frames
+        max_resolution = 10_000
+        # calculating the max step size to get the desired resolution
+        step = math.ceil((end - start) / max_resolution)
+        url = (
+            f"{self.base_url}/query_range?"
+            f"query={query}&"
+            f"start={start}&"
+            f"end={end}&"
+            f"step={step}s"
         )
-
-        if type == "range":
-            # prometheus will only return this many frames
-            max_resolution = 10_000
-            # calculating the max step size to get the desired resolution
-            step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution)
-            url = (
-                f"{self.base_url}/query_range?"
-                f"query={query_str}&"
-                f"start={kwargs['start']}&"
-                f"end={kwargs['end']}&"
-                f"step={step}s"
-            )
-            return await self._query(url)
-        elif type == "single":
-            url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}"
-            return await self._query(url)
+        return await self._query(url)
 
     async def _query(self, url: str) -> dict:
         """Query Prometheus with a query string"""
diff --git a/gantry/clients/prometheus/util.py b/gantry/clients/prometheus/util.py
index 8bb5e42..eaf768d 100644
--- a/gantry/clients/prometheus/util.py
+++ b/gantry/clients/prometheus/util.py
@@ -1,11 +1,25 @@
 import math
 import statistics
+import urllib.parse
 
 
 class IncompleteData(Exception):
     pass
 
 
+def process_query(query: dict | str) -> str:
+    """
+    Processes query into a string that can be used in a URL.
+    See query_single in prometheus.py for more details on args.
+    """
+    if isinstance(query, dict):
+        query = query_to_str(**query)
+    elif not isinstance(query, str):
+        raise ValueError("query must be a string or dict")
+
+    return urllib.parse.quote(query)
+
+
 def query_to_str(metric: str, filters: dict) -> str:
     """
     In: "metric", {key1: value1, key2: value2}

From a93a5288cb00ca84911524e392afdc069bf7efb3 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 5 Feb 2024 12:02:01 -0800
Subject: [PATCH 26/27] fix prometheus client types

---
 gantry/clients/prometheus/prometheus.py | 10 +++++-----
 gantry/clients/prometheus/util.py       |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/gantry/clients/prometheus/prometheus.py b/gantry/clients/prometheus/prometheus.py
index e6ecaee..acb2dd3 100644
--- a/gantry/clients/prometheus/prometheus.py
+++ b/gantry/clients/prometheus/prometheus.py
@@ -18,7 +18,7 @@ def __init__(self, base_url: str, auth_cookie: str = ""):
 
         self.base_url = base_url
 
-    async def query_single(self, query: str | dict, time: int) -> dict:
+    async def query_single(self, query: str | dict, time: int) -> list:
         """Query Prometheus for a single value
         args:
 
@@ -39,7 +39,7 @@ async def query_single(self, query: str | dict, time: int) -> dict:
         url = f"{self.base_url}/query?query={query}&time={time}"
         return await self._query(url)
 
-    async def query_range(self, query: str | dict, start: int, end: int) -> dict:
+    async def query_range(self, query: str | dict, start: int, end: int) -> list:
         """Query Prometheus for a range of values
 
         args:
@@ -64,7 +64,7 @@ async def query_range(self, query: str | dict, start: int, end: int) -> dict:
         )
         return await self._query(url)
 
-    async def _query(self, url: str) -> dict:
+    async def _query(self, url: str) -> list:
         """Query Prometheus with a query string"""
         async with aiohttp.ClientSession(raise_for_status=True) as session:
             # submit cookie with request
@@ -78,7 +78,7 @@ async def _query(self, url: str) -> dict:
                     )
                     return {}
 
-    def prettify_res(self, response: dict) -> dict:
+    def prettify_res(self, response: dict) -> list:
         """Process Prometheus response into an arrray of dicts with {label: value}"""
         result_type = response.get("data", {}).get("resultType")
         values_dict = {
@@ -88,7 +88,7 @@ def prettify_res(self, response: dict) -> dict:
 
         if result_type not in values_dict:
             logging.error(f"Prometheus response type {result_type} not supported")
-            return {}
+            return []
 
         return [
             {"labels": result["metric"], "values": result[values_dict[result_type]]}
diff --git a/gantry/clients/prometheus/util.py b/gantry/clients/prometheus/util.py
index eaf768d..e749dae 100644
--- a/gantry/clients/prometheus/util.py
+++ b/gantry/clients/prometheus/util.py
@@ -29,7 +29,7 @@ def query_to_str(metric: str, filters: dict) -> str:
     return f"{metric}{{{filters_str}}}"
 
 
-def process_resources(res: dict) -> dict:
+def process_resources(res: list) -> dict:
     """
     Processes the resource limits and requests from a Prometheus response into
     readable format.
@@ -54,7 +54,7 @@ def process_resources(res: dict) -> dict:
     return processed
 
 
-def process_usage(res: dict) -> dict:
+def process_usage(res: list) -> dict:
     """
     Processes the usage data from a Prometheus response into readable format.
     This could either be CPU usage or memory usage.

From 56157abb71bceda23aa804354a96c18882004d81 Mon Sep 17 00:00:00 2001
From: caetano melone <caetanomelone@gmail.com>
Date: Mon, 5 Feb 2024 12:05:55 -0800
Subject: [PATCH 27/27] fix flake8

---
 gantry/views.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gantry/views.py b/gantry/views.py
index 3acb80b..b71a738 100644
--- a/gantry/views.py
+++ b/gantry/views.py
@@ -21,9 +21,7 @@ async def collect_job(request: web.Request) -> web.Response:
         return web.Response(status=401, text="invalid token")
 
     if request.headers.get("X-Gitlab-Event") != "Job Hook":
-        logging.error(
-            f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab."
-        )
+        logging.error(f"invalid event type {request.headers.get('X-Gitlab-Event')}")
         # return 200 so gitlab doesn't disable the webhook -- this is not fatal
         return web.Response(status=200)