From 98586b33090cd6c6fc37ca94e6825bb41645bd7a Mon Sep 17 00:00:00 2001 From: Alec Scott Date: Thu, 21 Dec 2023 11:35:07 -0800 Subject: [PATCH 01/27] Add basic GitHub Actions CI --- .github/workflows/ci.yml | 50 +++++++++++++++++++ .github/workflows/requirements/style.txt | 2 + .github/workflows/requirements/unit-tests.txt | 2 + .github/workflows/style.yml | 27 ++++++++++ .github/workflows/unit-tests.yml | 25 ++++++++++ gantry/__init__.py | 0 gantry/__main__.py | 6 +++ 7 files changed, 112 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/requirements/style.txt create mode 100644 .github/workflows/requirements/unit-tests.txt create mode 100644 .github/workflows/style.yml create mode 100644 .github/workflows/unit-tests.yml create mode 100644 gantry/__init__.py create mode 100644 gantry/__main__.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3eda4da --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,50 @@ +name: ci +on: + push: + branches: + - main + pull_request: + branches: + - main + +concurrency: + group: ci-${{github.ref}}-${{github.event.pull_request.number || github.run_number}} + cancel-in-progress: true + +jobs: + changes: + runs-on: ubuntu-latest + permissions: + pull-requests: read + outputs: + style: ${{ steps.filter.outputs.style }} + unit-tests: ${{ steps.filter.outputs.unit-tests }} + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # @v2 + if: ${{ github.event_name == 'push' }} + with: + fetch-depth: 0 + + # For pull requests it's not necessary to checkout the code + - uses: dorny/paths-filter@4512585405083f25c027a35db413c2b3b9006d50 + id: filter + with: + filters: | + style: + - '.github/**' + - 'gantry/**' + - 'pyproject.toml' + unit-tests: + - '.github/**' + - 'gantry/**' + - 'pyproject.toml' + + style: + if: ${{ needs.changes.outputs.style == 'true' }} + needs: changes + uses: ./.github/workflows/style.yml + + unit-tests: + if: ${{ needs.changes.outputs.unit-tests == 'true' }} + needs: [changes, style] + uses: ./.github/workflows/unit-tests.yml diff --git a/.github/workflows/requirements/style.txt b/.github/workflows/requirements/style.txt new file mode 100644 index 0000000..dd22bb4 --- /dev/null +++ b/.github/workflows/requirements/style.txt @@ -0,0 +1,2 @@ +black==23.12.0 +flake8==6.1.0 diff --git a/.github/workflows/requirements/unit-tests.txt b/.github/workflows/requirements/unit-tests.txt new file mode 100644 index 0000000..1393afd --- /dev/null +++ b/.github/workflows/requirements/unit-tests.txt @@ -0,0 +1,2 @@ +pytest==7.4.3 +pytest-asyncio==0.23.2 diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml new file mode 100644 index 0000000..d5f7155 --- /dev/null +++ b/.github/workflows/style.yml @@ -0,0 +1,27 @@ +name: Linting & Style Checks +on: + # This Workflow can be triggered manually + workflow_dispatch: + workflow_call: + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 + + - name: Set up Python 3.11 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c + with: + python-version: '3.11' + cache: 'pip' + cache-dependency-path: '.github/workflows/requirements/style.txt' + + - name: Install Python dependencies + run: | + pip install -r .github/workflows/requirements/style.txt + + - name: Lint and Format Check with Flake8 and Black + run: | + black --diff --check . + flake8 hubcast/ diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000..551403a --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,25 @@ +name: Unit Tests +on: + # This Workflow can be triggered manually + workflow_dispatch: + workflow_call: + +jobs: + ubuntu: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.11'] + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 + - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: '.github/workflows/requirements/unit-tests.txt' + - name: Install Python dependencies + run: | + pip install -r .github/workflows/requirements/unit-tests.txt + - name: Run Unit Tests with Pytest + run: | + pytest diff --git a/gantry/__init__.py b/gantry/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gantry/__main__.py b/gantry/__main__.py new file mode 100644 index 0000000..491f8ff --- /dev/null +++ b/gantry/__main__.py @@ -0,0 +1,6 @@ +def main(): + print("Hello World") + + +if __name__ == "__main__": + main() From 57ca71c4e74edf03302582aab7093f7ec12d4a90 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 25 Dec 2023 14:51:56 -0800 Subject: [PATCH 02/27] rough draft of collection functionality --- .envrc | 12 ++ .flake8 | 3 + .gitignore | 5 + db/schema.sql | 55 ++++++++ gantry/tests/test_utils.py | 20 +++ gantry/utils/__init__.py | 0 gantry/utils/collect.py | 281 +++++++++++++++++++++++++++++++++++++ gantry/utils/db.py | 40 ++++++ gantry/utils/gitlab.py | 25 ++++ gantry/utils/misc.py | 26 ++++ gantry/utils/prometheus.py | 75 ++++++++++ pyproject.toml | 4 + spack.yaml | 13 ++ 13 files changed, 559 insertions(+) create mode 100644 .envrc create mode 100644 .flake8 create mode 100644 .gitignore create mode 100644 db/schema.sql create mode 100644 gantry/tests/test_utils.py create mode 100644 gantry/utils/__init__.py create mode 100644 gantry/utils/collect.py create mode 100644 gantry/utils/db.py create mode 100644 gantry/utils/gitlab.py create mode 100644 gantry/utils/misc.py create mode 100644 gantry/utils/prometheus.py create mode 100644 pyproject.toml create mode 100644 spack.yaml diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..5283fcd --- /dev/null +++ b/.envrc @@ -0,0 +1,12 @@ +#------------------------------------------------------------------------ +# Load Development Spack Environment (If Spack is installed.) +# +# Run 'direnv allow' from within the cloned repository to automatically +# load the spack environment when you enter the directory. +#------------------------------------------------------------------------ +if type spack &>/dev/null; then + . $SPACK_ROOT/share/spack/setup-env.sh + spack env activate -d . +fi + +dotenv diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..f295e07 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203, E704 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..372e265 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +.env +spack.lock +.spack-env +db/*.db diff --git a/db/schema.sql b/db/schema.sql new file mode 100644 index 0000000..c6fcac0 --- /dev/null +++ b/db/schema.sql @@ -0,0 +1,55 @@ +CREATE TABLE vms ( + id INTEGER PRIMARY KEY, + start INTEGER NOT NULL, + -- VM end is the max of the build end times + hostname TEXT NOT NULL, + cores REAL NOT NULL, + mem REAL NOT NULL, + arch TEXT NOT NULL, + os TEXT NOT NULL, + instance_type TEXT NOT NULL +); + + +CREATE TABLE builds ( + -- TODO do we want an entry here for if the job has been retried? + id INTEGER PRIMARY KEY, + pod TEXT NOT NULL UNIQUE, + vm INTEGER NOT NULL, + start INTEGER NOT NULL, + end INTEGER NOT NULL, + job_id INTEGER NOT NULL, + job_status TEXT NOT NULL, + ref TEXT NOT NULL, + pkg_name TEXT NOT NULL, + pkg_version TEXT NOT NULL, + pkg_variants TEXT NOT NULL, -- can be stored as JSONB in the future? + compiler_name TEXT NOT NULL, + compiler_version TEXT NOT NULL, + arch TEXT NOT NULL, + stack TEXT NOT NULL, + build_jobs INTEGER NOT NULL, + cpu_request REAL NOT NULL, + cpu_limit REAL, -- this can be null + cpu_mean REAL NOT NULL, + cpu_median REAL NOT NULL, + cpu_max REAL NOT NULL, + cpu_min REAL NOT NULL, + cpu_stddev REAL NOT NULL, + mem_request REAL NOT NULL, + mem_limit REAL NOT NULL, + mem_mean REAL NOT NULL, + mem_median REAL NOT NULL, + mem_max REAL NOT NULL, + mem_min REAL NOT NULL, + mem_stddev REAL NOT NULL, + FOREIGN KEY (vm) + REFERENCES vms (id) + ON UPDATE CASCADE + ON DELETE CASCADE +); + +CREATE TABLE ghost_jobs ( + id INTEGER PRIMARY KEY, + job_id INTEGER NOT NULL +); diff --git a/gantry/tests/test_utils.py b/gantry/tests/test_utils.py new file mode 100644 index 0000000..3e83088 --- /dev/null +++ b/gantry/tests/test_utils.py @@ -0,0 +1,20 @@ +import pytest + +from gantry.utils.misc import spec_variants + +# write tests for spec_variants here +# +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on has to equal {} + + +@pytest.fixture +def variant_string(): + return "+adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on" + + +def test_spec_variants(variant_string): + assert spec_variants(variant_string) == { + "adios2": True, + "advanced_debug": False, + "patches": ["02253c7", "acb3805", "b724e6a"], + "use_vtkm": "on", + } diff --git a/gantry/utils/__init__.py b/gantry/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py new file mode 100644 index 0000000..b19dd4d --- /dev/null +++ b/gantry/utils/collect.py @@ -0,0 +1,281 @@ +import json +import logging +import math +import re +import statistics +import sys +from datetime import datetime + +from utils.db import SqliteClient +from utils.gitlab import GitlabClient +from utils.misc import spec_variants +from utils.prometheus import PrometheusClient + + +async def fetch_job(job: dict) -> dict: + # TODO match gitlab webhook payload and process datetimes? + if job["build_status"] not in ("success", "failed"): + return + + if job["build_status"] == "failed": + # TODO implement retry mechanism + return + + job_name_pattern = re.compile(r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)") + job_name_match = job_name_pattern.match(job["build_name"]) + if not job_name_match: + # generate jobs, non build jobs, etc + return + + gitlab = GitlabClient() + prometheus = PrometheusClient() + db = SqliteClient() + + # check if job has already been inserted into the database + db.execute("select job_id from builds where job_id = ?", (job["build_id"],)) + if db.fetchone(): + logging.info(f"job {job['build_id']} already in database") + return + + job_log = await gitlab.job_log(job["build_id"]) + if is_ghost(job_log): + db.insert("ghost_jobs", (None, job["build_id"])) + return + + job["start"] = datetime.fromisoformat(job["build_started_at"]).timestamp() + job["end"] = datetime.fromisoformat(job["build_finished_at"]).timestamp() + + # prometheus is not guaranteed to have data at the exact start and end times + # instead of creating an arbitrary buffer, ask for data in the middle of the job + query_time = (job["end"] + job["start"]) / 2 + + pod_annotations_res = await prometheus.query( + type="single", + query={ + "metric": "kube_pod_annotations", + "filters": {"annotation_gitlab_ci_job_id": job["build_id"]}, + }, + time=query_time, + ) + + job.update( + { + "pod": pod_annotations_res[0]["labels"]["pod"], + # TODO int? is it guaranteed to be here? + "build_jobs": pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_build_jobs" + ], + "arch": pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_spec_arch" + ], + "pkg_name": pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_spec_pkg_name" + ], + "pkg_version": pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_spec_pkg_version" + ], + "pkg_variants": spec_variants( + pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_spec_variants" + ] + ), + "compiler_name": pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_spec_compiler_name" + ], + "compiler_version": pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_spec_compiler_version" + ], + "stack": job_name_match.group(6), + } + ) + + job_requests_res = await prometheus.query( + type="single", + query={ + "metric": "kube_pod_container_resource_requests", + "filters": {"container": "build", "pod": job["pod"]}, + }, + time=query_time, + ) + + job_limits_res = await prometheus.query( + type="single", + query={ + "metric": "kube_pod_container_resource_limits", + "filters": {"container": "build", "pod": job["pod"]}, + }, + time=query_time, + ) + + mem_usage = process_usage( + await prometheus.query( + type="range", + query={ + "metric": "container_memory_working_set_bytes", + "filters": {"container": "build", "pod": job["pod"]}, + }, + start=job["start"], + end=job["end"], + ), + job["build_id"], + ) + + cpu_usage = process_usage( + await prometheus.query( + type="range", + custom_query=f"rate(container_cpu_usage_seconds_total{{pod='{job['pod']}', container='build'}}[90s])", + start=job["start"], + end=job["end"], + ), + job["build_id"], + ) + + # instead of needing to fetch the node where the pod ran from kube_pod_info + # we can grab it from kube_pod_container_resource_limits + # weirdly, it's not available in kube_pod_labels or annotations + # https://github.com/kubernetes/kube-state-metrics/issues/1148 + vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time) + requests = process_resources_res(job_requests_res) + limits = process_resources_res(job_limits_res) + + # TODO insert into db here + + return db.insert( + "builds", + ( + None, + job["pod"], + vm, + job["start"], + job["end"], + job["build_id"], + job["build_status"], + job["ref"], + job["pkg_name"], + job["pkg_version"], + # dict to string + json.dumps(job["pkg_variants"]), + job["compiler_name"], + job["compiler_version"], + job["arch"], + job["stack"], + job["build_jobs"], + requests["cpu"]["value"], + # currently not set as of 12-23 + limits.get("cpu", {}).get("value"), + cpu_usage["mean"], + cpu_usage["median"], + cpu_usage["max"], + cpu_usage["min"], + cpu_usage["stddev"], + requests["memory"]["value"], + limits["memory"]["value"], + mem_usage["mean"], + mem_usage["median"], + mem_usage["max"], + mem_usage["min"], + mem_usage["stddev"], + ), + ) + + +async def fetch_vm(hostname: str, query_time: float) -> dict: + prometheus = PrometheusClient() + db = SqliteClient() + vm_start_res = await prometheus.query( + type="single", + query={ + "metric": "kube_node_created", + "filters": {"node": hostname}, + }, + time=query_time, + ) + + vm_start = float(vm_start_res[0]["values"][1]) + + db.execute( + "select id from vms where hostname = ? and start = ?", (hostname, vm_start) + ) + vm_id = db.fetchone() + + if vm_id: + logging.info(f"vm {hostname} already in database with id {vm_id[0]}") + return vm_id[0] + + vm_capacity = process_resources_res( + await prometheus.query( + type="single", + query={ + "metric": "kube_node_status_capacity", + "filters": {"node": hostname}, + }, + time=query_time, + ) + ) + + vm_labels = await prometheus.query( + type="single", + query={ + "metric": "kube_node_labels", + "filters": {"node": hostname}, + }, + time=query_time, + ) + + return db.insert( + "vms", + ( + None, + vm_start, + hostname, + vm_capacity["cpu"]["value"], + vm_capacity["memory"]["value"], + vm_labels[0]["labels"]["label_kubernetes_io_arch"], + vm_labels[0]["labels"]["label_kubernetes_io_os"], + vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"], + ), + ) + + +def is_ghost(log): + return "No need to rebuild" in log + + +def process_resources_res(res: dict) -> dict: + processed = {} + for item in res: + # duplicates are ignored by overwriting the previous entry + processed[item["labels"]["resource"]] = { + "unit": item["labels"]["unit"], + "value": float(item["values"][1]), + } + + return processed + + +def process_usage(res: dict, job_id: int) -> dict: + if not res: + # sometimes prometheus reports no data for a job if the time range is too small + logging.error(f"lack of usage data for job {job_id}") + sys.exit() + + usage = [float(value) for timestamp, value in res[0]["values"]] + + sum_stats = { + "mean": statistics.fmean(usage), + # use pstdev because we have the whole population + "stddev": statistics.pstdev(usage), + "max": max(usage), + "min": min(usage), + "median": statistics.median(usage), + } + + if ( + sum_stats["stddev"] == 0 + or sum_stats["mean"] == 0 + or math.isnan(sum_stats["stddev"]) + ): + logging.error(f"usage data is invalid for job {job_id}") + sys.exit() + + return sum_stats diff --git a/gantry/utils/db.py b/gantry/utils/db.py new file mode 100644 index 0000000..5ea9de5 --- /dev/null +++ b/gantry/utils/db.py @@ -0,0 +1,40 @@ +import os +import sqlite3 + + +class SqliteClient: + def __init__(self): + self.conn = sqlite3.connect(os.environ["DB_FILE"]) + self.cursor = self.conn.cursor() + self.execute("PRAGMA foreign_keys = ON;") + + def execute(self, query, params=None): + if params: + self.cursor.execute(query, params) + else: + self.cursor.execute(query) + + def insert(self, table, values): + self.execute( + f"insert into {table} values ({','.join(['?'] * len(values))})", values + ) + self.commit() + return self.cursor.lastrowid + + def fetchall(self): + return self.cursor.fetchall() + + def fetchone(self): + return self.cursor.fetchone() + + def commit(self): + self.conn.commit() + + def close(self): + self.conn.close() + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() diff --git a/gantry/utils/gitlab.py b/gantry/utils/gitlab.py new file mode 100644 index 0000000..4ad652d --- /dev/null +++ b/gantry/utils/gitlab.py @@ -0,0 +1,25 @@ +import logging +import os + +import aiohttp + + +class GitlabClient: + def __init__(self): + self.base_url = os.environ["GITLAB_URL"] + self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_TOKEN"]} + + async def request(self, url: str, response_type: str) -> dict: + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self.headers) as resp: + if resp.status != 200: + logging.error(f"Gitlab query failed with status {resp.status}") + return {} + if response_type == "json": + return await resp.json() + if response_type == "text": + return await resp.text() + + async def job_log(self, id: int) -> str: + url = f"{self.base_url}/jobs/{id}/trace" + return await self.request(url, "text") diff --git a/gantry/utils/misc.py b/gantry/utils/misc.py new file mode 100644 index 0000000..31c67ff --- /dev/null +++ b/gantry/utils/misc.py @@ -0,0 +1,26 @@ +def spec_variants(spec: str) -> dict: + """Given a spec's concrete variants, return a dict of variant name: value.""" + # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on + + # TODO handle errors and invalid inputs + + variants = {} + spec = spec.replace("+", " +") + spec = spec.replace("~", " ~") + parts = spec.split(" ") + + for part in parts: + if "=" in part: + name, value = part.split("=") + # multiple values + if "," in value: + variants[name] = value.split(",") + else: + variants[name] = value + else: + if part.startswith("+"): + variants[part[1:]] = True + elif part.startswith("~"): + variants[part[1:]] = False + + return variants diff --git a/gantry/utils/prometheus.py b/gantry/utils/prometheus.py new file mode 100644 index 0000000..ca1eac1 --- /dev/null +++ b/gantry/utils/prometheus.py @@ -0,0 +1,75 @@ +import logging +import math +import os +import urllib.parse + +import aiohttp + + +class PrometheusClient: + # TODO error handling for unexpected data + # todo retry mechanism for failed requests? + + def __init__(self): + self.base_url = os.environ["PROMETHEUS_URL"] + self.cookies = {"_oauth2_proxy": os.environ["PROMETHEUS_COOKIE"]} + + async def query(self, type: str, **kwargs) -> dict: + # TODO add validation for kwargs and comments + query_str = ( + kwargs["custom_query"] + if kwargs.get("custom_query") + else query_to_str(**kwargs["query"]) + ) + + if type == "range": + # prometheus will only return this many frames + max_resolution = 10_000 + # calculating the max step size to get the desired resolution + step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution) + url = f"{self.base_url}/query_range?query={query_str}&start={kwargs['start']}&end={kwargs['end']}&step={step}s" + return await self._query(url) + elif type == "single": + url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}" + return await self._query(url) + + async def _query(self, url: str) -> dict: + """Query Prometheus with a query string""" + async with aiohttp.ClientSession() as session: + # submit cookie with request + async with session.get(url, cookies=self.cookies) as resp: + if resp.status != 200: + logging.error(f"Prometheus query failed with status {resp.status}") + return {} + try: + return self.process_response(await resp.json()) + except aiohttp.ContentTypeError: + logging.error( + """Prometheus query failed with unexpected response. + The cookie may have expired.""" + ) + return {} + + def process_response(self, response: dict) -> dict: + """Process Prometheus response into a more usable format""" + result_type = response.get("data", {}).get("resultType") + values_dict = { + "matrix": "values", + "vector": "value", + } + + if result_type not in values_dict: + logging.error(f"Prometheus response type {result_type} not supported") + return {} + + return [ + {"labels": result["metric"], "values": result[values_dict[result_type]]} + for result in response["data"]["result"] + ] + + +def query_to_str(metric: str, filters: dict) -> str: + # TODO add a test for this + # expected output: metric{key1="val1", key2="val2"} + filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()]) + return urllib.parse.quote(f"{metric}{{{filters_str}}}") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4620ee1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.isort] +profile = "black" +skip_gitignore = true +color_output = true diff --git a/spack.yaml b/spack.yaml new file mode 100644 index 0000000..7249289 --- /dev/null +++ b/spack.yaml @@ -0,0 +1,13 @@ +spack: + specs: + - python + - py-aiohttp + - py-pytest + - py-pytest-asyncio + - py-flake8 + - py-black + - py-isort + - sqlite + view: true + concretizer: + unify: true From c8a39d8cb125ce41f50b09cb522b66c513a064ab Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 25 Dec 2023 15:05:52 -0800 Subject: [PATCH 03/27] Revert "Add basic GitHub Actions CI" This reverts commit 98586b33090cd6c6fc37ca94e6825bb41645bd7a. --- .github/workflows/ci.yml | 50 ------------------- .github/workflows/requirements/style.txt | 2 - .github/workflows/requirements/unit-tests.txt | 2 - .github/workflows/style.yml | 27 ---------- .github/workflows/unit-tests.yml | 25 ---------- gantry/__init__.py | 0 gantry/__main__.py | 6 --- 7 files changed, 112 deletions(-) delete mode 100644 .github/workflows/ci.yml delete mode 100644 .github/workflows/requirements/style.txt delete mode 100644 .github/workflows/requirements/unit-tests.txt delete mode 100644 .github/workflows/style.yml delete mode 100644 .github/workflows/unit-tests.yml delete mode 100644 gantry/__init__.py delete mode 100644 gantry/__main__.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 3eda4da..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: ci -on: - push: - branches: - - main - pull_request: - branches: - - main - -concurrency: - group: ci-${{github.ref}}-${{github.event.pull_request.number || github.run_number}} - cancel-in-progress: true - -jobs: - changes: - runs-on: ubuntu-latest - permissions: - pull-requests: read - outputs: - style: ${{ steps.filter.outputs.style }} - unit-tests: ${{ steps.filter.outputs.unit-tests }} - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # @v2 - if: ${{ github.event_name == 'push' }} - with: - fetch-depth: 0 - - # For pull requests it's not necessary to checkout the code - - uses: dorny/paths-filter@4512585405083f25c027a35db413c2b3b9006d50 - id: filter - with: - filters: | - style: - - '.github/**' - - 'gantry/**' - - 'pyproject.toml' - unit-tests: - - '.github/**' - - 'gantry/**' - - 'pyproject.toml' - - style: - if: ${{ needs.changes.outputs.style == 'true' }} - needs: changes - uses: ./.github/workflows/style.yml - - unit-tests: - if: ${{ needs.changes.outputs.unit-tests == 'true' }} - needs: [changes, style] - uses: ./.github/workflows/unit-tests.yml diff --git a/.github/workflows/requirements/style.txt b/.github/workflows/requirements/style.txt deleted file mode 100644 index dd22bb4..0000000 --- a/.github/workflows/requirements/style.txt +++ /dev/null @@ -1,2 +0,0 @@ -black==23.12.0 -flake8==6.1.0 diff --git a/.github/workflows/requirements/unit-tests.txt b/.github/workflows/requirements/unit-tests.txt deleted file mode 100644 index 1393afd..0000000 --- a/.github/workflows/requirements/unit-tests.txt +++ /dev/null @@ -1,2 +0,0 @@ -pytest==7.4.3 -pytest-asyncio==0.23.2 diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml deleted file mode 100644 index d5f7155..0000000 --- a/.github/workflows/style.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Linting & Style Checks -on: - # This Workflow can be triggered manually - workflow_dispatch: - workflow_call: - -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 - - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c - with: - python-version: '3.11' - cache: 'pip' - cache-dependency-path: '.github/workflows/requirements/style.txt' - - - name: Install Python dependencies - run: | - pip install -r .github/workflows/requirements/style.txt - - - name: Lint and Format Check with Flake8 and Black - run: | - black --diff --check . - flake8 hubcast/ diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml deleted file mode 100644 index 551403a..0000000 --- a/.github/workflows/unit-tests.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: Unit Tests -on: - # This Workflow can be triggered manually - workflow_dispatch: - workflow_call: - -jobs: - ubuntu: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.8', '3.11'] - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c - with: - python-version: ${{ matrix.python-version }} - cache: 'pip' - cache-dependency-path: '.github/workflows/requirements/unit-tests.txt' - - name: Install Python dependencies - run: | - pip install -r .github/workflows/requirements/unit-tests.txt - - name: Run Unit Tests with Pytest - run: | - pytest diff --git a/gantry/__init__.py b/gantry/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gantry/__main__.py b/gantry/__main__.py deleted file mode 100644 index 491f8ff..0000000 --- a/gantry/__main__.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - print("Hello World") - - -if __name__ == "__main__": - main() From a8dec44c9b0f8d061f9a2cdd7a53ce393cc67afb Mon Sep 17 00:00:00 2001 From: caetano melone Date: Wed, 10 Jan 2024 23:11:56 -0800 Subject: [PATCH 04/27] line breaks --- gantry/utils/collect.py | 6 +++++- gantry/utils/prometheus.py | 8 +++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py index b19dd4d..6704b01 100644 --- a/gantry/utils/collect.py +++ b/gantry/utils/collect.py @@ -123,7 +123,10 @@ async def fetch_job(job: dict) -> dict: cpu_usage = process_usage( await prometheus.query( type="range", - custom_query=f"rate(container_cpu_usage_seconds_total{{pod='{job['pod']}', container='build'}}[90s])", + custom_query=( + f"rate(container_cpu_usage_seconds_total{{" + f"pod='{job['pod']}', container='build'}}[90s])" + ), start=job["start"], end=job["end"], ), @@ -257,6 +260,7 @@ def process_usage(res: dict, job_id: int) -> dict: if not res: # sometimes prometheus reports no data for a job if the time range is too small logging.error(f"lack of usage data for job {job_id}") + # TODO throw exception sys.exit() usage = [float(value) for timestamp, value in res[0]["values"]] diff --git a/gantry/utils/prometheus.py b/gantry/utils/prometheus.py index ca1eac1..94ecab8 100644 --- a/gantry/utils/prometheus.py +++ b/gantry/utils/prometheus.py @@ -27,7 +27,13 @@ async def query(self, type: str, **kwargs) -> dict: max_resolution = 10_000 # calculating the max step size to get the desired resolution step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution) - url = f"{self.base_url}/query_range?query={query_str}&start={kwargs['start']}&end={kwargs['end']}&step={step}s" + url = ( + f"{self.base_url}/query_range?" + f"query={query_str}&" + f"start={kwargs['start']}&" + f"end={kwargs['end']}&" + f"step={step}s" + ) return await self._query(url) elif type == "single": url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}" From 1f3de5ccfb321fdb3b8553823d107d88de7bedaa Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 15 Jan 2024 22:49:15 -0800 Subject: [PATCH 05/27] improvements to collection --- db/schema.sql | 9 +- gantry/utils/collect.py | 219 ++++++++++++++++++++----------------- gantry/utils/db.py | 40 ------- gantry/utils/gitlab.py | 6 +- gantry/utils/misc.py | 17 ++- gantry/utils/prometheus.py | 47 +++++--- 6 files changed, 169 insertions(+), 169 deletions(-) delete mode 100644 gantry/utils/db.py diff --git a/db/schema.sql b/db/schema.sql index c6fcac0..316b132 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -1,7 +1,6 @@ CREATE TABLE vms ( id INTEGER PRIMARY KEY, - start INTEGER NOT NULL, - -- VM end is the max of the build end times + uuid TEXT NOT NULL, hostname TEXT NOT NULL, cores REAL NOT NULL, mem REAL NOT NULL, @@ -12,7 +11,6 @@ CREATE TABLE vms ( CREATE TABLE builds ( - -- TODO do we want an entry here for if the job has been retried? id INTEGER PRIMARY KEY, pod TEXT NOT NULL UNIQUE, vm INTEGER NOT NULL, @@ -20,17 +18,18 @@ CREATE TABLE builds ( end INTEGER NOT NULL, job_id INTEGER NOT NULL, job_status TEXT NOT NULL, + num_retries INTEGER NOT NULL, ref TEXT NOT NULL, pkg_name TEXT NOT NULL, pkg_version TEXT NOT NULL, - pkg_variants TEXT NOT NULL, -- can be stored as JSONB in the future? + pkg_variants TEXT NOT NULL, compiler_name TEXT NOT NULL, compiler_version TEXT NOT NULL, arch TEXT NOT NULL, stack TEXT NOT NULL, build_jobs INTEGER NOT NULL, cpu_request REAL NOT NULL, - cpu_limit REAL, -- this can be null + cpu_limit REAL, -- this can be null becasue it's currently not set cpu_mean REAL NOT NULL, cpu_median REAL NOT NULL, cpu_max REAL NOT NULL, diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py index 6704b01..702ed34 100644 --- a/gantry/utils/collect.py +++ b/gantry/utils/collect.py @@ -3,22 +3,22 @@ import math import re import statistics -import sys from datetime import datetime -from utils.db import SqliteClient from utils.gitlab import GitlabClient -from utils.misc import spec_variants +from utils.misc import db_insert, spec_variants from utils.prometheus import PrometheusClient -async def fetch_job(job: dict) -> dict: - # TODO match gitlab webhook payload and process datetimes? - if job["build_status"] not in ("success", "failed"): - return +class InvalidDataError(Exception): + pass - if job["build_status"] == "failed": - # TODO implement retry mechanism + +async def fetch_job(job: dict, db) -> dict: + gitlab = GitlabClient() + prometheus = PrometheusClient() + + if job["build_status"] not in ("success", "failed"): return job_name_pattern = re.compile(r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)") @@ -27,19 +27,19 @@ async def fetch_job(job: dict) -> dict: # generate jobs, non build jobs, etc return - gitlab = GitlabClient() - prometheus = PrometheusClient() - db = SqliteClient() - # check if job has already been inserted into the database - db.execute("select job_id from builds where job_id = ?", (job["build_id"],)) - if db.fetchone(): - logging.info(f"job {job['build_id']} already in database") - return + async with db.execute( + "select job_id from builds where job_id = ?", (job["build_id"],) + ) as cursor: + if await cursor.fetchone(): + logging.info(f"job {job['build_id']} already in database") + return job_log = await gitlab.job_log(job["build_id"]) if is_ghost(job_log): - db.insert("ghost_jobs", (None, job["build_id"])) + await db.execute( + ("insert into ghost_jobs (name) values (?)"), (job["build_id"],) + ) return job["start"] = datetime.fromisoformat(job["build_started_at"]).timestamp() @@ -61,10 +61,11 @@ async def fetch_job(job: dict) -> dict: job.update( { "pod": pod_annotations_res[0]["labels"]["pod"], - # TODO int? is it guaranteed to be here? - "build_jobs": pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_build_jobs" - ], + "build_jobs": int( + pod_annotations_res[0]["labels"][ + "annotation_metrics_spack_job_build_jobs" + ] + ), "arch": pod_annotations_res[0]["labels"][ "annotation_metrics_spack_job_spec_arch" ], @@ -133,88 +134,98 @@ async def fetch_job(job: dict) -> dict: job["build_id"], ) + if job["build_status"] == "failed": + oom_status = prometheus.query( + type="range", + query={ + "metric": "kube_pod_container_status_last_terminated_reason", + "filters": { + "container": "build", + "pod": job["pod"], + "reason": "OOMKilled", + }, + }, + start=job["start"], + end=job["end"] + 10 * 60, # give a 10 minute buffer + ) + # TODO retry the job if OOM, do not return as we still want to save the build + if not oom_status: + return + # instead of needing to fetch the node where the pod ran from kube_pod_info # we can grab it from kube_pod_container_resource_limits # weirdly, it's not available in kube_pod_labels or annotations # https://github.com/kubernetes/kube-state-metrics/issues/1148 - vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time) + vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time, db) requests = process_resources_res(job_requests_res) limits = process_resources_res(job_limits_res) - # TODO insert into db here - - return db.insert( - "builds", - ( - None, - job["pod"], - vm, - job["start"], - job["end"], - job["build_id"], - job["build_status"], - job["ref"], - job["pkg_name"], - job["pkg_version"], - # dict to string - json.dumps(job["pkg_variants"]), - job["compiler_name"], - job["compiler_version"], - job["arch"], - job["stack"], - job["build_jobs"], - requests["cpu"]["value"], - # currently not set as of 12-23 - limits.get("cpu", {}).get("value"), - cpu_usage["mean"], - cpu_usage["median"], - cpu_usage["max"], - cpu_usage["min"], - cpu_usage["stddev"], - requests["memory"]["value"], - limits["memory"]["value"], - mem_usage["mean"], - mem_usage["median"], - mem_usage["max"], - mem_usage["min"], - mem_usage["stddev"], - ), + await db.execute( + *db_insert( + "builds", + ( + None, + job["pod"], + vm, + job["start"], + job["end"], + job["build_id"], + job["build_status"], + job["retries_count"], + job["ref"], + job["pkg_name"], + job["pkg_version"], + json.dumps(job["pkg_variants"]), # dict to string + job["compiler_name"], + job["compiler_version"], + job["arch"], + job["stack"], + job["build_jobs"], + requests["cpu"]["value"], + # currently not set as of 12-23 + limits.get("cpu", {}).get("value"), + cpu_usage["mean"], + cpu_usage["median"], + cpu_usage["max"], + cpu_usage["min"], + cpu_usage["stddev"], + requests["memory"]["value"], + limits["memory"]["value"], + mem_usage["mean"], + mem_usage["median"], + mem_usage["max"], + mem_usage["min"], + mem_usage["stddev"], + ), + ) ) + # vm and build will get saved at the same time to make sure + # we don't accidentally commit a vm without a build + await db.commit() + + return + -async def fetch_vm(hostname: str, query_time: float) -> dict: +async def fetch_vm(hostname: str, query_time: float, db) -> dict: prometheus = PrometheusClient() - db = SqliteClient() - vm_start_res = await prometheus.query( + vm_info = await prometheus.query( type="single", query={ - "metric": "kube_node_created", + "metric": "kube_node_info", "filters": {"node": hostname}, }, time=query_time, ) - vm_start = float(vm_start_res[0]["values"][1]) - - db.execute( - "select id from vms where hostname = ? and start = ?", (hostname, vm_start) - ) - vm_id = db.fetchone() + vm_uuid = vm_info[0]["labels"]["system_uuid"] - if vm_id: - logging.info(f"vm {hostname} already in database with id {vm_id[0]}") - return vm_id[0] + async with db.execute("select id from vms where uuid = ?", (vm_uuid,)) as cursor: + old_vm = await cursor.fetchone() - vm_capacity = process_resources_res( - await prometheus.query( - type="single", - query={ - "metric": "kube_node_status_capacity", - "filters": {"node": hostname}, - }, - time=query_time, - ) - ) + if old_vm: + logging.info(f"vm {hostname} already in database with id {old_vm[0]}") + return old_vm[0] vm_labels = await prometheus.query( type="single", @@ -225,19 +236,26 @@ async def fetch_vm(hostname: str, query_time: float) -> dict: time=query_time, ) - return db.insert( - "vms", - ( - None, - vm_start, - hostname, - vm_capacity["cpu"]["value"], - vm_capacity["memory"]["value"], - vm_labels[0]["labels"]["label_kubernetes_io_arch"], - vm_labels[0]["labels"]["label_kubernetes_io_os"], - vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"], - ), - ) + async with db.execute( + *db_insert( + "vms", + ( + None, + vm_uuid, + hostname, + float(vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_cpu"]), + float( + vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_memory"] + ), + vm_labels[0]["labels"]["label_kubernetes_io_arch"], + vm_labels[0]["labels"]["label_kubernetes_io_os"], + vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"], + ), + ) + ) as cursor: + vm_id = cursor.lastrowid + + return vm_id def is_ghost(log): @@ -260,14 +278,13 @@ def process_usage(res: dict, job_id: int) -> dict: if not res: # sometimes prometheus reports no data for a job if the time range is too small logging.error(f"lack of usage data for job {job_id}") - # TODO throw exception - sys.exit() + raise InvalidDataError usage = [float(value) for timestamp, value in res[0]["values"]] sum_stats = { "mean": statistics.fmean(usage), - # use pstdev because we have the whole population + # pstdev because we have the whole population "stddev": statistics.pstdev(usage), "max": max(usage), "min": min(usage), @@ -280,6 +297,6 @@ def process_usage(res: dict, job_id: int) -> dict: or math.isnan(sum_stats["stddev"]) ): logging.error(f"usage data is invalid for job {job_id}") - sys.exit() + raise InvalidDataError return sum_stats diff --git a/gantry/utils/db.py b/gantry/utils/db.py deleted file mode 100644 index 5ea9de5..0000000 --- a/gantry/utils/db.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -import sqlite3 - - -class SqliteClient: - def __init__(self): - self.conn = sqlite3.connect(os.environ["DB_FILE"]) - self.cursor = self.conn.cursor() - self.execute("PRAGMA foreign_keys = ON;") - - def execute(self, query, params=None): - if params: - self.cursor.execute(query, params) - else: - self.cursor.execute(query) - - def insert(self, table, values): - self.execute( - f"insert into {table} values ({','.join(['?'] * len(values))})", values - ) - self.commit() - return self.cursor.lastrowid - - def fetchall(self): - return self.cursor.fetchall() - - def fetchone(self): - return self.cursor.fetchone() - - def commit(self): - self.conn.commit() - - def close(self): - self.conn.close() - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() diff --git a/gantry/utils/gitlab.py b/gantry/utils/gitlab.py index 4ad652d..96dcf0d 100644 --- a/gantry/utils/gitlab.py +++ b/gantry/utils/gitlab.py @@ -1,4 +1,3 @@ -import logging import os import aiohttp @@ -10,11 +9,8 @@ def __init__(self): self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_TOKEN"]} async def request(self, url: str, response_type: str) -> dict: - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(raise_for_status=True) as session: async with session.get(url, headers=self.headers) as resp: - if resp.status != 200: - logging.error(f"Gitlab query failed with status {resp.status}") - return {} if response_type == "json": return await resp.json() if response_type == "text": diff --git a/gantry/utils/misc.py b/gantry/utils/misc.py index 31c67ff..0376c67 100644 --- a/gantry/utils/misc.py +++ b/gantry/utils/misc.py @@ -1,15 +1,15 @@ def spec_variants(spec: str) -> dict: - """Given a spec's concrete variants, return a dict of variant name: value.""" + """Given a spec's concrete variants, return a dict of name: value.""" # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on - # TODO handle errors and invalid inputs - variants = {} spec = spec.replace("+", " +") spec = spec.replace("~", " ~") parts = spec.split(" ") for part in parts: + if len(part) < 2: + continue if "=" in part: name, value = part.split("=") # multiple values @@ -24,3 +24,14 @@ def spec_variants(spec: str) -> dict: variants[part[1:]] = False return variants + + +def db_insert(table, values): + """ + Returns an INSERT statement given a table name and tuple of values. + Must provide values for all columns in the table, including the primary key. + """ + return ( + f"insert into {table} values ({','.join(['?'] * (len(values)) )})", + values, + ) diff --git a/gantry/utils/prometheus.py b/gantry/utils/prometheus.py index 94ecab8..b1a6cb2 100644 --- a/gantry/utils/prometheus.py +++ b/gantry/utils/prometheus.py @@ -7,16 +7,34 @@ class PrometheusClient: - # TODO error handling for unexpected data - # todo retry mechanism for failed requests? - def __init__(self): self.base_url = os.environ["PROMETHEUS_URL"] self.cookies = {"_oauth2_proxy": os.environ["PROMETHEUS_COOKIE"]} async def query(self, type: str, **kwargs) -> dict: - # TODO add validation for kwargs and comments - query_str = ( + """ + type: "range" or "single" + + for range queries: set `start` and `end` (unix timestamps) + for single queries: set `time` (unix timestamp) + + for custom queries: set `custom_query` (string) + + for metric queries: set `query` (dict) + example: + "query": { + "metric": "metric_name", + "filters": {"filter1": "value1", "filter2": "value2"} + } + """ + + # validate that one of query or custom_query is set, but not both or neither + if not kwargs.get("query") and not kwargs.get("custom_query"): + raise ValueError("query or custom_query must be set") + if kwargs.get("query") and kwargs.get("custom_query"): + raise ValueError("query and custom_query cannot both be set") + + query_str = urllib.parse.quote( kwargs["custom_query"] if kwargs.get("custom_query") else query_to_str(**kwargs["query"]) @@ -41,14 +59,11 @@ async def query(self, type: str, **kwargs) -> dict: async def _query(self, url: str) -> dict: """Query Prometheus with a query string""" - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(raise_for_status=True) as session: # submit cookie with request async with session.get(url, cookies=self.cookies) as resp: - if resp.status != 200: - logging.error(f"Prometheus query failed with status {resp.status}") - return {} try: - return self.process_response(await resp.json()) + return self.prettify_res(await resp.json()) except aiohttp.ContentTypeError: logging.error( """Prometheus query failed with unexpected response. @@ -56,8 +71,8 @@ async def _query(self, url: str) -> dict: ) return {} - def process_response(self, response: dict) -> dict: - """Process Prometheus response into a more usable format""" + def prettify_res(self, response: dict) -> dict: + """Process Prometheus response into an arrray of dicts with {label: value}""" result_type = response.get("data", {}).get("resultType") values_dict = { "matrix": "values", @@ -75,7 +90,9 @@ def process_response(self, response: dict) -> dict: def query_to_str(metric: str, filters: dict) -> str: - # TODO add a test for this - # expected output: metric{key1="val1", key2="val2"} + """ + In: "metric", {key1: value1, key2: value2} + Out: "metric{key1="value1", key2="value2"}" + """ filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()]) - return urllib.parse.quote(f"{metric}{{{filters_str}}}") + return f"{metric}{{{filters_str}}}" From 130a1f103e8c9fe11a617f4abb5e9163a94519d4 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 15 Jan 2024 22:49:43 -0800 Subject: [PATCH 06/27] aiohttp server basics --- gantry/main.py | 19 +++++++++++++++++++ gantry/views.py | 16 ++++++++++++++++ spack.yaml | 1 + 3 files changed, 36 insertions(+) create mode 100644 gantry/main.py create mode 100644 gantry/views.py diff --git a/gantry/main.py b/gantry/main.py new file mode 100644 index 0000000..496140a --- /dev/null +++ b/gantry/main.py @@ -0,0 +1,19 @@ +import os + +import aiosqlite +from aiohttp import web +from views import routes + + +async def init_db(app: web.Application): + db = await aiosqlite.connect(os.environ["DB_FILE"]) + await db.execute("PRAGMA foreign_keys = ON;") + app["db"] = db + yield + await db.close() + + +app = web.Application() +app.add_routes(routes) +app.cleanup_ctx.append(init_db) +web.run_app(app) diff --git a/gantry/views.py b/gantry/views.py new file mode 100644 index 0000000..0820b23 --- /dev/null +++ b/gantry/views.py @@ -0,0 +1,16 @@ +from aiohttp import web +from utils.collect import fetch_job + +routes = web.RouteTableDef() + + +@routes.post("/collect") +async def collect_job(request: web.Request) -> web.Response: + payload = await request.json() + + # TODO validate gitlab token + if request.headers.get("X-Gitlab-Event") != "Job Hook": + return web.Response(status=400, text="invalid event type") + + await fetch_job(payload, request.app["db"]) + return web.Response(status=200) diff --git a/spack.yaml b/spack.yaml index 7249289..44863c0 100644 --- a/spack.yaml +++ b/spack.yaml @@ -7,6 +7,7 @@ spack: - py-flake8 - py-black - py-isort + - py-aiosqlite - sqlite view: true concretizer: From 45d8ef153837da9dfd34b156ba511e3e231326d2 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Thu, 18 Jan 2024 00:22:07 -0800 Subject: [PATCH 07/27] refactoring of collection --- db/schema.sql | 6 +- gantry/__main__.py | 21 +- gantry/collection.py | 91 ++++++++ gantry/main.py | 19 -- gantry/models/__init__.py | 3 + gantry/models/build.py | 255 ++++++++++++++++++++++ gantry/models/vm.py | 107 ++++++++++ gantry/tests/test_utils.py | 2 +- gantry/{utils => util}/__init__.py | 0 gantry/util/gitlab.py | 33 +++ gantry/util/misc.py | 59 ++++++ gantry/{utils => util}/prometheus.py | 68 ++++++ gantry/utils/collect.py | 302 --------------------------- gantry/utils/gitlab.py | 21 -- gantry/utils/misc.py | 37 ---- gantry/views.py | 17 +- 16 files changed, 653 insertions(+), 388 deletions(-) create mode 100644 gantry/collection.py delete mode 100644 gantry/main.py create mode 100644 gantry/models/__init__.py create mode 100644 gantry/models/build.py create mode 100644 gantry/models/vm.py rename gantry/{utils => util}/__init__.py (100%) create mode 100644 gantry/util/gitlab.py create mode 100644 gantry/util/misc.py rename gantry/{utils => util}/prometheus.py (66%) delete mode 100644 gantry/utils/collect.py delete mode 100644 gantry/utils/gitlab.py delete mode 100644 gantry/utils/misc.py diff --git a/db/schema.sql b/db/schema.sql index 316b132..6b1c24f 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -1,6 +1,6 @@ CREATE TABLE vms ( id INTEGER PRIMARY KEY, - uuid TEXT NOT NULL, + uuid TEXT NOT NULL UNIQUE, hostname TEXT NOT NULL, cores REAL NOT NULL, mem REAL NOT NULL, @@ -16,9 +16,9 @@ CREATE TABLE builds ( vm INTEGER NOT NULL, start INTEGER NOT NULL, end INTEGER NOT NULL, - job_id INTEGER NOT NULL, + job_id INTEGER NOT NULL UNIQUE, job_status TEXT NOT NULL, - num_retries INTEGER NOT NULL, + retries INTEGER NOT NULL, ref TEXT NOT NULL, pkg_name TEXT NOT NULL, pkg_version TEXT NOT NULL, diff --git a/gantry/__main__.py b/gantry/__main__.py index 491f8ff..64f408e 100644 --- a/gantry/__main__.py +++ b/gantry/__main__.py @@ -1,5 +1,24 @@ +import os + +import aiosqlite +from aiohttp import web + +from gantry.views import routes + + +async def init_db(app: web.Application): + db = await aiosqlite.connect(os.environ["DB_FILE"]) + await db.execute("PRAGMA foreign_keys = ON;") + app["db"] = db + yield + await db.close() + + def main(): - print("Hello World") + app = web.Application() + app.add_routes(routes) + app.cleanup_ctx.append(init_db) + web.run_app(app) if __name__ == "__main__": diff --git a/gantry/collection.py b/gantry/collection.py new file mode 100644 index 0000000..9aa8121 --- /dev/null +++ b/gantry/collection.py @@ -0,0 +1,91 @@ +import logging + +import aiosqlite + +from gantry.models import VM, Build +from gantry.util.gitlab import GitlabClient +from gantry.util.prometheus import IncompleteData, PrometheusClient + + +async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None: + """ + Fetches a job's information from Prometheus and inserts it into the database. + If there is data missing at any point, the function will still return so the webhook + responds as expected. If an exception is thrown, that behavior was unanticipated by + this program and should be investigated. + + args: + payload: a dictionary containing the information from the Gitlab job hook + db: an active aiosqlite connection + + returns: None in order to accomodate a 200 response for the webhook. + """ + + gitlab = GitlabClient() + prometheus = PrometheusClient() + + build = Build( + status=payload["build_status"], + name=payload["build_name"], + id=payload["build_id"], + start=payload["build_started_at"], + end=payload["build_finished_at"], + retries=payload["retries_count"], + ref=payload["ref"], + ) + + # perform checks to see if we should collect data for this job + if ( + build.status not in ("success",) + or not build.valid_name # is not a build job + or await build.in_db(db) # job already in the database + or await build.is_ghost(db, gitlab) + ): + return + + try: + await build.get_annotations(prometheus) + await build.get_resources(prometheus) + await build.get_usage(prometheus) + vm_id = await fetch_vm(db, prometheus, build.node, build.midpoint) + except IncompleteData as e: + # missing data, skip this job + logging.error(e) + return + + await build.insert(db, vm_id) + # vm and build will get saved at the same time to make sure + # we don't accidentally commit a vm without a build + await db.commit() + + return + + +async def fetch_vm( + db: aiosqlite.Connection, + prometheus: PrometheusClient, + hostname: dict, + query_time: float, +) -> int: + """ + Finds an existing VM in the database or inserts a new one. + + args: + db: an active aiosqlite connection + prometheus: + hostname: the hostname of the VM + query_time: any point during VM runtime, usually grabbed from build + + returns: id of the inserted or existing VM + """ + vm = VM( + hostname=hostname, + query_time=query_time, + ) + + # do not proceed if the VM exists + if existing_vm := await vm.db_id(db, prometheus): + return existing_vm + + await vm.get_labels(prometheus) + return await vm.insert(db) diff --git a/gantry/main.py b/gantry/main.py deleted file mode 100644 index 496140a..0000000 --- a/gantry/main.py +++ /dev/null @@ -1,19 +0,0 @@ -import os - -import aiosqlite -from aiohttp import web -from views import routes - - -async def init_db(app: web.Application): - db = await aiosqlite.connect(os.environ["DB_FILE"]) - await db.execute("PRAGMA foreign_keys = ON;") - app["db"] = db - yield - await db.close() - - -app = web.Application() -app.add_routes(routes) -app.cleanup_ctx.append(init_db) -web.run_app(app) diff --git a/gantry/models/__init__.py b/gantry/models/__init__.py new file mode 100644 index 0000000..57e9b66 --- /dev/null +++ b/gantry/models/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from .build import Build +from .vm import VM diff --git a/gantry/models/build.py b/gantry/models/build.py new file mode 100644 index 0000000..4289bb8 --- /dev/null +++ b/gantry/models/build.py @@ -0,0 +1,255 @@ +import json +import logging +import re +from datetime import datetime + +import aiosqlite + +from gantry.util.gitlab import GitlabClient +from gantry.util.misc import insert_dict, setattrs, spec_variants +from gantry.util.prometheus import ( + IncompleteData, + PrometheusClient, + process_resources, + process_usage, +) + + +class Build: + def __init__( + self, + status: str, + name: str, + id: int, + start: str, + end: str, + retries: int, + ref: str, + ): + self.status = status + self.name = name + self.id = id + self.start = datetime.fromisoformat(start).timestamp() + self.end = datetime.fromisoformat(end).timestamp() + self.retries = retries + self.ref = ref + + @property + def valid_name(self) -> bool: + """Returns True if the job is a build job, False otherwise.""" + + # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0 + # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1 + job_name_pattern = re.compile( + r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)" + ) + job_name_match = job_name_pattern.match(self.name) + # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack + return bool(job_name_match) + + @property + def midpoint(self) -> float: + """Returns the midpoint of the job in unix time.""" + # prometheus is not guaranteed to have data at the exact start and end times + # instead of creating an arbitrary buffer, ask for data in the middle of the job + return (self.start + self.end) / 2 + + async def is_ghost(self, db: aiosqlite.Connection, gl: GitlabClient) -> bool: + """Returns the job's ghost status.""" + + # prevent duplicate jobs from being inserted into the database + async with db.execute( + "select job_id from ghost_jobs where job_id = ?", (self.id,) + ) as cursor: + if await cursor.fetchone(): + # ghost job is already in the database + return True + + log = await gl.job_log(self.id) + ghost = "No need to rebuild" in log + + if ghost: + await db.execute(("insert into ghost_jobs (name) values (?)"), (self.id,)) + + return ghost + + async def in_db(self, db: aiosqlite.Connection) -> bool: + """Checks if the job is already in the db.""" + + async with db.execute( + "select job_id from builds where job_id = ?", (self.id,) + ) as cursor: + found = bool(await cursor.fetchone()) + + if found: + logging.warning(f"job {self.id} already in database") + + return found + + async def get_annotations(self, prometheus: PrometheusClient): + """Fetches the annotations and assigns multiple attributes.""" + + annotations_res = await prometheus.query( + type="single", + query={ + "metric": "kube_pod_annotations", + "filters": {"annotation_gitlab_ci_job_id": self.id}, + }, + time=self.midpoint, + ) + + if not annotations_res: + raise IncompleteData("missing annotations") + + annotations = annotations_res[0]["labels"] + + setattrs( + self, + pod=annotations["pod"], + # if build jobs is not set, defaults to 16 due to spack settings + build_jobs=annotations.get("annotation_metrics_spack_job_build_jobs", 16), + arch=annotations["annotation_metrics_spack_job_spec_arch"], + pkg_name=annotations["annotation_metrics_spack_job_spec_pkg_name"], + pkg_version=annotations["annotation_metrics_spack_job_spec_pkg_version"], + pkg_variants=spec_variants( + annotations["annotation_metrics_spack_job_spec_variants"] + ), + compiler_name=annotations[ + "annotation_metrics_spack_job_spec_compiler_name" + ], + compiler_version=annotations[ + "annotation_metrics_spack_job_spec_compiler_version" + ], + stack="testing" + # stack=job_name_dict["stack"], + ) + + async def get_resources(self, prometheus: PrometheusClient): + """fetches pod requests and limits, and also sets the node hostname""" + requests = process_resources( + await prometheus.query( + type="single", + query={ + "metric": "kube_pod_container_resource_requests", + "filters": {"container": "build", "pod": self.pod}, + }, + time=self.midpoint, + ), + self.id, + ) + + limits_res = await prometheus.query( + type="single", + query={ + "metric": "kube_pod_container_resource_limits", + "filters": {"container": "build", "pod": self.pod}, + }, + time=self.midpoint, + ) + + if not limits_res: + raise IncompleteData(f"missing limits for job {self.id}") + + # instead of needing to fetch the node where the pod ran from kube_pod_info + # we can grab it from kube_pod_container_resource_limits + # weirdly, it's not available in kube_pod_labels or annotations + # https://github.com/kubernetes/kube-state-metrics/issues/1148 + + self.node = limits_res[0]["labels"]["node"] + limits = process_resources(limits_res, self.id) + + setattrs( + self, + cpu_request=requests["cpu"]["value"], + mem_request=requests["memory"]["value"], + cpu_limit=limits.get("cpu", {}).get("value"), + mem_limit=limits["memory"]["value"], + ) + + async def get_usage(self, prometheus: PrometheusClient): + """Sets resource usage attributes.""" + + mem_usage = process_usage( + await prometheus.query( + type="range", + query={ + "metric": "container_memory_working_set_bytes", + "filters": {"container": "build", "pod": self.pod}, + }, + start=self.start, + end=self.end, + ), + self.id, + ) + + cpu_usage = process_usage( + await prometheus.query( + type="range", + custom_query=( + f"rate(container_cpu_usage_seconds_total{{" + f"pod='{self.pod}', container='build'}}[90s])" + ), + start=self.start, + end=self.end, + ), + self.id, + ) + + setattrs( + self, + cpu_mean=cpu_usage["mean"], + cpu_median=cpu_usage["median"], + cpu_max=cpu_usage["max"], + cpu_min=cpu_usage["min"], + cpu_stddev=cpu_usage["stddev"], + mem_mean=mem_usage["mean"], + mem_median=mem_usage["median"], + mem_max=mem_usage["max"], + mem_min=mem_usage["min"], + mem_stddev=mem_usage["stddev"], + ) + + async def insert(self, db: aiosqlite.Connection, vm_id: int) -> int: + """Inserts the build into the database and returns its id.""" + + async with db.execute( + *insert_dict( + "builds", + { + "pod": self.pod, + "vm": vm_id, + "start": self.start, + "end": self.end, + "job_id": self.id, + "job_status": self.status, + "retries": self.retries, + "ref": self.ref, + "pkg_name": self.pkg_name, + "pkg_version": self.pkg_version, + "pkg_variants": json.dumps(self.pkg_variants), # dict to string + "compiler_name": self.compiler_name, + "compiler_version": self.compiler_version, + "arch": self.arch, + "stack": self.stack, + "build_jobs": self.build_jobs, + "cpu_request": self.cpu_request, + "cpu_limit": self.cpu_limit, + "cpu_mean": self.cpu_mean, + "cpu_median": self.cpu_median, + "cpu_max": self.cpu_max, + "cpu_min": self.cpu_min, + "cpu_stddev": self.cpu_stddev, + "mem_request": self.mem_request, + "mem_limit": self.mem_limit, + "mem_mean": self.mem_mean, + "mem_median": self.mem_median, + "mem_max": self.mem_max, + "mem_min": self.mem_min, + "mem_stddev": self.mem_stddev, + }, + # if the job somehow gets added into the db (pod+id being unique) + # then ignore the insert + ignore=True, + ) + ) as cursor: + return cursor.lastrowid diff --git a/gantry/models/vm.py b/gantry/models/vm.py new file mode 100644 index 0000000..b763a91 --- /dev/null +++ b/gantry/models/vm.py @@ -0,0 +1,107 @@ +import aiosqlite + +from gantry.util.misc import insert_dict, setattrs +from gantry.util.prometheus import IncompleteData, PrometheusClient + +MB_IN_BYTES = 1_000_000 + + +class VM: + def __init__(self, hostname: str, query_time: float): + """ + args: + hostname: the hostname of the VM + query_time: any point during VM runtime, usually grabbed from build + """ + self.hostname = hostname + self.query_time = query_time + + async def db_id( + self, db: aiosqlite.Connection, prometheus: PrometheusClient + ) -> int | None: + """ + Returns the id of the vm if it exists in the database, otherwise returns None. + Also sets the uuid of the vm. + """ + vm_info = await prometheus.query( + type="single", + query={ + "metric": "kube_node_info", + "filters": {"node": self.hostname}, + }, + time=self.query_time, + ) + + if not vm_info: + raise IncompleteData(f"missing vm info for {self.hostname}") + + self.uuid = vm_info[0]["labels"]["system_uuid"] + + # look for the vm in the database + async with db.execute( + "select id from vms where uuid = ?", (self.uuid,) + ) as cursor: + old_vm = await cursor.fetchone() + + if old_vm: + return old_vm[0] + + return None + + async def get_labels(self, prometheus: PrometheusClient): + """Sets multiple attributes of the VM based on its labels.""" + + vm_labels_res = await prometheus.query( + type="single", + query={ + "metric": "kube_node_labels", + "filters": {"node": self.hostname}, + }, + time=self.query_time, + ) + + if not vm_labels_res: + raise IncompleteData(f"missing vm labels for {self.hostname}") + + labels = vm_labels_res[0]["labels"] + + setattrs( + self, + cores=float(labels["label_karpenter_k8s_aws_instance_cpu"]), + mem=float(labels["label_karpenter_k8s_aws_instance_memory"]), + arch=labels["label_kubernetes_io_arch"], + os=labels["label_kubernetes_io_os"], + instance_type=labels["label_node_kubernetes_io_instance_type"], + ) + + async def insert(self, db: aiosqlite.Connection) -> int: + """Inserts the VM into the database and returns its id.""" + async with db.execute( + *insert_dict( + "vms", + { + "uuid": self.uuid, + "hostname": self.hostname, + "cores": self.cores, + # convert to bytes to be consistent with other resource metrics + "mem": self.mem * MB_IN_BYTES, + "arch": self.arch, + "os": self.os, + "instance_type": self.instance_type, + }, + # deal with races + ignore=True, + ) + ) as cursor: + pk = cursor.lastrowid + + if pk == 0: + # the ignore part of the query was triggered, some other call + # must have inserted the vm before this one + async with db.execute( + "select id from vms where uuid = ?", (self.uuid,) + ) as cursor: + pk_res = await cursor.fetchone() + pk = pk_res[0] + + return pk diff --git a/gantry/tests/test_utils.py b/gantry/tests/test_utils.py index 3e83088..010e08b 100644 --- a/gantry/tests/test_utils.py +++ b/gantry/tests/test_utils.py @@ -1,6 +1,6 @@ import pytest -from gantry.utils.misc import spec_variants +from gantry.util.misc import spec_variants # write tests for spec_variants here # +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on has to equal {} diff --git a/gantry/utils/__init__.py b/gantry/util/__init__.py similarity index 100% rename from gantry/utils/__init__.py rename to gantry/util/__init__.py diff --git a/gantry/util/gitlab.py b/gantry/util/gitlab.py new file mode 100644 index 0000000..6658377 --- /dev/null +++ b/gantry/util/gitlab.py @@ -0,0 +1,33 @@ +import os + +import aiohttp + + +class GitlabClient: + def __init__(self): + self.base_url = os.environ["GITLAB_URL"] + self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_API_TOKEN"]} + + async def _request(self, url: str, response_type: str) -> dict | str: + """ + Helper for requests to the Gitlab API. + + args: + url: the url to request + response_type: the type of response to expect (json or text) + + returns: the response from Gitlab in the specified format + """ + + async with aiohttp.ClientSession(raise_for_status=True) as session: + async with session.get(url, headers=self.headers) as resp: + if response_type == "json": + return await resp.json() + if response_type == "text": + return await resp.text() + + async def job_log(self, job_id: int) -> str: + """Given a job id, returns the log from that job""" + + url = f"{self.base_url}/jobs/{job_id}/trace" + return await self._request(url, "text") diff --git a/gantry/util/misc.py b/gantry/util/misc.py new file mode 100644 index 0000000..2c6a69c --- /dev/null +++ b/gantry/util/misc.py @@ -0,0 +1,59 @@ +def spec_variants(spec: str) -> dict: + """Given a spec's concrete variants, return a dict in name: value format.""" + # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on + + variants = {} + # give some padding to + and ~ so we can split on them + spec = spec.replace("+", " +") + spec = spec.replace("~", " ~") + parts = spec.split(" ") + + for part in parts: + if len(part) < 2: + continue + if "=" in part: + name, value = part.split("=") + if "," in value: + # array of the multiple values + variants[name] = value.split(",") + else: + # string of the single value + variants[name] = value + else: + # anything after the first character is the value + if part.startswith("+"): + variants[part[1:]] = True + elif part.startswith("~"): + variants[part[1:]] = False + + return variants + + +def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]: + """ + Crafts an SQLite INSERT statement from a dictionary. + + args: + table: name of the table to insert into + input: dictionary of values to insert + ignore: whether to ignore duplicate entries + + returns: tuple of (query, values) + """ + + columns = ", ".join(input.keys()) + values = ", ".join(["?" for _ in range(len(input))]) + query = f"INSERT INTO {table} ({columns}) VALUES ({values})" + + if ignore: + query = query.replace("INSERT", "INSERT OR IGNORE") + + # using a tuple of values from the dictionary + values_tuple = tuple(input.values()) + return query, values_tuple + + +def setattrs(_self, **kwargs): + """Sets multiple attributes of an object from a dictionary.""" + for k, v in kwargs.items(): + setattr(_self, k, v) diff --git a/gantry/utils/prometheus.py b/gantry/util/prometheus.py similarity index 66% rename from gantry/utils/prometheus.py rename to gantry/util/prometheus.py index b1a6cb2..921069a 100644 --- a/gantry/utils/prometheus.py +++ b/gantry/util/prometheus.py @@ -1,11 +1,16 @@ import logging import math import os +import statistics import urllib.parse import aiohttp +class IncompleteData(Exception): + pass + + class PrometheusClient: def __init__(self): self.base_url = os.environ["PROMETHEUS_URL"] @@ -96,3 +101,66 @@ def query_to_str(metric: str, filters: dict) -> str: """ filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()]) return f"{metric}{{{filters_str}}}" + + +def process_resources(res: dict, job_id: int) -> dict: + """ + Processes the resource limits and requests from a Prometheus response into + readable format. + + args: + res: Prometheus response + job_id: job id for error logging + + returns: dict with {resource: {unit: value}} format + """ + + if not res: + raise IncompleteData(f"resource data is missing for job {job_id}") + + processed = {} + for item in res: + # duplicates are ignored by overwriting the previous entry + processed[item["labels"]["resource"]] = { + "unit": item["labels"]["unit"], + "value": float(item["values"][1]), + } + + return processed + + +def process_usage(res: dict, job_id: int) -> dict: + """ + Processes the usage data from a Prometheus response into readable format. + This could either be CPU usage or memory usage. + + args: + res: Prometheus response + job_id: job id for error logging + + returns: dict with {statistic: value} format + """ + + if not res: + # sometimes prometheus reports no data for a job if the time range is too small + raise IncompleteData(f"usage data is missing for job {job_id}") + + usage = [float(value) for timestamp, value in res[0]["values"]] + + sum_stats = { + "mean": statistics.fmean(usage), + # pstdev because we have the whole population + "stddev": statistics.pstdev(usage), + "max": max(usage), + "min": min(usage), + "median": statistics.median(usage), + } + + if ( + sum_stats["stddev"] == 0 + or sum_stats["mean"] == 0 + or math.isnan(sum_stats["stddev"]) + ): + raise IncompleteData(f"usage data is invalid for job {job_id}") + + return sum_stats diff --git a/gantry/utils/collect.py b/gantry/utils/collect.py deleted file mode 100644 index 702ed34..0000000 --- a/gantry/utils/collect.py +++ /dev/null @@ -1,302 +0,0 @@ -import json -import logging -import math -import re -import statistics -from datetime import datetime - -from utils.gitlab import GitlabClient -from utils.misc import db_insert, spec_variants -from utils.prometheus import PrometheusClient - - -class InvalidDataError(Exception): - pass - - -async def fetch_job(job: dict, db) -> dict: - gitlab = GitlabClient() - prometheus = PrometheusClient() - - if job["build_status"] not in ("success", "failed"): - return - - job_name_pattern = re.compile(r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)") - job_name_match = job_name_pattern.match(job["build_name"]) - if not job_name_match: - # generate jobs, non build jobs, etc - return - - # check if job has already been inserted into the database - async with db.execute( - "select job_id from builds where job_id = ?", (job["build_id"],) - ) as cursor: - if await cursor.fetchone(): - logging.info(f"job {job['build_id']} already in database") - return - - job_log = await gitlab.job_log(job["build_id"]) - if is_ghost(job_log): - await db.execute( - ("insert into ghost_jobs (name) values (?)"), (job["build_id"],) - ) - return - - job["start"] = datetime.fromisoformat(job["build_started_at"]).timestamp() - job["end"] = datetime.fromisoformat(job["build_finished_at"]).timestamp() - - # prometheus is not guaranteed to have data at the exact start and end times - # instead of creating an arbitrary buffer, ask for data in the middle of the job - query_time = (job["end"] + job["start"]) / 2 - - pod_annotations_res = await prometheus.query( - type="single", - query={ - "metric": "kube_pod_annotations", - "filters": {"annotation_gitlab_ci_job_id": job["build_id"]}, - }, - time=query_time, - ) - - job.update( - { - "pod": pod_annotations_res[0]["labels"]["pod"], - "build_jobs": int( - pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_build_jobs" - ] - ), - "arch": pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_spec_arch" - ], - "pkg_name": pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_spec_pkg_name" - ], - "pkg_version": pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_spec_pkg_version" - ], - "pkg_variants": spec_variants( - pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_spec_variants" - ] - ), - "compiler_name": pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_spec_compiler_name" - ], - "compiler_version": pod_annotations_res[0]["labels"][ - "annotation_metrics_spack_job_spec_compiler_version" - ], - "stack": job_name_match.group(6), - } - ) - - job_requests_res = await prometheus.query( - type="single", - query={ - "metric": "kube_pod_container_resource_requests", - "filters": {"container": "build", "pod": job["pod"]}, - }, - time=query_time, - ) - - job_limits_res = await prometheus.query( - type="single", - query={ - "metric": "kube_pod_container_resource_limits", - "filters": {"container": "build", "pod": job["pod"]}, - }, - time=query_time, - ) - - mem_usage = process_usage( - await prometheus.query( - type="range", - query={ - "metric": "container_memory_working_set_bytes", - "filters": {"container": "build", "pod": job["pod"]}, - }, - start=job["start"], - end=job["end"], - ), - job["build_id"], - ) - - cpu_usage = process_usage( - await prometheus.query( - type="range", - custom_query=( - f"rate(container_cpu_usage_seconds_total{{" - f"pod='{job['pod']}', container='build'}}[90s])" - ), - start=job["start"], - end=job["end"], - ), - job["build_id"], - ) - - if job["build_status"] == "failed": - oom_status = prometheus.query( - type="range", - query={ - "metric": "kube_pod_container_status_last_terminated_reason", - "filters": { - "container": "build", - "pod": job["pod"], - "reason": "OOMKilled", - }, - }, - start=job["start"], - end=job["end"] + 10 * 60, # give a 10 minute buffer - ) - # TODO retry the job if OOM, do not return as we still want to save the build - if not oom_status: - return - - # instead of needing to fetch the node where the pod ran from kube_pod_info - # we can grab it from kube_pod_container_resource_limits - # weirdly, it's not available in kube_pod_labels or annotations - # https://github.com/kubernetes/kube-state-metrics/issues/1148 - vm = await fetch_vm(job_limits_res[0]["labels"]["node"], query_time, db) - requests = process_resources_res(job_requests_res) - limits = process_resources_res(job_limits_res) - - await db.execute( - *db_insert( - "builds", - ( - None, - job["pod"], - vm, - job["start"], - job["end"], - job["build_id"], - job["build_status"], - job["retries_count"], - job["ref"], - job["pkg_name"], - job["pkg_version"], - json.dumps(job["pkg_variants"]), # dict to string - job["compiler_name"], - job["compiler_version"], - job["arch"], - job["stack"], - job["build_jobs"], - requests["cpu"]["value"], - # currently not set as of 12-23 - limits.get("cpu", {}).get("value"), - cpu_usage["mean"], - cpu_usage["median"], - cpu_usage["max"], - cpu_usage["min"], - cpu_usage["stddev"], - requests["memory"]["value"], - limits["memory"]["value"], - mem_usage["mean"], - mem_usage["median"], - mem_usage["max"], - mem_usage["min"], - mem_usage["stddev"], - ), - ) - ) - - # vm and build will get saved at the same time to make sure - # we don't accidentally commit a vm without a build - await db.commit() - - return - - -async def fetch_vm(hostname: str, query_time: float, db) -> dict: - prometheus = PrometheusClient() - vm_info = await prometheus.query( - type="single", - query={ - "metric": "kube_node_info", - "filters": {"node": hostname}, - }, - time=query_time, - ) - - vm_uuid = vm_info[0]["labels"]["system_uuid"] - - async with db.execute("select id from vms where uuid = ?", (vm_uuid,)) as cursor: - old_vm = await cursor.fetchone() - - if old_vm: - logging.info(f"vm {hostname} already in database with id {old_vm[0]}") - return old_vm[0] - - vm_labels = await prometheus.query( - type="single", - query={ - "metric": "kube_node_labels", - "filters": {"node": hostname}, - }, - time=query_time, - ) - - async with db.execute( - *db_insert( - "vms", - ( - None, - vm_uuid, - hostname, - float(vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_cpu"]), - float( - vm_labels[0]["labels"]["label_karpenter_k8s_aws_instance_memory"] - ), - vm_labels[0]["labels"]["label_kubernetes_io_arch"], - vm_labels[0]["labels"]["label_kubernetes_io_os"], - vm_labels[0]["labels"]["label_node_kubernetes_io_instance_type"], - ), - ) - ) as cursor: - vm_id = cursor.lastrowid - - return vm_id - - -def is_ghost(log): - return "No need to rebuild" in log - - -def process_resources_res(res: dict) -> dict: - processed = {} - for item in res: - # duplicates are ignored by overwriting the previous entry - processed[item["labels"]["resource"]] = { - "unit": item["labels"]["unit"], - "value": float(item["values"][1]), - } - - return processed - - -def process_usage(res: dict, job_id: int) -> dict: - if not res: - # sometimes prometheus reports no data for a job if the time range is too small - logging.error(f"lack of usage data for job {job_id}") - raise InvalidDataError - - usage = [float(value) for timestamp, value in res[0]["values"]] - - sum_stats = { - "mean": statistics.fmean(usage), - # pstdev because we have the whole population - "stddev": statistics.pstdev(usage), - "max": max(usage), - "min": min(usage), - "median": statistics.median(usage), - } - - if ( - sum_stats["stddev"] == 0 - or sum_stats["mean"] == 0 - or math.isnan(sum_stats["stddev"]) - ): - logging.error(f"usage data is invalid for job {job_id}") - raise InvalidDataError - - return sum_stats diff --git a/gantry/utils/gitlab.py b/gantry/utils/gitlab.py deleted file mode 100644 index 96dcf0d..0000000 --- a/gantry/utils/gitlab.py +++ /dev/null @@ -1,21 +0,0 @@ -import os - -import aiohttp - - -class GitlabClient: - def __init__(self): - self.base_url = os.environ["GITLAB_URL"] - self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_TOKEN"]} - - async def request(self, url: str, response_type: str) -> dict: - async with aiohttp.ClientSession(raise_for_status=True) as session: - async with session.get(url, headers=self.headers) as resp: - if response_type == "json": - return await resp.json() - if response_type == "text": - return await resp.text() - - async def job_log(self, id: int) -> str: - url = f"{self.base_url}/jobs/{id}/trace" - return await self.request(url, "text") diff --git a/gantry/utils/misc.py b/gantry/utils/misc.py deleted file mode 100644 index 0376c67..0000000 --- a/gantry/utils/misc.py +++ /dev/null @@ -1,37 +0,0 @@ -def spec_variants(spec: str) -> dict: - """Given a spec's concrete variants, return a dict of name: value.""" - # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on - - variants = {} - spec = spec.replace("+", " +") - spec = spec.replace("~", " ~") - parts = spec.split(" ") - - for part in parts: - if len(part) < 2: - continue - if "=" in part: - name, value = part.split("=") - # multiple values - if "," in value: - variants[name] = value.split(",") - else: - variants[name] = value - else: - if part.startswith("+"): - variants[part[1:]] = True - elif part.startswith("~"): - variants[part[1:]] = False - - return variants - - -def db_insert(table, values): - """ - Returns an INSERT statement given a table name and tuple of values. - Must provide values for all columns in the table, including the primary key. - """ - return ( - f"insert into {table} values ({','.join(['?'] * (len(values)) )})", - values, - ) diff --git a/gantry/views.py b/gantry/views.py index 0820b23..6b11b80 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -1,16 +1,25 @@ +import os +import json + from aiohttp import web -from utils.collect import fetch_job + +from gantry.collection import fetch_build routes = web.RouteTableDef() @routes.post("/collect") async def collect_job(request: web.Request) -> web.Response: - payload = await request.json() + try: + payload = await request.json() + except json.decoder.JSONDecodeError: + return web.Response(status=400, text="invalid json") + + if request.headers.get("X-Gitlab-Token") != os.environ["GITLAB_WEBHOOK_TOKEN"]: + return web.Response(status=401, text="invalid token") - # TODO validate gitlab token if request.headers.get("X-Gitlab-Event") != "Job Hook": return web.Response(status=400, text="invalid event type") - await fetch_job(payload, request.app["db"]) + await fetch_build(payload, request.app["db"]) return web.Response(status=200) From a1a864b4644f045181fe727b65371615b1b8c6ea Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 22 Jan 2024 17:33:38 -0800 Subject: [PATCH 08/27] isort --- gantry/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gantry/views.py b/gantry/views.py index 6b11b80..180fc9e 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -1,5 +1,5 @@ -import os import json +import os from aiohttp import web From 63165508ac84976c96b359b2646079e0d61ce799 Mon Sep 17 00:00:00 2001 From: Caetano Melone Date: Wed, 24 Jan 2024 00:02:01 -0800 Subject: [PATCH 09/27] don't depend on dotenv for .env sourcing Co-authored-by: Alec Scott --- .envrc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.envrc b/.envrc index 5283fcd..dcea6a5 100644 --- a/.envrc +++ b/.envrc @@ -9,4 +9,9 @@ if type spack &>/dev/null; then spack env activate -d . fi -dotenv +#------------------------------------------------------------------------ +# Load Environment Variables from .env (if files exists) +#------------------------------------------------------------------------ +if [ -e .env ]; then + source .env +fi From 0f89fba3fb5b01a7d8d7226370254f1f6245d1f6 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Wed, 24 Jan 2024 12:53:49 -0800 Subject: [PATCH 10/27] add stack --- gantry/models/build.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gantry/models/build.py b/gantry/models/build.py index 4289bb8..c34e433 100644 --- a/gantry/models/build.py +++ b/gantry/models/build.py @@ -99,7 +99,7 @@ async def get_annotations(self, prometheus: PrometheusClient): ) if not annotations_res: - raise IncompleteData("missing annotations") + raise IncompleteData(f"missing annotations for job {self.id}") annotations = annotations_res[0]["labels"] @@ -120,8 +120,7 @@ async def get_annotations(self, prometheus: PrometheusClient): compiler_version=annotations[ "annotation_metrics_spack_job_spec_compiler_version" ], - stack="testing" - # stack=job_name_dict["stack"], + stack=annotations["annotation_metrics_spack_ci_stack_name"], ) async def get_resources(self, prometheus: PrometheusClient): From 4e5324d756d1b316bdb96827be6da3e9447d2fb9 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Wed, 24 Jan 2024 12:54:05 -0800 Subject: [PATCH 11/27] restructure how clients are initialized --- gantry/__main__.py | 12 ++++++++++++ gantry/collection.py | 10 ++++++---- gantry/util/gitlab.py | 8 +++----- gantry/util/prometheus.py | 12 ++++++++---- gantry/views.py | 4 +++- 5 files changed, 32 insertions(+), 14 deletions(-) diff --git a/gantry/__main__.py b/gantry/__main__.py index 64f408e..c19dae0 100644 --- a/gantry/__main__.py +++ b/gantry/__main__.py @@ -3,6 +3,8 @@ import aiosqlite from aiohttp import web +from gantry.util.gitlab import GitlabClient +from gantry.util.prometheus import PrometheusClient from gantry.views import routes @@ -14,10 +16,20 @@ async def init_db(app: web.Application): await db.close() +async def init_clients(app: web.Application): + app["gitlab"] = GitlabClient( + os.environ["GITLAB_URL"], os.environ["GITLAB_API_TOKEN"] + ) + app["prometheus"] = PrometheusClient( + os.environ["PROMETHEUS_URL"], os.environ.get("PROMETHEUS_COOKIE", "") + ) + + def main(): app = web.Application() app.add_routes(routes) app.cleanup_ctx.append(init_db) + app.on_startup.append(init_clients) web.run_app(app) diff --git a/gantry/collection.py b/gantry/collection.py index 9aa8121..c651dea 100644 --- a/gantry/collection.py +++ b/gantry/collection.py @@ -7,7 +7,12 @@ from gantry.util.prometheus import IncompleteData, PrometheusClient -async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None: +async def fetch_build( + payload: dict, + db: aiosqlite.Connection, + gitlab: GitlabClient, + prometheus: PrometheusClient, +) -> None: """ Fetches a job's information from Prometheus and inserts it into the database. If there is data missing at any point, the function will still return so the webhook @@ -21,9 +26,6 @@ async def fetch_build(payload: dict, db: aiosqlite.Connection) -> None: returns: None in order to accomodate a 200 response for the webhook. """ - gitlab = GitlabClient() - prometheus = PrometheusClient() - build = Build( status=payload["build_status"], name=payload["build_name"], diff --git a/gantry/util/gitlab.py b/gantry/util/gitlab.py index 6658377..7ab672e 100644 --- a/gantry/util/gitlab.py +++ b/gantry/util/gitlab.py @@ -1,12 +1,10 @@ -import os - import aiohttp class GitlabClient: - def __init__(self): - self.base_url = os.environ["GITLAB_URL"] - self.headers = {"PRIVATE-TOKEN": os.environ["GITLAB_API_TOKEN"]} + def __init__(self, base_url: str, api_token: str): + self.base_url = base_url + self.headers = {"PRIVATE-TOKEN": api_token} async def _request(self, url: str, response_type: str) -> dict | str: """ diff --git a/gantry/util/prometheus.py b/gantry/util/prometheus.py index 921069a..a3db981 100644 --- a/gantry/util/prometheus.py +++ b/gantry/util/prometheus.py @@ -1,6 +1,5 @@ import logging import math -import os import statistics import urllib.parse @@ -12,9 +11,14 @@ class IncompleteData(Exception): class PrometheusClient: - def __init__(self): - self.base_url = os.environ["PROMETHEUS_URL"] - self.cookies = {"_oauth2_proxy": os.environ["PROMETHEUS_COOKIE"]} + def __init__(self, base_url: str, auth_cookie: str = ""): + # cookie will only be used if set + if auth_cookie: + self.cookies = {"_oauth2_proxy": auth_cookie} + else: + self.cookies = {} + + self.base_url = base_url async def query(self, type: str, **kwargs) -> dict: """ diff --git a/gantry/views.py b/gantry/views.py index 180fc9e..8967c19 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -21,5 +21,7 @@ async def collect_job(request: web.Request) -> web.Response: if request.headers.get("X-Gitlab-Event") != "Job Hook": return web.Response(status=400, text="invalid event type") - await fetch_build(payload, request.app["db"]) + await fetch_build( + payload, request.app["db"], request.app["gitlab"], request.app["prometheus"] + ) return web.Response(status=200) From 45820b400f6463bfaf7b9d88991ce3fe3634a4a9 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Wed, 24 Jan 2024 15:41:21 -0800 Subject: [PATCH 12/27] reorganize files into clients/models/routes --- gantry/__main__.py | 3 +-- gantry/clients/__init__.py | 3 +++ gantry/{util => clients}/gitlab.py | 0 gantry/{util => clients}/prometheus.py | 0 gantry/models/build.py | 4 ++-- gantry/models/vm.py | 2 +- gantry/{ => routes}/collection.py | 4 ++-- gantry/views.py | 2 +- 8 files changed, 10 insertions(+), 8 deletions(-) create mode 100644 gantry/clients/__init__.py rename gantry/{util => clients}/gitlab.py (100%) rename gantry/{util => clients}/prometheus.py (100%) rename gantry/{ => routes}/collection.py (95%) diff --git a/gantry/__main__.py b/gantry/__main__.py index c19dae0..ebb3e34 100644 --- a/gantry/__main__.py +++ b/gantry/__main__.py @@ -3,8 +3,7 @@ import aiosqlite from aiohttp import web -from gantry.util.gitlab import GitlabClient -from gantry.util.prometheus import PrometheusClient +from gantry.clients import GitlabClient, PrometheusClient from gantry.views import routes diff --git a/gantry/clients/__init__.py b/gantry/clients/__init__.py new file mode 100644 index 0000000..2dbe3f6 --- /dev/null +++ b/gantry/clients/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from .gitlab import GitlabClient +from .prometheus import PrometheusClient diff --git a/gantry/util/gitlab.py b/gantry/clients/gitlab.py similarity index 100% rename from gantry/util/gitlab.py rename to gantry/clients/gitlab.py diff --git a/gantry/util/prometheus.py b/gantry/clients/prometheus.py similarity index 100% rename from gantry/util/prometheus.py rename to gantry/clients/prometheus.py diff --git a/gantry/models/build.py b/gantry/models/build.py index c34e433..67de8e4 100644 --- a/gantry/models/build.py +++ b/gantry/models/build.py @@ -5,9 +5,9 @@ import aiosqlite -from gantry.util.gitlab import GitlabClient +from gantry.clients.gitlab import GitlabClient from gantry.util.misc import insert_dict, setattrs, spec_variants -from gantry.util.prometheus import ( +from gantry.clients.prometheus import ( IncompleteData, PrometheusClient, process_resources, diff --git a/gantry/models/vm.py b/gantry/models/vm.py index b763a91..59fe864 100644 --- a/gantry/models/vm.py +++ b/gantry/models/vm.py @@ -1,7 +1,7 @@ import aiosqlite from gantry.util.misc import insert_dict, setattrs -from gantry.util.prometheus import IncompleteData, PrometheusClient +from gantry.clients.prometheus import IncompleteData, PrometheusClient MB_IN_BYTES = 1_000_000 diff --git a/gantry/collection.py b/gantry/routes/collection.py similarity index 95% rename from gantry/collection.py rename to gantry/routes/collection.py index c651dea..66d987b 100644 --- a/gantry/collection.py +++ b/gantry/routes/collection.py @@ -3,8 +3,8 @@ import aiosqlite from gantry.models import VM, Build -from gantry.util.gitlab import GitlabClient -from gantry.util.prometheus import IncompleteData, PrometheusClient +from gantry.clients.gitlab import GitlabClient +from gantry.clients.prometheus import IncompleteData, PrometheusClient async def fetch_build( diff --git a/gantry/views.py b/gantry/views.py index 8967c19..b311e8d 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -3,7 +3,7 @@ from aiohttp import web -from gantry.collection import fetch_build +from gantry.routes.collection import fetch_build routes = web.RouteTableDef() From 4f78c3a5c43d28b4b706546bce21d852384f3f63 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Wed, 24 Jan 2024 15:51:00 -0800 Subject: [PATCH 13/27] decouple spec utility functions from misc.py --- gantry/models/build.py | 13 --------- gantry/routes/collection.py | 3 +- gantry/util/misc.py | 55 ------------------------------------- gantry/util/spec.py | 46 +++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 69 deletions(-) create mode 100644 gantry/util/spec.py diff --git a/gantry/models/build.py b/gantry/models/build.py index 67de8e4..169a6b8 100644 --- a/gantry/models/build.py +++ b/gantry/models/build.py @@ -34,19 +34,6 @@ def __init__( self.retries = retries self.ref = ref - @property - def valid_name(self) -> bool: - """Returns True if the job is a build job, False otherwise.""" - - # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0 - # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1 - job_name_pattern = re.compile( - r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)" - ) - job_name_match = job_name_pattern.match(self.name) - # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack - return bool(job_name_match) - @property def midpoint(self) -> float: """Returns the midpoint of the job in unix time.""" diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index 66d987b..b6c48a8 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -5,6 +5,7 @@ from gantry.models import VM, Build from gantry.clients.gitlab import GitlabClient from gantry.clients.prometheus import IncompleteData, PrometheusClient +from gantry.util.spec import valid_build_name async def fetch_build( @@ -39,7 +40,7 @@ async def fetch_build( # perform checks to see if we should collect data for this job if ( build.status not in ("success",) - or not build.valid_name # is not a build job + or not valid_build_name(build.name) # is not a build job or await build.in_db(db) # job already in the database or await build.is_ghost(db, gitlab) ): diff --git a/gantry/util/misc.py b/gantry/util/misc.py index 2c6a69c..0ff0892 100644 --- a/gantry/util/misc.py +++ b/gantry/util/misc.py @@ -1,58 +1,3 @@ -def spec_variants(spec: str) -> dict: - """Given a spec's concrete variants, return a dict in name: value format.""" - # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on - - variants = {} - # give some padding to + and ~ so we can split on them - spec = spec.replace("+", " +") - spec = spec.replace("~", " ~") - parts = spec.split(" ") - - for part in parts: - if len(part) < 2: - continue - if "=" in part: - name, value = part.split("=") - if "," in value: - # array of the multiple values - variants[name] = value.split(",") - else: - # string of the single value - variants[name] = value - else: - # anything after the first character is the value - if part.startswith("+"): - variants[part[1:]] = True - elif part.startswith("~"): - variants[part[1:]] = False - - return variants - - -def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]: - """ - Crafts an SQLite INSERT statement from a dictionary. - - args: - table: name of the table to insert into - input: dictionary of values to insert - ignore: whether to ignore duplicate entries - - returns: tuple of (query, values) - """ - - columns = ", ".join(input.keys()) - values = ", ".join(["?" for _ in range(len(input))]) - query = f"INSERT INTO {table} ({columns}) VALUES ({values})" - - if ignore: - query = query.replace("INSERT", "INSERT OR IGNORE") - - # using a tuple of values from the dictionary - values_tuple = tuple(input.values()) - return query, values_tuple - - def setattrs(_self, **kwargs): """Sets multiple attributes of an object from a dictionary.""" for k, v in kwargs.items(): diff --git a/gantry/util/spec.py b/gantry/util/spec.py new file mode 100644 index 0000000..9376ece --- /dev/null +++ b/gantry/util/spec.py @@ -0,0 +1,46 @@ +import re + +def spec_variants(spec: str) -> dict: + """Given a spec's concrete variants, return a dict in name: value format.""" + # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on + + variants = {} + # give some padding to + and ~ so we can split on them + spec = spec.replace("+", " +") + spec = spec.replace("~", " ~") + parts = spec.split(" ") + + for part in parts: + if len(part) < 2: + continue + if "=" in part: + name, value = part.split("=") + if "," in value: + # array of the multiple values + variants[name] = value.split(",") + else: + # string of the single value + variants[name] = value + else: + # anything after the first character is the value + if part.startswith("+"): + variants[part[1:]] = True + elif part.startswith("~"): + variants[part[1:]] = False + + return variants + +def valid_build_name(name): + """Returns True if the job is a build job, False otherwise.""" + + # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0 + # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1 + job_name_pattern = re.compile( + r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)" + ) + job_name_match = job_name_pattern.match(name) + # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack + return bool(job_name_match) + + + From fe2449d4a92e1d2ff9116b5af3093bb420fdbc1c Mon Sep 17 00:00:00 2001 From: caetano melone Date: Thu, 25 Jan 2024 15:08:30 -0800 Subject: [PATCH 14/27] rename vm: node build: job --- db/schema.sql | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/db/schema.sql b/db/schema.sql index 6b1c24f..26352e7 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -1,4 +1,4 @@ -CREATE TABLE vms ( +CREATE TABLE nodes ( id INTEGER PRIMARY KEY, uuid TEXT NOT NULL UNIQUE, hostname TEXT NOT NULL, @@ -10,15 +10,14 @@ CREATE TABLE vms ( ); -CREATE TABLE builds ( +CREATE TABLE jobs ( id INTEGER PRIMARY KEY, pod TEXT NOT NULL UNIQUE, - vm INTEGER NOT NULL, + node INTEGER NOT NULL, start INTEGER NOT NULL, end INTEGER NOT NULL, job_id INTEGER NOT NULL UNIQUE, job_status TEXT NOT NULL, - retries INTEGER NOT NULL, ref TEXT NOT NULL, pkg_name TEXT NOT NULL, pkg_version TEXT NOT NULL, @@ -42,8 +41,8 @@ CREATE TABLE builds ( mem_max REAL NOT NULL, mem_min REAL NOT NULL, mem_stddev REAL NOT NULL, - FOREIGN KEY (vm) - REFERENCES vms (id) + FOREIGN KEY (node) + REFERENCES nodes (id) ON UPDATE CASCADE ON DELETE CASCADE ); From 7b05c8ce0869b4cbbee5b80e7516b796528c873e Mon Sep 17 00:00:00 2001 From: caetano melone Date: Thu, 25 Jan 2024 15:08:50 -0800 Subject: [PATCH 15/27] reorganize functionality around clients rather than models --- gantry/clients/prometheus.py | 207 +++++++++++++++++++++++++++++- gantry/db/__init__.py | 3 + gantry/db/get.py | 41 ++++++ gantry/db/insert.py | 69 ++++++++++ gantry/models/__init__.py | 3 +- gantry/models/build.py | 241 ----------------------------------- gantry/models/job.py | 40 ++++++ gantry/models/vm.py | 107 ---------------- gantry/routes/collection.py | 104 ++++++++++----- gantry/util/misc.py | 4 - gantry/util/spec.py | 17 --- gantry/views.py | 12 +- 12 files changed, 432 insertions(+), 416 deletions(-) create mode 100644 gantry/db/__init__.py create mode 100644 gantry/db/get.py create mode 100644 gantry/db/insert.py delete mode 100644 gantry/models/build.py create mode 100644 gantry/models/job.py delete mode 100644 gantry/models/vm.py delete mode 100644 gantry/util/misc.py diff --git a/gantry/clients/prometheus.py b/gantry/clients/prometheus.py index a3db981..720c8cc 100644 --- a/gantry/clients/prometheus.py +++ b/gantry/clients/prometheus.py @@ -1,3 +1,4 @@ +import json import logging import math import statistics @@ -5,6 +6,8 @@ import aiohttp +from gantry.util.spec import spec_variants + class IncompleteData(Exception): pass @@ -97,6 +100,198 @@ def prettify_res(self, response: dict) -> dict: for result in response["data"]["result"] ] + async def get_job_annotations(self, job_id: int, time: float) -> dict: + """ + args: + job_id: job id + time: when to query (unix timestamp) + returns: dict of annotations + """ + + res = await self.query( + type="single", + query={ + "metric": "kube_pod_annotations", + "filters": {"annotation_gitlab_ci_job_id": job_id}, + }, + time=time, + ) + + if not res: + raise IncompleteData("annotation data is missing") + + annotations = res[0]["labels"] + + return { + "pod": annotations["pod"], + # if build jobs is not set, defaults to 16 due to spack config + "build_jobs": annotations.get( + "annotation_metrics_spack_job_build_jobs", 16 + ), + "arch": annotations["annotation_metrics_spack_job_spec_arch"], + "pkg_name": annotations["annotation_metrics_spack_job_spec_pkg_name"], + "pkg_version": annotations["annotation_metrics_spack_job_spec_pkg_version"], + "pkg_variants": json.dumps( + spec_variants(annotations["annotation_metrics_spack_job_spec_variants"]) + ), + "compiler_name": annotations[ + "annotation_metrics_spack_job_spec_compiler_name" + ], + "compiler_version": annotations[ + "annotation_metrics_spack_job_spec_compiler_version" + ], + "stack": annotations["annotation_metrics_spack_ci_stack_name"], + } + + async def get_job_resources(self, pod: str, time: float) -> tuple[dict, str]: + """ + args: + job_id: job id + pod: pod name + time: when to query (unix timestamp) + returns: dict of resources and node hostname + """ + + requests = process_resources( + await self.query( + type="single", + query={ + "metric": "kube_pod_container_resource_requests", + "filters": {"container": "build", "pod": pod}, + }, + time=time, + ) + ) + + limits_res = await self.query( + type="single", + query={ + "metric": "kube_pod_container_resource_limits", + "filters": {"container": "build", "pod": pod}, + }, + time=time, + ) + + if not limits_res: + raise IncompleteData("missing limits") + + # instead of needing to fetch the node where the pod ran from kube_pod_info + # we can grab it from kube_pod_container_resource_limits + # weirdly, it's not available in kube_pod_labels or annotations + # https://github.com/kubernetes/kube-state-metrics/issues/1148 + node = limits_res[0]["labels"]["node"] + limits = process_resources(limits_res) + + return ( + { + "cpu_request": requests["cpu"]["value"], + "mem_request": requests["memory"]["value"], + "cpu_limit": limits.get("cpu", {}).get("value"), + "mem_limit": limits["memory"]["value"], + }, + node, + ) + + async def get_job_usage(self, pod: str, start: float, end: float) -> dict: + """ + Gets resource usage attributes for a job. + + args: + pod: pod name + start: start time (unix timestamp) + end: end time (unix timestamp) + returns: dict of usage stats + """ + + mem_usage = process_usage( + await self.query( + type="range", + query={ + "metric": "container_memory_working_set_bytes", + "filters": {"container": "build", "pod": pod}, + }, + start=start, + end=end, + ) + ) + + cpu_usage = process_usage( + await self.query( + type="range", + custom_query=( + f"rate(container_cpu_usage_seconds_total{{" + f"pod='{pod}', container='build'}}[90s])" + ), + start=start, + end=end, + ) + ) + + return { + "cpu_mean": cpu_usage["mean"], + "cpu_median": cpu_usage["median"], + "cpu_max": cpu_usage["max"], + "cpu_min": cpu_usage["min"], + "cpu_stddev": cpu_usage["stddev"], + "mem_mean": mem_usage["mean"], + "mem_median": mem_usage["median"], + "mem_max": mem_usage["max"], + "mem_min": mem_usage["min"], + "mem_stddev": mem_usage["stddev"], + } + + async def get_node_uuid(self, hostname: str, time: float) -> dict: + """ + args: + hostname: node hostname + time: time to query (unix timestamp) + returns: dict of node info (UUID as of now) + """ + + res = await self.query( + type="single", + query={ + "metric": "kube_node_info", + "filters": {"node": hostname}, + }, + time=time, + ) + + if not res: + raise IncompleteData(f"node info is missing. hostname={hostname}") + + return res[0]["labels"]["system_uuid"] + + async def get_node_labels(self, hostname: str, time: float) -> dict: + """ + args: + hostname: node hostname + time: time to query (unix timestamp) + returns: dict of node labels + """ + + res = await self.query( + type="single", + query={ + "metric": "kube_node_labels", + "filters": {"node": hostname}, + }, + time=time, + ) + + if not res: + raise IncompleteData(f"node labels are missing. hostname={hostname}") + + labels = res[0]["labels"] + + return { + "cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]), + "mem": float(labels["label_karpenter_k8s_aws_instance_memory"]), + "arch": labels["label_kubernetes_io_arch"], + "os": labels["label_kubernetes_io_os"], + "instance_type": labels["label_node_kubernetes_io_instance_type"], + } + def query_to_str(metric: str, filters: dict) -> str: """ @@ -107,20 +302,19 @@ def query_to_str(metric: str, filters: dict) -> str: return f"{metric}{{{filters_str}}}" -def process_resources(res: dict, job_id: int) -> dict: +def process_resources(res: dict) -> dict: """ Processes the resource limits and requests from a Prometheus response into readable format. args: res: Prometheus response - job_id: job id for error logging returns: dict with {resource: {unit: value}} format """ if not res: - raise IncompleteData(f"resource data is missing for job {job_id}") + raise IncompleteData("resource data is missing") processed = {} for item in res: @@ -133,21 +327,20 @@ def process_resources(res: dict, job_id: int) -> dict: return processed -def process_usage(res: dict, job_id: int) -> dict: +def process_usage(res: dict) -> dict: """ Processes the usage data from a Prometheus response into readable format. This could either be CPU usage or memory usage. args: res: Prometheus response - job_id: job id for error logging returns: dict with {statistic: value} format """ if not res: # sometimes prometheus reports no data for a job if the time range is too small - raise IncompleteData(f"usage data is missing for job {job_id}") + raise IncompleteData("usage data is missing") usage = [float(value) for timestamp, value in res[0]["values"]] @@ -165,6 +358,6 @@ def process_usage(res: dict, job_id: int) -> dict: or sum_stats["mean"] == 0 or math.isnan(sum_stats["stddev"]) ): - raise IncompleteData(f"usage data is invalid for job {job_id}") + raise IncompleteData("usage data is invalid") return sum_stats diff --git a/gantry/db/__init__.py b/gantry/db/__init__.py new file mode 100644 index 0000000..dab0a74 --- /dev/null +++ b/gantry/db/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from .get import * +from .insert import * diff --git a/gantry/db/get.py b/gantry/db/get.py new file mode 100644 index 0000000..8ef7977 --- /dev/null +++ b/gantry/db/get.py @@ -0,0 +1,41 @@ +import logging + +import aiosqlite + + +async def get_node(db: aiosqlite.Connection, uuid: str) -> int | None: + """return the primary key if found, otherwise return None""" + + async with db.execute("select id from nodes where uuid = ?", (uuid,)) as cursor: + if cur_node := await cursor.fetchone(): + return cur_node[0] + + return None + + +async def job_exists(db: aiosqlite.Connection, job_id: int) -> bool: + """return if the job exists in the database""" + + async with db.execute("select id from jobs where job_id = ?", (job_id,)) as cursor: + if await cursor.fetchone(): + logging.warning( + f""" + job {job_id} already in database. + check why multiple requests are being sent. + """ + ) + return True + + return False + + +async def ghost_exists(db: aiosqlite.Connection, job_id: int) -> bool: + """return if the ghost job exists in the database""" + + async with db.execute( + "select id from ghost_jobs where job_id = ?", (job_id,) + ) as cursor: + if await cursor.fetchone(): + return True + + return False diff --git a/gantry/db/insert.py b/gantry/db/insert.py new file mode 100644 index 0000000..da35620 --- /dev/null +++ b/gantry/db/insert.py @@ -0,0 +1,69 @@ +import aiosqlite + +from gantry.db.get import get_node + + +def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]: + """ + crafts an sqlite insert statement from a dictionary. + + args: + table: name of the table to insert into + input: dictionary of values to insert + ignore: whether to ignore duplicate entries + + returns: tuple of (query, values) + """ + + columns = ", ".join(input.keys()) + values = ", ".join(["?" for _ in range(len(input))]) + query = f"INSERT INTO {table} ({columns}) VALUES ({values})" + + if ignore: + query = query.replace("INSERT", "INSERT OR IGNORE") + + # using a tuple of values from the dictionary + values_tuple = tuple(input.values()) + return query, values_tuple + + +async def insert_ghost(db: aiosqlite.Connection, job_id: int) -> None: + """Inserts a ghost job into the database.""" + + await db.execute(("insert into ghost_jobs (name) values (?)"), (job_id,)) + + +async def insert_node(db: aiosqlite.Connection, node: dict) -> int: + """Inserts a node into the database.""" + + async with db.execute( + *insert_dict( + "nodes", + node, + # deal with races + ignore=True, + ) + ) as cursor: + pk = cursor.lastrowid + + if pk == 0: + # the ignore part of the query was triggered, some other call + # must have inserted the node before this one + pk = await get_node(db, node["uuid"]) + + return pk + + +async def insert_job(db: aiosqlite.Connection, job: dict) -> int: + """Inserts a job into the database.""" + + async with db.execute( + *insert_dict( + "jobs", + job, + # if the job somehow gets added into the db (pod+id being unique) + # then ignore the insert + ignore=True, + ) + ) as cursor: + return cursor.lastrowid diff --git a/gantry/models/__init__.py b/gantry/models/__init__.py index 57e9b66..73d8633 100644 --- a/gantry/models/__init__.py +++ b/gantry/models/__init__.py @@ -1,3 +1,2 @@ # flake8: noqa -from .build import Build -from .vm import VM +from .job import Job diff --git a/gantry/models/build.py b/gantry/models/build.py deleted file mode 100644 index 169a6b8..0000000 --- a/gantry/models/build.py +++ /dev/null @@ -1,241 +0,0 @@ -import json -import logging -import re -from datetime import datetime - -import aiosqlite - -from gantry.clients.gitlab import GitlabClient -from gantry.util.misc import insert_dict, setattrs, spec_variants -from gantry.clients.prometheus import ( - IncompleteData, - PrometheusClient, - process_resources, - process_usage, -) - - -class Build: - def __init__( - self, - status: str, - name: str, - id: int, - start: str, - end: str, - retries: int, - ref: str, - ): - self.status = status - self.name = name - self.id = id - self.start = datetime.fromisoformat(start).timestamp() - self.end = datetime.fromisoformat(end).timestamp() - self.retries = retries - self.ref = ref - - @property - def midpoint(self) -> float: - """Returns the midpoint of the job in unix time.""" - # prometheus is not guaranteed to have data at the exact start and end times - # instead of creating an arbitrary buffer, ask for data in the middle of the job - return (self.start + self.end) / 2 - - async def is_ghost(self, db: aiosqlite.Connection, gl: GitlabClient) -> bool: - """Returns the job's ghost status.""" - - # prevent duplicate jobs from being inserted into the database - async with db.execute( - "select job_id from ghost_jobs where job_id = ?", (self.id,) - ) as cursor: - if await cursor.fetchone(): - # ghost job is already in the database - return True - - log = await gl.job_log(self.id) - ghost = "No need to rebuild" in log - - if ghost: - await db.execute(("insert into ghost_jobs (name) values (?)"), (self.id,)) - - return ghost - - async def in_db(self, db: aiosqlite.Connection) -> bool: - """Checks if the job is already in the db.""" - - async with db.execute( - "select job_id from builds where job_id = ?", (self.id,) - ) as cursor: - found = bool(await cursor.fetchone()) - - if found: - logging.warning(f"job {self.id} already in database") - - return found - - async def get_annotations(self, prometheus: PrometheusClient): - """Fetches the annotations and assigns multiple attributes.""" - - annotations_res = await prometheus.query( - type="single", - query={ - "metric": "kube_pod_annotations", - "filters": {"annotation_gitlab_ci_job_id": self.id}, - }, - time=self.midpoint, - ) - - if not annotations_res: - raise IncompleteData(f"missing annotations for job {self.id}") - - annotations = annotations_res[0]["labels"] - - setattrs( - self, - pod=annotations["pod"], - # if build jobs is not set, defaults to 16 due to spack settings - build_jobs=annotations.get("annotation_metrics_spack_job_build_jobs", 16), - arch=annotations["annotation_metrics_spack_job_spec_arch"], - pkg_name=annotations["annotation_metrics_spack_job_spec_pkg_name"], - pkg_version=annotations["annotation_metrics_spack_job_spec_pkg_version"], - pkg_variants=spec_variants( - annotations["annotation_metrics_spack_job_spec_variants"] - ), - compiler_name=annotations[ - "annotation_metrics_spack_job_spec_compiler_name" - ], - compiler_version=annotations[ - "annotation_metrics_spack_job_spec_compiler_version" - ], - stack=annotations["annotation_metrics_spack_ci_stack_name"], - ) - - async def get_resources(self, prometheus: PrometheusClient): - """fetches pod requests and limits, and also sets the node hostname""" - requests = process_resources( - await prometheus.query( - type="single", - query={ - "metric": "kube_pod_container_resource_requests", - "filters": {"container": "build", "pod": self.pod}, - }, - time=self.midpoint, - ), - self.id, - ) - - limits_res = await prometheus.query( - type="single", - query={ - "metric": "kube_pod_container_resource_limits", - "filters": {"container": "build", "pod": self.pod}, - }, - time=self.midpoint, - ) - - if not limits_res: - raise IncompleteData(f"missing limits for job {self.id}") - - # instead of needing to fetch the node where the pod ran from kube_pod_info - # we can grab it from kube_pod_container_resource_limits - # weirdly, it's not available in kube_pod_labels or annotations - # https://github.com/kubernetes/kube-state-metrics/issues/1148 - - self.node = limits_res[0]["labels"]["node"] - limits = process_resources(limits_res, self.id) - - setattrs( - self, - cpu_request=requests["cpu"]["value"], - mem_request=requests["memory"]["value"], - cpu_limit=limits.get("cpu", {}).get("value"), - mem_limit=limits["memory"]["value"], - ) - - async def get_usage(self, prometheus: PrometheusClient): - """Sets resource usage attributes.""" - - mem_usage = process_usage( - await prometheus.query( - type="range", - query={ - "metric": "container_memory_working_set_bytes", - "filters": {"container": "build", "pod": self.pod}, - }, - start=self.start, - end=self.end, - ), - self.id, - ) - - cpu_usage = process_usage( - await prometheus.query( - type="range", - custom_query=( - f"rate(container_cpu_usage_seconds_total{{" - f"pod='{self.pod}', container='build'}}[90s])" - ), - start=self.start, - end=self.end, - ), - self.id, - ) - - setattrs( - self, - cpu_mean=cpu_usage["mean"], - cpu_median=cpu_usage["median"], - cpu_max=cpu_usage["max"], - cpu_min=cpu_usage["min"], - cpu_stddev=cpu_usage["stddev"], - mem_mean=mem_usage["mean"], - mem_median=mem_usage["median"], - mem_max=mem_usage["max"], - mem_min=mem_usage["min"], - mem_stddev=mem_usage["stddev"], - ) - - async def insert(self, db: aiosqlite.Connection, vm_id: int) -> int: - """Inserts the build into the database and returns its id.""" - - async with db.execute( - *insert_dict( - "builds", - { - "pod": self.pod, - "vm": vm_id, - "start": self.start, - "end": self.end, - "job_id": self.id, - "job_status": self.status, - "retries": self.retries, - "ref": self.ref, - "pkg_name": self.pkg_name, - "pkg_version": self.pkg_version, - "pkg_variants": json.dumps(self.pkg_variants), # dict to string - "compiler_name": self.compiler_name, - "compiler_version": self.compiler_version, - "arch": self.arch, - "stack": self.stack, - "build_jobs": self.build_jobs, - "cpu_request": self.cpu_request, - "cpu_limit": self.cpu_limit, - "cpu_mean": self.cpu_mean, - "cpu_median": self.cpu_median, - "cpu_max": self.cpu_max, - "cpu_min": self.cpu_min, - "cpu_stddev": self.cpu_stddev, - "mem_request": self.mem_request, - "mem_limit": self.mem_limit, - "mem_mean": self.mem_mean, - "mem_median": self.mem_median, - "mem_max": self.mem_max, - "mem_min": self.mem_min, - "mem_stddev": self.mem_stddev, - }, - # if the job somehow gets added into the db (pod+id being unique) - # then ignore the insert - ignore=True, - ) - ) as cursor: - return cursor.lastrowid diff --git a/gantry/models/job.py b/gantry/models/job.py new file mode 100644 index 0000000..64b2f77 --- /dev/null +++ b/gantry/models/job.py @@ -0,0 +1,40 @@ +import re +from datetime import datetime + + +class Job: + def __init__( + self, + status: str, + name: str, + id: int, + start: str, + end: str, + ref: str, + ): + self.status = status + self.name = name + self.id = id + self.start = datetime.fromisoformat(start).timestamp() + self.end = datetime.fromisoformat(end).timestamp() + self.ref = ref + + @property + def midpoint(self) -> float: + """Returns the midpoint of the job in unix time.""" + # prometheus is not guaranteed to have data at the exact start and end times + # instead of creating an arbitrary buffer, ask for data in the middle of the job + return (self.start + self.end) / 2 + + @property + def valid_build_name(self) -> bool: + """validates the job name.""" + + # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0 + # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1 + job_name_pattern = re.compile( + r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)" + ) + job_name_match = job_name_pattern.match(self.name) + # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack + return bool(job_name_match) diff --git a/gantry/models/vm.py b/gantry/models/vm.py deleted file mode 100644 index 59fe864..0000000 --- a/gantry/models/vm.py +++ /dev/null @@ -1,107 +0,0 @@ -import aiosqlite - -from gantry.util.misc import insert_dict, setattrs -from gantry.clients.prometheus import IncompleteData, PrometheusClient - -MB_IN_BYTES = 1_000_000 - - -class VM: - def __init__(self, hostname: str, query_time: float): - """ - args: - hostname: the hostname of the VM - query_time: any point during VM runtime, usually grabbed from build - """ - self.hostname = hostname - self.query_time = query_time - - async def db_id( - self, db: aiosqlite.Connection, prometheus: PrometheusClient - ) -> int | None: - """ - Returns the id of the vm if it exists in the database, otherwise returns None. - Also sets the uuid of the vm. - """ - vm_info = await prometheus.query( - type="single", - query={ - "metric": "kube_node_info", - "filters": {"node": self.hostname}, - }, - time=self.query_time, - ) - - if not vm_info: - raise IncompleteData(f"missing vm info for {self.hostname}") - - self.uuid = vm_info[0]["labels"]["system_uuid"] - - # look for the vm in the database - async with db.execute( - "select id from vms where uuid = ?", (self.uuid,) - ) as cursor: - old_vm = await cursor.fetchone() - - if old_vm: - return old_vm[0] - - return None - - async def get_labels(self, prometheus: PrometheusClient): - """Sets multiple attributes of the VM based on its labels.""" - - vm_labels_res = await prometheus.query( - type="single", - query={ - "metric": "kube_node_labels", - "filters": {"node": self.hostname}, - }, - time=self.query_time, - ) - - if not vm_labels_res: - raise IncompleteData(f"missing vm labels for {self.hostname}") - - labels = vm_labels_res[0]["labels"] - - setattrs( - self, - cores=float(labels["label_karpenter_k8s_aws_instance_cpu"]), - mem=float(labels["label_karpenter_k8s_aws_instance_memory"]), - arch=labels["label_kubernetes_io_arch"], - os=labels["label_kubernetes_io_os"], - instance_type=labels["label_node_kubernetes_io_instance_type"], - ) - - async def insert(self, db: aiosqlite.Connection) -> int: - """Inserts the VM into the database and returns its id.""" - async with db.execute( - *insert_dict( - "vms", - { - "uuid": self.uuid, - "hostname": self.hostname, - "cores": self.cores, - # convert to bytes to be consistent with other resource metrics - "mem": self.mem * MB_IN_BYTES, - "arch": self.arch, - "os": self.os, - "instance_type": self.instance_type, - }, - # deal with races - ignore=True, - ) - ) as cursor: - pk = cursor.lastrowid - - if pk == 0: - # the ignore part of the query was triggered, some other call - # must have inserted the vm before this one - async with db.execute( - "select id from vms where uuid = ?", (self.uuid,) - ) as cursor: - pk_res = await cursor.fetchone() - pk = pk_res[0] - - return pk diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index b6c48a8..25e119f 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -2,15 +2,17 @@ import aiosqlite -from gantry.models import VM, Build +from gantry import db from gantry.clients.gitlab import GitlabClient from gantry.clients.prometheus import IncompleteData, PrometheusClient -from gantry.util.spec import valid_build_name +from gantry.models import Job +MB_IN_BYTES = 1_000_000 -async def fetch_build( + +async def fetch_job( payload: dict, - db: aiosqlite.Connection, + db_conn: aiosqlite.Connection, gitlab: GitlabClient, prometheus: PrometheusClient, ) -> None: @@ -27,68 +29,100 @@ async def fetch_build( returns: None in order to accomodate a 200 response for the webhook. """ - build = Build( + job = Job( status=payload["build_status"], name=payload["build_name"], id=payload["build_id"], start=payload["build_started_at"], end=payload["build_finished_at"], - retries=payload["retries_count"], ref=payload["ref"], ) # perform checks to see if we should collect data for this job if ( - build.status not in ("success",) - or not valid_build_name(build.name) # is not a build job - or await build.in_db(db) # job already in the database - or await build.is_ghost(db, gitlab) + job.status != "success" + or not job.valid_build_name # is not a build job + or await db.job_exists(db_conn, job.id) # job already in the database + or await db.ghost_exists(db_conn, job.id) # ghost already in db ): return + # check if the job is a ghost + job_log = await gitlab.job_log(job.id) + is_ghost = "No need to rebuild" in job_log + if is_ghost: + db.insert_ghost(db_conn, job.id) + return + try: - await build.get_annotations(prometheus) - await build.get_resources(prometheus) - await build.get_usage(prometheus) - vm_id = await fetch_vm(db, prometheus, build.node, build.midpoint) + annotations = await prometheus.get_job_annotations(job.id, job.midpoint) + resources, node_hostname = await prometheus.get_job_resources( + annotations["pod"], job.midpoint + ) + usage = await prometheus.get_job_usage(annotations["pod"], job.start, job.end) + node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint) except IncompleteData as e: # missing data, skip this job - logging.error(e) + logging.error(f"{e} job={job.id}") return - await build.insert(db, vm_id) - # vm and build will get saved at the same time to make sure - # we don't accidentally commit a vm without a build - await db.commit() + await db.insert_job( + db_conn, + { + "node": node_id, + "start": job.start, + "end": job.end, + "job_id": job.id, + "job_status": job.status, + "ref": job.ref, + **annotations, + **resources, + **usage, + }, + ) + + # job and node will get saved at the same time to make sure + # we don't accidentally commit a node without a job + await db_conn.commit() return -async def fetch_vm( - db: aiosqlite.Connection, +async def fetch_node( + db_conn: aiosqlite.Connection, prometheus: PrometheusClient, hostname: dict, query_time: float, ) -> int: """ - Finds an existing VM in the database or inserts a new one. + Finds an existing node in the database or inserts a new one. args: db: an active aiosqlite connection prometheus: - hostname: the hostname of the VM - query_time: any point during VM runtime, usually grabbed from build + hostname: the hostname of the node + query_time: any point during node runtime, usually grabbed from job - returns: id of the inserted or existing VM + returns: id of the inserted or existing node """ - vm = VM( - hostname=hostname, - query_time=query_time, - ) - # do not proceed if the VM exists - if existing_vm := await vm.db_id(db, prometheus): - return existing_vm - - await vm.get_labels(prometheus) - return await vm.insert(db) + node_uuid = await prometheus.get_node_uuid(hostname, query_time) + + # do not proceed if the node exists + if existing_node := await db.get_node(db_conn, node_uuid): + return existing_node + + node_labels = await prometheus.get_node_labels(hostname, query_time) + return await db.insert_node( + db_conn, + { + "uuid": node_uuid, + "hostname": hostname, + "cores": node_labels["cores"], + # convert to bytes to be consistent with other resource metrics + "mem": node_labels["mem"] * MB_IN_BYTES, + "arch": node_labels["arch"], + "os": node_labels["os"], + "instance_type": node_labels["instance_type"], + }, + ) diff --git a/gantry/util/misc.py b/gantry/util/misc.py deleted file mode 100644 index 0ff0892..0000000 --- a/gantry/util/misc.py +++ /dev/null @@ -1,4 +0,0 @@ -def setattrs(_self, **kwargs): - """Sets multiple attributes of an object from a dictionary.""" - for k, v in kwargs.items(): - setattr(_self, k, v) diff --git a/gantry/util/spec.py b/gantry/util/spec.py index 9376ece..eb1b33d 100644 --- a/gantry/util/spec.py +++ b/gantry/util/spec.py @@ -1,5 +1,3 @@ -import re - def spec_variants(spec: str) -> dict: """Given a spec's concrete variants, return a dict in name: value format.""" # example: +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on @@ -29,18 +27,3 @@ def spec_variants(spec: str) -> dict: variants[part[1:]] = False return variants - -def valid_build_name(name): - """Returns True if the job is a build job, False otherwise.""" - - # example: plumed@2.9.0 /i4u7p6u %gcc@11.4.0 - # arch=linux-ubuntu20.04-neoverse_v1 E4S ARM Neoverse V1 - job_name_pattern = re.compile( - r"([^/ ]+)@([^/ ]+) /([^%]+) %([^ ]+) ([^ ]+) (.+)" - ) - job_name_match = job_name_pattern.match(name) - # groups: 1: name, 2: version, 3: hash, 4: compiler, 5: arch, 6: stack - return bool(job_name_match) - - - diff --git a/gantry/views.py b/gantry/views.py index b311e8d..d9a0bb4 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -1,9 +1,10 @@ +import asyncio import json import os from aiohttp import web -from gantry.routes.collection import fetch_build +from gantry.routes.collection import fetch_job routes = web.RouteTableDef() @@ -21,7 +22,12 @@ async def collect_job(request: web.Request) -> web.Response: if request.headers.get("X-Gitlab-Event") != "Job Hook": return web.Response(status=400, text="invalid event type") - await fetch_build( - payload, request.app["db"], request.app["gitlab"], request.app["prometheus"] + # will return immediately, but will not block the event loop + # allowing fetch_job to run in the background + asyncio.ensure_future( + fetch_job( + payload, request.app["db"], request.app["gitlab"], request.app["prometheus"] + ) ) + return web.Response(status=200) From 608c043214686013c58785e1990f9548338be624 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Thu, 25 Jan 2024 15:28:29 -0800 Subject: [PATCH 16/27] job_id -> gitlab_id --- db/schema.sql | 5 ++--- gantry/clients/gitlab.py | 4 ++-- gantry/clients/prometheus.py | 7 +++---- gantry/db/get.py | 12 +++++++----- gantry/db/insert.py | 4 ++-- gantry/models/job.py | 4 ++-- gantry/routes/collection.py | 16 ++++++++-------- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/db/schema.sql b/db/schema.sql index 26352e7..8104a4e 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -9,14 +9,13 @@ CREATE TABLE nodes ( instance_type TEXT NOT NULL ); - CREATE TABLE jobs ( id INTEGER PRIMARY KEY, pod TEXT NOT NULL UNIQUE, node INTEGER NOT NULL, start INTEGER NOT NULL, end INTEGER NOT NULL, - job_id INTEGER NOT NULL UNIQUE, + gitlab_id INTEGER NOT NULL UNIQUE, job_status TEXT NOT NULL, ref TEXT NOT NULL, pkg_name TEXT NOT NULL, @@ -49,5 +48,5 @@ CREATE TABLE jobs ( CREATE TABLE ghost_jobs ( id INTEGER PRIMARY KEY, - job_id INTEGER NOT NULL + gitlab_id INTEGER NOT NULL ); diff --git a/gantry/clients/gitlab.py b/gantry/clients/gitlab.py index 7ab672e..97f9500 100644 --- a/gantry/clients/gitlab.py +++ b/gantry/clients/gitlab.py @@ -24,8 +24,8 @@ async def _request(self, url: str, response_type: str) -> dict | str: if response_type == "text": return await resp.text() - async def job_log(self, job_id: int) -> str: + async def job_log(self, gl_id: int) -> str: """Given a job id, returns the log from that job""" - url = f"{self.base_url}/jobs/{job_id}/trace" + url = f"{self.base_url}/jobs/{gl_id}/trace" return await self._request(url, "text") diff --git a/gantry/clients/prometheus.py b/gantry/clients/prometheus.py index 720c8cc..87be7ce 100644 --- a/gantry/clients/prometheus.py +++ b/gantry/clients/prometheus.py @@ -100,10 +100,10 @@ def prettify_res(self, response: dict) -> dict: for result in response["data"]["result"] ] - async def get_job_annotations(self, job_id: int, time: float) -> dict: + async def get_job_annotations(self, gl_id: int, time: float) -> dict: """ args: - job_id: job id + gl_id: gitlab job id time: when to query (unix timestamp) returns: dict of annotations """ @@ -112,7 +112,7 @@ async def get_job_annotations(self, job_id: int, time: float) -> dict: type="single", query={ "metric": "kube_pod_annotations", - "filters": {"annotation_gitlab_ci_job_id": job_id}, + "filters": {"annotation_gitlab_ci_job_id": gl_id}, }, time=time, ) @@ -146,7 +146,6 @@ async def get_job_annotations(self, job_id: int, time: float) -> dict: async def get_job_resources(self, pod: str, time: float) -> tuple[dict, str]: """ args: - job_id: job id pod: pod name time: when to query (unix timestamp) returns: dict of resources and node hostname diff --git a/gantry/db/get.py b/gantry/db/get.py index 8ef7977..c597c3e 100644 --- a/gantry/db/get.py +++ b/gantry/db/get.py @@ -13,14 +13,16 @@ async def get_node(db: aiosqlite.Connection, uuid: str) -> int | None: return None -async def job_exists(db: aiosqlite.Connection, job_id: int) -> bool: +async def job_exists(db: aiosqlite.Connection, gl_id: int) -> bool: """return if the job exists in the database""" - async with db.execute("select id from jobs where job_id = ?", (job_id,)) as cursor: + async with db.execute( + "select id from jobs where gitlab_id = ?", (gl_id,) + ) as cursor: if await cursor.fetchone(): logging.warning( f""" - job {job_id} already in database. + job {gl_id} already in database. check why multiple requests are being sent. """ ) @@ -29,11 +31,11 @@ async def job_exists(db: aiosqlite.Connection, job_id: int) -> bool: return False -async def ghost_exists(db: aiosqlite.Connection, job_id: int) -> bool: +async def ghost_exists(db: aiosqlite.Connection, gl_id: int) -> bool: """return if the ghost job exists in the database""" async with db.execute( - "select id from ghost_jobs where job_id = ?", (job_id,) + "select id from ghost_jobs where gitlab_id = ?", (gl_id,) ) as cursor: if await cursor.fetchone(): return True diff --git a/gantry/db/insert.py b/gantry/db/insert.py index da35620..3df157d 100644 --- a/gantry/db/insert.py +++ b/gantry/db/insert.py @@ -27,10 +27,10 @@ def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]: return query, values_tuple -async def insert_ghost(db: aiosqlite.Connection, job_id: int) -> None: +async def insert_ghost(db: aiosqlite.Connection, gl_id: int) -> None: """Inserts a ghost job into the database.""" - await db.execute(("insert into ghost_jobs (name) values (?)"), (job_id,)) + await db.execute(("insert into ghost_jobs (gitlab_id) values (?)"), (gl_id,)) async def insert_node(db: aiosqlite.Connection, node: dict) -> int: diff --git a/gantry/models/job.py b/gantry/models/job.py index 64b2f77..3c3a794 100644 --- a/gantry/models/job.py +++ b/gantry/models/job.py @@ -7,14 +7,14 @@ def __init__( self, status: str, name: str, - id: int, + gl_id: int, start: str, end: str, ref: str, ): self.status = status self.name = name - self.id = id + self.gl_id = gl_id self.start = datetime.fromisoformat(start).timestamp() self.end = datetime.fromisoformat(end).timestamp() self.ref = ref diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index 25e119f..d5125cd 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -32,7 +32,7 @@ async def fetch_job( job = Job( status=payload["build_status"], name=payload["build_name"], - id=payload["build_id"], + gl_id=payload["build_id"], start=payload["build_started_at"], end=payload["build_finished_at"], ref=payload["ref"], @@ -42,20 +42,20 @@ async def fetch_job( if ( job.status != "success" or not job.valid_build_name # is not a build job - or await db.job_exists(db_conn, job.id) # job already in the database - or await db.ghost_exists(db_conn, job.id) # ghost already in db + or await db.job_exists(db_conn, job.gl_id) # job already in the database + or await db.ghost_exists(db_conn, job.gl_id) # ghost already in db ): return # check if the job is a ghost - job_log = await gitlab.job_log(job.id) + job_log = await gitlab.job_log(job.gl_id) is_ghost = "No need to rebuild" in job_log if is_ghost: - db.insert_ghost(db_conn, job.id) + db.insert_ghost(db_conn, job.gl_id) return try: - annotations = await prometheus.get_job_annotations(job.id, job.midpoint) + annotations = await prometheus.get_job_annotations(job.gl_id, job.midpoint) resources, node_hostname = await prometheus.get_job_resources( annotations["pod"], job.midpoint ) @@ -63,7 +63,7 @@ async def fetch_job( node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint) except IncompleteData as e: # missing data, skip this job - logging.error(f"{e} job={job.id}") + logging.error(f"{e} job={job.gl_id}") return await db.insert_job( @@ -72,7 +72,7 @@ async def fetch_job( "node": node_id, "start": job.start, "end": job.end, - "job_id": job.id, + "gitlab_id": job.gl_id, "job_status": job.status, "ref": job.ref, **annotations, From 11d05fc99ddbe4df1ca68367ef00a02f50b32ce1 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Thu, 25 Jan 2024 16:38:11 -0800 Subject: [PATCH 17/27] make prometheus client more modular --- gantry/clients/prometheus.py | 362 ------------------------ gantry/clients/prometheus/__init__.py | 2 + gantry/clients/prometheus/job.py | 148 ++++++++++ gantry/clients/prometheus/node.py | 58 ++++ gantry/clients/prometheus/prometheus.py | 105 +++++++ gantry/clients/prometheus/util.py | 76 +++++ gantry/routes/collection.py | 13 +- 7 files changed, 396 insertions(+), 368 deletions(-) delete mode 100644 gantry/clients/prometheus.py create mode 100644 gantry/clients/prometheus/__init__.py create mode 100644 gantry/clients/prometheus/job.py create mode 100644 gantry/clients/prometheus/node.py create mode 100644 gantry/clients/prometheus/prometheus.py create mode 100644 gantry/clients/prometheus/util.py diff --git a/gantry/clients/prometheus.py b/gantry/clients/prometheus.py deleted file mode 100644 index 87be7ce..0000000 --- a/gantry/clients/prometheus.py +++ /dev/null @@ -1,362 +0,0 @@ -import json -import logging -import math -import statistics -import urllib.parse - -import aiohttp - -from gantry.util.spec import spec_variants - - -class IncompleteData(Exception): - pass - - -class PrometheusClient: - def __init__(self, base_url: str, auth_cookie: str = ""): - # cookie will only be used if set - if auth_cookie: - self.cookies = {"_oauth2_proxy": auth_cookie} - else: - self.cookies = {} - - self.base_url = base_url - - async def query(self, type: str, **kwargs) -> dict: - """ - type: "range" or "single" - - for range queries: set `start` and `end` (unix timestamps) - for single queries: set `time` (unix timestamp) - - for custom queries: set `custom_query` (string) - - for metric queries: set `query` (dict) - example: - "query": { - "metric": "metric_name", - "filters": {"filter1": "value1", "filter2": "value2"} - } - """ - - # validate that one of query or custom_query is set, but not both or neither - if not kwargs.get("query") and not kwargs.get("custom_query"): - raise ValueError("query or custom_query must be set") - if kwargs.get("query") and kwargs.get("custom_query"): - raise ValueError("query and custom_query cannot both be set") - - query_str = urllib.parse.quote( - kwargs["custom_query"] - if kwargs.get("custom_query") - else query_to_str(**kwargs["query"]) - ) - - if type == "range": - # prometheus will only return this many frames - max_resolution = 10_000 - # calculating the max step size to get the desired resolution - step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution) - url = ( - f"{self.base_url}/query_range?" - f"query={query_str}&" - f"start={kwargs['start']}&" - f"end={kwargs['end']}&" - f"step={step}s" - ) - return await self._query(url) - elif type == "single": - url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}" - return await self._query(url) - - async def _query(self, url: str) -> dict: - """Query Prometheus with a query string""" - async with aiohttp.ClientSession(raise_for_status=True) as session: - # submit cookie with request - async with session.get(url, cookies=self.cookies) as resp: - try: - return self.prettify_res(await resp.json()) - except aiohttp.ContentTypeError: - logging.error( - """Prometheus query failed with unexpected response. - The cookie may have expired.""" - ) - return {} - - def prettify_res(self, response: dict) -> dict: - """Process Prometheus response into an arrray of dicts with {label: value}""" - result_type = response.get("data", {}).get("resultType") - values_dict = { - "matrix": "values", - "vector": "value", - } - - if result_type not in values_dict: - logging.error(f"Prometheus response type {result_type} not supported") - return {} - - return [ - {"labels": result["metric"], "values": result[values_dict[result_type]]} - for result in response["data"]["result"] - ] - - async def get_job_annotations(self, gl_id: int, time: float) -> dict: - """ - args: - gl_id: gitlab job id - time: when to query (unix timestamp) - returns: dict of annotations - """ - - res = await self.query( - type="single", - query={ - "metric": "kube_pod_annotations", - "filters": {"annotation_gitlab_ci_job_id": gl_id}, - }, - time=time, - ) - - if not res: - raise IncompleteData("annotation data is missing") - - annotations = res[0]["labels"] - - return { - "pod": annotations["pod"], - # if build jobs is not set, defaults to 16 due to spack config - "build_jobs": annotations.get( - "annotation_metrics_spack_job_build_jobs", 16 - ), - "arch": annotations["annotation_metrics_spack_job_spec_arch"], - "pkg_name": annotations["annotation_metrics_spack_job_spec_pkg_name"], - "pkg_version": annotations["annotation_metrics_spack_job_spec_pkg_version"], - "pkg_variants": json.dumps( - spec_variants(annotations["annotation_metrics_spack_job_spec_variants"]) - ), - "compiler_name": annotations[ - "annotation_metrics_spack_job_spec_compiler_name" - ], - "compiler_version": annotations[ - "annotation_metrics_spack_job_spec_compiler_version" - ], - "stack": annotations["annotation_metrics_spack_ci_stack_name"], - } - - async def get_job_resources(self, pod: str, time: float) -> tuple[dict, str]: - """ - args: - pod: pod name - time: when to query (unix timestamp) - returns: dict of resources and node hostname - """ - - requests = process_resources( - await self.query( - type="single", - query={ - "metric": "kube_pod_container_resource_requests", - "filters": {"container": "build", "pod": pod}, - }, - time=time, - ) - ) - - limits_res = await self.query( - type="single", - query={ - "metric": "kube_pod_container_resource_limits", - "filters": {"container": "build", "pod": pod}, - }, - time=time, - ) - - if not limits_res: - raise IncompleteData("missing limits") - - # instead of needing to fetch the node where the pod ran from kube_pod_info - # we can grab it from kube_pod_container_resource_limits - # weirdly, it's not available in kube_pod_labels or annotations - # https://github.com/kubernetes/kube-state-metrics/issues/1148 - node = limits_res[0]["labels"]["node"] - limits = process_resources(limits_res) - - return ( - { - "cpu_request": requests["cpu"]["value"], - "mem_request": requests["memory"]["value"], - "cpu_limit": limits.get("cpu", {}).get("value"), - "mem_limit": limits["memory"]["value"], - }, - node, - ) - - async def get_job_usage(self, pod: str, start: float, end: float) -> dict: - """ - Gets resource usage attributes for a job. - - args: - pod: pod name - start: start time (unix timestamp) - end: end time (unix timestamp) - returns: dict of usage stats - """ - - mem_usage = process_usage( - await self.query( - type="range", - query={ - "metric": "container_memory_working_set_bytes", - "filters": {"container": "build", "pod": pod}, - }, - start=start, - end=end, - ) - ) - - cpu_usage = process_usage( - await self.query( - type="range", - custom_query=( - f"rate(container_cpu_usage_seconds_total{{" - f"pod='{pod}', container='build'}}[90s])" - ), - start=start, - end=end, - ) - ) - - return { - "cpu_mean": cpu_usage["mean"], - "cpu_median": cpu_usage["median"], - "cpu_max": cpu_usage["max"], - "cpu_min": cpu_usage["min"], - "cpu_stddev": cpu_usage["stddev"], - "mem_mean": mem_usage["mean"], - "mem_median": mem_usage["median"], - "mem_max": mem_usage["max"], - "mem_min": mem_usage["min"], - "mem_stddev": mem_usage["stddev"], - } - - async def get_node_uuid(self, hostname: str, time: float) -> dict: - """ - args: - hostname: node hostname - time: time to query (unix timestamp) - returns: dict of node info (UUID as of now) - """ - - res = await self.query( - type="single", - query={ - "metric": "kube_node_info", - "filters": {"node": hostname}, - }, - time=time, - ) - - if not res: - raise IncompleteData(f"node info is missing. hostname={hostname}") - - return res[0]["labels"]["system_uuid"] - - async def get_node_labels(self, hostname: str, time: float) -> dict: - """ - args: - hostname: node hostname - time: time to query (unix timestamp) - returns: dict of node labels - """ - - res = await self.query( - type="single", - query={ - "metric": "kube_node_labels", - "filters": {"node": hostname}, - }, - time=time, - ) - - if not res: - raise IncompleteData(f"node labels are missing. hostname={hostname}") - - labels = res[0]["labels"] - - return { - "cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]), - "mem": float(labels["label_karpenter_k8s_aws_instance_memory"]), - "arch": labels["label_kubernetes_io_arch"], - "os": labels["label_kubernetes_io_os"], - "instance_type": labels["label_node_kubernetes_io_instance_type"], - } - - -def query_to_str(metric: str, filters: dict) -> str: - """ - In: "metric", {key1: value1, key2: value2} - Out: "metric{key1="value1", key2="value2"}" - """ - filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()]) - return f"{metric}{{{filters_str}}}" - - -def process_resources(res: dict) -> dict: - """ - Processes the resource limits and requests from a Prometheus response into - readable format. - - args: - res: Prometheus response - - returns: dict with {resource: {unit: value}} format - """ - - if not res: - raise IncompleteData("resource data is missing") - - processed = {} - for item in res: - # duplicates are ignored by overwriting the previous entry - processed[item["labels"]["resource"]] = { - "unit": item["labels"]["unit"], - "value": float(item["values"][1]), - } - - return processed - - -def process_usage(res: dict) -> dict: - """ - Processes the usage data from a Prometheus response into readable format. - This could either be CPU usage or memory usage. - - args: - res: Prometheus response - - returns: dict with {statistic: value} format - """ - - if not res: - # sometimes prometheus reports no data for a job if the time range is too small - raise IncompleteData("usage data is missing") - - usage = [float(value) for timestamp, value in res[0]["values"]] - - sum_stats = { - "mean": statistics.fmean(usage), - # pstdev because we have the whole population - "stddev": statistics.pstdev(usage), - "max": max(usage), - "min": min(usage), - "median": statistics.median(usage), - } - - if ( - sum_stats["stddev"] == 0 - or sum_stats["mean"] == 0 - or math.isnan(sum_stats["stddev"]) - ): - raise IncompleteData("usage data is invalid") - - return sum_stats diff --git a/gantry/clients/prometheus/__init__.py b/gantry/clients/prometheus/__init__.py new file mode 100644 index 0000000..9234832 --- /dev/null +++ b/gantry/clients/prometheus/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa +from .prometheus import PrometheusClient diff --git a/gantry/clients/prometheus/job.py b/gantry/clients/prometheus/job.py new file mode 100644 index 0000000..48be608 --- /dev/null +++ b/gantry/clients/prometheus/job.py @@ -0,0 +1,148 @@ +import json + +from gantry.clients.prometheus import util +from gantry.util.spec import spec_variants + + +class PrometheusJobClient: + def __init__(self, client): + self.client = client + + async def get_annotations(self, gl_id: int, time: float) -> dict: + """ + args: + gl_id: gitlab job id + time: when to query (unix timestamp) + returns: dict of annotations + """ + + res = await self.client.query( + type="single", + query={ + "metric": "kube_pod_annotations", + "filters": {"annotation_gitlab_ci_job_id": gl_id}, + }, + time=time, + ) + + if not res: + raise util.IncompleteData("annotation data is missing") + + annotations = res[0]["labels"] + + return { + "pod": annotations["pod"], + # if build jobs is not set, defaults to 16 due to spack config + "build_jobs": annotations.get( + "annotation_metrics_spack_job_build_jobs", 16 + ), + "arch": annotations["annotation_metrics_spack_job_spec_arch"], + "pkg_name": annotations["annotation_metrics_spack_job_spec_pkg_name"], + "pkg_version": annotations["annotation_metrics_spack_job_spec_pkg_version"], + "pkg_variants": json.dumps( + spec_variants(annotations["annotation_metrics_spack_job_spec_variants"]) + ), + "compiler_name": annotations[ + "annotation_metrics_spack_job_spec_compiler_name" + ], + "compiler_version": annotations[ + "annotation_metrics_spack_job_spec_compiler_version" + ], + "stack": annotations["annotation_metrics_spack_ci_stack_name"], + } + + async def get_resources(self, pod: str, time: float) -> tuple[dict, str]: + """ + args: + pod: pod name + time: when to query (unix timestamp) + returns: dict of resources and node hostname + """ + + requests = util.process_resources( + await self.client.query( + type="single", + query={ + "metric": "kube_pod_container_resource_requests", + "filters": {"container": "build", "pod": pod}, + }, + time=time, + ) + ) + + limits_res = await self.client.query( + type="single", + query={ + "metric": "kube_pod_container_resource_limits", + "filters": {"container": "build", "pod": pod}, + }, + time=time, + ) + + if not limits_res: + raise util.IncompleteData("missing limits") + + # instead of needing to fetch the node where the pod ran from kube_pod_info + # we can grab it from kube_pod_container_resource_limits + # weirdly, it's not available in kube_pod_labels or annotations + # https://github.com/kubernetes/kube-state-metrics/issues/1148 + node = limits_res[0]["labels"]["node"] + limits = util.process_resources(limits_res) + + return ( + { + "cpu_request": requests["cpu"]["value"], + "mem_request": requests["memory"]["value"], + "cpu_limit": limits.get("cpu", {}).get("value"), + "mem_limit": limits["memory"]["value"], + }, + node, + ) + + async def get_usage(self, pod: str, start: float, end: float) -> dict: + """ + Gets resource usage attributes for a job. + + args: + pod: pod name + start: start time (unix timestamp) + end: end time (unix timestamp) + returns: dict of usage stats + """ + + mem_usage = util.process_usage( + await self.client.query( + type="range", + query={ + "metric": "container_memory_working_set_bytes", + "filters": {"container": "build", "pod": pod}, + }, + start=start, + end=end, + ) + ) + + cpu_usage = util.process_usage( + await self.client.query( + type="range", + custom_query=( + f"rate(container_cpu_usage_seconds_total{{" + f"pod='{pod}', container='build'}}[90s])" + ), + start=start, + end=end, + ) + ) + + return { + "cpu_mean": cpu_usage["mean"], + "cpu_median": cpu_usage["median"], + "cpu_max": cpu_usage["max"], + "cpu_min": cpu_usage["min"], + "cpu_stddev": cpu_usage["stddev"], + "mem_mean": mem_usage["mean"], + "mem_median": mem_usage["median"], + "mem_max": mem_usage["max"], + "mem_min": mem_usage["min"], + "mem_stddev": mem_usage["stddev"], + } diff --git a/gantry/clients/prometheus/node.py b/gantry/clients/prometheus/node.py new file mode 100644 index 0000000..13a3f50 --- /dev/null +++ b/gantry/clients/prometheus/node.py @@ -0,0 +1,58 @@ +from gantry.clients.prometheus import util + + +class PrometheusNodeClient: + def __init__(self, client): + self.client = client + + async def get_uuid(self, hostname: str, time: float) -> dict: + """ + args: + hostname: node hostname + time: time to query (unix timestamp) + returns: dict of node info (UUID as of now) + """ + + res = await self.client.query( + type="single", + query={ + "metric": "kube_node_info", + "filters": {"node": hostname}, + }, + time=time, + ) + + if not res: + raise util.IncompleteData(f"node info is missing. hostname={hostname}") + + return res[0]["labels"]["system_uuid"] + + async def get_labels(self, hostname: str, time: float) -> dict: + """ + args: + hostname: node hostname + time: time to query (unix timestamp) + returns: dict of node labels + """ + + res = await self.client.query( + type="single", + query={ + "metric": "kube_node_labels", + "filters": {"node": hostname}, + }, + time=time, + ) + + if not res: + raise util.IncompleteData(f"node labels are missing. hostname={hostname}") + + labels = res[0]["labels"] + + return { + "cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]), + "mem": float(labels["label_karpenter_k8s_aws_instance_memory"]), + "arch": labels["label_kubernetes_io_arch"], + "os": labels["label_kubernetes_io_os"], + "instance_type": labels["label_node_kubernetes_io_instance_type"], + } diff --git a/gantry/clients/prometheus/prometheus.py b/gantry/clients/prometheus/prometheus.py new file mode 100644 index 0000000..8ee06b3 --- /dev/null +++ b/gantry/clients/prometheus/prometheus.py @@ -0,0 +1,105 @@ +import logging +import math +import urllib.parse + +import aiohttp + +from gantry.clients.prometheus import util +from gantry.clients.prometheus.job import PrometheusJobClient +from gantry.clients.prometheus.node import PrometheusNodeClient + + +class PrometheusClient: + def __init__(self, base_url: str, auth_cookie: str = ""): + # cookie will only be used if set + if auth_cookie: + self.cookies = {"_oauth2_proxy": auth_cookie} + else: + self.cookies = {} + + self.base_url = base_url + + async def query(self, type: str, **kwargs) -> dict: + """ + type: "range" or "single" + + for range queries: set `start` and `end` (unix timestamps) + for single queries: set `time` (unix timestamp) + + for custom queries: set `custom_query` (string) + + for metric queries: set `query` (dict) + example: + "query": { + "metric": "metric_name", + "filters": {"filter1": "value1", "filter2": "value2"} + } + """ + + # validate that one of query or custom_query is set, but not both or neither + if not kwargs.get("query") and not kwargs.get("custom_query"): + raise ValueError("query or custom_query must be set") + if kwargs.get("query") and kwargs.get("custom_query"): + raise ValueError("query and custom_query cannot both be set") + + query_str = urllib.parse.quote( + kwargs["custom_query"] + if kwargs.get("custom_query") + else util.query_to_str(**kwargs["query"]) + ) + + if type == "range": + # prometheus will only return this many frames + max_resolution = 10_000 + # calculating the max step size to get the desired resolution + step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution) + url = ( + f"{self.base_url}/query_range?" + f"query={query_str}&" + f"start={kwargs['start']}&" + f"end={kwargs['end']}&" + f"step={step}s" + ) + return await self._query(url) + elif type == "single": + url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}" + return await self._query(url) + + async def _query(self, url: str) -> dict: + """Query Prometheus with a query string""" + async with aiohttp.ClientSession(raise_for_status=True) as session: + # submit cookie with request + async with session.get(url, cookies=self.cookies) as resp: + try: + return self.prettify_res(await resp.json()) + except aiohttp.ContentTypeError: + logging.error( + """Prometheus query failed with unexpected response. + The cookie may have expired.""" + ) + return {} + + def prettify_res(self, response: dict) -> dict: + """Process Prometheus response into an arrray of dicts with {label: value}""" + result_type = response.get("data", {}).get("resultType") + values_dict = { + "matrix": "values", + "vector": "value", + } + + if result_type not in values_dict: + logging.error(f"Prometheus response type {result_type} not supported") + return {} + + return [ + {"labels": result["metric"], "values": result[values_dict[result_type]]} + for result in response["data"]["result"] + ] + + @property + def job(self): + return PrometheusJobClient(self) + + @property + def node(self): + return PrometheusNodeClient(self) diff --git a/gantry/clients/prometheus/util.py b/gantry/clients/prometheus/util.py new file mode 100644 index 0000000..8bb5e42 --- /dev/null +++ b/gantry/clients/prometheus/util.py @@ -0,0 +1,76 @@ +import math +import statistics + + +class IncompleteData(Exception): + pass + + +def query_to_str(metric: str, filters: dict) -> str: + """ + In: "metric", {key1: value1, key2: value2} + Out: "metric{key1="value1", key2="value2"}" + """ + filters_str = ", ".join([f'{key}="{value}"' for key, value in filters.items()]) + return f"{metric}{{{filters_str}}}" + + +def process_resources(res: dict) -> dict: + """ + Processes the resource limits and requests from a Prometheus response into + readable format. + + args: + res: Prometheus response + + returns: dict with {resource: {unit: value}} format + """ + + if not res: + raise IncompleteData("resource data is missing") + + processed = {} + for item in res: + # duplicates are ignored by overwriting the previous entry + processed[item["labels"]["resource"]] = { + "unit": item["labels"]["unit"], + "value": float(item["values"][1]), + } + + return processed + + +def process_usage(res: dict) -> dict: + """ + Processes the usage data from a Prometheus response into readable format. + This could either be CPU usage or memory usage. + + args: + res: Prometheus response + + returns: dict with {statistic: value} format + """ + + if not res: + # sometimes prometheus reports no data for a job if the time range is too small + raise IncompleteData("usage data is missing") + + usage = [float(value) for timestamp, value in res[0]["values"]] + + sum_stats = { + "mean": statistics.fmean(usage), + # pstdev because we have the whole population + "stddev": statistics.pstdev(usage), + "max": max(usage), + "min": min(usage), + "median": statistics.median(usage), + } + + if ( + sum_stats["stddev"] == 0 + or sum_stats["mean"] == 0 + or math.isnan(sum_stats["stddev"]) + ): + raise IncompleteData("usage data is invalid") + + return sum_stats diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index d5125cd..08831d4 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -4,7 +4,8 @@ from gantry import db from gantry.clients.gitlab import GitlabClient -from gantry.clients.prometheus import IncompleteData, PrometheusClient +from gantry.clients.prometheus import PrometheusClient +from gantry.clients.prometheus.util import IncompleteData from gantry.models import Job MB_IN_BYTES = 1_000_000 @@ -55,11 +56,11 @@ async def fetch_job( return try: - annotations = await prometheus.get_job_annotations(job.gl_id, job.midpoint) - resources, node_hostname = await prometheus.get_job_resources( + annotations = await prometheus.job.get_annotations(job.gl_id, job.midpoint) + resources, node_hostname = await prometheus.job.get_resources( annotations["pod"], job.midpoint ) - usage = await prometheus.get_job_usage(annotations["pod"], job.start, job.end) + usage = await prometheus.job.get_usage(annotations["pod"], job.start, job.end) node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint) except IncompleteData as e: # missing data, skip this job @@ -106,13 +107,13 @@ async def fetch_node( returns: id of the inserted or existing node """ - node_uuid = await prometheus.get_node_uuid(hostname, query_time) + node_uuid = await prometheus.node.get_uuid(hostname, query_time) # do not proceed if the node exists if existing_node := await db.get_node(db_conn, node_uuid): return existing_node - node_labels = await prometheus.get_node_labels(hostname, query_time) + node_labels = await prometheus.node.get_labels(hostname, query_time) return await db.insert_node( db_conn, { From 49c1eaf48397f073bea56b7422e0638cd663af1f Mon Sep 17 00:00:00 2001 From: caetano melone Date: Thu, 25 Jan 2024 17:12:24 -0800 Subject: [PATCH 18/27] lessen fatality of not receiving the right hook --- gantry/views.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gantry/views.py b/gantry/views.py index d9a0bb4..8363af8 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -1,5 +1,6 @@ import asyncio import json +import logging import os from aiohttp import web @@ -20,7 +21,9 @@ async def collect_job(request: web.Request) -> web.Response: return web.Response(status=401, text="invalid token") if request.headers.get("X-Gitlab-Event") != "Job Hook": - return web.Response(status=400, text="invalid event type") + logging.error(f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab.") + # return 200 so gitlab doesn't disable the webhook -- this is not fatal + return web.Response(status=200) # will return immediately, but will not block the event loop # allowing fetch_job to run in the background From 3bb1ebcb7cf499ff0be043293b843a77f889f29c Mon Sep 17 00:00:00 2001 From: caetano melone Date: Thu, 25 Jan 2024 18:25:08 -0800 Subject: [PATCH 19/27] black --- gantry/views.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gantry/views.py b/gantry/views.py index 8363af8..821016b 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -21,7 +21,9 @@ async def collect_job(request: web.Request) -> web.Response: return web.Response(status=401, text="invalid token") if request.headers.get("X-Gitlab-Event") != "Job Hook": - logging.error(f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab.") + logging.error( + f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab." + ) # return 200 so gitlab doesn't disable the webhook -- this is not fatal return web.Response(status=200) From 0bfe4e665463011f8c7aefbadd48ae08cc9f3a12 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Tue, 30 Jan 2024 22:30:50 -0800 Subject: [PATCH 20/27] no need to store ghost jobs as they are being collected by KW --- db/schema.sql | 5 ----- gantry/db/insert.py | 6 ------ gantry/routes/collection.py | 1 - 3 files changed, 12 deletions(-) diff --git a/db/schema.sql b/db/schema.sql index 8104a4e..bba3549 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -45,8 +45,3 @@ CREATE TABLE jobs ( ON UPDATE CASCADE ON DELETE CASCADE ); - -CREATE TABLE ghost_jobs ( - id INTEGER PRIMARY KEY, - gitlab_id INTEGER NOT NULL -); diff --git a/gantry/db/insert.py b/gantry/db/insert.py index 3df157d..7564ad9 100644 --- a/gantry/db/insert.py +++ b/gantry/db/insert.py @@ -27,12 +27,6 @@ def insert_dict(table: str, input: dict, ignore=False) -> tuple[str, tuple]: return query, values_tuple -async def insert_ghost(db: aiosqlite.Connection, gl_id: int) -> None: - """Inserts a ghost job into the database.""" - - await db.execute(("insert into ghost_jobs (gitlab_id) values (?)"), (gl_id,)) - - async def insert_node(db: aiosqlite.Connection, node: dict) -> int: """Inserts a node into the database.""" diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index 08831d4..1ca6d82 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -52,7 +52,6 @@ async def fetch_job( job_log = await gitlab.job_log(job.gl_id) is_ghost = "No need to rebuild" in job_log if is_ghost: - db.insert_ghost(db_conn, job.gl_id) return try: From 004310646f06db854b114cdb2bc9b05175b4e69b Mon Sep 17 00:00:00 2001 From: caetano melone Date: Sun, 4 Feb 2024 23:21:07 -0800 Subject: [PATCH 21/27] don't try to collect UO-ran jobs [ci skip] --- gantry/routes/collection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index 1ca6d82..1fd7ed7 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -27,7 +27,7 @@ async def fetch_job( payload: a dictionary containing the information from the Gitlab job hook db: an active aiosqlite connection - returns: None in order to accomodate a 200 response for the webhook. + returns: None in order to accommodate a 200 response for the webhook. """ job = Job( @@ -43,6 +43,8 @@ async def fetch_job( if ( job.status != "success" or not job.valid_build_name # is not a build job + # uo runners are not in Prometheus + or payload["runner"]["description"].startswith("uo") or await db.job_exists(db_conn, job.gl_id) # job already in the database or await db.ghost_exists(db_conn, job.gl_id) # ghost already in db ): From 53fb639a1080cec9a53d2f75b84e19ef67965647 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 5 Feb 2024 11:25:30 -0800 Subject: [PATCH 22/27] remove tests [ci skip] --- gantry/tests/test_utils.py | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 gantry/tests/test_utils.py diff --git a/gantry/tests/test_utils.py b/gantry/tests/test_utils.py deleted file mode 100644 index 010e08b..0000000 --- a/gantry/tests/test_utils.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest - -from gantry.util.misc import spec_variants - -# write tests for spec_variants here -# +adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on has to equal {} - - -@pytest.fixture -def variant_string(): - return "+adios2~advanced_debug patches=02253c7,acb3805,b724e6a use_vtkm=on" - - -def test_spec_variants(variant_string): - assert spec_variants(variant_string) == { - "adios2": True, - "advanced_debug": False, - "patches": ["02253c7", "acb3805", "b724e6a"], - "use_vtkm": "on", - } From 9da7248d44d22bc14d74445a3cdb7d9fc0fcd50c Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 5 Feb 2024 11:32:45 -0800 Subject: [PATCH 23/27] import clients individually [ci skip] --- gantry/__main__.py | 3 ++- gantry/clients/__init__.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/gantry/__main__.py b/gantry/__main__.py index ebb3e34..ff25d64 100644 --- a/gantry/__main__.py +++ b/gantry/__main__.py @@ -3,7 +3,8 @@ import aiosqlite from aiohttp import web -from gantry.clients import GitlabClient, PrometheusClient +from gantry.clients.gitlab import GitlabClient +from gantry.clients.prometheus import PrometheusClient from gantry.views import routes diff --git a/gantry/clients/__init__.py b/gantry/clients/__init__.py index 2dbe3f6..e69de29 100644 --- a/gantry/clients/__init__.py +++ b/gantry/clients/__init__.py @@ -1,3 +0,0 @@ -# flake8: noqa -from .gitlab import GitlabClient -from .prometheus import PrometheusClient From 8e01222f2c3424dc6a2bca6b788d72f942714022 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 5 Feb 2024 11:33:41 -0800 Subject: [PATCH 24/27] version the API [ci skip] --- gantry/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gantry/views.py b/gantry/views.py index 821016b..3acb80b 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -10,7 +10,7 @@ routes = web.RouteTableDef() -@routes.post("/collect") +@routes.post("/v1/collect") async def collect_job(request: web.Request) -> web.Response: try: payload = await request.json() From fd7ecfa813b52b3b078eaa1c60ad1192d77fcd90 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 5 Feb 2024 11:58:12 -0800 Subject: [PATCH 25/27] break up PrometheusClient.query into query_single and query_range [ci skip] --- gantry/clients/prometheus/job.py | 17 ++---- gantry/clients/prometheus/node.py | 6 +- gantry/clients/prometheus/prometheus.py | 77 ++++++++++++------------- gantry/clients/prometheus/util.py | 14 +++++ 4 files changed, 60 insertions(+), 54 deletions(-) diff --git a/gantry/clients/prometheus/job.py b/gantry/clients/prometheus/job.py index 48be608..9f7162c 100644 --- a/gantry/clients/prometheus/job.py +++ b/gantry/clients/prometheus/job.py @@ -16,8 +16,7 @@ async def get_annotations(self, gl_id: int, time: float) -> dict: returns: dict of annotations """ - res = await self.client.query( - type="single", + res = await self.client.query_single( query={ "metric": "kube_pod_annotations", "filters": {"annotation_gitlab_ci_job_id": gl_id}, @@ -60,8 +59,7 @@ async def get_resources(self, pod: str, time: float) -> tuple[dict, str]: """ requests = util.process_resources( - await self.client.query( - type="single", + await self.client.query_single( query={ "metric": "kube_pod_container_resource_requests", "filters": {"container": "build", "pod": pod}, @@ -70,8 +68,7 @@ async def get_resources(self, pod: str, time: float) -> tuple[dict, str]: ) ) - limits_res = await self.client.query( - type="single", + limits_res = await self.client.query_single( query={ "metric": "kube_pod_container_resource_limits", "filters": {"container": "build", "pod": pod}, @@ -111,8 +108,7 @@ async def get_usage(self, pod: str, start: float, end: float) -> dict: """ mem_usage = util.process_usage( - await self.client.query( - type="range", + await self.client.query_range( query={ "metric": "container_memory_working_set_bytes", "filters": {"container": "build", "pod": pod}, @@ -123,9 +119,8 @@ async def get_usage(self, pod: str, start: float, end: float) -> dict: ) cpu_usage = util.process_usage( - await self.client.query( - type="range", - custom_query=( + await self.client.query_range( + query=( f"rate(container_cpu_usage_seconds_total{{" f"pod='{pod}', container='build'}}[90s])" ), diff --git a/gantry/clients/prometheus/node.py b/gantry/clients/prometheus/node.py index 13a3f50..abfb217 100644 --- a/gantry/clients/prometheus/node.py +++ b/gantry/clients/prometheus/node.py @@ -13,8 +13,7 @@ async def get_uuid(self, hostname: str, time: float) -> dict: returns: dict of node info (UUID as of now) """ - res = await self.client.query( - type="single", + res = await self.client.query_single( query={ "metric": "kube_node_info", "filters": {"node": hostname}, @@ -35,8 +34,7 @@ async def get_labels(self, hostname: str, time: float) -> dict: returns: dict of node labels """ - res = await self.client.query( - type="single", + res = await self.client.query_single( query={ "metric": "kube_node_labels", "filters": {"node": hostname}, diff --git a/gantry/clients/prometheus/prometheus.py b/gantry/clients/prometheus/prometheus.py index 8ee06b3..e6ecaee 100644 --- a/gantry/clients/prometheus/prometheus.py +++ b/gantry/clients/prometheus/prometheus.py @@ -1,6 +1,5 @@ import logging import math -import urllib.parse import aiohttp @@ -19,51 +18,51 @@ def __init__(self, base_url: str, auth_cookie: str = ""): self.base_url = base_url - async def query(self, type: str, **kwargs) -> dict: + async def query_single(self, query: str | dict, time: int) -> dict: + """Query Prometheus for a single value + args: + + query: str or dict + if str, the query string + if dict, the metric and filters + example: + "query": { + "metric": "metric_name", + "filters": {"filter1": "value1", "filter2": "value2"} + } + time: int (unix timestamp) + + returns: dict with {label: value} format """ - type: "range" or "single" - for range queries: set `start` and `end` (unix timestamps) - for single queries: set `time` (unix timestamp) + query = util.process_query(query) + url = f"{self.base_url}/query?query={query}&time={time}" + return await self._query(url) - for custom queries: set `custom_query` (string) + async def query_range(self, query: str | dict, start: int, end: int) -> dict: + """Query Prometheus for a range of values - for metric queries: set `query` (dict) - example: - "query": { - "metric": "metric_name", - "filters": {"filter1": "value1", "filter2": "value2"} - } - """ + args: + query: see query_single + start: int (unix timestamp) + end: int (unix timestamp) - # validate that one of query or custom_query is set, but not both or neither - if not kwargs.get("query") and not kwargs.get("custom_query"): - raise ValueError("query or custom_query must be set") - if kwargs.get("query") and kwargs.get("custom_query"): - raise ValueError("query and custom_query cannot both be set") + returns: list of dicts with {label: value} format + """ - query_str = urllib.parse.quote( - kwargs["custom_query"] - if kwargs.get("custom_query") - else util.query_to_str(**kwargs["query"]) + query = util.process_query(query) + # prometheus will only return this many frames + max_resolution = 10_000 + # calculating the max step size to get the desired resolution + step = math.ceil((end - start) / max_resolution) + url = ( + f"{self.base_url}/query_range?" + f"query={query}&" + f"start={start}&" + f"end={end}&" + f"step={step}s" ) - - if type == "range": - # prometheus will only return this many frames - max_resolution = 10_000 - # calculating the max step size to get the desired resolution - step = math.ceil((kwargs["end"] - kwargs["start"]) / max_resolution) - url = ( - f"{self.base_url}/query_range?" - f"query={query_str}&" - f"start={kwargs['start']}&" - f"end={kwargs['end']}&" - f"step={step}s" - ) - return await self._query(url) - elif type == "single": - url = f"{self.base_url}/query?query={query_str}&time={kwargs['time']}" - return await self._query(url) + return await self._query(url) async def _query(self, url: str) -> dict: """Query Prometheus with a query string""" diff --git a/gantry/clients/prometheus/util.py b/gantry/clients/prometheus/util.py index 8bb5e42..eaf768d 100644 --- a/gantry/clients/prometheus/util.py +++ b/gantry/clients/prometheus/util.py @@ -1,11 +1,25 @@ import math import statistics +import urllib.parse class IncompleteData(Exception): pass +def process_query(query: dict | str) -> str: + """ + Processes query into a string that can be used in a URL. + See query_single in prometheus.py for more details on args. + """ + if isinstance(query, dict): + query = query_to_str(**query) + elif not isinstance(query, str): + raise ValueError("query must be a string or dict") + + return urllib.parse.quote(query) + + def query_to_str(metric: str, filters: dict) -> str: """ In: "metric", {key1: value1, key2: value2} From a93a5288cb00ca84911524e392afdc069bf7efb3 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 5 Feb 2024 12:02:01 -0800 Subject: [PATCH 26/27] fix prometheus client types --- gantry/clients/prometheus/prometheus.py | 10 +++++----- gantry/clients/prometheus/util.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gantry/clients/prometheus/prometheus.py b/gantry/clients/prometheus/prometheus.py index e6ecaee..acb2dd3 100644 --- a/gantry/clients/prometheus/prometheus.py +++ b/gantry/clients/prometheus/prometheus.py @@ -18,7 +18,7 @@ def __init__(self, base_url: str, auth_cookie: str = ""): self.base_url = base_url - async def query_single(self, query: str | dict, time: int) -> dict: + async def query_single(self, query: str | dict, time: int) -> list: """Query Prometheus for a single value args: @@ -39,7 +39,7 @@ async def query_single(self, query: str | dict, time: int) -> dict: url = f"{self.base_url}/query?query={query}&time={time}" return await self._query(url) - async def query_range(self, query: str | dict, start: int, end: int) -> dict: + async def query_range(self, query: str | dict, start: int, end: int) -> list: """Query Prometheus for a range of values args: @@ -64,7 +64,7 @@ async def query_range(self, query: str | dict, start: int, end: int) -> dict: ) return await self._query(url) - async def _query(self, url: str) -> dict: + async def _query(self, url: str) -> list: """Query Prometheus with a query string""" async with aiohttp.ClientSession(raise_for_status=True) as session: # submit cookie with request @@ -78,7 +78,7 @@ async def _query(self, url: str) -> dict: ) return {} - def prettify_res(self, response: dict) -> dict: + def prettify_res(self, response: dict) -> list: """Process Prometheus response into an arrray of dicts with {label: value}""" result_type = response.get("data", {}).get("resultType") values_dict = { @@ -88,7 +88,7 @@ def prettify_res(self, response: dict) -> dict: if result_type not in values_dict: logging.error(f"Prometheus response type {result_type} not supported") - return {} + return [] return [ {"labels": result["metric"], "values": result[values_dict[result_type]]} diff --git a/gantry/clients/prometheus/util.py b/gantry/clients/prometheus/util.py index eaf768d..e749dae 100644 --- a/gantry/clients/prometheus/util.py +++ b/gantry/clients/prometheus/util.py @@ -29,7 +29,7 @@ def query_to_str(metric: str, filters: dict) -> str: return f"{metric}{{{filters_str}}}" -def process_resources(res: dict) -> dict: +def process_resources(res: list) -> dict: """ Processes the resource limits and requests from a Prometheus response into readable format. @@ -54,7 +54,7 @@ def process_resources(res: dict) -> dict: return processed -def process_usage(res: dict) -> dict: +def process_usage(res: list) -> dict: """ Processes the usage data from a Prometheus response into readable format. This could either be CPU usage or memory usage. From 56157abb71bceda23aa804354a96c18882004d81 Mon Sep 17 00:00:00 2001 From: caetano melone Date: Mon, 5 Feb 2024 12:05:55 -0800 Subject: [PATCH 27/27] fix flake8 --- gantry/views.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gantry/views.py b/gantry/views.py index 3acb80b..b71a738 100644 --- a/gantry/views.py +++ b/gantry/views.py @@ -21,9 +21,7 @@ async def collect_job(request: web.Request) -> web.Response: return web.Response(status=401, text="invalid token") if request.headers.get("X-Gitlab-Event") != "Job Hook": - logging.error( - f"invalid event type {request.headers.get('X-Gitlab-Event')} received from Gitlab." - ) + logging.error(f"invalid event type {request.headers.get('X-Gitlab-Event')}") # return 200 so gitlab doesn't disable the webhook -- this is not fatal return web.Response(status=200)