Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collection API #3

Merged
merged 28 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
98586b3
Add basic GitHub Actions CI
alecbcs Dec 21, 2023
57ca71c
rough draft of collection functionality
cmelone Dec 25, 2023
c8a39d8
Revert "Add basic GitHub Actions CI"
cmelone Dec 25, 2023
a1346b4
Merge branch 'develop' into add/collection-func
cmelone Dec 25, 2023
a8dec44
line breaks
cmelone Jan 11, 2024
1f3de5c
improvements to collection
cmelone Jan 16, 2024
130a1f1
aiohttp server basics
cmelone Jan 16, 2024
45d8ef1
refactoring of collection
cmelone Jan 18, 2024
a1a864b
isort
cmelone Jan 23, 2024
6316550
don't depend on dotenv for .env sourcing
cmelone Jan 24, 2024
0f89fba
add stack
cmelone Jan 24, 2024
4e5324d
restructure how clients are initialized
cmelone Jan 24, 2024
45820b4
reorganize files into clients/models/routes
cmelone Jan 24, 2024
4f78c3a
decouple spec utility functions from misc.py
cmelone Jan 24, 2024
fe2449d
rename vm: node build: job
cmelone Jan 25, 2024
7b05c8c
reorganize functionality around clients rather than models
cmelone Jan 25, 2024
608c043
job_id -> gitlab_id
cmelone Jan 25, 2024
11d05fc
make prometheus client more modular
cmelone Jan 26, 2024
49c1eaf
lessen fatality of not receiving the right hook
cmelone Jan 26, 2024
3bb1ebc
black
cmelone Jan 26, 2024
0bfe4e6
no need to store ghost jobs as they are being collected by KW
cmelone Jan 31, 2024
0043106
don't try to collect UO-ran jobs [ci skip]
cmelone Feb 5, 2024
53fb639
remove tests [ci skip]
cmelone Feb 5, 2024
9da7248
import clients individually [ci skip]
cmelone Feb 5, 2024
8e01222
version the API [ci skip]
cmelone Feb 5, 2024
fd7ecfa
break up PrometheusClient.query into query_single and query_range [ci…
cmelone Feb 5, 2024
a93a528
fix prometheus client types
cmelone Feb 5, 2024
56157ab
fix flake8
cmelone Feb 5, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#------------------------------------------------------------------------
# Load Development Spack Environment (If Spack is installed.)
#
# Run 'direnv allow' from within the cloned repository to automatically
# load the spack environment when you enter the directory.
#------------------------------------------------------------------------
if type spack &>/dev/null; then
. $SPACK_ROOT/share/spack/setup-env.sh
spack env activate -d .
fi

#------------------------------------------------------------------------
# Load Environment Variables from .env (if files exists)
#------------------------------------------------------------------------
if [ -e .env ]; then
source .env
fi
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203, E704
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__
.env
spack.lock
.spack-env
db/*.db
47 changes: 47 additions & 0 deletions db/schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
CREATE TABLE nodes (
id INTEGER PRIMARY KEY,
uuid TEXT NOT NULL UNIQUE,
hostname TEXT NOT NULL,
cores REAL NOT NULL,
mem REAL NOT NULL,
arch TEXT NOT NULL,
os TEXT NOT NULL,
instance_type TEXT NOT NULL
);

CREATE TABLE jobs (
id INTEGER PRIMARY KEY,
pod TEXT NOT NULL UNIQUE,
node INTEGER NOT NULL,
start INTEGER NOT NULL,
end INTEGER NOT NULL,
gitlab_id INTEGER NOT NULL UNIQUE,
job_status TEXT NOT NULL,
ref TEXT NOT NULL,
pkg_name TEXT NOT NULL,
pkg_version TEXT NOT NULL,
pkg_variants TEXT NOT NULL,
compiler_name TEXT NOT NULL,
compiler_version TEXT NOT NULL,
arch TEXT NOT NULL,
stack TEXT NOT NULL,
build_jobs INTEGER NOT NULL,
cpu_request REAL NOT NULL,
cpu_limit REAL, -- this can be null becasue it's currently not set
cpu_mean REAL NOT NULL,
cpu_median REAL NOT NULL,
cpu_max REAL NOT NULL,
cpu_min REAL NOT NULL,
cpu_stddev REAL NOT NULL,
mem_request REAL NOT NULL,
mem_limit REAL NOT NULL,
mem_mean REAL NOT NULL,
mem_median REAL NOT NULL,
mem_max REAL NOT NULL,
mem_min REAL NOT NULL,
mem_stddev REAL NOT NULL,
FOREIGN KEY (node)
REFERENCES nodes (id)
ON UPDATE CASCADE
ON DELETE CASCADE
);
32 changes: 31 additions & 1 deletion gantry/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,35 @@
import os

import aiosqlite
from aiohttp import web

from gantry.clients import GitlabClient, PrometheusClient
from gantry.views import routes


async def init_db(app: web.Application):
db = await aiosqlite.connect(os.environ["DB_FILE"])
await db.execute("PRAGMA foreign_keys = ON;")
app["db"] = db
yield
await db.close()


async def init_clients(app: web.Application):
app["gitlab"] = GitlabClient(
os.environ["GITLAB_URL"], os.environ["GITLAB_API_TOKEN"]
)
app["prometheus"] = PrometheusClient(
os.environ["PROMETHEUS_URL"], os.environ.get("PROMETHEUS_COOKIE", "")
)


def main():
print("Hello World")
app = web.Application()
app.add_routes(routes)
app.cleanup_ctx.append(init_db)
app.on_startup.append(init_clients)
web.run_app(app)


if __name__ == "__main__":
Expand Down
3 changes: 3 additions & 0 deletions gantry/clients/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# flake8: noqa
from .gitlab import GitlabClient
from .prometheus import PrometheusClient
cmelone marked this conversation as resolved.
Show resolved Hide resolved
31 changes: 31 additions & 0 deletions gantry/clients/gitlab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import aiohttp


class GitlabClient:
def __init__(self, base_url: str, api_token: str):
self.base_url = base_url
self.headers = {"PRIVATE-TOKEN": api_token}

async def _request(self, url: str, response_type: str) -> dict | str:
"""
Helper for requests to the Gitlab API.

args:
url: the url to request
response_type: the type of response to expect (json or text)

returns: the response from Gitlab in the specified format
"""

async with aiohttp.ClientSession(raise_for_status=True) as session:
async with session.get(url, headers=self.headers) as resp:
if response_type == "json":
return await resp.json()
if response_type == "text":
return await resp.text()

async def job_log(self, gl_id: int) -> str:
"""Given a job id, returns the log from that job"""

url = f"{self.base_url}/jobs/{gl_id}/trace"
return await self._request(url, "text")
2 changes: 2 additions & 0 deletions gantry/clients/prometheus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# flake8: noqa
from .prometheus import PrometheusClient
148 changes: 148 additions & 0 deletions gantry/clients/prometheus/job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import json

from gantry.clients.prometheus import util
from gantry.util.spec import spec_variants


class PrometheusJobClient:
def __init__(self, client):
self.client = client

async def get_annotations(self, gl_id: int, time: float) -> dict:
"""
args:
gl_id: gitlab job id
time: when to query (unix timestamp)
returns: dict of annotations
"""

res = await self.client.query(
type="single",
query={
"metric": "kube_pod_annotations",
"filters": {"annotation_gitlab_ci_job_id": gl_id},
},
time=time,
)

if not res:
raise util.IncompleteData("annotation data is missing")

annotations = res[0]["labels"]

return {
"pod": annotations["pod"],
# if build jobs is not set, defaults to 16 due to spack config
"build_jobs": annotations.get(
"annotation_metrics_spack_job_build_jobs", 16
),
"arch": annotations["annotation_metrics_spack_job_spec_arch"],
"pkg_name": annotations["annotation_metrics_spack_job_spec_pkg_name"],
"pkg_version": annotations["annotation_metrics_spack_job_spec_pkg_version"],
"pkg_variants": json.dumps(
spec_variants(annotations["annotation_metrics_spack_job_spec_variants"])
),
"compiler_name": annotations[
"annotation_metrics_spack_job_spec_compiler_name"
],
"compiler_version": annotations[
"annotation_metrics_spack_job_spec_compiler_version"
],
"stack": annotations["annotation_metrics_spack_ci_stack_name"],
}

async def get_resources(self, pod: str, time: float) -> tuple[dict, str]:
"""
args:
pod: pod name
time: when to query (unix timestamp)
returns: dict of resources and node hostname
"""

requests = util.process_resources(
await self.client.query(
type="single",
query={
"metric": "kube_pod_container_resource_requests",
"filters": {"container": "build", "pod": pod},
},
time=time,
)
)

limits_res = await self.client.query(
type="single",
query={
"metric": "kube_pod_container_resource_limits",
"filters": {"container": "build", "pod": pod},
},
time=time,
)

if not limits_res:
raise util.IncompleteData("missing limits")

# instead of needing to fetch the node where the pod ran from kube_pod_info
# we can grab it from kube_pod_container_resource_limits
# weirdly, it's not available in kube_pod_labels or annotations
# https://github.com/kubernetes/kube-state-metrics/issues/1148
node = limits_res[0]["labels"]["node"]
limits = util.process_resources(limits_res)

return (
{
"cpu_request": requests["cpu"]["value"],
"mem_request": requests["memory"]["value"],
"cpu_limit": limits.get("cpu", {}).get("value"),
"mem_limit": limits["memory"]["value"],
},
node,
)

async def get_usage(self, pod: str, start: float, end: float) -> dict:
"""
Gets resource usage attributes for a job.

args:
pod: pod name
start: start time (unix timestamp)
end: end time (unix timestamp)
returns: dict of usage stats
"""

mem_usage = util.process_usage(
await self.client.query(
type="range",
query={
"metric": "container_memory_working_set_bytes",
"filters": {"container": "build", "pod": pod},
},
start=start,
end=end,
)
)

cpu_usage = util.process_usage(
await self.client.query(
type="range",
custom_query=(
f"rate(container_cpu_usage_seconds_total{{"
f"pod='{pod}', container='build'}}[90s])"
),
start=start,
end=end,
)
)

return {
"cpu_mean": cpu_usage["mean"],
"cpu_median": cpu_usage["median"],
"cpu_max": cpu_usage["max"],
"cpu_min": cpu_usage["min"],
"cpu_stddev": cpu_usage["stddev"],
"mem_mean": mem_usage["mean"],
"mem_median": mem_usage["median"],
"mem_max": mem_usage["max"],
"mem_min": mem_usage["min"],
"mem_stddev": mem_usage["stddev"],
}
58 changes: 58 additions & 0 deletions gantry/clients/prometheus/node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from gantry.clients.prometheus import util


class PrometheusNodeClient:
def __init__(self, client):
self.client = client

async def get_uuid(self, hostname: str, time: float) -> dict:
"""
args:
hostname: node hostname
time: time to query (unix timestamp)
returns: dict of node info (UUID as of now)
"""

res = await self.client.query(
type="single",
query={
"metric": "kube_node_info",
"filters": {"node": hostname},
},
time=time,
)

if not res:
raise util.IncompleteData(f"node info is missing. hostname={hostname}")

return res[0]["labels"]["system_uuid"]

async def get_labels(self, hostname: str, time: float) -> dict:
"""
args:
hostname: node hostname
time: time to query (unix timestamp)
returns: dict of node labels
"""

res = await self.client.query(
type="single",
query={
"metric": "kube_node_labels",
"filters": {"node": hostname},
},
time=time,
)

if not res:
raise util.IncompleteData(f"node labels are missing. hostname={hostname}")

labels = res[0]["labels"]

return {
"cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]),
"mem": float(labels["label_karpenter_k8s_aws_instance_memory"]),
"arch": labels["label_kubernetes_io_arch"],
"os": labels["label_kubernetes_io_os"],
"instance_type": labels["label_node_kubernetes_io_instance_type"],
}
Loading