Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect baseline job costs #132

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions gantry/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ async def apply_migrations(db: aiosqlite.Connection):
# and not inadvertently added to the migrations folder
("001_initial.sql", 1),
("002_spec_index.sql", 2),
("003_job_cost.sql", 3),
]

# apply migrations that have not been applied
Expand All @@ -45,6 +46,8 @@ async def apply_migrations(db: aiosqlite.Connection):
async def init_db(app: web.Application):
db = await aiosqlite.connect(os.environ["DB_FILE"])
await apply_migrations(db)
# ensure foreign key constraints are enabled
await db.execute("PRAGMA foreign_keys = ON")
app["db"] = db
yield
await db.close()
Expand Down
67 changes: 67 additions & 0 deletions gantry/clients/prometheus/job.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json

import aiosqlite

from gantry.clients.prometheus import util
from gantry.util.spec import spec_variants

Expand Down Expand Up @@ -152,3 +154,68 @@ async def get_usage(self, pod: str, start: float, end: float) -> dict:
"mem_min": mem_usage["min"],
"mem_stddev": mem_usage["stddev"],
}

async def get_costs(
self,
db: aiosqlite.Connection,
start: float,
end: float,
node_id: int,
) -> dict:
"""
Calculates the costs associated with a job.

args:
db: a database connection
resources: job requests and limits
usage: job memory and cpu usage
start: job start time
end: job end time
node_id: the node that the job ran on

returns:
dict of: job_cost_instance (cost of the instance over the job's lifetime)
"""
costs = {}
async with db.execute(
"""
select capacity_type, instance_type, zone
from nodes where id = ?
""",
(node_id,),
) as cursor:
node = await cursor.fetchone()

if not node:
# this is a temporary condition that will happen during the transition
# to collecting
raise util.IncompleteData(
f"node instance metadata is missing from db. node={node_id}"
)

capacity_type, instance_type, zone = node

# spot instance prices can change, so we avg the cost over the job's runtime
instance_costs = await self.client.query_range(
query={
"metric": "karpenter_cloudprovider_instance_type_offering_price_estimate", # noqa: E501
"filters": {
"capacity_type": capacity_type,
"instance_type": instance_type,
"zone": zone,
},
},
start=start,
end=end,
)

if not instance_costs:
raise util.IncompleteData(f"node cost is missing. node={node_id}")

instance_costs = [float(value) for _, value in instance_costs[0]["values"]]
# average hourly cost of the instance over the job's lifetime
instance_cost = sum(instance_costs) / len(instance_costs)
# compute cost: hourly instance cost * job runtime in hours
node_cost = instance_cost * ((end - start) / 3600) # seconds to hours
costs["job_cost_instance"] = node_cost
return costs
2 changes: 2 additions & 0 deletions gantry/clients/prometheus/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,6 @@ async def get_labels(self, hostname: str, time: float) -> dict:
"arch": labels["label_kubernetes_io_arch"],
"os": labels["label_kubernetes_io_os"],
"instance_type": labels["label_node_kubernetes_io_instance_type"],
"capacity_type": labels["label_karpenter_sh_capacity_type"],
"zone": labels["label_topology_kubernetes_io_zone"],
}
5 changes: 5 additions & 0 deletions gantry/routes/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ async def fetch_job(
)
usage = await prometheus.job.get_usage(annotations["pod"], job.start, job.end)
node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint)
costs = await prometheus.job.get_costs(db_conn, job.start, job.end, node_id)

except aiohttp.ClientError as e:
logger.error(f"Request failed: {e}")
return
Expand All @@ -93,6 +95,7 @@ async def fetch_job(
**annotations,
**resources,
**usage,
**costs,
},
)

Expand Down Expand Up @@ -139,5 +142,7 @@ async def fetch_node(
"arch": node_labels["arch"],
"os": node_labels["os"],
"instance_type": node_labels["instance_type"],
"capacity_type": node_labels["capacity_type"],
"zone": node_labels["zone"],
},
)
5 changes: 3 additions & 2 deletions gantry/tests/defs/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@

# used to compare successful insertions
# run SELECT * FROM table_name WHERE id = 1; from python sqlite api and grab fetchone() result
INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485)
INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge')
INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485, 0.19083333333333333)
INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge', 'us-east-1c', 'spot')

# these were obtained by executing the respective queries to Prometheus and capturing the JSON output
# or the raw output of PrometheusClient._query
Expand All @@ -32,6 +32,7 @@
VALID_CPU_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'container': 'build', 'cpu': 'total', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117145, '0.2483743618267752'], [1706117146, '0.25650526138466395'], [1706117147, '0.26463616094255266'], [1706117148, '0.2727670605004414'], [1706117149, '0.28089796005833007'], [1706117150, '0.2890288596162188'], [1706117151, '0.2971597591741076'], [1706117357, '3.7319005481816236'], [1706117358, '3.7319005481816236'], [1706117359, '3.7319005481816236'], [1706117360, '3.7319005481816245'], [1706117361, '3.7319005481816245'], [1706118420, '4.128116379389054']]}]}}
VALID_NODE_INFO = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_node_info', 'container': 'kube-state-metrics', 'container_runtime_version': 'containerd://1.7.2', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'internal_ip': '192.168.86.107', 'job': 'kube-state-metrics', 'kernel_version': '5.10.205-195.804.amzn2.x86_64', 'kubelet_version': 'v1.27.9-eks-5e0fdde', 'kubeproxy_version': 'v1.27.9-eks-5e0fdde', 'namespace': 'monitoring', 'node': 'ip-192-168-86-107.ec2.internal', 'os_image': 'Amazon Linux 2', 'pod': 'kube-prometheus-stack-kube-state-metrics-dbd66d8c7-6ftw8', 'provider_id': 'aws:///us-east-1c/i-0fe9d9c99fdb3631d', 'service': 'kube-prometheus-stack-kube-state-metrics', 'system_uuid': 'ec253b04-b1dc-f08b-acac-e23df83b3602'}, 'value': [1706117733, '1']}]}}
VALID_NODE_LABELS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_node_labels', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'label_beta_kubernetes_io_arch': 'amd64', 'label_beta_kubernetes_io_instance_type': 'i3en.6xlarge', 'label_beta_kubernetes_io_os': 'linux', 'label_failure_domain_beta_kubernetes_io_region': 'us-east-1', 'label_failure_domain_beta_kubernetes_io_zone': 'us-east-1c', 'label_k8s_io_cloud_provider_aws': 'ceb9f9cc8e47252a6f7fe7d6bded2655', 'label_karpenter_k8s_aws_instance_category': 'i', 'label_karpenter_k8s_aws_instance_cpu': '24', 'label_karpenter_k8s_aws_instance_encryption_in_transit_supported': 'true', 'label_karpenter_k8s_aws_instance_family': 'i3en', 'label_karpenter_k8s_aws_instance_generation': '3', 'label_karpenter_k8s_aws_instance_hypervisor': 'nitro', 'label_karpenter_k8s_aws_instance_local_nvme': '15000', 'label_karpenter_k8s_aws_instance_memory': '196608', 'label_karpenter_k8s_aws_instance_network_bandwidth': '25000', 'label_karpenter_k8s_aws_instance_pods': '234', 'label_karpenter_k8s_aws_instance_size': '6xlarge', 'label_karpenter_sh_capacity_type': 'spot', 'label_karpenter_sh_initialized': 'true', 'label_karpenter_sh_provisioner_name': 'glr-x86-64-v4', 'label_kubernetes_io_arch': 'amd64', 'label_kubernetes_io_hostname': 'ip-192-168-86-107.ec2.internal', 'label_kubernetes_io_os': 'linux', 'label_node_kubernetes_io_instance_type': 'i3en.6xlarge', 'label_spack_io_pipeline': 'true', 'label_spack_io_x86_64': 'v4', 'label_topology_ebs_csi_aws_com_zone': 'us-east-1c', 'label_topology_kubernetes_io_region': 'us-east-1', 'label_topology_kubernetes_io_zone': 'us-east-1c', 'namespace': 'monitoring', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'kube-prometheus-stack-kube-state-metrics-dbd66d8c7-6ftw8', 'service': 'kube-prometheus-stack-kube-state-metrics'}, 'value': [1706117733, '1']}]}}
VALID_NODE_COST = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'karpenter_cloudprovider_instance_type_offering_price_estimate', 'capacity_type': 'spot', 'container': 'controller', 'endpoint': 'http-metrics', 'instance': '192.168.240.113:8000', 'instance_type': 'i3en.6xlarge', 'job': 'karpenter', 'namespace': 'karpenter', 'pod': 'karpenter-8488f7f6dc-ml7q8', 'region': 'us-east-1', 'service': 'karpenter', 'zone': 'us-east-1c'}, 'values': [[1723838829, '0.5']]}]}}

# modified version of VALID_MEMORY_USAGE to make the mean/stddev 0
INVALID_MEMORY_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'container_memory_working_set_bytes', 'container': 'build', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117115, '0']]}]}}
2 changes: 1 addition & 1 deletion gantry/tests/defs/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# fmt: off

# valid input into insert_node
NODE_INSERT_DICT = {"uuid": "ec253b04-b1dc-f08b-acac-e23df83b3602", "hostname": "ip-192-168-86-107.ec2.internal", "cores": 24.0, "mem": 196608000000.0, "arch": "amd64", "os": "linux", "instance_type": "i3en.6xlarge"}
NODE_INSERT_DICT = {"uuid": "ec253b04-b1dc-f08b-acac-e23df83b3602", "hostname": "ip-192-168-86-107.ec2.internal", "cores": 24.0, "mem": 196608000000.0, "arch": "amd64", "os": "linux", "instance_type": "i3en.6xlarge", "zone": "us-east-1c", "capacity_type": "spot"}
2 changes: 1 addition & 1 deletion gantry/tests/sql/insert_job.sql
Original file line number Diff line number Diff line change
@@ -1 +1 @@
INSERT INTO jobs VALUES(1,'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z',2,1706117046,1706118420,9892514,'success','pr42264_bugfix/mathomp4/hdf5-appleclang15','gmsh','4.8.4','{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',16,0.75,NULL,4.12532286694540495,3.15805864677520409,11.6038107294648877,0.248374361826775191,3.34888880339475214,2000000000.0,48000000000.0,1649868862.72588062,999763968.0,5679742976.0,2785280.0,1378705563.21018671);
INSERT INTO jobs VALUES(1,'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z',2,1706117046,1706118420,9892514,'success','pr42264_bugfix/mathomp4/hdf5-appleclang15','gmsh','4.8.4','{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',16,0.75,NULL,4.12532286694540495,3.15805864677520409,11.6038107294648877,0.248374361826775191,3.34888880339475214,2000000000.0,48000000000.0,1649868862.72588062,999763968.0,5679742976.0,2785280.0,1378705563.21018671,0.19083333333333333);
2 changes: 1 addition & 1 deletion gantry/tests/sql/insert_node.sql
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
--- primary key is set to 2 to set up the test that checks for race conditions
INSERT INTO nodes VALUES(2,'ec253b04-b1dc-f08b-acac-e23df83b3602','ip-192-168-86-107.ec2.internal',24.0,196608000000.0,'amd64','linux','i3en.6xlarge');
INSERT INTO nodes VALUES(2,'ec253b04-b1dc-f08b-acac-e23df83b3602','ip-192-168-86-107.ec2.internal',24.0,196608000000.0,'amd64','linux','i3en.6xlarge','us-east-1c','spot');
Loading