Skip to content

Commit

Permalink
Collect node spot instance costs
Browse files Browse the repository at this point in the history
adds the following fields to the db schema:

- capacity_type: spot or on demand
- zone: aws zone
- hourly_cost: price per hour of the node's instance type

using this information, we can better assess the costs associated with each CI job. I will follow up with another PR that incorporates a cost per job figure into gantry, but this is all that is needed before we start running predictions.
  • Loading branch information
cmelone committed Aug 20, 2024
1 parent 77e9f04 commit 0c534d6
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 5 deletions.
3 changes: 3 additions & 0 deletions gantry/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ async def apply_migrations(db: aiosqlite.Connection):
# they are applied in the correct order
# and not inadvertently added to the migrations folder
("001_initial.sql", 1),
("002_node_cost.sql", 2),
]

# apply migrations that have not been applied
Expand All @@ -44,6 +45,8 @@ async def apply_migrations(db: aiosqlite.Connection):
async def init_db(app: web.Application):
db = await aiosqlite.connect(os.environ["DB_FILE"])
await apply_migrations(db)
# ensure foreign key constraints are enabled
await db.execute("PRAGMA foreign_keys = ON")
app["db"] = db
yield
await db.close()
Expand Down
24 changes: 23 additions & 1 deletion gantry/clients/prometheus/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,32 @@ async def get_labels(self, hostname: str, time: float) -> dict:

labels = res[0]["labels"]

return {
node_labels = {
"cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]),
"mem": float(labels["label_karpenter_k8s_aws_instance_memory"]),
"arch": labels["label_kubernetes_io_arch"],
"os": labels["label_kubernetes_io_os"],
"instance_type": labels["label_node_kubernetes_io_instance_type"],
"zone": labels["label_topology_kubernetes_io_zone"],
"capacity_type": labels["label_karpenter_sh_capacity_type"],
}

# get hourly cost
res = await self.client.query_single(
query={
"metric": "karpenter_cloudprovider_instance_type_price_estimate",
"filters": {
"capacity_type": node_labels["capacity_type"],
"instance_type": node_labels["instance_type"],
"zone": node_labels["zone"],
},
},
time=time,
)

if not res:
raise util.IncompleteData(f"node cost is missing. hostname={hostname}")

node_labels["hourly_cost"] = float(res[0]["values"][1])

return node_labels
3 changes: 3 additions & 0 deletions gantry/routes/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,8 @@ async def fetch_node(
"arch": node_labels["arch"],
"os": node_labels["os"],
"instance_type": node_labels["instance_type"],
"zone": node_labels["zone"],
"capacity_type": node_labels["capacity_type"],
"hourly_cost": node_labels["hourly_cost"],
},
)
3 changes: 2 additions & 1 deletion gantry/tests/defs/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# used to compare successful insertions
# run SELECT * FROM table_name WHERE id = 1; from python sqlite api and grab fetchone() result
INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485)
INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge')
INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge', 'us-east-1c', 'spot', 0.5)

# these were obtained by executing the respective queries to Prometheus and capturing the JSON output
# or the raw output of PrometheusClient._query
Expand All @@ -32,6 +32,7 @@
VALID_CPU_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'container': 'build', 'cpu': 'total', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117145, '0.2483743618267752'], [1706117146, '0.25650526138466395'], [1706117147, '0.26463616094255266'], [1706117148, '0.2727670605004414'], [1706117149, '0.28089796005833007'], [1706117150, '0.2890288596162188'], [1706117151, '0.2971597591741076'], [1706117357, '3.7319005481816236'], [1706117358, '3.7319005481816236'], [1706117359, '3.7319005481816236'], [1706117360, '3.7319005481816245'], [1706117361, '3.7319005481816245'], [1706118420, '4.128116379389054']]}]}}
VALID_NODE_INFO = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_node_info', 'container': 'kube-state-metrics', 'container_runtime_version': 'containerd://1.7.2', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'internal_ip': '192.168.86.107', 'job': 'kube-state-metrics', 'kernel_version': '5.10.205-195.804.amzn2.x86_64', 'kubelet_version': 'v1.27.9-eks-5e0fdde', 'kubeproxy_version': 'v1.27.9-eks-5e0fdde', 'namespace': 'monitoring', 'node': 'ip-192-168-86-107.ec2.internal', 'os_image': 'Amazon Linux 2', 'pod': 'kube-prometheus-stack-kube-state-metrics-dbd66d8c7-6ftw8', 'provider_id': 'aws:///us-east-1c/i-0fe9d9c99fdb3631d', 'service': 'kube-prometheus-stack-kube-state-metrics', 'system_uuid': 'ec253b04-b1dc-f08b-acac-e23df83b3602'}, 'value': [1706117733, '1']}]}}
VALID_NODE_LABELS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_node_labels', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'label_beta_kubernetes_io_arch': 'amd64', 'label_beta_kubernetes_io_instance_type': 'i3en.6xlarge', 'label_beta_kubernetes_io_os': 'linux', 'label_failure_domain_beta_kubernetes_io_region': 'us-east-1', 'label_failure_domain_beta_kubernetes_io_zone': 'us-east-1c', 'label_k8s_io_cloud_provider_aws': 'ceb9f9cc8e47252a6f7fe7d6bded2655', 'label_karpenter_k8s_aws_instance_category': 'i', 'label_karpenter_k8s_aws_instance_cpu': '24', 'label_karpenter_k8s_aws_instance_encryption_in_transit_supported': 'true', 'label_karpenter_k8s_aws_instance_family': 'i3en', 'label_karpenter_k8s_aws_instance_generation': '3', 'label_karpenter_k8s_aws_instance_hypervisor': 'nitro', 'label_karpenter_k8s_aws_instance_local_nvme': '15000', 'label_karpenter_k8s_aws_instance_memory': '196608', 'label_karpenter_k8s_aws_instance_network_bandwidth': '25000', 'label_karpenter_k8s_aws_instance_pods': '234', 'label_karpenter_k8s_aws_instance_size': '6xlarge', 'label_karpenter_sh_capacity_type': 'spot', 'label_karpenter_sh_initialized': 'true', 'label_karpenter_sh_provisioner_name': 'glr-x86-64-v4', 'label_kubernetes_io_arch': 'amd64', 'label_kubernetes_io_hostname': 'ip-192-168-86-107.ec2.internal', 'label_kubernetes_io_os': 'linux', 'label_node_kubernetes_io_instance_type': 'i3en.6xlarge', 'label_spack_io_pipeline': 'true', 'label_spack_io_x86_64': 'v4', 'label_topology_ebs_csi_aws_com_zone': 'us-east-1c', 'label_topology_kubernetes_io_region': 'us-east-1', 'label_topology_kubernetes_io_zone': 'us-east-1c', 'namespace': 'monitoring', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'kube-prometheus-stack-kube-state-metrics-dbd66d8c7-6ftw8', 'service': 'kube-prometheus-stack-kube-state-metrics'}, 'value': [1706117733, '1']}]}}
VALID_NODE_COST = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'karpenter_cloudprovider_instance_type_price_estimate', 'capacity_type': 'spot', 'container': 'controller', 'endpoint': 'http-metrics', 'instance': '192.168.240.113:8000', 'instance_type': 'i3en.6xlarge', 'job': 'karpenter', 'namespace': 'karpenter', 'pod': 'karpenter-8488f7f6dc-ml7q8', 'region': 'us-east-1', 'service': 'karpenter', 'zone': 'us-east-1c'}, 'value': [1723838829, '0.5']}]}}

# modified version of VALID_MEMORY_USAGE to make the mean/stddev 0
INVALID_MEMORY_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'container_memory_working_set_bytes', 'container': 'build', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117115, '0']]}]}}
2 changes: 1 addition & 1 deletion gantry/tests/defs/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# fmt: off

# valid input into insert_node
NODE_INSERT_DICT = {"uuid": "ec253b04-b1dc-f08b-acac-e23df83b3602", "hostname": "ip-192-168-86-107.ec2.internal", "cores": 24.0, "mem": 196608000000.0, "arch": "amd64", "os": "linux", "instance_type": "i3en.6xlarge"}
NODE_INSERT_DICT = {"uuid": "ec253b04-b1dc-f08b-acac-e23df83b3602", "hostname": "ip-192-168-86-107.ec2.internal", "cores": 24.0, "mem": 196608000000.0, "arch": "amd64", "os": "linux", "instance_type": "i3en.6xlarge", "zone": "us-east-1c", "capacity_type": "spot", "hourly_cost": 0.5}
2 changes: 1 addition & 1 deletion gantry/tests/sql/insert_node.sql
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
--- primary key is set to 2 to set up the test that checks for race conditions
INSERT INTO nodes VALUES(2,'ec253b04-b1dc-f08b-acac-e23df83b3602','ip-192-168-86-107.ec2.internal',24.0,196608000000.0,'amd64','linux','i3en.6xlarge');
INSERT INTO nodes VALUES(2,'ec253b04-b1dc-f08b-acac-e23df83b3602','ip-192-168-86-107.ec2.internal',24.0,196608000000.0,'amd64','linux','i3en.6xlarge','us-east-1c','spot', 0.5);
2 changes: 1 addition & 1 deletion gantry/tests/sql/insert_samples.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
INSERT INTO nodes VALUES(6789,'ec2c47a0-7e9b-cfa3-9ad4-ac227ade598d','ip-192-168-202-150.ec2.internal',32.0,131072000000.0,'amd64','linux','m5.8xlarge');
INSERT INTO nodes VALUES(6789,'ec2c47a0-7e9b-cfa3-9ad4-ac227ade598d','ip-192-168-202-150.ec2.internal',32.0,131072000000.0,'amd64','linux','m5.8xlarge','us-east-1c','spot', 0.5);
INSERT INTO jobs VALUES(6781,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi1',6789,1708919572.983000041,1708924744.811000108,101502092,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,9.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9652098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419);
INSERT INTO jobs VALUES(6782,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi2',6789,1708919572.983000041,1708924744.811000108,101502093,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,10.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9958098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419);
INSERT INTO jobs VALUES(6783,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi3',6789,1708919572.983000041,1708924744.811000108,101502094,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,11.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9158098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419);
Expand Down
1 change: 1 addition & 0 deletions gantry/tests/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"job_cpu_usage": defs.VALID_CPU_USAGE,
"node_info": defs.VALID_NODE_INFO,
"node_labels": defs.VALID_NODE_LABELS,
"node_cost": defs.VALID_NODE_COST,
}


Expand Down
30 changes: 30 additions & 0 deletions migrations/002_node_cost.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- temporarily disable foreign key constraints
PRAGMA foreign_keys = OFF;

-- create tmp table
CREATE TABLE nodes_tmp (
id INTEGER PRIMARY KEY,
uuid TEXT NOT NULL UNIQUE,
hostname TEXT NOT NULL,
cores REAL NOT NULL,
mem REAL NOT NULL,
arch TEXT NOT NULL,
os TEXT NOT NULL,
instance_type TEXT NOT NULL,
-- new columns below
zone TEXT NOT NULL,
capacity_type TEXT NOT NULL,
hourly_cost REAL NOT NULL
);

-- this approach is needed because we want to add new columns
-- with a not null constraint, but also to add default values
-- this isn't directly supported by ALTER TABLE

-- copy data from nodes to nodes_tmp
-- '', '', and 0 are the default values for zone, capacity_type, and hourly_cost
INSERT INTO nodes_tmp SELECT id, uuid, hostname, cores, mem, arch, os, instance_type, '', '', 0 FROM nodes;
DROP TABLE nodes;
ALTER TABLE nodes_tmp RENAME TO nodes;

PRAGMA foreign_keys = ON;

0 comments on commit 0c534d6

Please sign in to comment.