diff --git a/gantry/__main__.py b/gantry/__main__.py index ab66f86..3a4fe79 100644 --- a/gantry/__main__.py +++ b/gantry/__main__.py @@ -27,6 +27,7 @@ async def apply_migrations(db: aiosqlite.Connection): # they are applied in the correct order # and not inadvertently added to the migrations folder ("001_initial.sql", 1), + ("002_node_cost.sql", 2), ] # apply migrations that have not been applied @@ -44,6 +45,8 @@ async def apply_migrations(db: aiosqlite.Connection): async def init_db(app: web.Application): db = await aiosqlite.connect(os.environ["DB_FILE"]) await apply_migrations(db) + # ensure foreign key constraints are enabled + await db.execute("PRAGMA foreign_keys = ON") app["db"] = db yield await db.close() diff --git a/gantry/clients/prometheus/node.py b/gantry/clients/prometheus/node.py index abfb217..3bd29cd 100644 --- a/gantry/clients/prometheus/node.py +++ b/gantry/clients/prometheus/node.py @@ -47,10 +47,32 @@ async def get_labels(self, hostname: str, time: float) -> dict: labels = res[0]["labels"] - return { + node_labels = { "cores": float(labels["label_karpenter_k8s_aws_instance_cpu"]), "mem": float(labels["label_karpenter_k8s_aws_instance_memory"]), "arch": labels["label_kubernetes_io_arch"], "os": labels["label_kubernetes_io_os"], "instance_type": labels["label_node_kubernetes_io_instance_type"], + "zone": labels["label_topology_kubernetes_io_zone"], + "capacity_type": labels["label_karpenter_sh_capacity_type"], } + + # get hourly cost + res = await self.client.query_single( + query={ + "metric": "karpenter_cloudprovider_instance_type_price_estimate", + "filters": { + "capacity_type": node_labels["capacity_type"], + "instance_type": node_labels["instance_type"], + "zone": node_labels["zone"], + }, + }, + time=time, + ) + + if not res: + raise util.IncompleteData(f"node cost is missing. hostname={hostname}") + + node_labels["hourly_cost"] = float(res[0]["values"][1]) + + return node_labels diff --git a/gantry/routes/collection.py b/gantry/routes/collection.py index aed7d47..0ce6205 100644 --- a/gantry/routes/collection.py +++ b/gantry/routes/collection.py @@ -133,5 +133,8 @@ async def fetch_node( "arch": node_labels["arch"], "os": node_labels["os"], "instance_type": node_labels["instance_type"], + "zone": node_labels["zone"], + "capacity_type": node_labels["capacity_type"], + "hourly_cost": node_labels["hourly_cost"], }, ) diff --git a/gantry/tests/defs/collection.py b/gantry/tests/defs/collection.py index d419d79..af64d58 100644 --- a/gantry/tests/defs/collection.py +++ b/gantry/tests/defs/collection.py @@ -21,7 +21,7 @@ # used to compare successful insertions # run SELECT * FROM table_name WHERE id = 1; from python sqlite api and grab fetchone() result INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485) -INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge') +INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge', 'us-east-1c', 'spot', 0.5) # these were obtained by executing the respective queries to Prometheus and capturing the JSON output # or the raw output of PrometheusClient._query @@ -32,6 +32,7 @@ VALID_CPU_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'container': 'build', 'cpu': 'total', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117145, '0.2483743618267752'], [1706117146, '0.25650526138466395'], [1706117147, '0.26463616094255266'], [1706117148, '0.2727670605004414'], [1706117149, '0.28089796005833007'], [1706117150, '0.2890288596162188'], [1706117151, '0.2971597591741076'], [1706117357, '3.7319005481816236'], [1706117358, '3.7319005481816236'], [1706117359, '3.7319005481816236'], [1706117360, '3.7319005481816245'], [1706117361, '3.7319005481816245'], [1706118420, '4.128116379389054']]}]}} VALID_NODE_INFO = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_node_info', 'container': 'kube-state-metrics', 'container_runtime_version': 'containerd://1.7.2', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'internal_ip': '192.168.86.107', 'job': 'kube-state-metrics', 'kernel_version': '5.10.205-195.804.amzn2.x86_64', 'kubelet_version': 'v1.27.9-eks-5e0fdde', 'kubeproxy_version': 'v1.27.9-eks-5e0fdde', 'namespace': 'monitoring', 'node': 'ip-192-168-86-107.ec2.internal', 'os_image': 'Amazon Linux 2', 'pod': 'kube-prometheus-stack-kube-state-metrics-dbd66d8c7-6ftw8', 'provider_id': 'aws:///us-east-1c/i-0fe9d9c99fdb3631d', 'service': 'kube-prometheus-stack-kube-state-metrics', 'system_uuid': 'ec253b04-b1dc-f08b-acac-e23df83b3602'}, 'value': [1706117733, '1']}]}} VALID_NODE_LABELS = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'kube_node_labels', 'container': 'kube-state-metrics', 'endpoint': 'http', 'instance': '192.168.164.84:8080', 'job': 'kube-state-metrics', 'label_beta_kubernetes_io_arch': 'amd64', 'label_beta_kubernetes_io_instance_type': 'i3en.6xlarge', 'label_beta_kubernetes_io_os': 'linux', 'label_failure_domain_beta_kubernetes_io_region': 'us-east-1', 'label_failure_domain_beta_kubernetes_io_zone': 'us-east-1c', 'label_k8s_io_cloud_provider_aws': 'ceb9f9cc8e47252a6f7fe7d6bded2655', 'label_karpenter_k8s_aws_instance_category': 'i', 'label_karpenter_k8s_aws_instance_cpu': '24', 'label_karpenter_k8s_aws_instance_encryption_in_transit_supported': 'true', 'label_karpenter_k8s_aws_instance_family': 'i3en', 'label_karpenter_k8s_aws_instance_generation': '3', 'label_karpenter_k8s_aws_instance_hypervisor': 'nitro', 'label_karpenter_k8s_aws_instance_local_nvme': '15000', 'label_karpenter_k8s_aws_instance_memory': '196608', 'label_karpenter_k8s_aws_instance_network_bandwidth': '25000', 'label_karpenter_k8s_aws_instance_pods': '234', 'label_karpenter_k8s_aws_instance_size': '6xlarge', 'label_karpenter_sh_capacity_type': 'spot', 'label_karpenter_sh_initialized': 'true', 'label_karpenter_sh_provisioner_name': 'glr-x86-64-v4', 'label_kubernetes_io_arch': 'amd64', 'label_kubernetes_io_hostname': 'ip-192-168-86-107.ec2.internal', 'label_kubernetes_io_os': 'linux', 'label_node_kubernetes_io_instance_type': 'i3en.6xlarge', 'label_spack_io_pipeline': 'true', 'label_spack_io_x86_64': 'v4', 'label_topology_ebs_csi_aws_com_zone': 'us-east-1c', 'label_topology_kubernetes_io_region': 'us-east-1', 'label_topology_kubernetes_io_zone': 'us-east-1c', 'namespace': 'monitoring', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'kube-prometheus-stack-kube-state-metrics-dbd66d8c7-6ftw8', 'service': 'kube-prometheus-stack-kube-state-metrics'}, 'value': [1706117733, '1']}]}} +VALID_NODE_COST = {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'__name__': 'karpenter_cloudprovider_instance_type_price_estimate', 'capacity_type': 'spot', 'container': 'controller', 'endpoint': 'http-metrics', 'instance': '192.168.240.113:8000', 'instance_type': 'i3en.6xlarge', 'job': 'karpenter', 'namespace': 'karpenter', 'pod': 'karpenter-8488f7f6dc-ml7q8', 'region': 'us-east-1', 'service': 'karpenter', 'zone': 'us-east-1c'}, 'value': [1723838829, '0.5']}]}} # modified version of VALID_MEMORY_USAGE to make the mean/stddev 0 INVALID_MEMORY_USAGE = {'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'container_memory_working_set_bytes', 'container': 'build', 'endpoint': 'https-metrics', 'id': '/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-podd7aa13e0_998c_4f21_b1d6_62781f4980b0.slice/cri-containerd-48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1.scope', 'image': 'ghcr.io/spack/ubuntu20.04-runner-amd64-gcc-11.4:2023.08.01', 'instance': '192.168.86.107:10250', 'job': 'kubelet', 'metrics_path': '/metrics/cadvisor', 'name': '48a5e9e7d46655e73ba119fa16b65fa94ceed23c55157db8269b0b12f18f55d1', 'namespace': 'pipeline', 'node': 'ip-192-168-86-107.ec2.internal', 'pod': 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 'service': 'kube-prometheus-stack-kubelet'}, 'values': [[1706117115, '0']]}]}} diff --git a/gantry/tests/defs/db.py b/gantry/tests/defs/db.py index d4e5290..23824a2 100644 --- a/gantry/tests/defs/db.py +++ b/gantry/tests/defs/db.py @@ -2,4 +2,4 @@ # fmt: off # valid input into insert_node -NODE_INSERT_DICT = {"uuid": "ec253b04-b1dc-f08b-acac-e23df83b3602", "hostname": "ip-192-168-86-107.ec2.internal", "cores": 24.0, "mem": 196608000000.0, "arch": "amd64", "os": "linux", "instance_type": "i3en.6xlarge"} +NODE_INSERT_DICT = {"uuid": "ec253b04-b1dc-f08b-acac-e23df83b3602", "hostname": "ip-192-168-86-107.ec2.internal", "cores": 24.0, "mem": 196608000000.0, "arch": "amd64", "os": "linux", "instance_type": "i3en.6xlarge", "zone": "us-east-1c", "capacity_type": "spot", "hourly_cost": 0.5} diff --git a/gantry/tests/sql/insert_node.sql b/gantry/tests/sql/insert_node.sql index cad50ee..165bf75 100644 --- a/gantry/tests/sql/insert_node.sql +++ b/gantry/tests/sql/insert_node.sql @@ -1,2 +1,2 @@ --- primary key is set to 2 to set up the test that checks for race conditions -INSERT INTO nodes VALUES(2,'ec253b04-b1dc-f08b-acac-e23df83b3602','ip-192-168-86-107.ec2.internal',24.0,196608000000.0,'amd64','linux','i3en.6xlarge'); \ No newline at end of file +INSERT INTO nodes VALUES(2,'ec253b04-b1dc-f08b-acac-e23df83b3602','ip-192-168-86-107.ec2.internal',24.0,196608000000.0,'amd64','linux','i3en.6xlarge','us-east-1c','spot', 0.5); \ No newline at end of file diff --git a/gantry/tests/sql/insert_samples.sql b/gantry/tests/sql/insert_samples.sql index d017ebe..bcaa8c3 100644 --- a/gantry/tests/sql/insert_samples.sql +++ b/gantry/tests/sql/insert_samples.sql @@ -1,4 +1,4 @@ -INSERT INTO nodes VALUES(6789,'ec2c47a0-7e9b-cfa3-9ad4-ac227ade598d','ip-192-168-202-150.ec2.internal',32.0,131072000000.0,'amd64','linux','m5.8xlarge'); +INSERT INTO nodes VALUES(6789,'ec2c47a0-7e9b-cfa3-9ad4-ac227ade598d','ip-192-168-202-150.ec2.internal',32.0,131072000000.0,'amd64','linux','m5.8xlarge','us-east-1c','spot', 0.5); INSERT INTO jobs VALUES(6781,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi1',6789,1708919572.983000041,1708924744.811000108,101502092,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,9.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9652098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419); INSERT INTO jobs VALUES(6782,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi2',6789,1708919572.983000041,1708924744.811000108,101502093,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,10.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9958098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419); INSERT INTO jobs VALUES(6783,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi3',6789,1708919572.983000041,1708924744.811000108,101502094,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,11.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9158098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419); diff --git a/gantry/tests/test_collection.py b/gantry/tests/test_collection.py index 926a161..c274f6b 100644 --- a/gantry/tests/test_collection.py +++ b/gantry/tests/test_collection.py @@ -19,6 +19,7 @@ "job_cpu_usage": defs.VALID_CPU_USAGE, "node_info": defs.VALID_NODE_INFO, "node_labels": defs.VALID_NODE_LABELS, + "node_cost": defs.VALID_NODE_COST, } diff --git a/migrations/002_node_cost.sql b/migrations/002_node_cost.sql new file mode 100644 index 0000000..b1d948e --- /dev/null +++ b/migrations/002_node_cost.sql @@ -0,0 +1,30 @@ +-- temporarily disable foreign key constraints +PRAGMA foreign_keys = OFF; + +-- create tmp table +CREATE TABLE nodes_tmp ( + id INTEGER PRIMARY KEY, + uuid TEXT NOT NULL UNIQUE, + hostname TEXT NOT NULL, + cores REAL NOT NULL, + mem REAL NOT NULL, + arch TEXT NOT NULL, + os TEXT NOT NULL, + instance_type TEXT NOT NULL, + -- new columns below + zone TEXT NOT NULL, + capacity_type TEXT NOT NULL, + hourly_cost REAL NOT NULL +); + +-- this approach is needed because we want to add new columns +-- with a not null constraint, but also to add default values +-- this isn't directly supported by ALTER TABLE + +-- copy data from nodes to nodes_tmp +-- '', '', and 0 are the default values for zone, capacity_type, and hourly_cost +INSERT INTO nodes_tmp SELECT id, uuid, hostname, cores, mem, arch, os, instance_type, '', '', 0 FROM nodes; +DROP TABLE nodes; +ALTER TABLE nodes_tmp RENAME TO nodes; + +PRAGMA foreign_keys = ON;