diff --git a/openshift_metrics/openshift_prometheus_metrics.py b/openshift_metrics/openshift_prometheus_metrics.py index 70494e8..8daaf70 100755 --- a/openshift_metrics/openshift_prometheus_metrics.py +++ b/openshift_metrics/openshift_prometheus_metrics.py @@ -27,7 +27,7 @@ # For GPU requests, we don't need to exclude unscheduled pods because the join on node will eliminate those as unscheduled # pods don't have a node value -GPU_REQUEST = 'kube_pod_resource_request{resource=~".*gpu.*"} * on(node) group_left(label_nvidia_com_gpu_product) kube_node_labels' +GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*"} * on(node) group_left(label_nvidia_com_gpu_product, label_nvidia_com_gpu_machine) kube_node_labels' def main(): diff --git a/openshift_metrics/tests/test_utils.py b/openshift_metrics/tests/test_utils.py index 8b469ba..11edcf0 100644 --- a/openshift_metrics/tests/test_utils.py +++ b/openshift_metrics/tests/test_utils.py @@ -416,17 +416,20 @@ def test_merge_metrics_not_empty_with_gpu(self): 0: { "cpu": 10, "gpu_request": 1, - "gpu_type": "Tesla-V100-PCIE-32GB" + "gpu_type": "Tesla-V100-PCIE-32GB", + "gpu_resource": "nvidia.com/gpu", }, 60: { "cpu": 15, "gpu_request": 1, - "gpu_type": "Tesla-V100-PCIE-32GB" + "gpu_type": "Tesla-V100-PCIE-32GB", + "gpu_resource": "nvidia.com/gpu", }, 120: { "cpu": 20, "gpu_request": 2, - "gpu_type": "Tesla-V100-PCIE-32GB" + "gpu_type": "Tesla-V100-PCIE-32GB", + "gpu_resource": "nvidia.com/gpu", }, } }, @@ -791,23 +794,25 @@ def test_write_metrics_log(self, mock_gna): test_metrics_dict = { "pod1": { "namespace": "namespace1", - "gpu_type": utils.NO_GPU, "metrics": { 0: { "cpu_request": 10, "memory_request": 1048576, - "duration": 120 + "duration": 120, + "node": "wrk-1", + "node_model": "Dell", }, 120: { "cpu_request": 20, "memory_request": 1048576, - "duration": 60 + "duration": 60, + "node": "wrk-2", + "node_model": "Lenovo" } } }, "pod2": { "namespace": "namespace1", - "gpu_type": utils.NO_GPU, "metrics": { 0: { "cpu_request": 20, @@ -828,7 +833,6 @@ def test_write_metrics_log(self, mock_gna): }, "pod3": { "namespace": "namespace2", - "gpu_type": utils.NO_GPU, "metrics": { 0: { "cpu_request": 45, @@ -839,7 +843,6 @@ def test_write_metrics_log(self, mock_gna): }, "pod4": { # this results in 0.5 SU "namespace": "namespace2", - "gpu_type": utils.NO_GPU, "metrics": { 0: { "cpu_request": 0.5, @@ -850,14 +853,14 @@ def test_write_metrics_log(self, mock_gna): }, } - expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,Memory Request (GiB),Determining Resource,SU Type,SU Count\n" - "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,No GPU,0.001,CPU,OpenShift CPU,10.0\n" - "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,No GPU,0.001,CPU,OpenShift CPU,20.0\n" - "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n" - "namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,No GPU,0.0098,CPU,OpenShift CPU,25.0\n" - "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n" - "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,No GPU,0.0977,CPU,OpenShift CPU,45.0\n" - "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,No GPU,2.0,CPU,OpenShift CPU,0.5\n") + expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,GPU Resource,Node,Node Model,Memory Request (GiB),Determining Resource,SU Type,SU Count\n" + "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,,,wrk-1,Dell,0.001,CPU,OpenShift CPU,10.0\n" + "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,,,wrk-2,Lenovo,0.001,CPU,OpenShift CPU,20.0\n" + "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20.0\n" + "namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,25.0\n" + "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20.0\n" + "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,,,Unknown Node,Unknown Model,0.0977,CPU,OpenShift CPU,45.0\n" + "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,,,Unknown Node,Unknown Model,2.0,CPU,OpenShift CPU,0.5\n") with tempfile.NamedTemporaryFile(mode="w+") as tmp: utils.write_metrics_by_pod(test_metrics_dict, tmp.name) @@ -928,6 +931,7 @@ def test_write_metrics_log(self, mock_gna): "memory_request": 8 * 2**30, "gpu_request": 1, "gpu_type": utils.GPU_A100, + "gpu_resource": utils.WHOLE_GPU, "duration": 172700 # little under 48 hours, expect to be rounded up in the output }, } @@ -941,6 +945,7 @@ def test_write_metrics_log(self, mock_gna): "memory_request": 8 * 2**30, "gpu_request": 1, "gpu_type": utils.GPU_A100_SXM4, + "gpu_resource": utils.WHOLE_GPU, "duration": 172800 }, } @@ -961,79 +966,103 @@ def test_write_metrics_log(self, mock_gna): class TestGetServiceUnit(TestCase): def test_cpu_only(self): - su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None) + su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None, None) self.assertEqual(su_type, utils.SU_CPU) self.assertEqual(su_count, 4) self.assertEqual(determining_resource, "CPU") def test_known_gpu(self): - su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100) + su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU) self.assertEqual(su_type, utils.SU_A100_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_known_gpu_A100_SXM4(self): - su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4) + su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU) self.assertEqual(su_type, utils.SU_A100_SXM4_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_known_gpu_high_cpu(self): - su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100) + su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU) self.assertEqual(su_type, utils.SU_A100_GPU) self.assertEqual(su_count, 3) self.assertEqual(determining_resource, "CPU") def test_known_gpu_high_memory(self): - su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100) + su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU) self.assertEqual(su_type, utils.SU_A100_GPU) self.assertEqual(su_count, 2) self.assertEqual(determining_resource, "RAM") def test_known_gpu_low_cpu_memory(self): - su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100) + su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU) self.assertEqual(su_type, utils.SU_A100_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") def test_unknown_gpu(self): - su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type") + su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU) + self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + self.assertEqual(su_count, 1) + self.assertEqual(determining_resource, "GPU") + + def test_known_gpu_zero_count(self): + su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU) self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + self.assertEqual(su_count, 0) + self.assertEqual(determining_resource, "GPU") + + def test_known_mig_gpu(self): + su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB) + self.assertEqual(su_type, utils.SU_UNKNOWN_MIG_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") + def test_known_gpu_unknown_resource(self): + su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB") + self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + self.assertEqual(su_count, 0) + self.assertEqual(determining_resource, "GPU") + + def test_unknown_gpu_known_resource(self): + su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB) + self.assertEqual(su_type, utils.SU_UNKNOWN_GPU) + self.assertEqual(su_count, 0) + self.assertEqual(determining_resource, "GPU") + def test_zero_memory(self): - su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None) + su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None, None) self.assertEqual(su_type, utils.SU_UNKNOWN) self.assertEqual(su_count, 0) self.assertEqual(determining_resource, "CPU") def test_zero_cpu(self): - su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None) + su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None, None) self.assertEqual(su_type, utils.SU_UNKNOWN) self.assertEqual(su_count, 0) self.assertEqual(determining_resource, "CPU") def test_memory_dominant(self): - su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None) + su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None, None) self.assertEqual(su_type, utils.SU_CPU) self.assertEqual(su_count, 16) self.assertEqual(determining_resource, "RAM") def test_fractional_su_cpu_dominant(self): - su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None) + su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None, None) self.assertEqual(su_type, utils.SU_CPU) self.assertEqual(su_count, 0.5) self.assertEqual(determining_resource, "CPU") def test_fractional_su_memory_dominant(self): - su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None) + su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None, None) self.assertEqual(su_type, utils.SU_CPU) self.assertEqual(su_count, 0.25) self.assertEqual(determining_resource, "RAM") def test_known_gpu_fractional_cpu_memory(self): - su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100) + su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU) self.assertEqual(su_type, utils.SU_A100_GPU) self.assertEqual(su_count, 1) self.assertEqual(determining_resource, "GPU") diff --git a/openshift_metrics/utils.py b/openshift_metrics/utils.py index 1adafe9..f79c9fb 100755 --- a/openshift_metrics/utils.py +++ b/openshift_metrics/utils.py @@ -29,7 +29,14 @@ GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB" GPU_V100 = "Tesla-V100-PCIE-32GB" GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE" -NO_GPU = "No GPU" + +# GPU Resource - MIG Geometries +# A100 Strategies +MIG_1G_5GB = "nvidia.com/mig-1g.5gb" +MIG_2G_10GB = "nvidia.com/mig-2g.10gb" +MIG_3G_20GB = "nvidia.com/mig-3g.20gb" +WHOLE_GPU = "nvidia.com/gpu" + # SU Types SU_CPU = "OpenShift CPU" @@ -37,6 +44,7 @@ SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4" SU_V100_GPU = "OpenShift GPUV100" SU_UNKNOWN_GPU = "OpenShift Unknown GPU" +SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU" SU_UNKNOWN = "Openshift Unknown" RATE = { @@ -160,18 +168,15 @@ def get_namespace_attributes(): return namespaces_dict -def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type): +def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource): """ Returns the type of service unit, the count, and the determining resource """ su_type = SU_UNKNOWN su_count = 0 - if gpu_type == NO_GPU: - gpu_type = None - # pods that requested a specific GPU but weren't scheduled may report 0 GPU - if gpu_type is not None and gpu_count == 0: + if gpu_resource is not None and gpu_count == 0: return SU_UNKNOWN_GPU, 0, "GPU" # pods in weird states @@ -182,7 +187,12 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type): GPU_A100: SU_A100_GPU, GPU_A100_SXM4: SU_A100_SXM4_GPU, GPU_V100: SU_V100_GPU, - GPU_UNKNOWN_TYPE: SU_UNKNOWN_GPU, + } + + A100_SXM4_MIG = { + MIG_1G_5GB: SU_UNKNOWN_MIG_GPU, + MIG_2G_10GB: SU_UNKNOWN_MIG_GPU, + MIG_3G_20GB: SU_UNKNOWN_MIG_GPU, } # GPU count for some configs is -1 for math reasons, in reality it is 0 @@ -192,13 +202,18 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type): SU_A100_SXM4_GPU: {"gpu": 1, "cpu": 32, "ram": 245}, SU_V100_GPU: {"gpu": 1, "cpu": 24, "ram": 192}, SU_UNKNOWN_GPU: {"gpu": 1, "cpu": 8, "ram": 64}, + SU_UNKNOWN_MIG_GPU: {"gpu": 1, "cpu": 8, "ram": 64}, SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1}, } - if gpu_type is None and gpu_count == 0: + if gpu_resource is None and gpu_count == 0: su_type = SU_CPU - else: + elif gpu_type is not None and gpu_resource == WHOLE_GPU: su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU) + elif gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4 + su_type = A100_SXM4_MIG.get(gpu_resource, SU_UNKNOWN_MIG_GPU) + else: + return SU_UNKNOWN_GPU, 0, "GPU" cpu_multiplier = cpu_count / su_config[su_type]["cpu"] gpu_multiplier = gpu_count / su_config[su_type]["gpu"] @@ -230,14 +245,20 @@ def merge_metrics(metric_name, metric_list, output_dict): for metric in metric_list: pod = metric["metric"]["pod"] namespace = metric["metric"]["namespace"] + node = metric["metric"].get("node") + + gpu_type = None + gpu_resource = None + node_model = None + unique_name = namespace + "+" + pod if unique_name not in output_dict: - output_dict[unique_name] = {"namespace": metric["metric"]["namespace"], "metrics": {}} + output_dict[unique_name] = {"namespace": namespace, "metrics": {}} if metric_name == "gpu_request": gpu_type = metric["metric"].get("label_nvidia_com_gpu_product", GPU_UNKNOWN_TYPE) - else: - gpu_type = None + gpu_resource = metric["metric"].get("resource") + node_model = metric["metric"].get("label_nvidia_com_gpu_machine") for value in metric["values"]: epoch_time = value[0] @@ -246,6 +267,12 @@ def merge_metrics(metric_name, metric_list, output_dict): output_dict[unique_name]["metrics"][epoch_time][metric_name] = value[1] if gpu_type: output_dict[unique_name]["metrics"][epoch_time]['gpu_type'] = gpu_type + if gpu_resource: + output_dict[unique_name]["metrics"][epoch_time]['gpu_resource'] = gpu_resource + if node_model: + output_dict[unique_name]["metrics"][epoch_time]['node_model'] = node_model + if node: + output_dict[unique_name]["metrics"][epoch_time]['node'] = node return output_dict @@ -402,9 +429,10 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month): cpu_request = float(pod_metric_dict.get("cpu_request", 0)) gpu_request = float(pod_metric_dict.get("gpu_request", 0)) gpu_type = pod_metric_dict.get("gpu_type") + gpu_resource = pod_metric_dict.get("gpu_resource") memory_request = float(pod_metric_dict.get("memory_request", 0)) / 2**30 - _, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type) + _, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type, gpu_resource) if gpu_type == GPU_A100: metrics_by_namespace[namespace]["SU_A100_GPU_HOURS"] += su_count * duration_in_hours @@ -465,6 +493,9 @@ def write_metrics_by_pod(metrics_dict, file_name): "CPU Request", "GPU Request", "GPU Type", + "GPU Resource", + "Node", + "Node Model", "Memory Request (GiB)", "Determining Resource", "SU Type", @@ -489,10 +520,13 @@ def write_metrics_by_pod(metrics_dict, file_name): duration = round(float(pod_metric_dict["duration"]) / 3600, 4) cpu_request = pod_metric_dict.get("cpu_request", 0) gpu_request = pod_metric_dict.get("gpu_request", 0) - gpu_type = pod_metric_dict.get("gpu_type", NO_GPU) + gpu_type = pod_metric_dict.get("gpu_type") + gpu_resource = pod_metric_dict.get("gpu_resource") + node = pod_metric_dict.get("node", "Unknown Node") + node_model = pod_metric_dict.get("node_model", "Unknown Model") memory_request = round(float(pod_metric_dict.get("memory_request", 0)) / 2**30, 4) su_type, su_count, determining_resource = get_service_unit( - float(cpu_request), memory_request, float(gpu_request), gpu_type + float(cpu_request), memory_request, float(gpu_request), gpu_type, gpu_resource ) info_list = [ @@ -506,6 +540,9 @@ def write_metrics_by_pod(metrics_dict, file_name): cpu_request, gpu_request, gpu_type, + gpu_resource, + node, + node_model, memory_request, determining_resource, su_type,