diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go index 311855cf34..62375030da 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go @@ -45,6 +45,7 @@ const ( Kubernetes = "kubernetes" Region = "region" SubnetId = "subnet_id" + RuntimeTagOverride = "DEFAULT" NeuronExecutionErrorsAggregatedMetric = containerinsightscommon.NeuronExecutionErrors + "_total" NeuronDeviceHardwareEccEventsAggregatedMetric = containerinsightscommon.NeuronDeviceHardwareEccEvents + "_total" ) @@ -121,7 +122,7 @@ func (md *AwsNeuronMetricModifier) ModifyMetric(originalMetric pmetric.Metric, m } // Neuron metrics sent by the neuron monitor don't have any units so we add them in the agent. addUnit(originalMetric) - prefixCoreAndDeviceLabels(originalMetric) + updateCoreDeviceRuntimeLabels(originalMetric) resetStaleDatapoints(originalMetric) originalMetricName := originalMetric.Name() @@ -248,7 +249,7 @@ func (md *AwsNeuronMetricModifier) extractDatapointsAsMetricsAndAggregate(origin // This method prefixes NeuronCore and NeuronDevice values with `core` and `device` respectively // to make the attribute values more verbose -func prefixCoreAndDeviceLabels(originalMetric pmetric.Metric) { +func updateCoreDeviceRuntimeLabels(originalMetric pmetric.Metric) { dps := originalMetric.Sum().DataPoints() for i := 0; i < dps.Len(); i++ { dp := dps.At(i) @@ -257,6 +258,7 @@ func prefixCoreAndDeviceLabels(originalMetric pmetric.Metric) { dp.Attributes().PutStr(attributeKey, attributeValuePrefix+value.Str()) } } + dp.Attributes().PutStr(RuntimeTag, RuntimeTagOverride) } } @@ -313,7 +315,7 @@ func resetStaleDatapoints(originalMetric pmetric.Metric) { dp := dps.At(i) if dp.ValueType() == pmetric.NumberDataPointValueTypeEmpty || dp.Flags().NoRecordedValue() { dp.SetDoubleValue(dp.DoubleValue()) - dp.Attributes().PutStr(RuntimeTag, "default") + dp.Attributes().PutStr(RuntimeTag, RuntimeTagOverride) dp.SetFlags(dp.Flags().WithNoRecordedValue(false)) } } diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go index 9bff85917a..c9c0de0bca 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go @@ -21,7 +21,7 @@ var staticAttributes = map[string]any{ NodeName: "dummyAttribute", AvailabilityZone: "dummyAttribute", Kubernetes: "dummyAttribute", - RuntimeTag: "dummyAttribute", + RuntimeTag: RuntimeTagOverride, SubnetId: "dummyAttribute", } var staticTimestamp = pcommon.NewTimestampFromTime(time.Date(2023, time.March, 12, 11, 0, 0, 0, time.UTC)) @@ -88,13 +88,13 @@ func TestMetricModifierForExecutionErrorMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionErrors: metricsList.At(0), - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -111,12 +111,12 @@ func TestMetricModifierForExecutionStatusMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionStatus: metricsList.At(0), - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -174,21 +174,21 @@ func TestMetricModifierForNeuronDeviceEccEventMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -202,11 +202,11 @@ func TestMetricModifierForNeuronDeviceEccEventMetric_PodNameMissing(t *testing.T expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -251,20 +251,20 @@ func TestListWithMultipleMetrics(t *testing.T) { "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, Percentile: "p50"}}, []float64{1}, pmetric.MetricTypeSum, Seconds), - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), - - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1", StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), + + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), @@ -272,21 +272,21 @@ func TestListWithMultipleMetrics(t *testing.T) { "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron, MemoryLocation: "neuron_device"}}, []float64{2}, pmetric.MetricTypeSum, Bytes), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1", EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) } diff --git a/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go b/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go index e7868be616..dedc259432 100644 --- a/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go +++ b/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go @@ -105,7 +105,6 @@ var PodNeuronLabelFilter = map[string]map[string]interface{}{ containerinsightscommon.K8sLabelsKey: nil, }, internal.Region: nil, - internal.RuntimeTag: nil, internal.SubnetId: nil, internal.NeuronCore: nil, containerinsightscommon.MetricType: nil, @@ -132,7 +131,6 @@ var ContainerNeuronLabelFilter = map[string]map[string]interface{}{ containerinsightscommon.K8sLabelsKey: nil, }, internal.Region: nil, - internal.RuntimeTag: nil, internal.SubnetId: nil, internal.NeuronCore: nil, containerinsightscommon.MetricType: nil, @@ -152,7 +150,6 @@ var NodeNeuronLabelFilter = map[string]map[string]interface{}{ containerinsightscommon.K8sLabelsKey: nil, }, internal.Region: nil, - internal.RuntimeTag: nil, internal.SubnetId: nil, internal.NeuronCore: nil, containerinsightscommon.MetricType: nil, diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go index 30fdcfa646..d60a409504 100644 --- a/plugins/processors/gpuattributes/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -228,12 +228,15 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { }), wantMetricCnt: 2, want: []map[string]string{ + // neuron_execution_latency { "ClusterName": "cluster", "Drop": "val", "percentile": "p50", + "runtime_tag": "DEFAULT", "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", }, + // node_neuron_execution_latency { "ClusterName": "cluster", "Type": "NodeAWSNeuron", @@ -256,19 +259,20 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { }), wantMetricCnt: 7, want: []map[string]string{ + // neuroncore_memory_usage_constants { "ClusterName": "cluster", "Drop": "val", - "runtime_tag": "10", + "runtime_tag": "DEFAULT", "NeuronCore": "core0", "NeuronDevice": "device0", "PodName": "testPod", "ContainerName": "testContainer", "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", }, + // container_neuroncore_memory_usage_constants { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "ContainerAWSNeuronCore", @@ -276,26 +280,26 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { "ContainerName": "testContainer", "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", }, + // pod_neuroncore_memory_usage_constants { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "PodAWSNeuronCore", "PodName": "testPod", "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", }, + // node_neuroncore_memory_usage_constants { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "NodeAWSNeuronCore", "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", }, + // container_neuroncore_memory_usage_total { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "ContainerAWSNeuronCore", @@ -303,18 +307,18 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { "ContainerName": "testContainer", "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", }, + // pod_neuroncore_memory_usage_total { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "PodAWSNeuronCore", "PodName": "testPod", "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", }, + // node_neuroncore_memory_usage_total { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "NodeAWSNeuronCore", @@ -335,25 +339,26 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { }), wantMetricCnt: 3, want: []map[string]string{ + // neuroncore_memory_usage_constants { "ClusterName": "cluster", "Drop": "val", - "runtime_tag": "10", + "runtime_tag": "DEFAULT", "NeuronCore": "core0", "NeuronDevice": "device0", "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", }, + // node_neuroncore_memory_usage_constants { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "NodeAWSNeuronCore", "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", }, + // node_neuroncore_memory_usage_total { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "NodeAWSNeuronCore", @@ -377,10 +382,11 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { }), wantMetricCnt: 7, want: []map[string]string{ + // neurondevice_hw_ecc_events { "ClusterName": "cluster", "Drop": "val", - "runtime_tag": "10", + "runtime_tag": "DEFAULT", "NeuronCore": "core0", "NeuronDevice": "device0", "event_type": "mem_ecc_corrected", @@ -388,9 +394,9 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { "PodName": "testPod", "ContainerName": "testContainer", }, + // container_neurondevice_hw_ecc_events_mem_ecc_corrected { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "ContainerAWSNeuronDevice", @@ -398,26 +404,26 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { "PodName": "testPod", "ContainerName": "testContainer", }, + // pod_neurondevice_hw_ecc_events_mem_ecc_corrected { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "PodAWSNeuronDevice", "kubernetes": "{\"host\":\"test\"}", "PodName": "testPod", }, + // node_neurondevice_hw_ecc_events_mem_ecc_corrected { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "NodeAWSNeuronDevice", "kubernetes": "{\"host\":\"test\"}", }, + // container_neurondevice_hw_ecc_events_total { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "ContainerAWSNeuronDevice", @@ -425,18 +431,18 @@ func TestProcessMetricsForNeuronMetrics(t *testing.T) { "PodName": "testPod", "ContainerName": "testContainer", }, + // pod_neurondevice_hw_ecc_events_total { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "PodAWSNeuronDevice", "kubernetes": "{\"host\":\"test\"}", "PodName": "testPod", }, + // node_neurondevice_hw_ecc_events_total { "ClusterName": "cluster", - "runtime_tag": "10", "NeuronCore": "core0", "NeuronDevice": "device0", "Type": "NodeAWSNeuronDevice",