From 515ffd0e2573acc8ea869acc5e9d99ff6395b50d Mon Sep 17 00:00:00 2001 From: yangfeiyu20102011 Date: Mon, 11 Nov 2024 15:19:09 +0800 Subject: [PATCH] koordlet: export host application cpu and memory usage for prometheus Signed-off-by: yangfeiyu20102011 --- pkg/koordlet/metrics/host_application.go | 59 +++++++++++++++++++ pkg/koordlet/metrics/internal_metrics.go | 1 + pkg/koordlet/metrics/metrics_test.go | 32 ++++++++++ .../hostapplication/host_app_collector.go | 11 +++- 4 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 pkg/koordlet/metrics/host_application.go diff --git a/pkg/koordlet/metrics/host_application.go b/pkg/koordlet/metrics/host_application.go new file mode 100644 index 000000000..fc5efa2a9 --- /dev/null +++ b/pkg/koordlet/metrics/host_application.go @@ -0,0 +1,59 @@ +/* +Copyright 2024 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + hostApplicationName = "host_application_name" + priorityClass = "priority_class" + qos = "qos" +) + +var ( + HostApplicationResourceUsage = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: KoordletSubsystem, + Name: "host_application_resource_usage", + Help: "Host application resource usage collected by koordlet", + }, []string{NodeKey, hostApplicationName, ResourceKey, priorityClass, qos}) + + HostApplicationCollectors = []prometheus.Collector{ + HostApplicationResourceUsage, + } +) + +func ResetHostApplicationResourceUsage() { + HostApplicationResourceUsage.Reset() +} + +func RecordHostApplicationResourceUsage(resourceName string, hostAppSpec *slov1alpha1.HostApplicationSpec, value float64) { + if hostAppSpec == nil { + return + } + labels := genNodeLabels() + if labels == nil { + return + } + labels[hostApplicationName] = hostAppSpec.Name + labels[ResourceKey] = resourceName + labels[priorityClass] = string(hostAppSpec.Priority) + labels[qos] = string(hostAppSpec.QoS) + HostApplicationResourceUsage.With(labels).Set(value) +} diff --git a/pkg/koordlet/metrics/internal_metrics.go b/pkg/koordlet/metrics/internal_metrics.go index d34393f81..00d5d4f83 100644 --- a/pkg/koordlet/metrics/internal_metrics.go +++ b/pkg/koordlet/metrics/internal_metrics.go @@ -44,4 +44,5 @@ func init() { internalMustRegister(ResourceExecutorCollector...) internalMustRegister(KubeletStubCollector...) internalMustRegister(RuntimeHookCollectors...) + internalMustRegister(HostApplicationCollectors...) } diff --git a/pkg/koordlet/metrics/metrics_test.go b/pkg/koordlet/metrics/metrics_test.go index 531f1d77b..75312a79c 100644 --- a/pkg/koordlet/metrics/metrics_test.go +++ b/pkg/koordlet/metrics/metrics_test.go @@ -412,3 +412,35 @@ func TestRuntimeHookCollector(t *testing.T) { RecordRuntimeHookReconcilerInvokedDurationMilliSeconds("pod", "cpu.cfs_quota_us", testErr, 5.0) }) } + +func TestHostApplicationCollectors(t *testing.T) { + type resourceUsage struct { + cpu float64 + mem float64 + } + testingNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{}, + }, + } + testingHostApplication := &slov1alpha1.HostApplicationSpec{ + Name: "test-app", + QoS: apiext.QoSBE, + Priority: apiext.PriorityBatch, + } + testingResourceUsage := resourceUsage{ + cpu: 33740549972770, + mem: 7806574592, + } + + t.Run("test", func(t *testing.T) { + Register(testingNode) + defer Register(nil) + + RecordHostApplicationResourceUsage(string(corev1.ResourceCPU), testingHostApplication, testingResourceUsage.cpu) + RecordHostApplicationResourceUsage(string(corev1.ResourceMemory), testingHostApplication, testingResourceUsage.mem) + + ResetHostApplicationResourceUsage() + }) +} diff --git a/pkg/koordlet/metricsadvisor/collectors/hostapplication/host_app_collector.go b/pkg/koordlet/metricsadvisor/collectors/hostapplication/host_app_collector.go index 23da8c317..ce69ee898 100644 --- a/pkg/koordlet/metricsadvisor/collectors/hostapplication/host_app_collector.go +++ b/pkg/koordlet/metricsadvisor/collectors/hostapplication/host_app_collector.go @@ -21,11 +21,13 @@ import ( gocache "github.com/patrickmn/go-cache" "go.uber.org/atomic" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" "github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metrics" "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework" "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" @@ -92,9 +94,10 @@ func (h *hostAppCollector) collectHostAppResUsed() { return } count := 0 - metrics := make([]metriccache.MetricSample, 0) + resourceMetrics := make([]metriccache.MetricSample, 0) allCPUUsageCores := metriccache.Point{Timestamp: timeNow(), Value: 0} allMemoryUsage := metriccache.Point{Timestamp: timeNow(), Value: 0} + metrics.ResetHostApplicationResourceUsage() for _, hostApp := range nodeSLO.Spec.HostApplications { collectTime := timeNow() cgroupDir := util.GetHostAppCgroupRelativePath(&hostApp) @@ -139,7 +142,9 @@ func (h *hostAppCollector) collectHostAppResUsed() { return } - metrics = append(metrics, cpuUsageMetric, memUsageMetric) + metrics.RecordHostApplicationResourceUsage(string(corev1.ResourceCPU), &hostApp, cpuUsageValue) + metrics.RecordHostApplicationResourceUsage(string(corev1.ResourceMemory), &hostApp, float64(memoryUsageValue)) + resourceMetrics = append(resourceMetrics, cpuUsageMetric, memUsageMetric) klog.V(6).Infof("collect host application %v finished, metric cpu=%v, memory=%v", hostApp.Name, cpuUsageValue, memoryUsageValue) count++ allCPUUsageCores.Value += cpuUsageValue @@ -147,7 +152,7 @@ func (h *hostAppCollector) collectHostAppResUsed() { } appender := h.appendableDB.Appender() - if err := appender.Append(metrics); err != nil { + if err := appender.Append(resourceMetrics); err != nil { klog.Warningf("Append host application metrics error: %v", err) return }