From 4eee4d0ee242e39a27b6707396337958863d4bf9 Mon Sep 17 00:00:00 2001 From: Hengzhe Date: Sat, 26 Oct 2024 06:34:07 +0900 Subject: [PATCH] [YUNIKORN-2908] metrics not removed when a queue is removed --- pkg/metrics/init.go | 9 +++++++++ pkg/metrics/queue.go | 15 +++++++++++++++ pkg/metrics/queue_test.go | 17 +++++++++++++++++ pkg/scheduler/objects/queue.go | 5 +++++ 4 files changed, 46 insertions(+) diff --git a/pkg/metrics/init.go b/pkg/metrics/init.go index ce4780145..ec370abf3 100644 --- a/pkg/metrics/init.go +++ b/pkg/metrics/init.go @@ -84,6 +84,15 @@ func GetQueueMetrics(name string) *QueueMetrics { return queueMetrics } +func RemoveQueueMetrics(name string) { + m.lock.Lock() + defer m.lock.Unlock() + if metrics, ok := m.queues[name]; ok { + metrics.UnregisterMetrics() + delete(m.queues, name) + } +} + func GetEventMetrics() *EventMetrics { return m.event } diff --git a/pkg/metrics/queue.go b/pkg/metrics/queue.go index fc1181646..d8917b850 100644 --- a/pkg/metrics/queue.go +++ b/pkg/metrics/queue.go @@ -132,6 +132,21 @@ func InitQueueMetrics(name string) *QueueMetrics { return q } +func (m *QueueMetrics) UnregisterMetrics() { + var queueMetricsList = []prometheus.Collector{ + m.appMetricsLabel, + m.appMetricsSubsystem, + m.containerMetrics, + m.resourceMetricsLabel, + m.resourceMetricsSubsystem, + } + + // Unregister the metrics + for _, metric := range queueMetricsList { + prometheus.Unregister(metric) + } +} + func (m *QueueMetrics) incQueueApplications(state string) { m.appMetricsLabel.WithLabelValues(state).Inc() m.appMetricsSubsystem.WithLabelValues(state).Inc() diff --git a/pkg/metrics/queue_test.go b/pkg/metrics/queue_test.go index 9a9678645..69199f70a 100644 --- a/pkg/metrics/queue_test.go +++ b/pkg/metrics/queue_test.go @@ -246,6 +246,23 @@ func TestQueuePreemptingResourceMetrics(t *testing.T) { verifyResourceMetrics(t, "preempting", "cpu") } +func TestRemoveQueueMetrics(t *testing.T) { + testQueueName := "root.test" + qm = GetQueueMetrics(testQueueName) + defer unregisterQueueMetrics() + + qm.SetQueueAllocatedResourceMetrics("cpu", 1) + assert.Assert(t, m.queues[testQueueName] != nil) + RemoveQueueMetrics(testQueueName) + assert.Assert(t, m.queues[testQueueName] == nil) + metrics, err := prometheus.DefaultGatherer.Gather() + assert.NilError(t, err) + for _, metric := range metrics { + assert.Assert(t, metric.GetName() != "yunikorn_queue_resource", + "Queue metrics are not cleaned up") + } +} + func getQueueMetrics() *QueueMetrics { return InitQueueMetrics("root.test") } diff --git a/pkg/scheduler/objects/queue.go b/pkg/scheduler/objects/queue.go index 6ac9dc16e..5b62ca5c1 100644 --- a/pkg/scheduler/objects/queue.go +++ b/pkg/scheduler/objects/queue.go @@ -986,6 +986,7 @@ func (sq *Queue) RemoveQueue() bool { return false } log.Log(log.SchedQueue).Info("removing queue", zap.String("queue", sq.QueuePath)) + sq.removeMetrics() // root is always managed and is the only queue with a nil parent: no need to guard sq.parent.removeChildQueue(sq.Name) sq.queueEvents.SendRemoveQueueEvent(sq.QueuePath, sq.isManaged) @@ -1696,6 +1697,10 @@ func (sq *Queue) updatePreemptingResourceMetrics() { } } +func (sq *Queue) removeMetrics() { + metrics.RemoveQueueMetrics(sq.QueuePath) +} + func (sq *Queue) String() string { sq.RLock() defer sq.RUnlock()