Skip to content

✨ Allow a user to provide a custom metric provider for firing metrics #3213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 195 additions & 0 deletions pkg/controller/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
Copyright 2018 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"time"

"github.com/prometheus/client_golang/prometheus"
internalmetrics "sigs.k8s.io/controller-runtime/pkg/internal/metrics"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
// reconcileTotal is a prometheus counter metrics which holds the total
// number of reconciliations per controller. It has two labels. controller label refers
// to the controller name and result label refers to the reconcile result i.e
// success, error, requeue, requeue_after.
reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_total",
Help: "Total number of reconciliations per controller",
}, []string{"controller", "result"})

// reconcileErrors is a prometheus counter metrics which holds the total
// number of errors from the Reconciler.
reconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconciliation errors per controller",
}, []string{"controller"})

// terminalReconcileErrors is a prometheus counter metrics which holds the total
// number of terminal errors from the Reconciler.
terminalReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_terminal_reconcile_errors_total",
Help: "Total number of terminal reconciliation errors per controller",
}, []string{"controller"})

// reconcilePanics is a prometheus counter metrics which holds the total
// number of panics from the Reconciler.
reconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_panics_total",
Help: "Total number of reconciliation panics per controller",
}, []string{"controller"})

// reconcileTime is a prometheus metric which keeps track of the duration
// of reconciliations.
reconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_time_seconds",
Help: "Length of time per reconciliation per controller",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
}, []string{"controller"})

// workerCount is a prometheus metric which holds the number of
// concurrent reconciles per controller.
workerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_max_concurrent_reconciles",
Help: "Maximum number of concurrent reconciles per controller",
}, []string{"controller"})

// activeWorkers is a prometheus metric which holds the number
// of active workers per controller.
activeWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_active_workers",
Help: "Number of currently used workers per controller",
}, []string{"controller"})
)

// ControllerMetricsProvider is an interface that provides methods for firing controller metrics
type ControllerMetricsProvider interface {
// ReconcileTotal is a prometheus counter metrics which holds the total
// number of reconciliations per controller. It has two labels. controller label refers
// to the controller name and result label refers to the reconcile result i.e
// success, error, requeue, requeue_after.
ReconcileTotal() internalmetrics.CounterMetric
// ReconcileErrors is a prometheus counter metrics which holds the total
// number of errors from the Reconciler.
ReconcileErrors() internalmetrics.CounterMetric
// TerminalReconcileErrors is a prometheus counter metrics which holds the total
// number of terminal errors from the Reconciler.
TerminalReconcileErrors() internalmetrics.CounterMetric
// ReconcilePanics is a prometheus counter metrics which holds the total
// number of panics from the Reconciler.
ReconcilePanics() internalmetrics.CounterMetric
// ReconcileTime is a prometheus metric which keeps track of the duration
// of reconciliations.
ReconcileTime() internalmetrics.HistogramMetric
// WorkerCount is a prometheus metric which holds the number of
// concurrent reconciles per controller.
WorkerCount() internalmetrics.GaugeMetric
// ActiveWorkers is a prometheus metric which holds the number
// of active workers per controller.
ActiveWorkers() internalmetrics.GaugeMetric
}

// PrometheusProvider is a metrics.ControllerMetricsProvider and a metrics.LeaderElectionMetricsProvider
// that registers and fires prometheus metrics in response to leader election and controller events
type PrometheusProvider struct {
reconcileTotal *prometheus.CounterVec
reconcileErrors *prometheus.CounterVec
terminalReconcileErrors *prometheus.CounterVec
reconcilePanics *prometheus.CounterVec
reconcileTime *prometheus.HistogramVec
workerCount *prometheus.GaugeVec
activeWorkers *prometheus.GaugeVec
}

// NewPrometheusProvider creates a PrometheusProvider
func NewPrometheusProvider() *PrometheusProvider {
return &PrometheusProvider{
reconcileTotal: reconcileTotal,
reconcileErrors: reconcileErrors,
terminalReconcileErrors: terminalReconcileErrors,
reconcilePanics: reconcilePanics,
reconcileTime: reconcileTime,
workerCount: workerCount,
activeWorkers: activeWorkers,
}
}

// ReconcileTotal returns a Prometheus counter that fulfills the CounterMetric interface
func (p PrometheusProvider) ReconcileTotal() internalmetrics.CounterMetric {
return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcileTotal}
}

// ReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface
func (p PrometheusProvider) ReconcileErrors() internalmetrics.CounterMetric {
return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcileErrors}
}

// TerminalReconcileErrors returns a Prometheus counter that fulfills the CounterMetric interface
func (p PrometheusProvider) TerminalReconcileErrors() internalmetrics.CounterMetric {
return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.terminalReconcileErrors}
}

// ReconcilePanics returns a Prometheus counter that fulfills the CounterMetric interface
func (p PrometheusProvider) ReconcilePanics() internalmetrics.CounterMetric {
return &internalmetrics.PrometheusCounterAdapter{CounterVec: p.reconcilePanics}
}

// ReconcileTime returns a Prometheus histogram that fulfills the ObservationMetric interface
func (p PrometheusProvider) ReconcileTime() internalmetrics.HistogramMetric {
return &internalmetrics.PrometheusHistogramAdapter{HistogramVec: p.reconcileTime}
}

// WorkerCount returns a Prometheus gauge that fulfills the GaugeMetric interface
func (p PrometheusProvider) WorkerCount() internalmetrics.GaugeMetric {
return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.workerCount}
}

// ActiveWorkers returns a Prometheus gauge that fulfills the GaugeMetric interface
func (p PrometheusProvider) ActiveWorkers() internalmetrics.GaugeMetric {
return &internalmetrics.PrometheusGaugeAdapter{GaugeVec: p.activeWorkers}
}

func init() {
metrics.Registry.MustRegister(
reconcileTotal,
reconcileErrors,
terminalReconcileErrors,
reconcilePanics,
reconcileTime,
workerCount,
activeWorkers,
)
}

var controllerMetricsProvider ControllerMetricsProvider = NewPrometheusProvider()

// SetControllerMetricsProvider assigns a provider to the ControllerMetricsProvider for exposing controller metrics.
// The PrometheusProvider will be used by default if the provider is not overridden
func SetControllerMetricsProvider(provider ControllerMetricsProvider) {
controllerMetricsProvider = provider
}

// GetControllerMetricsProvider returns the controller metrics provider being used by the controller reconciliation
func GetControllerMetricsProvider() ControllerMetricsProvider {
return controllerMetricsProvider
}
2 changes: 1 addition & 1 deletion pkg/controller/priorityqueue/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

"k8s.io/client-go/util/workqueue"
"k8s.io/utils/clock"
"sigs.k8s.io/controller-runtime/pkg/internal/metrics"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// This file is mostly a copy of unexported code from
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/priorityqueue/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"sync"

"k8s.io/client-go/util/workqueue"
"sigs.k8s.io/controller-runtime/pkg/internal/metrics"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

func newFakeMetricsProvider() *fakeMetricsProvider {
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/priorityqueue/priorityqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
"k8s.io/utils/clock"
"k8s.io/utils/ptr"

"sigs.k8s.io/controller-runtime/pkg/internal/metrics"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// AddOpts describes the options for adding items to the queue.
Expand Down Expand Up @@ -56,7 +56,7 @@ func New[T comparable](name string, o ...Opt[T]) PriorityQueue[T] {
}

if opts.MetricProvider == nil {
opts.MetricProvider = metrics.WorkqueueMetricsProvider{}
opts.MetricProvider = metrics.PrometheusWorkqueueMetricsProvider{}
}

pq := &priorityqueue[T]{
Expand Down
50 changes: 26 additions & 24 deletions pkg/internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ import (
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/client-go/util/workqueue"
"sigs.k8s.io/controller-runtime/pkg/controller/metrics"

"sigs.k8s.io/controller-runtime/pkg/controller/priorityqueue"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"
Expand Down Expand Up @@ -101,7 +101,7 @@ type Controller[request comparable] struct {
func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ reconcile.Result, err error) {
defer func() {
if r := recover(); r != nil {
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Inc()
metrics.GetControllerMetricsProvider().ReconcilePanics().Inc(map[string]string{labelKeyController: c.Name})

if c.RecoverPanic == nil || *c.RecoverPanic {
for _, fn := range utilruntime.PanicHandlers {
Expand Down Expand Up @@ -294,30 +294,32 @@ func (c *Controller[request]) processNextWorkItem(ctx context.Context) bool {
// period.
defer c.Queue.Done(obj)

ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Add(1)
defer ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Add(-1)
metrics.GetControllerMetricsProvider().ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, 1)
defer metrics.GetControllerMetricsProvider().ActiveWorkers().Add(map[string]string{labelKeyController: c.Name}, -1)

c.reconcileHandler(ctx, obj, priority)
return true
}

const (
labelError = "error"
labelRequeueAfter = "requeue_after"
labelRequeue = "requeue"
labelSuccess = "success"
labelKeyController = "controller"
labelKeyResult = "result"
labelError = "error"
labelRequeueAfter = "requeue_after"
labelRequeue = "requeue"
labelSuccess = "success"
)

func (c *Controller[request]) initMetrics() {
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Add(0)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Add(0)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Add(0)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Add(0)
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0)
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0)
ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles))
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError}, 0)
metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter}, 0)
metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue}, 0)
metrics.GetControllerMetricsProvider().ReconcileTotal().Add(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess}, 0)
metrics.GetControllerMetricsProvider().ReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0)
metrics.GetControllerMetricsProvider().TerminalReconcileErrors().Add(map[string]string{labelKeyController: c.Name}, 0)
metrics.GetControllerMetricsProvider().ReconcilePanics().Add(map[string]string{labelKeyController: c.Name}, 0)
metrics.GetControllerMetricsProvider().WorkerCount().Set(map[string]string{labelKeyController: c.Name}, float64(c.MaxConcurrentReconciles))
metrics.GetControllerMetricsProvider().ActiveWorkers().Set(map[string]string{labelKeyController: c.Name}, 0)
}

func (c *Controller[request]) reconcileHandler(ctx context.Context, req request, priority int) {
Expand All @@ -341,12 +343,12 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request,
switch {
case err != nil:
if errors.Is(err, reconcile.TerminalError(nil)) {
ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Inc()
metrics.GetControllerMetricsProvider().TerminalReconcileErrors().Inc(map[string]string{"controller": c.Name})
} else {
c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req)
}
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Inc()
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Inc()
metrics.GetControllerMetricsProvider().ReconcileErrors().Inc(map[string]string{labelKeyController: c.Name})
metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelError})
if !result.IsZero() {
log.Info("Warning: Reconciler returned both a non-zero result and a non-nil error. The result will always be ignored if the error is non-nil and the non-nil error causes requeuing with exponential backoff. For more details, see: https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/reconcile#Reconciler")
}
Expand All @@ -359,17 +361,17 @@ func (c *Controller[request]) reconcileHandler(ctx context.Context, req request,
// to result.RequestAfter
c.Queue.Forget(req)
c.Queue.AddWithOpts(priorityqueue.AddOpts{After: result.RequeueAfter, Priority: priority}, req)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Inc()
metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeueAfter})
case result.Requeue: //nolint: staticcheck // We have to handle it until it is removed
log.V(5).Info("Reconcile done, requeueing")
c.Queue.AddWithOpts(priorityqueue.AddOpts{RateLimited: true, Priority: priority}, req)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Inc()
metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelRequeue})
default:
log.V(5).Info("Reconcile successful")
// Finally, if no error occurs we Forget this item so it does not
// get queued again until another change happens.
c.Queue.Forget(req)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Inc()
metrics.GetControllerMetricsProvider().ReconcileTotal().Inc(map[string]string{labelKeyController: c.Name, labelKeyResult: labelSuccess})
}
}

Expand All @@ -380,7 +382,7 @@ func (c *Controller[request]) GetLogger() logr.Logger {

// updateMetrics updates prometheus metrics within the controller.
func (c *Controller[request]) updateMetrics(reconcileTime time.Duration) {
ctrlmetrics.ReconcileTime.WithLabelValues(c.Name).Observe(reconcileTime.Seconds())
metrics.GetControllerMetricsProvider().ReconcileTime().Observe(map[string]string{labelKeyController: c.Name}, reconcileTime.Seconds())
}

// ReconcileIDFromContext gets the reconcileID from the current context.
Expand Down
Loading
Loading