Skip to content

Commit 55e7cf1

Browse files
gotjoshalexweav
andauthored
Alerting: Introduce Metric Aggregation starting with Silences (grafana#62512)
* Alerting: Introduce Metric Aggregation starting with Silences --------- Co-authored-by: Alexander Weaver <[email protected]>
1 parent 138575c commit 55e7cf1

File tree

4 files changed

+126
-16
lines changed

4 files changed

+126
-16
lines changed

go.mod

+3
Original file line numberDiff line numberDiff line change
@@ -420,3 +420,6 @@ replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-aler
420420
replace google.golang.org/grpc => google.golang.org/grpc v1.45.0
421421

422422
replace google.golang.org/genproto => google.golang.org/genproto v0.0.0-20220421151946-72621c1f0bd3
423+
424+
// Remove this once https://github.com/grafana/dskit/pull/258 is merged.
425+
replace github.com/grafana/dskit => github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -1247,14 +1247,14 @@ github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad
12471247
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
12481248
github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc=
12491249
github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
1250+
github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27 h1:rWMt8wsjGjzT/6AX6/Ie0JTA0CNZzzbDfup34lSJnTw=
1251+
github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27/go.mod h1:ulYLLoSd71AWIjxgifLO86Lndx82Yj+IcV+fFnh8tkI=
12501252
github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0 h1:BzkQNnj+eevX30EMqJiUS1w3CPoGc8kp7pDf/ari/4Y=
12511253
github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0/go.mod h1:NoSLbfmUwE+omWFReFrLtbtOItmvTbuQERJ6XFYp9ME=
12521254
github.com/grafana/codejen v0.0.3 h1:tAWxoTUuhgmEqxJPOLtJoxlPBbMULFwKFOcRsPRPXDw=
12531255
github.com/grafana/codejen v0.0.3/go.mod h1:zmwwM/DRyQB7pfuBjTWII3CWtxcXh8LTwAYGfDfpR6s=
12541256
github.com/grafana/cuetsy v0.1.5 h1:mnFwAXdbqCsyL8r7kkdUMJ4kOAR26cxIPmrZj7JzTeY=
12551257
github.com/grafana/cuetsy v0.1.5/go.mod h1:4KWkUOslwvRTpEv7wdQG0jDFTuJmU+0L9x0h4kWxa2A=
1256-
github.com/grafana/dskit v0.0.0-20230126115530-71478074eab8 h1:5nqLvzKugVUb9sCQkKuOPecRshawSrbHsXyGxBkTBus=
1257-
github.com/grafana/dskit v0.0.0-20230126115530-71478074eab8/go.mod h1:zj+5BNZAVmQafV583uLTAOzRr963KPdEm4d6NPmtbwg=
12581258
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036 h1:GplhUk6Xes5JIhUUrggPcPBhOn+eT8+WsHiebvq7GgA=
12591259
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU=
12601260
github.com/grafana/grafana-aws-sdk v0.12.0 h1:eUjFdFZeZE+nyu/RMRz+qFxTBew69ToLBrbRhTbjkfM=

pkg/services/ngalert/metrics/multi_org_alertmanager.go

+121-4
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,31 @@
11
package metrics
22

33
import (
4+
"fmt"
5+
"strconv"
6+
7+
"github.com/grafana/grafana/pkg/infra/log"
8+
9+
"github.com/grafana/dskit/metrics"
410
"github.com/prometheus/client_golang/prometheus"
511
"github.com/prometheus/client_golang/prometheus/promauto"
612
)
713

814
type MultiOrgAlertmanager struct {
9-
Registerer prometheus.Registerer
15+
Registerer prometheus.Registerer
16+
registries *metrics.TenantRegistries
17+
1018
ActiveConfigurations prometheus.Gauge
1119
DiscoveredConfigurations prometheus.Gauge
12-
registries *OrgRegistries
20+
21+
aggregatedMetrics *AlertmanagerAggregatedMetrics
1322
}
1423

1524
func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
16-
return &MultiOrgAlertmanager{
25+
registries := metrics.NewTenantRegistries(log.New("ngalert.multiorg.alertmanager.metrics")) //TODO: Should this be here? Probably not.
26+
moa := &MultiOrgAlertmanager{
1727
Registerer: r,
18-
registries: NewOrgRegistries(),
28+
registries: registries,
1929
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
2030
Namespace: Namespace,
2131
Subsystem: Subsystem,
@@ -28,5 +38,112 @@ func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanag
2838
Name: "active_configurations",
2939
Help: "The number of active Alertmanager configurations.",
3040
}),
41+
aggregatedMetrics: NewAlertmanagerAggregatedMetrics(registries),
42+
}
43+
44+
// These metrics use a different registration method as the struct itself represents a custom collector.
45+
// There's no way to "auto-register" a collector.
46+
r.MustRegister(moa.aggregatedMetrics)
47+
48+
return moa
49+
}
50+
51+
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
52+
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
53+
moa.registries.RemoveTenantRegistry(strconv.FormatInt(id, 10), false)
54+
}
55+
56+
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
57+
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
58+
sid := strconv.FormatInt(id, 10)
59+
reg := moa.registries.GetRegistryForTenant(sid)
60+
if reg != nil {
61+
return reg
62+
}
63+
64+
result := prometheus.NewRegistry()
65+
moa.registries.AddTenantRegistry(sid, result)
66+
67+
return result
68+
}
69+
70+
// AlertmanagerAggregatedMetrics are metrics collected directly from the registry.
71+
// Unlike metrics.Alertmanager they are not called within this codebase hence the need for direct collection.
72+
type AlertmanagerAggregatedMetrics struct {
73+
registries *metrics.TenantRegistries
74+
75+
// exported metrics, gathered from Alertmanager Silences
76+
silencesGCDuration *prometheus.Desc
77+
silencesSnapshotDuration *prometheus.Desc
78+
silencesSnapshotSize *prometheus.Desc
79+
silencesQueriesTotal *prometheus.Desc
80+
silencesQueryErrorsTotal *prometheus.Desc
81+
silencesQueryDuration *prometheus.Desc
82+
silences *prometheus.Desc
83+
silencesPropagatedMessagesTotal *prometheus.Desc
84+
}
85+
86+
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
87+
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
88+
registries: registries,
89+
90+
silencesGCDuration: prometheus.NewDesc(
91+
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
92+
"Duration of the last silence garbage collection cycle.",
93+
nil, nil),
94+
silencesSnapshotDuration: prometheus.NewDesc(
95+
fmt.Sprintf("%s_%s_silences_snapshot_duration_seconds", Namespace, Subsystem),
96+
"Duration of the last silence snapshot.",
97+
nil, nil),
98+
silencesSnapshotSize: prometheus.NewDesc(
99+
fmt.Sprintf("%s_%s_silences_snapshot_size_bytes", Namespace, Subsystem),
100+
"Size of the last silence snapshot in bytes.",
101+
nil, nil),
102+
silencesQueriesTotal: prometheus.NewDesc(
103+
fmt.Sprintf("%s_%s_silences_queries_total", Namespace, Subsystem),
104+
"How many silence queries were received.",
105+
nil, nil),
106+
silencesQueryErrorsTotal: prometheus.NewDesc(
107+
fmt.Sprintf("%s_%s_silences_query_errors_total", Namespace, Subsystem),
108+
"How many silence received queries did not succeed.",
109+
nil, nil),
110+
silencesQueryDuration: prometheus.NewDesc(
111+
fmt.Sprintf("%s_%s_silences_query_duration_seconds", Namespace, Subsystem),
112+
"Duration of silence query evaluation.",
113+
nil, nil),
114+
silencesPropagatedMessagesTotal: prometheus.NewDesc(
115+
fmt.Sprintf("%s_%s_silences_gossip_messages_propagated_total", Namespace, Subsystem),
116+
"Number of received gossip messages that have been further gossiped.",
117+
nil, nil),
118+
silences: prometheus.NewDesc(
119+
fmt.Sprintf("%s_%s_silences", Namespace, Subsystem),
120+
"How many silences by state.",
121+
[]string{"org", "state"}, nil),
31122
}
123+
124+
return aggregatedMetrics
125+
}
126+
127+
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
128+
out <- a.silencesGCDuration
129+
out <- a.silencesSnapshotDuration
130+
out <- a.silencesSnapshotSize
131+
out <- a.silencesQueriesTotal
132+
out <- a.silencesQueryErrorsTotal
133+
out <- a.silencesQueryDuration
134+
out <- a.silencesPropagatedMessagesTotal
135+
out <- a.silences
136+
}
137+
138+
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
139+
data := a.registries.BuildMetricFamiliesPerTenant()
140+
141+
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
142+
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
143+
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
144+
data.SendSumOfCounters(out, a.silencesQueriesTotal, "alertmanager_silences_queries_total")
145+
data.SendSumOfCounters(out, a.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
146+
data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
147+
data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
148+
data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state")
32149
}

pkg/services/ngalert/metrics/ngalert.go

-10
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,3 @@ func (ng *NGAlert) GetAPIMetrics() *API {
5757
func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
5858
return ng.multiOrgAlertmanagerMetrics
5959
}
60-
61-
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
62-
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
63-
moa.registries.RemoveOrgRegistry(id)
64-
}
65-
66-
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
67-
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
68-
return moa.registries.GetOrCreateOrgRegistry(id)
69-
}

0 commit comments

Comments
 (0)