1
1
package metrics
2
2
3
3
import (
4
+ "fmt"
5
+ "strconv"
6
+
7
+ "github.com/grafana/grafana/pkg/infra/log"
8
+
9
+ "github.com/grafana/dskit/metrics"
4
10
"github.com/prometheus/client_golang/prometheus"
5
11
"github.com/prometheus/client_golang/prometheus/promauto"
6
12
)
7
13
8
14
type MultiOrgAlertmanager struct {
9
- Registerer prometheus.Registerer
15
+ Registerer prometheus.Registerer
16
+ registries * metrics.TenantRegistries
17
+
10
18
ActiveConfigurations prometheus.Gauge
11
19
DiscoveredConfigurations prometheus.Gauge
12
- registries * OrgRegistries
20
+
21
+ aggregatedMetrics * AlertmanagerAggregatedMetrics
13
22
}
14
23
15
24
func NewMultiOrgAlertmanagerMetrics (r prometheus.Registerer ) * MultiOrgAlertmanager {
16
- return & MultiOrgAlertmanager {
25
+ registries := metrics .NewTenantRegistries (log .New ("ngalert.multiorg.alertmanager.metrics" )) //TODO: Should this be here? Probably not.
26
+ moa := & MultiOrgAlertmanager {
17
27
Registerer : r ,
18
- registries : NewOrgRegistries () ,
28
+ registries : registries ,
19
29
DiscoveredConfigurations : promauto .With (r ).NewGauge (prometheus.GaugeOpts {
20
30
Namespace : Namespace ,
21
31
Subsystem : Subsystem ,
@@ -28,5 +38,112 @@ func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanag
28
38
Name : "active_configurations" ,
29
39
Help : "The number of active Alertmanager configurations." ,
30
40
}),
41
+ aggregatedMetrics : NewAlertmanagerAggregatedMetrics (registries ),
42
+ }
43
+
44
+ // These metrics use a different registration method as the struct itself represents a custom collector.
45
+ // There's no way to "auto-register" a collector.
46
+ r .MustRegister (moa .aggregatedMetrics )
47
+
48
+ return moa
49
+ }
50
+
51
+ // RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
52
+ func (moa * MultiOrgAlertmanager ) RemoveOrgRegistry (id int64 ) {
53
+ moa .registries .RemoveTenantRegistry (strconv .FormatInt (id , 10 ), false )
54
+ }
55
+
56
+ // GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
57
+ func (moa * MultiOrgAlertmanager ) GetOrCreateOrgRegistry (id int64 ) prometheus.Registerer {
58
+ sid := strconv .FormatInt (id , 10 )
59
+ reg := moa .registries .GetRegistryForTenant (sid )
60
+ if reg != nil {
61
+ return reg
62
+ }
63
+
64
+ result := prometheus .NewRegistry ()
65
+ moa .registries .AddTenantRegistry (sid , result )
66
+
67
+ return result
68
+ }
69
+
70
+ // AlertmanagerAggregatedMetrics are metrics collected directly from the registry.
71
+ // Unlike metrics.Alertmanager they are not called within this codebase hence the need for direct collection.
72
+ type AlertmanagerAggregatedMetrics struct {
73
+ registries * metrics.TenantRegistries
74
+
75
+ // exported metrics, gathered from Alertmanager Silences
76
+ silencesGCDuration * prometheus.Desc
77
+ silencesSnapshotDuration * prometheus.Desc
78
+ silencesSnapshotSize * prometheus.Desc
79
+ silencesQueriesTotal * prometheus.Desc
80
+ silencesQueryErrorsTotal * prometheus.Desc
81
+ silencesQueryDuration * prometheus.Desc
82
+ silences * prometheus.Desc
83
+ silencesPropagatedMessagesTotal * prometheus.Desc
84
+ }
85
+
86
+ func NewAlertmanagerAggregatedMetrics (registries * metrics.TenantRegistries ) * AlertmanagerAggregatedMetrics {
87
+ aggregatedMetrics := & AlertmanagerAggregatedMetrics {
88
+ registries : registries ,
89
+
90
+ silencesGCDuration : prometheus .NewDesc (
91
+ fmt .Sprintf ("%s_%s_silences_gc_duration_seconds" , Namespace , Subsystem ),
92
+ "Duration of the last silence garbage collection cycle." ,
93
+ nil , nil ),
94
+ silencesSnapshotDuration : prometheus .NewDesc (
95
+ fmt .Sprintf ("%s_%s_silences_snapshot_duration_seconds" , Namespace , Subsystem ),
96
+ "Duration of the last silence snapshot." ,
97
+ nil , nil ),
98
+ silencesSnapshotSize : prometheus .NewDesc (
99
+ fmt .Sprintf ("%s_%s_silences_snapshot_size_bytes" , Namespace , Subsystem ),
100
+ "Size of the last silence snapshot in bytes." ,
101
+ nil , nil ),
102
+ silencesQueriesTotal : prometheus .NewDesc (
103
+ fmt .Sprintf ("%s_%s_silences_queries_total" , Namespace , Subsystem ),
104
+ "How many silence queries were received." ,
105
+ nil , nil ),
106
+ silencesQueryErrorsTotal : prometheus .NewDesc (
107
+ fmt .Sprintf ("%s_%s_silences_query_errors_total" , Namespace , Subsystem ),
108
+ "How many silence received queries did not succeed." ,
109
+ nil , nil ),
110
+ silencesQueryDuration : prometheus .NewDesc (
111
+ fmt .Sprintf ("%s_%s_silences_query_duration_seconds" , Namespace , Subsystem ),
112
+ "Duration of silence query evaluation." ,
113
+ nil , nil ),
114
+ silencesPropagatedMessagesTotal : prometheus .NewDesc (
115
+ fmt .Sprintf ("%s_%s_silences_gossip_messages_propagated_total" , Namespace , Subsystem ),
116
+ "Number of received gossip messages that have been further gossiped." ,
117
+ nil , nil ),
118
+ silences : prometheus .NewDesc (
119
+ fmt .Sprintf ("%s_%s_silences" , Namespace , Subsystem ),
120
+ "How many silences by state." ,
121
+ []string {"org" , "state" }, nil ),
31
122
}
123
+
124
+ return aggregatedMetrics
125
+ }
126
+
127
+ func (a * AlertmanagerAggregatedMetrics ) Describe (out chan <- * prometheus.Desc ) {
128
+ out <- a .silencesGCDuration
129
+ out <- a .silencesSnapshotDuration
130
+ out <- a .silencesSnapshotSize
131
+ out <- a .silencesQueriesTotal
132
+ out <- a .silencesQueryErrorsTotal
133
+ out <- a .silencesQueryDuration
134
+ out <- a .silencesPropagatedMessagesTotal
135
+ out <- a .silences
136
+ }
137
+
138
+ func (a * AlertmanagerAggregatedMetrics ) Collect (out chan <- prometheus.Metric ) {
139
+ data := a .registries .BuildMetricFamiliesPerTenant ()
140
+
141
+ data .SendSumOfSummaries (out , a .silencesGCDuration , "alertmanager_silences_gc_duration_seconds" )
142
+ data .SendSumOfSummaries (out , a .silencesSnapshotDuration , "alertmanager_silences_snapshot_duration_seconds" )
143
+ data .SendSumOfGauges (out , a .silencesSnapshotSize , "alertmanager_silences_snapshot_size_bytes" )
144
+ data .SendSumOfCounters (out , a .silencesQueriesTotal , "alertmanager_silences_queries_total" )
145
+ data .SendSumOfCounters (out , a .silencesQueryErrorsTotal , "alertmanager_silences_query_errors_total" )
146
+ data .SendSumOfHistograms (out , a .silencesQueryDuration , "alertmanager_silences_query_duration_seconds" )
147
+ data .SendSumOfCounters (out , a .silencesPropagatedMessagesTotal , "alertmanager_silences_gossip_messages_propagated_total" )
148
+ data .SendSumOfGaugesPerTenantWithLabels (out , a .silences , "alertmanager_silences" , "state" )
32
149
}
0 commit comments