From ffd51fc693533899c84c6b4330fede5729fa1a8c Mon Sep 17 00:00:00 2001 From: Lukasz Mierzwa Date: Wed, 18 Oct 2023 11:22:22 +0100 Subject: [PATCH] More strict checks in alerts/template --- docs/changelog.md | 48 +++++++++++++------ internal/checks/alerts_template.go | 61 +++++++++++++++++++++++-- internal/checks/alerts_template_test.go | 58 +++++++++++++++++++++++ 3 files changed, 149 insertions(+), 18 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 28d5fa3d..900d9dd0 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,25 @@ # Changelog +## v0.47.0 + +### Added + +- `alerts/template` check can now report problems with alerting rules + when trying to use templates on a query that doesn't produce any labels + at all. + For example when using [`vector(...)`](https://prometheus.io/docs/prometheus/latest/querying/functions/#vector) function: + + {% raw %} + + ```yaml + - alert: DeadMansSwitch + expr: vector(1) + annotations: + summary: Deadman's switch on {{ $labels.instance }} is firing + ``` + + {% endraw %} + ## v0.46.0 ### Added @@ -115,7 +135,7 @@ ```yaml - record: my:sum expr: sum(http_requests_total) - + - alert: my alert expr: rate(my:sum[5m]) ``` @@ -195,7 +215,7 @@ snooze them instead of disabling forever. The difference between `# pint disable ...` and `# pint snooze ...` comments is that - the snooze comment must include a timestamp. Selected check will be disabled *until* + the snooze comment must include a timestamp. Selected check will be disabled _until_ that timestamp. Timestamp must either use [RFC3339](https://www.rfc-editor.org/rfc/rfc3339) syntax or `YYYY-MM-DD` (if you don't care about time and want to snooze until given date). @@ -640,7 +660,7 @@ ### Changed - The way `pint` sends API requests to Prometheus was changed to improve performance. - + First change is that each `prometheus` server definition in `pint` config file can now accept optional `concurrency` field (defaults to 16) that sets a limit on how many concurrent requests can that server receive. There is a new metric that @@ -800,10 +820,10 @@ ```yaml groups: - - name: example - rules: - - record: ... - expr: ... + - name: example + rules: + - record: ... + expr: ... ``` Previous releases were only looking for individual rules so `groups` object @@ -815,7 +835,7 @@ ```yaml parser { - relaxed = ["(.*)"] + relaxed = ["(.*)"] } ``` @@ -974,7 +994,7 @@ ### Added -- Added `pint_last_run_time_seconds` and `pint_rules_parsed_total` metrics when running `pint watch`. +- Added `pint_last_run_time_seconds` and `pint_rules_parsed_total` metrics when running `pint watch`. ### Changed @@ -997,7 +1017,7 @@ ### Added - Added `promql/regexp` check that will warn about unnecessary regexp matchers. -- Added `pint_prometheus_queries_total` and `pint_prometheus_query_errors_total` +- Added `pint_prometheus_queries_total` and `pint_prometheus_query_errors_total` metric when running `pint watch`. ## v0.10.1 @@ -1090,7 +1110,7 @@ ### Fixed - `promql/rate`, `query/series` and `promql/vector_matching` checks were not enabled - for all defined `prometheus {}` blocks unless there was at least one `rule {}` block. + for all defined `prometheus {}` blocks unless there was at least one `rule {}` block. - `annotation` based `match` blocks didn't work correctly. ## v0.6.6 @@ -1224,7 +1244,7 @@ - alert: Foo expr: absent(foo{env="prod"}) annotations: - summary: 'foo metric is missing for job {{ $labels.job }}' + summary: "foo metric is missing for job {{ $labels.job }}" ``` {% endraw %} @@ -1291,7 +1311,7 @@ - alert: Foo expr: count(up) without(instance) == 0 annotations: - summary: 'foo is down on {{ $labels.instance }}' + summary: "foo is down on {{ $labels.instance }}" ``` {% endraw %} @@ -1302,7 +1322,7 @@ ### Fixed -- Fixed file descriptor leak due to missing file `Close()` #69. +- Fixed file descriptor leak due to missing file `Close()` #69. ## v0.1.4 diff --git a/internal/checks/alerts_template.go b/internal/checks/alerts_template.go index bd2647bd..21b22b36 100644 --- a/internal/checks/alerts_template.go +++ b/internal/checks/alerts_template.go @@ -101,6 +101,7 @@ func (c TemplateCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ aggrs := utils.HasOuterAggregation(rule.AlertingRule.Expr.Query) absentCalls := utils.HasOuterAbsent(rule.AlertingRule.Expr.Query) + vectors := utils.HasVectorSelector(rule.AlertingRule.Expr.Query) var safeLabels []string for _, be := range binaryExprs(rule.AlertingRule.Expr.Query) { @@ -179,6 +180,19 @@ func (c TemplateCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ }) } } + + labelNames := getTemplateLabels(label.Key.Value, label.Value.Value) + if len(labelNames) > 0 && len(vectors) == 0 { + for _, name := range labelNames { + problems = append(problems, Problem{ + Fragment: fmt.Sprintf("%s: %s", label.Key.Value, label.Value.Value), + Lines: mergeLines(label.Lines(), rule.AlertingRule.Expr.Lines()), + Reporter: c.Reporter(), + Text: fmt.Sprintf("template is using %q label but the query doesn't produce any labels", name), + Severity: Bug, + }) + } + } } } @@ -228,6 +242,19 @@ func (c TemplateCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ } } + labelNames := getTemplateLabels(annotation.Key.Value, annotation.Value.Value) + if len(labelNames) > 0 && len(vectors) == 0 { + for _, name := range labelNames { + problems = append(problems, Problem{ + Fragment: fmt.Sprintf("%s: %s", annotation.Key.Value, annotation.Value.Value), + Lines: mergeLines(annotation.Lines(), rule.AlertingRule.Expr.Lines()), + Reporter: c.Reporter(), + Text: fmt.Sprintf("template is using %q label but the query doesn't produce any labels", name), + Severity: Bug, + }) + } + } + if hasValue(annotation.Key.Value, annotation.Value.Value) && !hasHumanize(annotation.Key.Value, annotation.Value.Value) { for _, problem := range c.checkHumanizeIsNeeded(rule.AlertingRule.Expr.Query) { problems = append(problems, Problem{ @@ -458,7 +485,7 @@ func getVariables(node parse.Node) (vars [][]string) { return vars } -func checkMetricLabels(msg, name, text string, metricLabels []string, excludeLabels bool, safeLabels []string) (msgs []string) { +func findTemplateVariables(name, text string) (vars [][]string, aliases aliasMap, ok bool) { t, err := textTemplate. New(name). Funcs(templateFuncMap). @@ -466,16 +493,42 @@ func checkMetricLabels(msg, name, text string, metricLabels []string, excludeLab Parse(strings.Join(append(templateDefs, text), "")) if err != nil { // no need to double report errors - return nil + return vars, aliases, false } - aliases := aliasMap{aliases: map[string]map[string]struct{}{}} - vars := [][]string{} + aliases.aliases = map[string]map[string]struct{}{} for _, node := range t.Root.Nodes { getAliases(node, &aliases) vars = append(vars, getVariables(node)...) } + return vars, aliases, true +} + +func getTemplateLabels(name, text string) (names []string) { + vars, aliases, ok := findTemplateVariables(name, text) + if !ok { + return nil + } + + labelsAliases := aliases.varAliases(".Labels") + for _, v := range vars { + for _, a := range labelsAliases { + if len(v) > 1 && v[0] == a { + names = append(names, v[1]) + } + } + } + + return names +} + +func checkMetricLabels(msg, name, text string, metricLabels []string, excludeLabels bool, safeLabels []string) (msgs []string) { + vars, aliases, ok := findTemplateVariables(name, text) + if !ok { + return nil + } + done := map[string]struct{}{} labelsAliases := aliases.varAliases(".Labels") for _, v := range vars { diff --git a/internal/checks/alerts_template_test.go b/internal/checks/alerts_template_test.go index 69ca9507..6611f67a 100644 --- a/internal/checks/alerts_template_test.go +++ b/internal/checks/alerts_template_test.go @@ -1017,6 +1017,64 @@ func TestTemplateCheck(t *testing.T) { prometheus: noProm, problems: noProblems, }, + { + description: "annotation label from vector(0)", + content: "- alert: DeadMansSwitch\n expr: vector(1)\n annotations:\n summary: 'Deadmans switch on {{ $labels.instance }} is firing'\n", + checker: newTemplateCheck, + prometheus: noProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: `summary: Deadmans switch on {{ $labels.instance }} is firing`, + Lines: []int{2, 4}, + Reporter: checks.TemplateCheckName, + Text: `template is using "instance" label but the query doesn't produce any labels`, + Severity: checks.Bug, + }, + } + }, + }, + { + description: "labels label from vector(0)", + content: "- alert: DeadMansSwitch\n expr: vector(1)\n labels:\n summary: 'Deadmans switch on {{ $labels.instance }} is firing'\n", + checker: newTemplateCheck, + prometheus: noProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: `summary: Deadmans switch on {{ $labels.instance }} is firing`, + Lines: []int{2, 4}, + Reporter: checks.TemplateCheckName, + Text: `template is using "instance" label but the query doesn't produce any labels`, + Severity: checks.Bug, + }, + } + }, + }, + { + description: "annotation label from number", + content: "- alert: DeadMansSwitch\n expr: 1 > bool 0\n annotations:\n summary: 'Deadmans switch on {{ $labels.instance }} / {{ $labels.job }} is firing'\n", + checker: newTemplateCheck, + prometheus: noProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: `summary: Deadmans switch on {{ $labels.instance }} / {{ $labels.job }} is firing`, + Lines: []int{2, 4}, + Reporter: checks.TemplateCheckName, + Text: `template is using "instance" label but the query doesn't produce any labels`, + Severity: checks.Bug, + }, + { + Fragment: `summary: Deadmans switch on {{ $labels.instance }} / {{ $labels.job }} is firing`, + Lines: []int{2, 4}, + Reporter: checks.TemplateCheckName, + Text: `template is using "job" label but the query doesn't produce any labels`, + Severity: checks.Bug, + }, + } + }, + }, } runTests(t, testCases) }