diff --git a/docs/changelog.md b/docs/changelog.md index cac01f47..62ade3fb 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -6,6 +6,8 @@ - [promql/regexp](checks/promql/regexp.md) check will now look for smelly regexp selectors. See check docs for details. +- [promql/range_query](checks/promql/range_query.md) now allows to configure a custom maximum + duration for range queries - #1064. ## v0.64.1 diff --git a/docs/checks/promql/range_query.md b/docs/checks/promql/range_query.md index 484a392c..0cb58c4f 100644 --- a/docs/checks/promql/range_query.md +++ b/docs/checks/promql/range_query.md @@ -14,8 +14,8 @@ By default Prometheus keeps [15 days of data](https://prometheus.io/docs/prometh this can be customised by setting time or disk space limits. There are two main ways of configuring retention limits in Prometheus: -* time based - Prometheus will keep last N days of metrics -* disk based - Prometheus will try to use up to N bytes of disk space. +- time based - Prometheus will keep last N days of metrics +- disk based - Prometheus will try to use up to N bytes of disk space. Pint will ignore any disk space limits, since that doesn't tell us what the effective time retention is. @@ -34,13 +34,30 @@ getting results of a `avg_over_time(foo[40d])` you are getting the average value of `foo` in the last 40 days, but in reality you're only getting an average value in the last 30 days, and you cannot get any more than that. +You can also configure your own maximum allowed range duration if you want +to ensure that all queries are never requesting more than allowed range. +This can be done by adding a configuration rule as below. + ## Configuration -This check doesn't have any configuration options. +Syntax: + +```js +range_query { + max = "2h" + comment = "..." + severity = "bug|warning|info" +} +``` + +- `max` - duration for the maximum allowed query range. +- `comment` - set a custom comment that will be added to reported problems. +- `severity` - set custom severity for reported issues, defaults to `warning`. ## How to enable it -This check is enabled by default for all configured Prometheus servers. +This check is enabled by default for all configured Prometheus servers and will +validate that queries don't use ranges longer than configured Prometheus retention. Example: @@ -64,6 +81,19 @@ prometheus "dev" { } ``` +Additionally you can configure an extra rule that will enforce a custom maximum +query range duration: + +```js +rule { + range_query { + max = "4h" + comment = "You cannot use range queries with range more than 4h" + severity = "bug" + } +} +``` + ## How to disable it You can disable this check globally by adding this config block: @@ -102,6 +132,20 @@ Example: # pint disable promql/range_query(prod) ``` +To disable a custom maximum range duration rule use: + +```yaml +# pint disable promql/range_query($duration) +``` + +Where `$duration` is the value of `max` option in `range_query` rule. + +Example: + +```yaml +# pint disable promql/range_query(4h) +``` + ## How to snooze it You can disable this check until given time by adding a comment to it. Example: @@ -111,6 +155,6 @@ You can disable this check until given time by adding a comment to it. Example: ``` Where `$TIMESTAMP` is either use [RFC3339](https://www.rfc-editor.org/rfc/rfc3339) -formatted or `YYYY-MM-DD`. -Adding this comment will disable `promql/range_query` *until* `$TIMESTAMP`, after that +formatted or `YYYY-MM-DD`. +Adding this comment will disable `promql/range_query` _until_ `$TIMESTAMP`, after that check will be re-enabled. diff --git a/internal/checks/promql_range_query.go b/internal/checks/promql_range_query.go index 87590cf2..d34d14c7 100644 --- a/internal/checks/promql_range_query.go +++ b/internal/checks/promql_range_query.go @@ -9,6 +9,7 @@ import ( promParser "github.com/prometheus/prometheus/promql/parser" "github.com/cloudflare/pint/internal/discovery" + "github.com/cloudflare/pint/internal/output" "github.com/cloudflare/pint/internal/parser" "github.com/cloudflare/pint/internal/promapi" ) @@ -17,12 +18,15 @@ const ( RangeQueryCheckName = "promql/range_query" ) -func NewRangeQueryCheck(prom *promapi.FailoverGroup) RangeQueryCheck { - return RangeQueryCheck{prom: prom} +func NewRangeQueryCheck(prom *promapi.FailoverGroup, limit time.Duration, comment string, severity Severity) RangeQueryCheck { + return RangeQueryCheck{prom: prom, limit: limit, comment: comment, severity: severity} } type RangeQueryCheck struct { - prom *promapi.FailoverGroup + prom *promapi.FailoverGroup + limit time.Duration + comment string + severity Severity } func (c RangeQueryCheck) Meta() CheckMeta { @@ -38,6 +42,9 @@ func (c RangeQueryCheck) Meta() CheckMeta { } func (c RangeQueryCheck) String() string { + if c.limit > 0 { + return fmt.Sprintf("%s(%s)", RangeQueryCheckName, output.HumanizeDuration(c.limit)) + } return fmt.Sprintf("%s(%s)", RangeQueryCheckName, c.prom.Name()) } @@ -47,11 +54,26 @@ func (c RangeQueryCheck) Reporter() string { func (c RangeQueryCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Rule, _ []discovery.Entry) (problems []Problem) { expr := rule.Expr() - if expr.SyntaxError != nil { return problems } + if c.limit > 0 { + for _, problem := range c.checkNode(ctx, expr.Query, c.limit, fmt.Sprintf("%s is the maximum allowed range query.", model.Duration(c.limit))) { + problems = append(problems, Problem{ + Lines: expr.Value.Lines, + Reporter: c.Reporter(), + Text: problem.text, + Details: maybeComment(c.comment), + Severity: c.severity, + }) + } + } + + if c.prom == nil || len(problems) > 0 { + return problems + } + flags, err := c.prom.Flags(ctx) if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Warning) @@ -84,7 +106,7 @@ func (c RangeQueryCheck) Check(ctx context.Context, _ discovery.Path, rule parse retention = time.Hour * 24 * 15 } - for _, problem := range c.checkNode(ctx, expr.Query, retention, flags.URI) { + for _, problem := range c.checkNode(ctx, expr.Query, retention, fmt.Sprintf("%s is configured to only keep %s of metrics history.", promText(c.prom.Name(), flags.URI), model.Duration(retention))) { problems = append(problems, Problem{ Lines: expr.Value.Lines, Reporter: c.Reporter(), @@ -96,20 +118,20 @@ func (c RangeQueryCheck) Check(ctx context.Context, _ discovery.Path, rule parse return problems } -func (c RangeQueryCheck) checkNode(ctx context.Context, node *parser.PromQLNode, retention time.Duration, uri string) (problems []exprProblem) { +func (c RangeQueryCheck) checkNode(ctx context.Context, node *parser.PromQLNode, retention time.Duration, reason string) (problems []exprProblem) { if n, ok := node.Expr.(*promParser.MatrixSelector); ok { if n.Range > retention { problems = append(problems, exprProblem{ expr: node.Expr.String(), - text: fmt.Sprintf("`%s` selector is trying to query Prometheus for %s worth of metrics, but %s is configured to only keep %s of metrics history.", - node.Expr, model.Duration(n.Range), promText(c.prom.Name(), uri), model.Duration(retention)), + text: fmt.Sprintf("`%s` selector is trying to query Prometheus for %s worth of metrics, but %s", + node.Expr, model.Duration(n.Range), reason), severity: Warning, }) } } for _, child := range node.Children { - problems = append(problems, c.checkNode(ctx, child, retention, uri)...) + problems = append(problems, c.checkNode(ctx, child, retention, reason)...) } return problems diff --git a/internal/checks/promql_range_query_test.go b/internal/checks/promql_range_query_test.go index 27fc4575..dc1bb47f 100644 --- a/internal/checks/promql_range_query_test.go +++ b/internal/checks/promql_range_query_test.go @@ -3,6 +3,7 @@ package checks_test import ( "fmt" "testing" + "time" "github.com/cloudflare/pint/internal/checks" "github.com/cloudflare/pint/internal/parser" @@ -10,7 +11,11 @@ import ( ) func newRangeQueryCheck(prom *promapi.FailoverGroup) checks.RuleChecker { - return checks.NewRangeQueryCheck(prom) + return checks.NewRangeQueryCheck(prom, 0, "", checks.Fatal) +} + +func newRangeQueryCheckWithLimit(prom *promapi.FailoverGroup) checks.RuleChecker { + return checks.NewRangeQueryCheck(prom, time.Hour*4, "some text", checks.Bug) } func retentionToLow(name, uri, metric, qr, retention string) string { @@ -214,6 +219,33 @@ func TestRangeQueryCheck(t *testing.T) { }, }, }, + { + description: "limit / 3h", + content: "- record: foo\n expr: rate(foo[3h])\n", + checker: newRangeQueryCheckWithLimit, + prometheus: noProm, + problems: noProblems, + }, + { + description: "limit / 5h", + content: "- record: foo\n expr: rate(foo[5h])\n", + checker: newRangeQueryCheckWithLimit, + prometheus: noProm, + problems: func(_ string) []checks.Problem { + return []checks.Problem{ + { + Lines: parser.LineRange{ + First: 2, + Last: 2, + }, + Reporter: "promql/range_query", + Text: "`foo[5h]` selector is trying to query Prometheus for 5h worth of metrics, but 4h is the maximum allowed range query.", + Details: "Rule comment: some text", + Severity: checks.Bug, + }, + } + }, + }, } runTests(t, testCases) } diff --git a/internal/config/config.go b/internal/config/config.go index 45e800f0..a53519c9 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -134,7 +134,7 @@ func (cfg *Config) GetChecksForRule(ctx context.Context, gen *PrometheusGenerato }) allChecks = append(allChecks, checkMeta{ name: checks.RangeQueryCheckName, - check: checks.NewRangeQueryCheck(p), + check: checks.NewRangeQueryCheck(p, 0, "", checks.Warning), tags: p.Tags(), }) allChecks = append(allChecks, checkMeta{ diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 3e2fb757..86e3fae9 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -2277,6 +2277,14 @@ func TestConfigErrors(t *testing.T) { }`, err: "unknown severity: xxx", }, + { + config: `rule { + range_query { + max = "abc" + } +}`, + err: `not a valid duration string: "abc"`, + }, } dir := t.TempDir() diff --git a/internal/config/range_query.go b/internal/config/range_query.go new file mode 100644 index 00000000..64ac9e0c --- /dev/null +++ b/internal/config/range_query.go @@ -0,0 +1,41 @@ +package config + +import ( + "errors" + + "github.com/cloudflare/pint/internal/checks" +) + +type RangeQuerySettings struct { + Max string `hcl:"max" json:"max"` + Comment string `hcl:"comment,optional" json:"comment,omitempty"` + Severity string `hcl:"severity,optional" json:"severity,omitempty"` +} + +func (s RangeQuerySettings) validate() error { + if s.Max != "" { + dur, err := parseDuration(s.Max) + if err != nil { + return err + } + if dur == 0 { + return errors.New("range_query max value cannot be zero") + } + } + + if s.Severity != "" { + if _, err := checks.ParseSeverity(s.Severity); err != nil { + return err + } + } + + return nil +} + +func (s RangeQuerySettings) getSeverity(fallback checks.Severity) checks.Severity { + if s.Severity != "" { + sev, _ := checks.ParseSeverity(s.Severity) + return sev + } + return fallback +} diff --git a/internal/config/range_query_test.go b/internal/config/range_query_test.go new file mode 100644 index 00000000..2b8cf903 --- /dev/null +++ b/internal/config/range_query_test.go @@ -0,0 +1,49 @@ +package config + +import ( + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestRangeQuerySettings(t *testing.T) { + type testCaseT struct { + err error + conf RangeQuerySettings + } + + testCases := []testCaseT{ + { + conf: RangeQuerySettings{ + Max: "foo", + }, + err: errors.New(`not a valid duration string: "foo"`), + }, + { + conf: RangeQuerySettings{ + Max: "0h", + }, + err: errors.New("range_query max value cannot be zero"), + }, + { + conf: RangeQuerySettings{ + Max: "1d", + Severity: "bag", + }, + err: errors.New("unknown severity: bag"), + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("%v", tc.conf), func(t *testing.T) { + err := tc.conf.validate() + if err == nil || tc.err == nil { + require.Equal(t, err, tc.err) + } else { + require.EqualError(t, err, tc.err.Error()) + } + }) + } +} diff --git a/internal/config/rule.go b/internal/config/rule.go index c46c9c55..3a5a3e3f 100644 --- a/internal/config/rule.go +++ b/internal/config/rule.go @@ -23,6 +23,7 @@ type Rule struct { Alerts *AlertsSettings `hcl:"alerts,block" json:"alerts,omitempty"` For *ForSettings `hcl:"for,block" json:"for,omitempty"` KeepFiringFor *ForSettings `hcl:"keep_firing_for,block" json:"keep_firing_for,omitempty"` + RangeQuery *RangeQuerySettings `hcl:"range_query,block" json:"range_query,omitempty"` Reject []RejectSettings `hcl:"reject,block" json:"reject,omitempty"` RuleLink []RuleLinkSettings `hcl:"link,block" json:"link,omitempty"` RuleName []RuleNameSettings `hcl:"name,block" json:"name,omitempty"` @@ -101,6 +102,12 @@ func (rule Rule) validate() (err error) { } } + if rule.RangeQuery != nil { + if err = rule.RangeQuery.validate(); err != nil { + return err + } + } + return nil } @@ -288,6 +295,15 @@ func (rule Rule) resolveChecks(ctx context.Context, path string, r parser.Rule, }) } + if rule.RangeQuery != nil { + severity := rule.RangeQuery.getSeverity(checks.Warning) + limit, _ := parseDuration(rule.RangeQuery.Max) + enabled = append(enabled, checkMeta{ + name: checks.CostCheckName, + check: checks.NewRangeQueryCheck(nil, limit, rule.RangeQuery.Comment, severity), + }) + } + return enabled }