Skip to content

Commit a4ebd34

Browse files
authored
Change error code and configuration of resource-based limiter (#6771)
* Change status code from 429 to 503 Signed-off-by: Justin Jung <[email protected]> * Change configurations Signed-off-by: Justin Jung <[email protected]> * Nit Signed-off-by: Justin Jung <[email protected]> * Nit Signed-off-by: Justin Jung <[email protected]> * Changelog Signed-off-by: Justin Jung <[email protected]> * Revert changelog Signed-off-by: Justin Jung <[email protected]> --------- Signed-off-by: Justin Jung <[email protected]>
1 parent c9fa217 commit a4ebd34

File tree

11 files changed

+187
-135
lines changed

11 files changed

+187
-135
lines changed

docs/blocks-storage/store-gateway.md

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -349,20 +349,28 @@ store_gateway:
349349
# CLI flag: -store-gateway.disabled-tenants
350350
[disabled_tenants: <string> | default = ""]
351351

352-
instance_limits:
353-
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
354-
# rejecting new query request (across all tenants) in percentage, between 0
355-
# and 1. monitored_resources config must include the resource type. 0 to
356-
# disable.
357-
# CLI flag: -store-gateway.instance-limits.cpu-utilization
358-
[cpu_utilization: <float> | default = 0]
359-
360-
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
361-
# rejecting new query request (across all tenants) in percentage, between 0
362-
# and 1. monitored_resources config must include the resource type. 0 to
363-
# disable.
364-
# CLI flag: -store-gateway.instance-limits.heap-utilization
365-
[heap_utilization: <float> | default = 0]
352+
query_protection:
353+
rejection:
354+
# EXPERIMENTAL: Enable query rejection feature, where the component return
355+
# 503 to all incoming query requests when the configured thresholds are
356+
# breached.
357+
# CLI flag: -store-gateway.query-protection.rejection.enabled
358+
[enabled: <boolean> | default = false]
359+
360+
threshold:
361+
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
362+
# rejecting new query request (across all tenants) in percentage,
363+
# between 0 and 1. monitored_resources config must include the resource
364+
# type. 0 to disable.
365+
# CLI flag: -store-gateway.query-protection.rejection.threshold.cpu-utilization
366+
[cpu_utilization: <float> | default = 0]
367+
368+
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
369+
# rejecting new query request (across all tenants) in percentage,
370+
# between 0 and 1. monitored_resources config must include the resource
371+
# type. 0 to disable.
372+
# CLI flag: -store-gateway.query-protection.rejection.threshold.heap-utilization
373+
[heap_utilization: <float> | default = 0]
366374

367375
hedged_request:
368376
# If true, hedged requests are applied to object store calls. It can help

docs/configuration/config-file-reference.md

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3208,20 +3208,6 @@ lifecycler:
32083208
[upload_compacted_blocks_enabled: <boolean> | default = true]
32093209
32103210
instance_limits:
3211-
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
3212-
# rejecting new query request (across all tenants) in percentage, between 0
3213-
# and 1. monitored_resources config must include the resource type. 0 to
3214-
# disable.
3215-
# CLI flag: -ingester.instance-limits.cpu-utilization
3216-
[cpu_utilization: <float> | default = 0]
3217-
3218-
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
3219-
# rejecting new query request (across all tenants) in percentage, between 0
3220-
# and 1. monitored_resources config must include the resource type. 0 to
3221-
# disable.
3222-
# CLI flag: -ingester.instance-limits.heap-utilization
3223-
[heap_utilization: <float> | default = 0]
3224-
32253211
# Max ingestion rate (samples/sec) that ingester will accept. This limit is
32263212
# per-ingester, not per-tenant. Additional push requests will be rejected.
32273213
# Current ingestion rate is computed as exponentially weighted moving average,
@@ -3280,6 +3266,29 @@ instance_limits:
32803266
# If enabled, the metadata API returns all metadata regardless of the limits.
32813267
# CLI flag: -ingester.skip-metadata-limits
32823268
[skip_metadata_limits: <boolean> | default = true]
3269+
3270+
query_protection:
3271+
rejection:
3272+
# EXPERIMENTAL: Enable query rejection feature, where the component return
3273+
# 503 to all incoming query requests when the configured thresholds are
3274+
# breached.
3275+
# CLI flag: -ingester.query-protection.rejection.enabled
3276+
[enabled: <boolean> | default = false]
3277+
3278+
threshold:
3279+
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
3280+
# rejecting new query request (across all tenants) in percentage, between
3281+
# 0 and 1. monitored_resources config must include the resource type. 0 to
3282+
# disable.
3283+
# CLI flag: -ingester.query-protection.rejection.threshold.cpu-utilization
3284+
[cpu_utilization: <float> | default = 0]
3285+
3286+
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
3287+
# rejecting new query request (across all tenants) in percentage, between
3288+
# 0 and 1. monitored_resources config must include the resource type. 0 to
3289+
# disable.
3290+
# CLI flag: -ingester.query-protection.rejection.threshold.heap-utilization
3291+
[heap_utilization: <float> | default = 0]
32833292
```
32843293

32853294
### `ingester_client_config`
@@ -5901,20 +5910,28 @@ sharding_ring:
59015910
# CLI flag: -store-gateway.disabled-tenants
59025911
[disabled_tenants: <string> | default = ""]
59035912
5904-
instance_limits:
5905-
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
5906-
# rejecting new query request (across all tenants) in percentage, between 0
5907-
# and 1. monitored_resources config must include the resource type. 0 to
5908-
# disable.
5909-
# CLI flag: -store-gateway.instance-limits.cpu-utilization
5910-
[cpu_utilization: <float> | default = 0]
5911-
5912-
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
5913-
# rejecting new query request (across all tenants) in percentage, between 0
5914-
# and 1. monitored_resources config must include the resource type. 0 to
5915-
# disable.
5916-
# CLI flag: -store-gateway.instance-limits.heap-utilization
5917-
[heap_utilization: <float> | default = 0]
5913+
query_protection:
5914+
rejection:
5915+
# EXPERIMENTAL: Enable query rejection feature, where the component return
5916+
# 503 to all incoming query requests when the configured thresholds are
5917+
# breached.
5918+
# CLI flag: -store-gateway.query-protection.rejection.enabled
5919+
[enabled: <boolean> | default = false]
5920+
5921+
threshold:
5922+
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
5923+
# rejecting new query request (across all tenants) in percentage, between
5924+
# 0 and 1. monitored_resources config must include the resource type. 0 to
5925+
# disable.
5926+
# CLI flag: -store-gateway.query-protection.rejection.threshold.cpu-utilization
5927+
[cpu_utilization: <float> | default = 0]
5928+
5929+
# EXPERIMENTAL: Max heap utilization that this ingester can reach before
5930+
# rejecting new query request (across all tenants) in percentage, between
5931+
# 0 and 1. monitored_resources config must include the resource type. 0 to
5932+
# disable.
5933+
# CLI flag: -store-gateway.query-protection.rejection.threshold.heap-utilization
5934+
[heap_utilization: <float> | default = 0]
59185935
59195936
hedged_request:
59205937
# If true, hedged requests are applied to object store calls. It can help with

docs/configuration/v1-guarantees.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,8 @@ Currently experimental features are:
123123
- Query-frontend: dynamic query splits
124124
- `querier.max-shards-per-query` (int) CLI flag
125125
- `querier.max-fetched-data-duration-per-query` (duration) CLI flag
126-
- Ingester/Store-Gateway: Resource-based throttling
127-
- `-ingester.instance-limits.cpu-utilization`
128-
- `-ingester.instance-limits.heap-utilization`
129-
- `-store-gateway.instance-limits.cpu-utilization`
130-
- `-store-gateway.instance-limits.heap-utilization`
126+
- Ingester/Store-Gateway: Query rejection
127+
- `-ingester.query-protection.rejection`
128+
- `-store-gateway.query-protection.rejection`
131129
- Distributor/Ingester: Stream push connection
132-
- Enable stream push connection between distributor and ingester by setting `-distributor.use-stream-push=true` on Distributor.
130+
- Enable stream push connection between distributor and ingester by setting `-distributor.use-stream-push=true` on Distributor.

docs/guides/protecting-cortex-from-heavy-queries.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,13 @@ For example, the following configuration will start throttling query requests if
4848
```
4949
target: ingester
5050
monitored_resources: cpu,heap
51-
instance_limits:
52-
cpu_utilization: 0.8
53-
heap_utilization: 0.8
51+
ingester:
52+
query_protection:
53+
rejection:
54+
enabled: true
55+
threshold:
56+
cpu_utilization: 0.8
57+
heap_utilization: 0.8
5458
```
5559

56-
See https://cortexmetrics.io/docs/configuration/configuration-file/:~:text=instance_limits for details.
60+
See https://cortexmetrics.io/docs/configuration/configuration-file/:~:text=query_protection for details.

integration/resource_based_limiter_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ func Test_ResourceBasedLimiter_shouldStartWithoutError(t *testing.T) {
2929

3030
// Start Cortex components.
3131
ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), mergeFlags(flags, map[string]string{
32-
"-ingester.instance-limits.cpu-utilization": "0.8",
33-
"-ingester.instance-limits.heap-utilization": "0.8",
32+
"-ingester.query-protection.rejection.threshold.cpu-utilization": "0.8",
33+
"-ingester.query-protection.rejection.threshold.heap-utilization": "0.8",
3434
}), "")
3535
storeGateway := e2ecortex.NewStoreGateway("store-gateway", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), mergeFlags(flags, map[string]string{
36-
"-store-gateway.instance-limits.cpu-utilization": "0.8",
37-
"-store-gateway.instance-limits.heap-utilization": "0.8",
36+
"-store-gateway.query-protection.rejection.threshold.cpu-utilization": "0.8",
37+
"-store-gateway.query-protection.rejection.threshold.heap-utilization": "0.8",
3838
}), "")
3939
require.NoError(t, s.StartAndWaitReady(ingester, storeGateway))
4040
}

pkg/configs/instance_limits.go

Lines changed: 0 additions & 40 deletions
This file was deleted.

pkg/configs/query_protection.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package configs
2+
3+
import (
4+
"errors"
5+
"flag"
6+
"strings"
7+
8+
"github.com/cortexproject/cortex/pkg/util/flagext"
9+
"github.com/cortexproject/cortex/pkg/util/resource"
10+
)
11+
12+
type QueryProtection struct {
13+
Rejection rejection `json:"rejection"`
14+
}
15+
16+
type rejection struct {
17+
Enabled bool `yaml:"enabled"`
18+
Threshold threshold `yaml:"threshold"`
19+
}
20+
21+
type threshold struct {
22+
CPUUtilization float64 `yaml:"cpu_utilization"`
23+
HeapUtilization float64 `yaml:"heap_utilization"`
24+
}
25+
26+
func (cfg *QueryProtection) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) {
27+
f.BoolVar(&cfg.Rejection.Enabled, prefix+"query-protection.rejection.enabled", false, "EXPERIMENTAL: Enable query rejection feature, where the component return 503 to all incoming query requests when the configured thresholds are breached.")
28+
f.Float64Var(&cfg.Rejection.Threshold.CPUUtilization, prefix+"query-protection.rejection.threshold.cpu-utilization", 0, "EXPERIMENTAL: Max CPU utilization that this ingester can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
29+
f.Float64Var(&cfg.Rejection.Threshold.HeapUtilization, prefix+"query-protection.rejection.threshold.heap-utilization", 0, "EXPERIMENTAL: Max heap utilization that this ingester can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
30+
}
31+
32+
func (cfg *QueryProtection) Validate(monitoredResources flagext.StringSliceCSV) error {
33+
thresholdCfg := cfg.Rejection.Threshold
34+
if thresholdCfg.CPUUtilization > 1 || thresholdCfg.CPUUtilization < 0 {
35+
return errors.New("cpu_utilization must be between 0 and 1")
36+
}
37+
38+
if thresholdCfg.CPUUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.CPU)) {
39+
return errors.New("monitored_resources config must include \"cpu\" as well")
40+
}
41+
42+
if thresholdCfg.HeapUtilization > 1 || thresholdCfg.HeapUtilization < 0 {
43+
return errors.New("heap_utilization must be between 0 and 1")
44+
}
45+
46+
if thresholdCfg.HeapUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.Heap)) {
47+
return errors.New("monitored_resources config must include \"heap\" as well")
48+
}
49+
50+
return nil
51+
}

pkg/configs/instance_limits_test.go renamed to pkg/configs/query_protection_test.go

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,51 +9,71 @@ import (
99

1010
func Test_Validate(t *testing.T) {
1111
for name, tc := range map[string]struct {
12-
instanceLimits InstanceLimits
12+
queryProtection QueryProtection
1313
monitoredResources []string
1414
err error
1515
}{
1616
"correct config should pass validation": {
17-
instanceLimits: InstanceLimits{
18-
CPUUtilization: 0.5,
19-
HeapUtilization: 0.5,
17+
queryProtection: QueryProtection{
18+
Rejection: rejection{
19+
Threshold: threshold{
20+
CPUUtilization: 0.5,
21+
HeapUtilization: 0.5,
22+
},
23+
},
2024
},
2125
monitoredResources: []string{"cpu", "heap"},
2226
err: nil,
2327
},
2428
"utilization config less than 0 should fail validation": {
25-
instanceLimits: InstanceLimits{
26-
CPUUtilization: -0.5,
27-
HeapUtilization: 0.5,
29+
queryProtection: QueryProtection{
30+
Rejection: rejection{
31+
Threshold: threshold{
32+
CPUUtilization: -0.5,
33+
HeapUtilization: 0.5,
34+
},
35+
},
2836
},
2937
monitoredResources: []string{"cpu", "heap"},
3038
err: errors.New("cpu_utilization must be between 0 and 1"),
3139
},
3240
"utilization config greater than 1 should fail validation": {
33-
instanceLimits: InstanceLimits{
34-
CPUUtilization: 0.5,
35-
HeapUtilization: 1.5,
41+
queryProtection: QueryProtection{
42+
Rejection: rejection{
43+
Threshold: threshold{
44+
CPUUtilization: 0.5,
45+
HeapUtilization: 1.5,
46+
},
47+
},
3648
},
3749
monitoredResources: []string{"cpu", "heap"},
3850
err: errors.New("heap_utilization must be between 0 and 1"),
3951
},
4052
"missing cpu in monitored_resources config should fail validation": {
41-
instanceLimits: InstanceLimits{
42-
CPUUtilization: 0.5,
53+
queryProtection: QueryProtection{
54+
Rejection: rejection{
55+
Threshold: threshold{
56+
CPUUtilization: 0.5,
57+
},
58+
},
4359
},
4460
monitoredResources: []string{"heap"},
4561
err: errors.New("monitored_resources config must include \"cpu\" as well"),
4662
},
4763
"missing heap in monitored_resources config should fail validation": {
48-
instanceLimits: InstanceLimits{
49-
HeapUtilization: 0.5,
64+
queryProtection: QueryProtection{
65+
Rejection: rejection{
66+
Threshold: threshold{
67+
HeapUtilization: 0.5,
68+
},
69+
},
5070
},
5171
monitoredResources: []string{"cpu"},
5272
err: errors.New("monitored_resources config must include \"heap\" as well"),
5373
},
5474
} {
5575
t.Run(name, func(t *testing.T) {
56-
err := tc.instanceLimits.Validate(tc.monitoredResources)
76+
err := tc.queryProtection.Validate(tc.monitoredResources)
5777
if tc.err != nil {
5878
require.Errorf(t, err, tc.err.Error())
5979
} else {

0 commit comments

Comments
 (0)