Skip to content

Commit 2811849

Browse files
authored
ResourceMonitor: Change configs (#6782)
* Update resource monitor config Signed-off-by: Justin Jung <[email protected]> * Doc Signed-off-by: Justin Jung <[email protected]> * Apply configs to NewMonitor Signed-off-by: Justin Jung <[email protected]> * Changelog Signed-off-by: Justin Jung <[email protected]> * Nit Signed-off-by: Justin Jung <[email protected]> * Revert "Changelog" This reverts commit 7126c59. Signed-off-by: Justin Jung <[email protected]> * Nit - config description Signed-off-by: Justin Jung <[email protected]> * Nit Signed-off-by: Justin Jung <[email protected]> --------- Signed-off-by: Justin Jung <[email protected]>
1 parent a4ebd34 commit 2811849

File tree

9 files changed

+130
-47
lines changed

9 files changed

+130
-47
lines changed

docs/configuration/config-file-reference.md

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,21 @@ Where default_value is the value to use if the environment variable is undefined
6868
# CLI flag: -http.prefix
6969
[http_prefix: <string> | default = "/api/prom"]
7070

71-
# Comma-separated list of resources to monitor. Supported values are cpu and
72-
# heap, which tracks metrics from github.com/prometheus/procfs and
73-
# runtime/metrics that are close estimates. Empty string to disable.
74-
# CLI flag: -monitored.resources
75-
[monitored_resources: <string> | default = ""]
71+
resource_monitor:
72+
# Comma-separated list of resources to monitor. Supported values are cpu and
73+
# heap, which tracks metrics from github.com/prometheus/procfs and
74+
# runtime/metrics that are close estimates. Empty string to disable.
75+
# CLI flag: -resource-monitor.resources
76+
[resources: <string> | default = ""]
77+
78+
# Update interval of resource monitor. Must be greater than 0.
79+
# CLI flag: -resource-monitor.interval
80+
[interval: <duration> | default = 100ms]
81+
82+
# Interval to calculate average CPU rate. Must be greater than resource
83+
# monitor interval.
84+
# CLI flag: -resource-monitor.cpu-rate-interval
85+
[cpu_rate_interval: <duration> | default = 1m]
7686

7787
api:
7888
# Use GZIP compression for API responses. Some endpoints serve large YAML or

integration/resource_based_limiter_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ func Test_ResourceBasedLimiter_shouldStartWithoutError(t *testing.T) {
1919
defer s.Close()
2020

2121
flags := mergeFlags(BlocksStorageFlags(), map[string]string{
22-
"-monitored.resources": "cpu,heap",
22+
"-resource-monitor.resources": "cpu,heap",
2323
})
2424

2525
// Start dependencies.

pkg/configs/resource_monitor.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package configs
2+
3+
import (
4+
"flag"
5+
"fmt"
6+
"time"
7+
8+
"github.com/cortexproject/cortex/pkg/util/flagext"
9+
"github.com/cortexproject/cortex/pkg/util/resource"
10+
)
11+
12+
type ResourceMonitor struct {
13+
Resources flagext.StringSliceCSV `yaml:"resources"`
14+
Interval time.Duration `yaml:"interval"`
15+
CPURateInterval time.Duration `yaml:"cpu_rate_interval"`
16+
}
17+
18+
func (cfg *ResourceMonitor) RegisterFlags(f *flag.FlagSet) {
19+
cfg.Resources = []string{}
20+
21+
f.Var(&cfg.Resources, "resource-monitor.resources", "Comma-separated list of resources to monitor. "+
22+
"Supported values are cpu and heap, which tracks metrics from github.com/prometheus/procfs and runtime/metrics "+
23+
"that are close estimates. Empty string to disable.")
24+
f.DurationVar(&cfg.Interval, "resource-monitor.interval", 100*time.Millisecond, "Update interval of resource monitor. Must be greater than 0.")
25+
f.DurationVar(&cfg.CPURateInterval, "resource-monitor.cpu-rate-interval", time.Minute, "Interval to calculate average CPU rate. Must be greater than resource monitor interval.")
26+
}
27+
28+
func (cfg *ResourceMonitor) Validate() error {
29+
for _, r := range cfg.Resources {
30+
switch resource.Type(r) {
31+
case resource.CPU, resource.Heap:
32+
default:
33+
if len(r) > 0 {
34+
return fmt.Errorf("unsupported resource type to monitor: %s", r)
35+
}
36+
}
37+
}
38+
39+
if cfg.Interval <= 0 {
40+
return fmt.Errorf("resource monitor interval must be greater than zero")
41+
}
42+
43+
if cfg.CPURateInterval < cfg.Interval {
44+
return fmt.Errorf("resource monitor cpu rate interval cannot be smaller than resource monitor interval")
45+
}
46+
47+
return nil
48+
}

pkg/cortex/cortex.go

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,11 @@ var (
9090

9191
// Config is the root config for Cortex.
9292
type Config struct {
93-
Target flagext.StringSliceCSV `yaml:"target"`
94-
AuthEnabled bool `yaml:"auth_enabled"`
95-
PrintConfig bool `yaml:"-"`
96-
HTTPPrefix string `yaml:"http_prefix"`
97-
MonitoredResources flagext.StringSliceCSV `yaml:"monitored_resources"`
93+
Target flagext.StringSliceCSV `yaml:"target"`
94+
AuthEnabled bool `yaml:"auth_enabled"`
95+
PrintConfig bool `yaml:"-"`
96+
HTTPPrefix string `yaml:"http_prefix"`
97+
ResourceMonitor configs.ResourceMonitor `yaml:"resource_monitor"`
9898

9999
ExternalQueryable prom_storage.Queryable `yaml:"-"`
100100
ExternalPusher ruler.Pusher `yaml:"-"`
@@ -147,11 +147,6 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) {
147147
f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.")
148148
f.StringVar(&c.HTTPPrefix, "http.prefix", "/api/prom", "HTTP path prefix for Cortex API.")
149149

150-
c.MonitoredResources = []string{}
151-
f.Var(&c.MonitoredResources, "monitored.resources", "Comma-separated list of resources to monitor. "+
152-
"Supported values are cpu and heap, which tracks metrics from github.com/prometheus/procfs and runtime/metrics "+
153-
"that are close estimates. Empty string to disable.")
154-
155150
c.API.RegisterFlags(f)
156151
c.registerServerFlagsWithChangedDefaultValues(f)
157152
c.Distributor.RegisterFlags(f)
@@ -170,6 +165,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) {
170165
c.ParquetConverter.RegisterFlags(f)
171166
c.StoreGateway.RegisterFlags(f)
172167
c.TenantFederation.RegisterFlags(f)
168+
c.ResourceMonitor.RegisterFlags(f)
173169

174170
c.Ruler.RegisterFlags(f)
175171
c.RulerStorage.RegisterFlags(f)
@@ -211,6 +207,9 @@ func (c *Config) Validate(log log.Logger) error {
211207
if err := c.LimitsConfig.Validate(c.Distributor.ShardByAllLabels); err != nil {
212208
return errors.Wrap(err, "invalid limits config")
213209
}
210+
if err := c.ResourceMonitor.Validate(); err != nil {
211+
return errors.Wrap(err, "invalid resource-monitor config")
212+
}
214213
if err := c.Distributor.Validate(c.LimitsConfig); err != nil {
215214
return errors.Wrap(err, "invalid distributor config")
216215
}
@@ -226,7 +225,7 @@ func (c *Config) Validate(log log.Logger) error {
226225
if err := c.QueryRange.Validate(c.Querier); err != nil {
227226
return errors.Wrap(err, "invalid query_range config")
228227
}
229-
if err := c.StoreGateway.Validate(c.LimitsConfig, c.MonitoredResources); err != nil {
228+
if err := c.StoreGateway.Validate(c.LimitsConfig, c.ResourceMonitor.Resources); err != nil {
230229
return errors.Wrap(err, "invalid store-gateway config")
231230
}
232231
if err := c.Compactor.Validate(c.LimitsConfig); err != nil {
@@ -239,24 +238,14 @@ func (c *Config) Validate(log log.Logger) error {
239238
return errors.Wrap(err, "invalid alertmanager config")
240239
}
241240

242-
if err := c.Ingester.Validate(c.MonitoredResources); err != nil {
241+
if err := c.Ingester.Validate(c.ResourceMonitor.Resources); err != nil {
243242
return errors.Wrap(err, "invalid ingester config")
244243
}
245244

246245
if err := c.Tracing.Validate(); err != nil {
247246
return errors.Wrap(err, "invalid tracing config")
248247
}
249248

250-
for _, r := range c.MonitoredResources {
251-
switch resource.Type(r) {
252-
case resource.CPU, resource.Heap:
253-
default:
254-
if len(r) > 0 {
255-
return fmt.Errorf("unsupported resource type to monitor: %s", r)
256-
}
257-
}
258-
}
259-
260249
return nil
261250
}
262251

pkg/cortex/cortex_test.go

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
"github.com/cortexproject/cortex/pkg/alertmanager"
2525
"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
26+
"github.com/cortexproject/cortex/pkg/configs"
2627
"github.com/cortexproject/cortex/pkg/cortex/storage"
2728
"github.com/cortexproject/cortex/pkg/frontend/v1/frontendv1pb"
2829
"github.com/cortexproject/cortex/pkg/ingester"
@@ -170,16 +171,45 @@ func TestConfigValidation(t *testing.T) {
170171
name: "should fail validation for invalid resource to monitor",
171172
getTestConfig: func() *Config {
172173
configuration := newDefaultConfig()
173-
configuration.MonitoredResources = []string{"wrong"}
174+
configuration.ResourceMonitor = configs.ResourceMonitor{
175+
Resources: []string{"wrong"},
176+
}
174177
return configuration
175178
},
176179
expectedError: fmt.Errorf("unsupported resource type to monitor: %s", "wrong"),
177180
},
181+
{
182+
name: "should fail validation for invalid resource to monitor - 2",
183+
getTestConfig: func() *Config {
184+
configuration := newDefaultConfig()
185+
configuration.ResourceMonitor = configs.ResourceMonitor{
186+
Interval: -1,
187+
}
188+
return configuration
189+
},
190+
expectedError: fmt.Errorf("resource monitor interval must be greater than zero"),
191+
},
192+
{
193+
name: "should fail validation for invalid resource to monitor - 3",
194+
getTestConfig: func() *Config {
195+
configuration := newDefaultConfig()
196+
configuration.ResourceMonitor = configs.ResourceMonitor{
197+
Interval: time.Second,
198+
CPURateInterval: time.Millisecond,
199+
}
200+
return configuration
201+
},
202+
expectedError: fmt.Errorf("resource monitor cpu rate interval cannot be smaller than resource monitor interval"),
203+
},
178204
{
179205
name: "should not fail validation for valid resources to monitor",
180206
getTestConfig: func() *Config {
181207
configuration := newDefaultConfig()
182-
configuration.MonitoredResources = []string{"cpu", "heap"}
208+
configuration.ResourceMonitor = configs.ResourceMonitor{
209+
Resources: []string{"cpu", "heap"},
210+
Interval: time.Second,
211+
CPURateInterval: time.Minute,
212+
}
183213
return configuration
184214
},
185215
expectedError: nil,

pkg/cortex/modules.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -786,14 +786,14 @@ func (t *Cortex) initQueryScheduler() (services.Service, error) {
786786
}
787787

788788
func (t *Cortex) initResourceMonitor() (services.Service, error) {
789-
if t.Cfg.MonitoredResources.String() == "" || len(t.Cfg.MonitoredResources) == 0 {
789+
if t.Cfg.ResourceMonitor.Resources.String() == "" || len(t.Cfg.ResourceMonitor.Resources) == 0 {
790790
return nil, nil
791791
}
792792

793-
util_log.WarnExperimentalUse(fmt.Sprintf("resource monitor for [%s]", t.Cfg.MonitoredResources.String()))
793+
util_log.WarnExperimentalUse(fmt.Sprintf("resource monitor for [%s]", t.Cfg.ResourceMonitor.Resources.String()))
794794

795795
containerLimits := make(map[resource.Type]float64)
796-
for _, res := range t.Cfg.MonitoredResources {
796+
for _, res := range t.Cfg.ResourceMonitor.Resources {
797797
switch resource.Type(res) {
798798
case resource.CPU:
799799
containerLimits[resource.Type(res)] = float64(runtime.GOMAXPROCS(0))
@@ -805,7 +805,7 @@ func (t *Cortex) initResourceMonitor() (services.Service, error) {
805805
}
806806

807807
var err error
808-
t.ResourceMonitor, err = resource.NewMonitor(containerLimits, prometheus.DefaultRegisterer)
808+
t.ResourceMonitor, err = resource.NewMonitor(containerLimits, t.Cfg.ResourceMonitor.Interval, t.Cfg.ResourceMonitor.CPURateInterval, prometheus.DefaultRegisterer)
809809
return t.ResourceMonitor, err
810810
}
811811

pkg/cortex/modules_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/stretchr/testify/require"
1313
"github.com/weaveworks/common/server"
1414

15+
"github.com/cortexproject/cortex/pkg/configs"
1516
"github.com/cortexproject/cortex/pkg/cortexpb"
1617
)
1718

@@ -237,7 +238,9 @@ func Test_initResourceMonitor_shouldFailOnInvalidResource(t *testing.T) {
237238
cortex := &Cortex{
238239
Server: &server.Server{},
239240
Cfg: Config{
240-
MonitoredResources: []string{"invalid"},
241+
ResourceMonitor: configs.ResourceMonitor{
242+
Resources: []string{"invalid"},
243+
},
241244
},
242245
}
243246

pkg/util/resource/monitor.go

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ import (
1515
const (
1616
CPU Type = "cpu"
1717
Heap Type = "heap"
18-
19-
monitorInterval = 100 * time.Millisecond
20-
dataPointsToAvg = 50
2118
)
2219

2320
type Type string
@@ -33,31 +30,32 @@ type Monitor struct {
3330
scanners map[Type]scanner
3431
containerLimit map[Type]float64
3532
utilization map[Type]float64
33+
interval time.Duration
3634

3735
// Variables to calculate average CPU utilization
3836
index int
39-
cpuRates [dataPointsToAvg]float64
40-
cpuIntervals [dataPointsToAvg]float64
37+
cpuRates []float64
38+
cpuIntervals []float64
4139
totalCPU float64
4240
totalInterval float64
4341
lastCPU float64
4442
lastUpdate time.Time
43+
cpuDataPoints int
4544

4645
lock sync.RWMutex
4746
}
4847

49-
func NewMonitor(limits map[Type]float64, registerer prometheus.Registerer) (*Monitor, error) {
48+
func NewMonitor(limits map[Type]float64, interval, cpuRateInterval time.Duration, registerer prometheus.Registerer) (*Monitor, error) {
5049
m := &Monitor{
5150
containerLimit: limits,
5251
scanners: make(map[Type]scanner),
5352
utilization: make(map[Type]float64),
54-
55-
cpuRates: [dataPointsToAvg]float64{},
56-
cpuIntervals: [dataPointsToAvg]float64{},
53+
interval: interval,
5754

5855
lock: sync.RWMutex{},
5956
}
6057

58+
m.interval = interval
6159
m.Service = services.NewBasicService(nil, m.running, nil)
6260

6361
for resType, limit := range limits {
@@ -68,6 +66,10 @@ func NewMonitor(limits map[Type]float64, registerer prometheus.Registerer) (*Mon
6866
case CPU:
6967
scannerFunc = newCPUScanner
7068
gaugeFunc = m.GetCPUUtilization
69+
70+
m.cpuDataPoints = int(cpuRateInterval.Nanoseconds() / interval.Nanoseconds())
71+
m.cpuRates = make([]float64, m.cpuDataPoints)
72+
m.cpuIntervals = make([]float64, m.cpuDataPoints)
7173
case Heap:
7274
scannerFunc = newHeapScanner
7375
gaugeFunc = m.GetHeapUtilization
@@ -92,7 +94,7 @@ func NewMonitor(limits map[Type]float64, registerer prometheus.Registerer) (*Mon
9294
}
9395

9496
func (m *Monitor) running(ctx context.Context) error {
95-
ticker := time.NewTicker(monitorInterval)
97+
ticker := time.NewTicker(m.interval)
9698
defer ticker.Stop()
9799

98100
for {
@@ -141,7 +143,7 @@ func (m *Monitor) storeCPUUtilization(cpuTime float64) {
141143

142144
m.lastCPU = cpuTime
143145
m.lastUpdate = now
144-
m.index = (m.index + 1) % dataPointsToAvg
146+
m.index = (m.index + 1) % m.cpuDataPoints
145147

146148
if m.totalInterval > 0 && m.containerLimit[CPU] > 0 {
147149
m.utilization[CPU] = m.totalCPU / m.totalInterval / m.containerLimit[CPU]

pkg/util/resource/monitor_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@ package resource
22

33
import (
44
"testing"
5+
"time"
56

67
"github.com/prometheus/client_golang/prometheus"
78
"github.com/stretchr/testify/require"
89
)
910

1011
func Test_Monitor(t *testing.T) {
11-
m, err := NewMonitor(map[Type]float64{}, prometheus.DefaultRegisterer)
12+
m, err := NewMonitor(map[Type]float64{}, time.Second, time.Minute, prometheus.DefaultRegisterer)
1213

1314
m.scanners[CPU] = &noopScanner{}
1415
m.containerLimit[CPU] = 1

0 commit comments

Comments
 (0)