diff --git a/CHANGELOG.md b/CHANGELOG.md index a05ea36636b..fc5d7b616e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ * [CHANGE] Distributor: Replace `-distributor.retry-after-header.max-backoff-exponent` and `-distributor.retry-after-header.base-seconds` with `-distributor.retry-after-header.min-backoff` and `-distributor.retry-after-header.max-backoff` for easier configuration. #8694 * [CHANGE] Ingester: increase the default inactivity timeout of active series (`-ingester.active-series-metrics-idle-timeout`) from `10m` to `20m`. #8975 * [CHANGE] Distributor: Remove `-distributor.enable-otlp-metadata-storage` flag, which was deprecated in version 2.12. #9069 +* [CHANGE] Ruler: Removed `-ruler.drain-notification-queue-on-shutdown` option, which is now enabled by default. #9115 * [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #8422 #8430 #8454 #8455 #8360 #8490 #8508 #8577 #8660 #8671 #8677 #8747 #8850 #8872 #8838 #8911 #8909 #8923 #8924 #8925 #8932 #8933 #8934 #8962 #8986 #8993 #8995 #9017 #9018 #9008 #9120 #9121 * [FEATURE] Experimental Kafka-based ingest storage. #6888 #6894 #6929 #6940 #6951 #6974 #6982 #7029 #7030 #7091 #7142 #7147 #7148 #7153 #7160 #7193 #7349 #7376 #7388 #7391 #7393 #7394 #7402 #7404 #7423 #7424 #7437 #7486 #7503 #7508 #7540 #7621 #7682 #7685 #7694 #7695 #7696 #7697 #7701 #7733 #7734 #7741 #7752 #7838 #7851 #7871 #7877 #7880 #7882 #7887 #7891 #7925 #7955 #7967 #8031 #8063 #8077 #8088 #8135 #8176 #8184 #8194 #8216 #8217 #8222 #8233 #8503 #8542 #8579 #8657 #8686 #8688 #8703 #8706 #8708 #8738 #8750 #8778 #8808 #8809 #8841 #8842 #8845 #8853 #8886 #8988 * What it is: diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index dbd5bfaaaea..14cc5fe9463 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -11804,17 +11804,6 @@ "fieldValue": null, "fieldDefaultValue": null }, - { - "kind": "field", - "name": "drain_notification_queue_on_shutdown", - "required": false, - "desc": "Drain all outstanding alert notifications when shutting down. If false, any outstanding alert notifications are dropped when shutting down.", - "fieldValue": null, - "fieldDefaultValue": false, - "fieldFlag": "ruler.drain-notification-queue-on-shutdown", - "fieldType": "boolean", - "fieldCategory": "experimental" - }, { "kind": "field", "name": "for_outage_tolerance", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index e56162f4b03..31075017563 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2623,8 +2623,6 @@ Usage of ./cmd/mimir/mimir: Override the expected name on the server certificate. -ruler.disabled-tenants comma-separated-list-of-strings Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding. - -ruler.drain-notification-queue-on-shutdown - [experimental] Drain all outstanding alert notifications when shutting down. If false, any outstanding alert notifications are dropped when shutting down. -ruler.enable-api Enable the ruler config API. (default true) -ruler.enabled-tenants comma-separated-list-of-strings diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 3eacb471196..7a57d69a4dc 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -1964,11 +1964,6 @@ alertmanager_client: # CLI flag: -ruler.alertmanager-client.basic-auth-password [basic_auth_password: | default = ""] -# (experimental) Drain all outstanding alert notifications when shutting down. -# If false, any outstanding alert notifications are dropped when shutting down. -# CLI flag: -ruler.drain-notification-queue-on-shutdown -[drain_notification_queue_on_shutdown: | default = false] - # (advanced) Max time to tolerate outage for restoring "for" state of alert. # CLI flag: -ruler.for-outage-tolerance [for_outage_tolerance: | default = 1h] diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 40d304ee64a..c02fc40c834 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -299,7 +299,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie var err error if n, err = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, - DrainOnShutdown: r.cfg.DrainNotificationQueueOnShutdown, + DrainOnShutdown: true, Registerer: reg, Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) { // Note: The passed-in context comes from the Prometheus notifier diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index fd73713f589..ac0766e84f5 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -303,11 +303,10 @@ func TestDefaultMultiTenantManager_WaitsToDrainPendingNotificationsOnShutdown(t user1Group1 := createRuleGroup("group-1", user, createRecordingRule("count:metric_1", "count(metric_1)")) cfg := Config{ - RulePath: t.TempDir(), - AlertmanagerURL: server.URL, - NotificationQueueCapacity: 1000, - NotificationTimeout: 10 * time.Second, - DrainNotificationQueueOnShutdown: true, + RulePath: t.TempDir(), + AlertmanagerURL: server.URL, + NotificationQueueCapacity: 1000, + NotificationTimeout: 10 * time.Second, } m, err := NewDefaultMultiTenantManager(cfg, managerMockFactory, nil, logger, nil) require.NoError(t, err) diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 482802966f4..c527fcf67e2 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -110,8 +110,6 @@ type Config struct { NotificationTimeout time.Duration `yaml:"notification_timeout" category:"advanced"` // Client configs for interacting with the Alertmanager Notifier NotifierConfig `yaml:"alertmanager_client"` - // Enable draining the pending alert notification queue when shutting down. - DrainNotificationQueueOnShutdown bool `yaml:"drain_notification_queue_on_shutdown" category:"experimental"` // Max time to tolerate outage for restoring "for" state of alert. OutageTolerance time.Duration `yaml:"for_outage_tolerance" category:"advanced"` @@ -182,7 +180,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing DNS resolutions of Alertmanager hosts.") f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.") f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.") - f.BoolVar(&cfg.DrainNotificationQueueOnShutdown, "ruler.drain-notification-queue-on-shutdown", false, "Drain all outstanding alert notifications when shutting down. If false, any outstanding alert notifications are dropped when shutting down.") f.StringVar(&cfg.RulePath, "ruler.rule-path", "./data-ruler/", "Directory to store temporary rule files loaded by the Prometheus rule managers. This directory is not required to be persisted between restarts.") f.BoolVar(&cfg.EnableAPI, "ruler.enable-api", true, "Enable the ruler config API.")