diff --git a/charts/tempo-mixin/Chart.yaml b/charts/tempo-mixin/Chart.yaml index c39b051d..605479c4 100644 --- a/charts/tempo-mixin/Chart.yaml +++ b/charts/tempo-mixin/Chart.yaml @@ -27,7 +27,7 @@ keywords: - tempo - monitoring-mixin - portefaix -version: 1.4.0 +version: 1.4.1 appVersion: 2.0.0 maintainers: @@ -51,5 +51,5 @@ annotations: fingerprint: C39918B3EBDE35C23B8D0B8E5F99269A6FCA437C url: https://keybase.io/nlamirault/pgp_keys.asc artifacthub.io/changes: | - - kind: changed - description: Includes additionalLabels and additionalAnnotations on configmaps + - kind: fixed + description: YAML indentation diff --git a/charts/tempo-mixin/templates/alerts.yaml b/charts/tempo-mixin/templates/alerts.yaml index d66c2236..ee2573ac 100644 --- a/charts/tempo-mixin/templates/alerts.yaml +++ b/charts/tempo-mixin/templates/alerts.yaml @@ -13,145 +13,145 @@ metadata: {{- end }} spec: groups: - - name: tempo_alerts - rules: - - alert: TempoRequestLatency - annotations: - message: | - {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestLatency - expr: | - cluster_namespace_job_route:tempo_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|debug_pprof"} > 3 - for: 15m - labels: - severity: critical - - alert: TempoCompactorUnhealthy - annotations: - message: There are {{`{{`}} printf "%f" $value {{`}}`}} unhealthy compactor(s). - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy - expr: | - max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="compactor", namespace=~".*"}) > 0 - for: 15m - labels: - severity: critical - - alert: TempoDistributorUnhealthy - annotations: - message: There are {{`{{`}} printf "%f" $value {{`}}`}} unhealthy distributor(s). - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy - expr: | - max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="distributor", namespace=~".*"}) > 0 - for: 15m - labels: - severity: warning - - alert: TempoCompactionsFailing - annotations: - message: Greater than 2 compactions have failed in the past hour. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactionsFailing - expr: | - sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[1h])) > 2 and - sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[5m])) > 0 - for: 5m - labels: - severity: critical - - alert: TempoIngesterFlushesUnhealthy - annotations: - message: Greater than 2 flush retries have occurred in the past hour. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing - expr: | - sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[1h])) > 2 and - sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0 - for: 5m - labels: - severity: warning - - alert: TempoIngesterFlushesFailing - annotations: - message: Greater than 2 flush retries have failed in the past hour. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing - expr: | - sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[1h])) > 2 and - sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[5m])) > 0 - for: 5m - labels: - severity: critical - - alert: TempoPollsFailing - annotations: - message: Greater than 2 polls have failed in the past hour. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPollsFailing - expr: | - sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[1h])) > 2 and - sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[5m])) > 0 - labels: - severity: critical - - alert: TempoTenantIndexFailures - annotations: - message: Greater than 2 tenant index failures in the past hour. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures - expr: | - sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[1h])) > 2 and - sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[5m])) > 0 - labels: - severity: critical - - alert: TempoNoTenantIndexBuilders - annotations: - message: No tenant index builders for tenant {{`{{`}} $labels.tenant {{`}}`}}. Tenant index will quickly become stale. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoNoTenantIndexBuilders - expr: | - sum by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_builder{}) == 0 and - max by (cluster, namespace) (tempodb_blocklist_length{}) > 0 - for: 5m - labels: - severity: critical - - alert: TempoTenantIndexTooOld - annotations: - message: Tenant index age is 600 seconds old for tenant {{`{{`}} $labels.tenant {{`}}`}}. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexTooOld - expr: | - max by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_age_seconds{}) > 600 - for: 5m - labels: - severity: critical - - alert: TempoBadOverrides - annotations: - message: '{{`{{`}} $labels.job {{`}}`}} failed to reload overrides.' - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoBadOverrides - expr: | - sum(tempo_runtime_config_last_reload_successful{namespace=~".*"} == 0) by (cluster, namespace, job) - for: 15m - labels: - severity: warning - - alert: TempoProvisioningTooManyWrites - annotations: - message: Ingesters in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} are receiving more data/second than desired, add more ingesters. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoProvisioningTooManyWrites - expr: | - avg by (cluster, namespace) (rate(tempo_ingester_bytes_received_total{job=~".+/ingester"}[1m])) / 1024 / 1024 > 30 - for: 15m - labels: - severity: warning - - alert: TempoCompactorsTooManyOutstandingBlocks - annotations: - message: There are too many outstanding compaction blocks in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} for tenant {{`{{`}} $labels.tenant {{`}}`}}, increase compactor's CPU or add more compactors. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorsTooManyOutstandingBlocks - expr: | - sum by (cluster, namespace, tenant) (tempodb_compaction_outstanding_blocks{container="compactor", namespace=~".*"}) / ignoring(tenant) group_left count(tempo_build_info{container="compactor", namespace=~".*"}) by (cluster, namespace) > 100 - for: 6h - labels: - severity: warning - - alert: TempoCompactorsTooManyOutstandingBlocks - annotations: - message: There are too many outstanding compaction blocks in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} for tenant {{`{{`}} $labels.tenant {{`}}`}}, increase compactor's CPU or add more compactors. - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorsTooManyOutstandingBlocks - expr: | - sum by (cluster, namespace, tenant) (tempodb_compaction_outstanding_blocks{container="compactor", namespace=~".*"}) / ignoring(tenant) group_left count(tempo_build_info{container="compactor", namespace=~".*"}) by (cluster, namespace) > 250 - for: 24h - labels: - severity: critical - - alert: TempoIngesterReplayErrors - annotations: - message: Tempo ingester has encountered errors while replaying a block on startup in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} for tenant {{`{{`}} $labels.tenant {{`}}`}} - runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors - expr: | - sum by (cluster, namespace, tenant) (increase(tempo_ingester_replay_errors_total{namespace=~".*"}[5m])) > 0 - for: 5m - labels: - severity: critical + - name: tempo_alerts + rules: + - alert: TempoRequestLatency + annotations: + message: | + {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestLatency + expr: | + cluster_namespace_job_route:tempo_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|debug_pprof"} > 3 + for: 15m + labels: + severity: critical + - alert: TempoCompactorUnhealthy + annotations: + message: There are {{`{{`}} printf "%f" $value {{`}}`}} unhealthy compactor(s). + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy + expr: | + max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="compactor", namespace=~".*"}) > 0 + for: 15m + labels: + severity: critical + - alert: TempoDistributorUnhealthy + annotations: + message: There are {{`{{`}} printf "%f" $value {{`}}`}} unhealthy distributor(s). + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy + expr: | + max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="distributor", namespace=~".*"}) > 0 + for: 15m + labels: + severity: warning + - alert: TempoCompactionsFailing + annotations: + message: Greater than 2 compactions have failed in the past hour. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactionsFailing + expr: | + sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[1h])) > 2 and + sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[5m])) > 0 + for: 5m + labels: + severity: critical + - alert: TempoIngesterFlushesUnhealthy + annotations: + message: Greater than 2 flush retries have occurred in the past hour. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing + expr: | + sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[1h])) > 2 and + sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0 + for: 5m + labels: + severity: warning + - alert: TempoIngesterFlushesFailing + annotations: + message: Greater than 2 flush retries have failed in the past hour. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing + expr: | + sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[1h])) > 2 and + sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[5m])) > 0 + for: 5m + labels: + severity: critical + - alert: TempoPollsFailing + annotations: + message: Greater than 2 polls have failed in the past hour. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPollsFailing + expr: | + sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[1h])) > 2 and + sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[5m])) > 0 + labels: + severity: critical + - alert: TempoTenantIndexFailures + annotations: + message: Greater than 2 tenant index failures in the past hour. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures + expr: | + sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[1h])) > 2 and + sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[5m])) > 0 + labels: + severity: critical + - alert: TempoNoTenantIndexBuilders + annotations: + message: No tenant index builders for tenant {{`{{`}} $labels.tenant {{`}}`}}. Tenant index will quickly become stale. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoNoTenantIndexBuilders + expr: | + sum by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_builder{}) == 0 and + max by (cluster, namespace) (tempodb_blocklist_length{}) > 0 + for: 5m + labels: + severity: critical + - alert: TempoTenantIndexTooOld + annotations: + message: Tenant index age is 600 seconds old for tenant {{`{{`}} $labels.tenant {{`}}`}}. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexTooOld + expr: | + max by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_age_seconds{}) > 600 + for: 5m + labels: + severity: critical + - alert: TempoBadOverrides + annotations: + message: '{{`{{`}} $labels.job {{`}}`}} failed to reload overrides.' + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoBadOverrides + expr: | + sum(tempo_runtime_config_last_reload_successful{namespace=~".*"} == 0) by (cluster, namespace, job) + for: 15m + labels: + severity: warning + - alert: TempoProvisioningTooManyWrites + annotations: + message: Ingesters in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} are receiving more data/second than desired, add more ingesters. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoProvisioningTooManyWrites + expr: | + avg by (cluster, namespace) (rate(tempo_ingester_bytes_received_total{job=~".+/ingester"}[1m])) / 1024 / 1024 > 30 + for: 15m + labels: + severity: warning + - alert: TempoCompactorsTooManyOutstandingBlocks + annotations: + message: There are too many outstanding compaction blocks in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} for tenant {{`{{`}} $labels.tenant {{`}}`}}, increase compactor's CPU or add more compactors. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorsTooManyOutstandingBlocks + expr: | + sum by (cluster, namespace, tenant) (tempodb_compaction_outstanding_blocks{container="compactor", namespace=~".*"}) / ignoring(tenant) group_left count(tempo_build_info{container="compactor", namespace=~".*"}) by (cluster, namespace) > 100 + for: 6h + labels: + severity: warning + - alert: TempoCompactorsTooManyOutstandingBlocks + annotations: + message: There are too many outstanding compaction blocks in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} for tenant {{`{{`}} $labels.tenant {{`}}`}}, increase compactor's CPU or add more compactors. + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorsTooManyOutstandingBlocks + expr: | + sum by (cluster, namespace, tenant) (tempodb_compaction_outstanding_blocks{container="compactor", namespace=~".*"}) / ignoring(tenant) group_left count(tempo_build_info{container="compactor", namespace=~".*"}) by (cluster, namespace) > 250 + for: 24h + labels: + severity: critical + - alert: TempoIngesterReplayErrors + annotations: + message: Tempo ingester has encountered errors while replaying a block on startup in {{`{{`}} $labels.cluster {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} for tenant {{`{{`}} $labels.tenant {{`}}`}} + runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors + expr: | + sum by (cluster, namespace, tenant) (increase(tempo_ingester_replay_errors_total{namespace=~".*"}[5m])) > 0 + for: 5m + labels: + severity: critical diff --git a/charts/tempo-mixin/templates/rules.yaml b/charts/tempo-mixin/templates/rules.yaml index ff31a06d..23238196 100644 --- a/charts/tempo-mixin/templates/rules.yaml +++ b/charts/tempo-mixin/templates/rules.yaml @@ -13,17 +13,17 @@ metadata: {{- end }} spec: groups: - - name: tempo_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile - - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds:avg - - expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate - - expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate + - name: tempo_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile + - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds:avg + - expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate + - expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate