-
Notifications
You must be signed in to change notification settings - Fork 136
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Co-authored-by: Christopher Martin <[email protected]>
- Loading branch information
1 parent
d4548ad
commit 286cfd5
Showing
2 changed files
with
100 additions
and
1 deletion.
There are no files selected for viewing
99 changes: 99 additions & 0 deletions
99
deployment/scheduler/templates/scheduler-prometheusrule.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
{{- if .Values.scheduler.prometheus.enabled }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
name: {{ include "armada-scheduler.name" . }} | ||
namespace: {{ .Release.Namespace }} | ||
labels: | ||
{{- include "armada-scheduler.labels.all" . | nindent 4 -}} | ||
{{- if .Values.scheduler.prometheus.labels }} | ||
{{- toYaml .Values.scheduler.prometheus.labels | nindent 4 -}} | ||
{{- end }} | ||
spec: | ||
groups: | ||
- name: armada-server-metrics | ||
interval: {{ .Values.scheduler.prometheus.scrapeInterval }} | ||
rules: | ||
# Per-node failures. | ||
- record: node:armada_scheduler_failed_jobs | ||
expr: sum by (node) (armada_scheduler_job_state_counter_by_node{state="failed"}) | ||
# Per-cluster failures. | ||
- record: cluster_category_subCategory:armada_scheduler_failed_jobs | ||
expr: sum by (cluster, category, subCategory) (armada_scheduler_error_classification_by_node) | ||
# Per-queue failures. | ||
- record: queue_category_subCategory:armada_scheduler_failed_jobs | ||
expr: sum by (queue, category, subCategory) (job_error_classification_by_queue) | ||
# Per-node successes. | ||
- record: node:armada_scheduler_succeeded_jobs | ||
expr: sum by (node) (armada_scheduler_job_state_counter_by_node{state="succeeded"}) | ||
# Per-cluster successes. | ||
- record: cluster_category_subCategory:armada_scheduler_succeeded_jobs | ||
expr: sum by (cluster, category, subCategory) (armada_scheduler_job_state_counter_by_node{state="succeeded"}) | ||
# Per-queue successes. | ||
- record: queue_category_subCategory:armada_scheduler_succeeded_jobs | ||
expr: sum by (queue) (job_state_counter_by_queue{state="succeeded"}) | ||
# Per-node failures increase. | ||
# increase(sum... is safe here, since all metrics that make up the sum reset at the same time. | ||
- record: node:armada_scheduler_failed_jobs:increase1m | ||
expr: increase(node:armada_scheduler_job_state_counter_by_queue{state="failed"}[1m:]) | ||
- record: node:armada_scheduler_failed_jobs:increase10m | ||
expr: increase(node:armada_scheduler_job_state_counter_by_queue{state="failed"}[10m:]) | ||
- record: node:armada_scheduler_failed_jobs:increase1h | ||
expr: increase(node:armada_scheduler_job_state_counter_by_queue{state="failed"}[10m:]) | ||
# Per-cluster failures increase. | ||
- record: cluster_category_subCategory:armada_scheduler_failed_jobs:increase1m | ||
expr: increase(cluster_category_subCategory:armada_scheduler_failed_jobs[1m:]) | ||
- record: cluster_category_subCategory:armada_scheduler_failed_jobs:increase10m | ||
expr: increase(cluster_category_subCategory:armada_scheduler_failed_jobs[10m:]) | ||
- record: cluster_category_subCategory:armada_scheduler_failed_jobs:increase1h | ||
expr: increase(cluster_category_subCategory:armada_scheduler_failed_jobs[1h:]) | ||
# Per-queue failures increase. | ||
- record: queue_category_subCategory:armada_scheduler_failed_jobs:increase1m | ||
expr: increase(queue_category_subCategory:armada_scheduler_failed_jobs[1m:]) | ||
- record: queue_category_subCategory:armada_scheduler_failed_jobs:increase10m | ||
expr: increase(queue_category_subCategory:armada_scheduler_failed_jobs[10m:]) | ||
- record: queue_category_subCategory:armada_scheduler_failed_jobs:increase1h | ||
expr: increase(queue_category_subCategory:armada_scheduler_failed_jobs[1h:]) | ||
# Per-node successes increase. | ||
- record: node:armada_scheduler_succeeded_jobs:increase1m | ||
expr: increase(node:armada_scheduler_succeeded_jobs[1m:]) | ||
- record: node:armada_scheduler_succeeded_jobs:increase10m | ||
expr: increase(node:armada_scheduler_succeeded_jobs[10m:]) | ||
- record: node:armada_scheduler_succeeded_jobs:increase1h | ||
expr: increase(node:armada_scheduler_succeeded_jobs[1h:]) | ||
# Per-cluster successes increase. | ||
- record: cluster_category_subCategory:armada_scheduler_succeeded_jobs:increase1m | ||
expr: increase(cluster_category_subCategory:armada_scheduler_succeeded_jobs[1m:]) | ||
- record: cluster_category_subCategory:armada_scheduler_succeeded_jobs:increase10m | ||
expr: increase(cluster_category_subCategory:armada_scheduler_succeeded_jobs[10m:]) | ||
- record: cluster_category_subCategory:armada_scheduler_succeeded_jobs:increase1h | ||
expr: increase(cluster_category_subCategory:armada_scheduler_succeeded_jobs[1h:]) | ||
# Per-queue successes increase. | ||
- record: queue_category_subCategory:armada_scheduler_succeeded_jobs:increase1m | ||
expr: increase(queue_category_subCategory:armada_scheduler_succeeded_jobs[1m:]) | ||
- record: queue_category_subCategory:armada_scheduler_succeeded_jobs:increase10m | ||
expr: increase(queue_category_subCategory:armada_scheduler_succeeded_jobs[1m:]) | ||
- record: queue_category_subCategory:armada_scheduler_succeeded_jobs:increase1h | ||
expr: increase(queue_category_subCategory:armada_scheduler_succeeded_jobs[1m:]) | ||
# Per-node failure rates. | ||
- record: node:armada_scheduler_failed_rate_jobs:increase1m | ||
expr: sum by(node) (node:armada_scheduler_failed_jobs:increase1m) / on(node) group_left() ((sum by(node) (node:armada_scheduler_failed_jobs:increase1m)) + (sum by(node) (node:armada_scheduler_succeeded_jobs:increase1m))) | ||
- record: node:armada_scheduler_failed_rate_jobs:increase10m | ||
expr: sum by(node) (node:armada_scheduler_failed_jobs:increase10m) / on(node) group_left() ((sum by(node) (node:armada_scheduler_failed_jobs:increase10m)) + (sum by(node) (node:armada_scheduler_succeeded_jobs:increase10m))) | ||
- record: node:armada_scheduler_failed_rate_jobs:increase1h | ||
expr: sum by(node) (node:armada_scheduler_failed_jobs:increase1h) / on(node) group_left() ((sum by(node) (node:armada_scheduler_failed_jobs:increase1h)) + (sum by(node) (node:armada_scheduler_succeeded_jobs:increase1h))) | ||
# Per-cluster failure rates. | ||
- record: cluster_category_subCategory:armada_scheduler_failed_rate_jobs:increase1m | ||
expr: sum by(cluster, category, subCategory) (cluster_category_subCategory:armada_scheduler_failed_jobs:increase1m) / on(cluster) group_left() ((sum by(cluster) (cluster_category_subCategory:armada_scheduler_failed_jobs:increase1m)) + (sum by(cluster) (cluster_category_subCategory:armada_scheduler_succeeded_jobs:increase1m))) | ||
- record: cluster_category_subCategory:armada_scheduler_failed_rate_jobs:increase10m | ||
expr: sum by(cluster, category, subCategory) (cluster_category_subCategory:armada_scheduler_failed_jobs:increase10m) / on(cluster) group_left() ((sum by(cluster) (cluster_category_subCategory:armada_scheduler_failed_jobs:increase10m)) + (sum by(cluster) (cluster_category_subCategory:armada_scheduler_succeeded_jobs:increase10m))) | ||
- record: cluster_category_subCategory:armada_scheduler_failed_rate_jobs:increase1h | ||
expr: sum by(cluster, category, subCategory) (cluster_category_subCategory:armada_scheduler_failed_jobs:increase1h) / on(cluster) group_left() ((sum by(cluster) (cluster_category_subCategory:armada_scheduler_failed_jobs:increase1h)) + (sum by(cluster) (cluster_category_subCategory:armada_scheduler_succeeded_jobs:increase1h))) | ||
# Per-queue failure rates. | ||
- record: queue_category_subCategory:armada_scheduler_failed_rate_jobs:increase1m | ||
expr: sum by(queue, category, subCategory) (queue_category_subCategory:armada_scheduler_failed_jobs:increase1m) / on(queue) group_left() ((sum by(queue) (queue_category_subCategory:armada_scheduler_failed_jobs:increase1m)) + (sum by(queue) (queue_category_subCategory:armada_scheduler_succeeded_jobs:increase1m))) | ||
- record: queue_category_subCategory:armada_scheduler_failed_rate_jobs:increase10m | ||
expr: sum by(queue, category, subCategory) (queue_category_subCategory:armada_scheduler_failed_jobs:increase10m) / on(queue) group_left() ((sum by(queue) (queue_category_subCategory:armada_scheduler_failed_jobs:increase10m)) + (sum by(queue) (queue_category_subCategory:armada_scheduler_succeeded_jobs:increase10m))) | ||
- record: queue_category_subCategory:armada_scheduler_failed_rate_jobs:increase1h | ||
expr: sum by(queue, category, subCategory) (queue_category_subCategory:armada_scheduler_failed_jobs:increase1h) / on(queue) group_left() ((sum by(queue) (queue_category_subCategory:armada_scheduler_failed_jobs:increase1h)) + (sum by(queue) (queue_category_subCategory:armada_scheduler_succeeded_jobs:increase1h))) | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters