From 6a27021f95b14534420487cb3ea4934fcb118901 Mon Sep 17 00:00:00 2001 From: Ashish Jaiswal Date: Mon, 18 Nov 2024 14:56:47 +0530 Subject: [PATCH] bug fix: fixed the env for puppet-agent status monitoring, since default env is master --- .../prometheus-linuxaid/rules/puppet.yaml | 6 +- .../prometheus-linuxaid/tests/puppet.yaml | 58 +++++++++---------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/argocd-helm-charts/prometheus-linuxaid/rules/puppet.yaml b/argocd-helm-charts/prometheus-linuxaid/rules/puppet.yaml index 2e8128220..2f72470d8 100644 --- a/argocd-helm-charts/prometheus-linuxaid/rules/puppet.yaml +++ b/argocd-helm-charts/prometheus-linuxaid/rules/puppet.yaml @@ -2,7 +2,7 @@ groups: - name: monitor::system::puppet::last_run rules: - alert: monitor::system::puppet::last_run - expr: time() - puppet_report{environment="production", host!~".+.niwyocdmk2"} >= 86400 * 1 + expr: time() - puppet_report{environment="master", host!~".+.niwyocdmk2"} >= 86400 * 1 for: 24h labels: alert_id: 'monitor::system::puppet::last_run' @@ -15,7 +15,7 @@ groups: This could be due to any number of things, e.g. Puppet crashing or hanging, or cron/systemd timers being broken. - alert: monitor::system::puppet::total_resources - expr: count(puppet_status{state="failed",environment="production"} == 1) > 5 + expr: count(puppet_status{state="failed",environment="master"} == 1) > 5 for: 48h labels: alert_id: 'monitor::system::puppet::total_resources' @@ -27,7 +27,7 @@ groups: You can get the list of nodes on prom dashboard by running this query ```sh - sum by (host) (puppet_status{environment="production",state="failed"} == 1) + sum by (host) (puppet_status{environment="master",state="failed"} == 1) ``` Puppetboard for the corresponding node should show that the past diff --git a/argocd-helm-charts/prometheus-linuxaid/tests/puppet.yaml b/argocd-helm-charts/prometheus-linuxaid/tests/puppet.yaml index 62aadb1c2..632cbff5a 100644 --- a/argocd-helm-charts/prometheus-linuxaid/tests/puppet.yaml +++ b/argocd-helm-charts/prometheus-linuxaid/tests/puppet.yaml @@ -7,59 +7,59 @@ rule_files: tests: - interval: 1m input_series: - - series: 'puppet_report{host="atat.enableit",environment="production"}' + - series: 'puppet_report{host="atat.enableit",environment="master"}' values: 0x3000 - - series: 'puppet_status{state="failed",host="atat01.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat01.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat02.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat02.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat03.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat03.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat04.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat04.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat05.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat05.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat06.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat06.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat07.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat07.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat08.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat08.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat09.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat09.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat10.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat10.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat11.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat11.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat12.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat12.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat13.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat13.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat14.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat14.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat15.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat15.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat16.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat16.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat17.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat17.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat18.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat18.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat19.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat19.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat20.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat20.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat21.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat21.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat22.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat22.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat23.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat23.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat24.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat24.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat25.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat25.enableit",environment="master"}' values: 1x3000 - - series: 'puppet_status{state="failed",host="atat26.enableit",environment="production"}' + - series: 'puppet_status{state="failed",host="atat26.enableit",environment="master"}' values: 1x3000 # 2 days alert_rule_test: @@ -71,7 +71,7 @@ tests: severity: warning host: atat.enableit alert_id: 'monitor::system::puppet::last_run' - environment: production + environment: master exp_annotations: summary: 'Puppet has not run on **atat.enableit** for more than 2d 2h 0m 0s' description: |-2 @@ -91,7 +91,7 @@ tests: You can get the list of nodes on prom dashboard by running this query ```sh - sum by (host) (puppet_status{environment="production",state="failed"} == 1) + sum by (host) (puppet_status{environment="master",state="failed"} == 1) ``` Puppetboard for the corresponding node should show that the past