From 36bdbcad064d5d52827edec3b9cd62d020fa67c7 Mon Sep 17 00:00:00 2001 From: mphanias Date: Sun, 7 Jan 2024 17:29:43 +0530 Subject: [PATCH] OM157 - additional node-exporter alerts added few more alerts related to node-exporter --- config/prometheus/node_exporter_alerts.yml | 504 ++++++++++++++++++ .../node_exporter_alerts_rules.template | 502 +++++++++++++++++ .../templates/node_exporter_config_data.json | 92 +++- 3 files changed, 1097 insertions(+), 1 deletion(-) diff --git a/config/prometheus/node_exporter_alerts.yml b/config/prometheus/node_exporter_alerts.yml index f6373c2..56a67f3 100644 --- a/config/prometheus/node_exporter_alerts.yml +++ b/config/prometheus/node_exporter_alerts.yml @@ -272,3 +272,507 @@ groups: description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + - alert: HostMemoryFillingUpWarn(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 15 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 15%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryFillingUpCritical(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryUnderMemoryPressureWarn(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryUnderMemoryPressureCritical(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostDiskSpaceFillingUpWarn(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 15 + for: 1m + labels: + severity: warn + annotations: + summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 15% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostDiskSpaceFillingUpCritical(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 30% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostInodesFillingUpWarn(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostInodesFillingUpCritical(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host inodes filling Up of ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyWarn(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk Read latency is increasing (read operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskReadLatencyCritical(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk Read latency is increasing (read operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyWarn(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyCritical(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationWarn(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 20% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationCritical(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 30% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationWarn(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 20% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationCritical(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host high CPU Utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 30% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8 + for: 1m + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8 + for: 1m + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostContextSwitchingWarn(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 1000 + for: 1m + labels: + severity: warn + annotations: + summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Context switching is increasing (> 1000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostContextSwitchingCritical(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 2000 + for: 1m + labels: + severity: critical + annotations: + summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Context switching is increasing (> 2000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkReceiveErrorsWarn(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostNetworkReceiveErrorsCritical(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostNetworkTransmitErrorsWarn(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkTransmitErrorsCritical(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedWarn(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 80 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedCritical(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 90 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostSwapInRateWarn + expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapInRateCritical + expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapOutRateWarn + expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapOutRatecritical + expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostDiskReadIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 300 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 500 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 100 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 250 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 300 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 500 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 100 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 250 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostRateUnusualNetworkThroughputInWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 20/ < -20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 30/ < -30 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 30/ < -30 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskReadRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading less data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + diff --git a/config/prometheus/templates/node_exporter_alerts_rules.template b/config/prometheus/templates/node_exporter_alerts_rules.template index a3907d8..953cbdb 100644 --- a/config/prometheus/templates/node_exporter_alerts_rules.template +++ b/config/prometheus/templates/node_exporter_alerts_rules.template @@ -271,4 +271,506 @@ groups: summary: "{% raw %}Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds {{ node_critical_swapPages_count }} on host {% raw %}{{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + - alert: HostMemoryFillingUpWarn(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ {{ node_rate_memory_fill_duration }}]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > {{ node_warn_rate_memory_fill_pct }} + for: {{ node_rate_memory_fill_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Node memory is filling up (> {{ node_warn_rate_memory_fill_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostMemoryFillingUpCritical(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ {{ node_rate_memory_fill_duration }}]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > {{ node_critical_rate_memory_fill_pct }} + for: {{ node_rate_memory_fill_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Node memory is filling up (> {{ node_critical_rate_memory_fill_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostMemoryUnderMemoryPressureWarn(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[{{ node_memory_rate_underpressure_duration }}]) > {{ node_warn_rate_memorypressure }} + for: {{ node_memory_rate_underpressure_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "High rate of major page faults {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostMemoryUnderMemoryPressureCritical(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[{{ node_memory_rate_underpressure_duration }}]) > {{ node_critical_rate_memorypressure }} + for: {{ node_memory_rate_underpressure_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "High rate of major page faults {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostDiskSpaceFillingUpWarn(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[{{ node_warn_Disk_fill_duration }}]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > {{ node_warn_rate_disk_fill_pct }} + for: {{ node_warn_Disk_fill_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is crossing (> {{ node_warn_rate_disk_fill_pct }}% ) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + - alert: HostDiskSpaceFillingUpCritical(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[{{ node_critical_Disk_fill_duration }}]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > {{ node_critical_rate_disk_fill_pct }} + for: {{ node_critical_Disk_fill_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is crossing (> {{ node_critical_rate_disk_fill_pct }}% ) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostInodesFillingUpWarn(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[{{ node_rate_Inodes_warn_fill_duration }}])) / node_filesystem_files{job="node-exporter"} * 100 > {{ node_warn_rate_inodes_fill_pct }} + for: {{ node_rate_Inodes_warn_fill_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is running out of available inodes (> {{ node_warn_rate_inodes_fill_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostInodesFillingUpCritical(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[{{ node_rate_Inodes_critical_fill_duration }}])) / node_filesystem_files{job="node-exporter"} * 100 > {{ node_critical_rate_inodes_fill_pct }} + for: {{ node_rate_Inodes_critical_fill_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host inodes filling Up of ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is running out of available inodes (> {{ node_critical_rate_inodes_fill_pct }}%) {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskReadLatencyWarn(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[{{ node_rate_unusual_diskRead_latency_warn_duration }}]) / rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskRead_latency_warn_duration }}]) > {{ node_warn_rate_unusual_diskReadlatency_time }} and rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskRead_latency_warn_duration }}]) > 0 + for: {{ node_rate_unusual_diskRead_latency_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk Read latency is increasing (read operations > {{ node_warn_rate_unusual_diskReadlatency_time }}s) {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostUnusualDiskReadLatencyCritical(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[{{ node_rate_unusual_diskRead_latency_critical_duration }}]) / rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskRead_latency_critical_duration }}]) > {{ node_critical_rate_unusual_diskReadlatency_time }} and rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskRead_latency_critical_duration }}]) > 0 + for: {{ node_rate_unusual_diskRead_latency_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk Read latency is increasing (read operations > {{ node_critical_rate_unusual_diskReadlatency_time }}s) {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostUnusualDiskWriteLatencyWarn(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[{{ node_rate_unusual_diskWrite_latency_warn_duration }}]) / rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskWrite_latency_warn_duration }}]) > {{ node_warn_rate_unusual_diskWritelatency_time }} and rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskWrite_latency_warn_duration }}]) > 0 + for: {{ node_rate_unusual_diskWrite_latency_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk write latency is increasing (write operations > {{ node_warn_rate_unusual_diskWritelatency_time }}s){% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostUnusualDiskWriteLatencyCritical(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[{{ node_rate_unusual_diskWrite_latency_critical_duration }}]) / rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskWrite_latency_critical_duration }}]) > {{ node_critical_rate_unusual_diskWritelatency_time }} and rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_unusual_diskWrite_latency_critical_duration }}]) > 0 + for: {{ node_rate_unusual_diskWrite_latency_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk write latency is increasing (write operations > {{ node_critical_rate_unusual_diskWritelatency_time }}s) {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateCPUutilizationWarn(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[{{ node_rate_warn_cpu_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_warn_cpu_threshold_duration }}])))) * 100 > {{ node_warn_rate_highCPU_pct }} + for: {{ node_rate_warn_cpu_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is > {{ node_warn_rate_highCPU_pct }}% {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateCPUutilizationCritical(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[{{ node_rate_critical_cpu_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_critical_cpu_threshold_duration }}])))) * 100 > {{ node_critical_rate_highCPU_pct }} + for: {{ node_rate_critical_cpu_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is > {{ node_critical_rate_highCPU_pct }}% {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateCPUutilizationWarn(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[{{ node_rate_warn_cpu_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_warn_cpu_threshold_duration }}])))) * 100 > {{ node_warn_rate_highCPU_pct }} + for: {{ node_rate_warn_cpu_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host high CPU utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is > {{ node_warn_rate_highCPU_pct }}% {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateCPUutilizationCritical(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[{{ node_rate_critical_cpu_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_critical_cpu_threshold_duration }}])))) * 100 > {{ node_critical_rate_highCPU_pct }} + for: {{ node_rate_critical_cpu_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host high CPU Utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is > {{ node_critical_rate_highCPU_pct }}% {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[{{ node_rate_cpu_steal_warn_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_cpu_steal_warn_threshold_duration }}])))) * 100 > {{ node_warn_rate_cpu_steal_pct }} + for: {{ node_rate_cpu_steal_warn_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is > {{ node_warn_rate_cpu_steal_pct }}% {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}{% endraw %}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[{{ node_rate_cpu_steal_critical_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_cpu_steal_critical_threshold_duration }}])))) * 100 > {{ node_critical_rate_cpu_steal_pct }} + for: {{ node_rate_cpu_steal_critical_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is > {{ node_critical_rate_cpu_steal_pct }}% {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}{% endraw %}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[{{ node_rate_cpu_steal_warn_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_cpu_steal_warn_threshold_duration }}])))) * 100 > {{ node_warn_rate_cpu_steal_pct }} + for: {{ node_rate_cpu_steal_warn_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is > {{ node_warn_rate_cpu_steal_pct }}% {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}{% endraw %}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[{{ node_rate_cpu_steal_critical_threshold_duration }}])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[{{ node_rate_cpu_steal_critical_threshold_duration }}])))) * 100 > {{ node_critical_rate_cpu_steal_pct }} + for: {{ node_rate_cpu_steal_critical_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is > {{ node_critical_rate_cpu_steal_pct }}% {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}{% endraw %}" + + - alert: HostContextSwitchingWarn(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[{{ node_context_switching_warn_threshold_duration }}])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > {{ node_rate_contextswitching_warn_count }} + for: {{ node_context_switching_warn_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Context switching is increasing (> {{ node_rate_contextswitching_warn_count }} /s) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostContextSwitchingCritical(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[{{ node_context_switching_critical_threshold_duration }}])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > {{ node_rate_contextswitching_critical_count }} + for: {{ node_context_switching_critical_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Context switching is increasing (> {{ node_rate_contextswitching_critical_count }} /s) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostNetworkReceiveErrorsWarn(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[{{ node_Rate_network_receiveError_warn_duration }}]) / rate(node_network_receive_packets_total{job="node-exporter"}[{{ node_Rate_network_receiveError_warn_duration }}]) > {{ node_rate_network_err_warn }} + for: {{ node_Rate_network_receiveError_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostNetworkReceiveErrorsCritical(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[{{ node_Rate_network_receiveError_critical_duration }}]) / rate(node_network_receive_packets_total{job="node-exporter"}[{{ node_Rate_network_receiveError_critical_duration }}]) > {{ node_rate_network_err_critical }} + for: {{ node_Rate_network_receiveError_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostNetworkTransmitErrorsWarn(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[{{ node_Rate_network_transmitError_warn_duration }}]) / rate(node_network_transmit_packets_total{job="node-exporter"}[{{ node_Rate_network_transmitError_warn_duration }}]) > {{ node_rate_network_err_warn }} + for: {{ node_Rate_network_transmitError_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostNetworkTransmitErrorsCritical(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[{{ node_Rate_network_transmitError_critical_duration }}]) / rate(node_network_transmit_packets_total{job="node-exporter"}[{{ node_Rate_network_transmitError_critical_duration }}]) > {{ node_rate_network_err_critical }} + for: {{ node_Rate_network_transmitError_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostNetworkInterfaceSaturatedWarn(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_interface_saturated_warn_duration }}]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_interface_saturated_warn_duration }}])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > {{ node_warn_network_interface_saturation }} + for: {{ node_rate_network_interface_saturated_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostNetworkInterfaceSaturatedCritical(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_interface_saturated_critical_duration }}]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_interface_saturated_critical_duration }}])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > {{ node_critical_network_interface_saturation }} + for: {{ node_rate_network_interface_saturated_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostSwapInRateWarn + expr: rate(node_vmstat_pswpin{job="node-exporter"}[{{ node_rate_swapin_warn_duration }}]) > {{ node_rate_swapPages_warn_count }} + for: {{ node_rate_swapin_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapIn rate exceeds {{ node_rate_swapPages_warn_count }} {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}.{% endraw %}" + + - alert: HostSwapInRateCritical + expr: rate(node_vmstat_pswpin{job="node-exporter"}[{{ node_rate_swapin_critical_duration }}]) > {{ node_rate_swapPages_critical_count }} + for: {{ node_rate_swapin_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapIn rate exceeds {{ node_rate_swapPages_critical_count }} {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}.{% endraw %}" + + - alert: HostSwapOutRateWarn + expr: rate(node_vmstat_pswpout{job="node-exporter"}[{{ node_rate_swapOut_warn_duration }}]) > {{ node_rate_swapPages_warn_count }} + for: {{ node_rate_swapOut_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapOut rate exceeds {{ node_rate_swapPages_warn_count }} {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}.{% endraw %}" + + - alert: HostSwapOutRatecritical + expr: rate(node_vmstat_pswpout{job="node-exporter"}[{{ node_rate_swapOut_critical_duration }}]) > {{ node_rate_swapPages_critical_count }} + for: {{ node_rate_swapOut_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapOut rate exceeds {{ node_rate_swapPages_critical_count }} {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}.{% endraw %}" + + - alert: HostDiskReadIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_diskread_iops_warn_threshold_duration }}])) > {{ node_rate_disk_read_iops_warn_value }} + for: {{ node_rate_diskread_iops_warn_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk read IOPS rate (> {{ node_rate_disk_read_iops_warn_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostDiskReadIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_diskread_iops_critical_threshold_duration }}])) > {{ node_rate_disk_read_iops_critical_value }} + for: {{ node_rate_diskread_iops_critical_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk read IOPS rate (> {{ node_rate_disk_read_iops_critical_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostDiskReadIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_diskread_iops_warn_threshold_duration }}])) > {{ node_rate_diskread_iops_device_warn_value }} + for: {{ node_rate_diskread_iops_warn_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk read IOPS rate (> {{ node_rate_diskread_iops_device_warn_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostDiskReadIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[{{ node_rate_diskread_iops_critical_threshold_duration }}])) > {{ node_rate_diskread_iops_device_critical_value }} + for: {{ node_rate_diskread_iops_critical_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk read IOPS rate (> {{ node_rate_diskread_iops_device_critical_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostDiskWriteIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_diskwrite_iops_warn_threshold_duration }}])) > {{ node_rate_disk_write_iops_warn_value }} + for: {{ node_rate_diskwrite_iops_warn_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk write IOPS rate (> {{ node_rate_disk_write_iops_warn_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostDiskWriteIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_diskwrite_iops_critical_threshold_duration }}])) > {{ node_rate_disk_write_iops_critical_value }} + for: {{ node_rate_diskwrite_iops_critical_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk write IOPS rate (> {{ node_rate_disk_write_iops_critical_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostDiskWriteIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_diskwrite_iops_warn_threshold_duration }}])) > {{ node_rate_diskwrite_iops_device_warn_value }} + for: {{ node_rate_diskwrite_iops_warn_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk write IOPS rate (> {{ node_rate_diskwrite_iops_device_warn_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostDiskWriteIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[{{ node_rate_diskwrite_iops_critical_threshold_duration }}])) > {{ node_rate_diskwrite_iops_device_critical_value }} + for: {{ node_rate_diskwrite_iops_critical_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Sustained high disk write IOPS rate (> {{ node_rate_diskwrite_iops_device_critical_value }}) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputInWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) > {{ node_rate_unusal_increases_network_warn_pct }} or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) < {{ node_rate_unusal_decreases_network_warn_pct }} + for: {{ node_rate_network_throughput_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably receiving data (> {{ node_rate_unusal_increases_network_warn_pct }}/ < {{ node_rate_unusal_decreases_network_warn_pct }}%) {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_critical_duration }}]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_critical_duration }}])))) > {{ node_rate_unusal_increases_network_critical_pct }} or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_critical_duration }}]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_critical_duration }}])))) < {{ node_rate_unusal_decreases_network_critical_pct }} + for: {{ node_rate_network_throughput_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably receiving data (> {{ node_rate_unusal_increases_network_critical_pct }}/ < {{ node_rate_unusal_decreases_network_critical_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputInWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) > {{ node_rate_unusal_increases_network_warn_pct }} or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) < {{ node_rate_unusal_decreases_network_warn_pct }} + for: {{ node_rate_network_throughput_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably receiving data (> {{ node_rate_unusal_increases_network_warn_pct }}/ < {{ node_rate_unusal_decreases_network_warn_pct }} %) {% raw %} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_critical_duration }}]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_critical_duration }}])))) > {{ node_rate_unusal_increases_network_critical_pct }} or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_critical_duration }}]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_critical_duration }}])))) < {{ node_rate_unusal_decreases_network_critical_pct }} + for: {{ node_rate_network_throughput_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably receiving data (> {{ node_rate_unusal_increases_network_critical_pct }}/ < {{ node_rate_unusal_decreases_network_critical_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) > {{ node_rate_unusal_increases_network_warn_pct }} or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) < {{ node_rate_unusal_decreases_network_warn_pct }} + for: {{ node_rate_network_throughput_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably sending data (> {{ node_rate_unusal_increases_network_warn_pct }}/ < {{ node_rate_unusal_decreases_network_warn_pct }} %) {% raw %}on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) > {{ node_rate_unusal_increases_network_critical_pct }} or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) < {{ node_rate_unusal_decreases_network_critical_pct }} + for: {{ node_rate_network_throughput_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably sending data (> {{ node_rate_unusal_increases_network_critical_pct }}/ < {{ node_rate_unusal_decreases_network_critical_pct }} %) {% raw %} on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) > {{ node_rate_unusal_increases_network_warn_pct }} or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) < {{ node_rate_unusal_decreases_network_warn_pct }} + for: {{ node_rate_network_throughput_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably sending data (> {{ node_rate_unusal_increases_network_warn_pct }}/ < {{ node_rate_unusal_decreases_network_warn_pct }} %) {% raw %}on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) > {{ node_rate_unusal_increases_network_critical_pct }} or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_value_network_throughput_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[{{ node_rate_network_throughput_warn_duration }}])))) < {{ node_rate_unusal_decreases_network_critical_pct }} + for: {{ node_rate_network_throughput_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Host network interfaces are probably sending data (> {{ node_rate_unusal_increases_network_critical_pct }}/ < {{ node_rate_unusal_decreases_network_critical_pct }} %) {% raw %} on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %}" + + - alert: HostUnusualDiskReadRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_warn_duration }}])))) > {{ node_rate_unusal_increases_diskRate_warn_pct }} or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_warn_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_warn_pct }} + for: {{ node_rate_unusual_diskread_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably reading data (> {{ node_rate_unusal_increases_diskRate_warn_pct }}/ < {{ node_rate_unusal_decreases_diskRate_warn_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskReadRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_critical_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_critical_duration }}])))) > {{ node_rate_unusal_increases_diskRate_critical_pct }} or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_critical_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_critical_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_critical_pct }} + for: {{ node_rate_unusual_diskread_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably reading less data (> {{ node_rate_unusal_increases_diskRate_critical_pct }}/ < {{ node_rate_unusal_decreases_diskRate_critical_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskReadRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_warn_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_warn_duration }}])))) > {{ node_rate_unusal_increases_diskRate_warn_pct }} or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_warn_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_warn_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_warn_pct }} + for: {{ node_rate_unusual_diskread_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably reading data (> {{ node_rate_unusal_increases_diskRate_warn_pct }}/ < {{ node_rate_unusal_decreases_diskRate_warn_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskReadRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_critical_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_critical_duration }}])))) > {{ node_rate_unusal_increases_diskRate_critical_pct }} or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskread_critical_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskread_critical_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_critical_pct }} + for: {{ node_rate_unusual_diskread_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably reading data (> {{ node_rate_unusal_increases_diskRate_critical_pct }}/ < {{ node_rate_unusal_decreases_diskRate_critical_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskWriteRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_warn_duration }}])))) > {{ node_rate_unusal_increases_diskRate_warn_pct }} or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_warn_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_warn_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_warn_pct }} + for: {{ node_rate_unusual_diskwrite_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably writing data (> {{ node_rate_unusal_increases_diskRate_warn_pct }}/ < {{ node_rate_unusal_decreases_diskRate_warn_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskWriteRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_critical_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_critical_duration }}])))) > {{ node_rate_unusal_increases_diskRate_critical_pct }} or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_critical_duration }}]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_critical_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_critical_pct }} + for: {{ node_rate_unusual_diskwrite_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably writing data (> {{ node_rate_unusal_increases_diskRate_critical_pct }}/ < {{ node_rate_unusal_decreases_diskRate_critical_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskWriteRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_warn_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_warn_duration }}])))) > {{ node_rate_unusal_increases_diskRate_warn_pct }} or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_warn_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_warn_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_warn_pct }} + for: {{ node_rate_unusual_diskwrite_warn_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably writing data (> {{ node_rate_unusal_increases_diskRate_warn_pct }}/ < {{ node_rate_unusal_decreases_diskRate_warn_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskWriteRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_critical_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_critical_duration }}])))) > {{ node_rate_unusal_increases_diskRate_critical_pct }} or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_unusual_diskwrite_critical_duration }}]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[{{ node_rate_increases_unusual_diskwrite_critical_duration }}])))) < {{ node_rate_unusal_decreases_diskRate_critical_pct }} + for: {{ node_rate_unusual_diskwrite_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk rate is probably writing data (> {{ node_rate_unusal_increases_diskRate_critical_pct }}/ < {{ node_rate_unusal_decreases_diskRate_critical_pct }} %) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" diff --git a/config/prometheus/templates/node_exporter_config_data.json b/config/prometheus/templates/node_exporter_config_data.json index 29c61ad..e79036e 100644 --- a/config/prometheus/templates/node_exporter_config_data.json +++ b/config/prometheus/templates/node_exporter_config_data.json @@ -39,6 +39,96 @@ "node_critical_network_interface_saturation": "0.9", "node_critical_swapPages_count": "10", "node_critical_swap_in_duration": "1m", - "node_critical_swap_out_duration": "1m" + "node_critical_swap_out_duration": "1m", + + "node_memory_rate_underpressure_duration": "1m", + "node_warn_rate_memorypressure": "3", + "node_warn_rate_memory_fill_pct": "15", + "node_rate_memory_fill_duration": "1m", + "node_warn_Disk_fill_duration": "1m", + "node_warn_rate_disk_fill_pct": "15", + "node_rate_Inodes_warn_fill_duration": "1m", + "node_warn_rate_inodes_fill_pct": "20", + "node_rate_unusual_diskRead_latency_warn_duration": "1m", + "node_warn_rate_unusual_diskReadlatency_time": "0.05", + "node_warn_rate_unusual_diskWritelatency_time": "0.05", + "node_rate_unusual_diskWrite_latency_warn_duration": "1m", + "node_rate_warn_cpu_threshold_duration": "1m", + "node_warn_rate_highCPU_pct": "20", + "node_rate_cpu_steal_warn_threshold_duration": "1m", + "node_warn_rate_cpu_steal_pct": "5", + "node_context_switching_warn_threshold_duration": "1m", + "node_rate_contextswitching_warn_count": "1000", + "node_Rate_network_receiveError_warn_duration": "30s", + "node_rate_network_err_warn": "3", + "node_Rate_network_transmitError_warn_duration": "30s", + "node_rate_network_interface_saturated_warn_duration": "1m", + "node_warn_network_interface_saturation": "80", + "node_rate_swapin_warn_duration": "1m", + "node_rate_swapPages_warn_count": "5", + "node_rate_swapOut_warn_duration": "1m", + "node_rate_diskread_iops_warn_threshold_duration": "1m", + "node_rate_diskwrite_iops_warn_threshold_duration": "1m", + "node_rate_disk_read_iops_warn_value": "300", + "node_rate_diskread_iops_device_warn_value": "100", + "node_rate_disk_write_iops_warn_value": "300", + "node_rate_diskwrite_iops_device_warn_value": "100", + + "node_rate_network_throughput_warn_duration": "1m", + "node_rate_value_network_throughput_warn_duration": "30s", + "node_rate_unusal_increases_network_warn_pct": "20", + "node_rate_increases_unusual_diskread_warn_duration": "1m", + "node_rate_unusual_diskread_warn_duration": "30s", + "node_rate_unusal_increases_diskRate_warn_pct": "20", + "node_rate_unusual_diskwrite_warn_duration": "30s", + "node_rate_increases_unusual_diskwrite_warn_duration": "1m", + "node_rate_unusal_decreases_network_warn_pct": "-20", + "node_rate_unusal_decreases_diskRate_warn_pct": "-20", + + + "node_critical_rate_memorypressure": "5", + "node_critical_rate_memory_fill_pct": "30", + "node_critical_Disk_fill_duration": "1m", + "node_critical_rate_disk_fill_pct": "30", + "node_rate_Inodes_critical_fill_duration": "1m", + "node_critical_rate_inodes_fill_pct": "30", + "node_rate_unusual_diskRead_latency_critical_duration": "1m", + "node_critical_rate_unusual_diskReadlatency_time": "0.1", + "node_critical_rate_unusual_diskWritelatency_time": "0.1", + "node_rate_unusual_diskWrite_latency_critical_duration": "1m", + "node_rate_critical_cpu_threshold_duration": "1m", + "node_critical_rate_highCPU_pct": "30", + "node_rate_cpu_steal_critical_threshold_duration": "1m", + "node_critical_rate_cpu_steal_pct": "8", + "node_context_switching_critical_threshold_duration": "1m", + "node_rate_contextswitching_critical_count": "2000", + "node_Rate_network_receiveError_critical_duration": "30s", + "node_rate_network_err_critical": "5", + "node_Rate_network_transmitError_critical_duration": "30s", + "node_rate_network_interface_saturated_critical_duration": "1m", + "node_critical_network_interface_saturation": "90", + "node_rate_swapin_critical_duration": "1m", + "node_rate_swapPages_critical_count": "10", + "node_rate_swapOut_critical_duration": "1m", + "node_rate_diskread_iops_critical_threshold_duration": "1m", + "node_rate_diskwrite_iops_critical_threshold_duration": "1m", + "node_rate_disk_read_iops_critical_value": "500", + "node_rate_diskread_iops_device_critical_value": "250", + "node_rate_disk_write_iops_critical_value": "500", + "node_rate_diskwrite_iops_device_critical_value": "250", + + "node_rate_network_throughput_critical_duration": "1m", + "node_rate_value_network_throughput_critical_duration": "30s", + "node_rate_unusal_increases_network_critical_pct": "30", + "node_rate_unusal_increases_diskRate_critical_pct": "30", + "node_rate_increases_unusual_diskread_critical_duration": "1m", + "node_rate_unusual_diskread_critical_duration": "30s", + "node_rate_unusual_diskwrite_critical_duration": "30s", + "node_rate_increases_unusual_diskwrite_critical_duration": "1m", + "node_rate_unusal_decreases_network_critical_pct": "-30", + "node_rate_unusal_decreases_diskRate_critical_pct": "-30", + + "node_rate_value_unusual_diskread_warn_duration": "30s", + "node_rate_value_unusual_diskwrite_warn_duration": "30s" } \ No newline at end of file