-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added node-exporter template files
- Loading branch information
Showing
2 changed files
with
280 additions
and
0 deletions.
There are no files selected for viewing
236 changes: 236 additions & 0 deletions
236
config/prometheus/templates/node-exporter_alerts_rules.template
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
groups: | ||
- name: node_exporter_alerts | ||
rules: | ||
- alert: NodeExporterDownCritical | ||
expr: up{job="node-exporter"} == 0 | ||
for: {{ node_down_critical_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Node {{ $labels.instance }} is down{% endraw %}" | ||
description: "{% raw %}Failed to scrape {{ $labels.job }} on {{ $labels.instance }} {% endraw %}for more than {{ node_down_critical_duration }} minutes. Node seems down." | ||
|
||
- alert: HostMemoryFillingUpWarn | ||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > {{ node_warn_memory_pct }} | ||
for: {{ node_warn_outofmemory_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host memory filling up (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Node memory is filling up (> {{ node_warn_memory_pct }}%) {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostDiskSpaceFillingUpWarn | ||
expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > {{ node_warn_OutOfdisk_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 | ||
for: {{ node_warn_outOfDisk_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host disk space is filling up (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk is almost (> {{ node_warn_OutOfdisk_pct }}% ) {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostInodesFillingUpWarn | ||
expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > {{ node_warn_OutOfInodes_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 | ||
for: {{ node_warn_outOfInodes_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host inodes filling Up (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk is almost running out of available inodes (> {{ node_warn_OutOfInodes_pct }}%) {% raw %}VALUE = {{ $value }} {% endraw %}" | ||
|
||
- alert: HostUnusualDiskReadLatencyWarn | ||
expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > {{ node_warn_unusual_disklatency_time }} and (node_disk_reads_completed_total{job="node-exporter"}) > 0 | ||
for: {{ node_warn_unusual_diskRead_latency_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host unusual disk read latency (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk latency is increasing (read operations > {{ node_warn_unusual_disklatency_time }}s) {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostUnusualDiskWriteLatencyWarn | ||
expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > {{ node_warn_unusual_disklatency_time }} and (node_disk_writes_completed_total{job="node-exporter"}) > 0 | ||
for: {{ node_warn_unusual_diskWrite_latency_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host unusual disk write latency (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk latency is increasing (write operations > {{ node_warn_unusual_disklatency_time }}s) {% raw %}VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostHighCpuUtilizationWarn | ||
expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_warn_highCPU_pct }} | ||
for: {{ node_high_cpuload_threshold_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host high CPU load (instance {{ $labels.instance }}){% endraw %}" | ||
description: "CPU utilization is almost > {{ node_warn_highCPU_pct }}% {% raw %}VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostCpuStealNoisyNeighborWarn | ||
expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_warn_cpu_steal_pct }} | ||
for: {{ node_cpu_steal_threshold_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host CPU steal noisy neighbor (instance {{ $labels.instance }}){% endraw %}" | ||
description: "CPU steal is > {{ node_warn_cpu_steal_pct }}%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostNetworkReceiveErrorsWarn | ||
expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > {{ node_warn_network_err }} | ||
for: {{ node_network_receiveError_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }}){% endraw %}" | ||
description: "{% raw %}Instance interface has encountered {{ $value }} receive errors VALUE = {{ $value }} {% endraw %}" | ||
|
||
- alert: HostNetworkTransmitErrorsWarn | ||
expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > {{ node_warn_network_err }} | ||
for: {{ node_network_transmitError_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }}){% endraw %}" | ||
description: "{% raw %}Instance has encountered {{ $value }} transmit errors VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostNetworkInterfaceSaturatedWarn | ||
expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > {{ node_warn_network_interface_saturation }} | ||
for: {{ node_network_interface_saturated_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }}){% endraw %}" | ||
description: "The network interface is getting overloaded (> {{ node_warn_network_interface_saturation }}) {% raw %}{{ $value }}. VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostClockNotSynchronisingWarn | ||
expr: min_over_time(node_timex_sync_status{job="node-exporter"}[{{ node_clock_notsync_duration }}]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= {{ node_warn_clock_duration }} | ||
for: {{ node_clock_notsync_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}Host clock not synchronising (instance {{ $labels.instance }}){% endraw %}" | ||
description: "{% raw %}Clock not synchronising. VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostSwapInWarn | ||
expr: (node_vmstat_pswpin{job="node-exporter"}) > {{ node_warn_swapPages_count }} | ||
for: {{ node_warn_swap_in_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}PageSwap in value is too high on {{ $labels.instance }}{% endraw %}" | ||
description: "PageSwap in value exceeds {{ node_warn_swapPages_count }}. Current value is {% raw %}{{ $value }}.{% endraw %}" | ||
|
||
- alert: HostSwapOutWarn | ||
expr: (node_vmstat_pswpout{job="node-exporter"}) > {{ node_warn_swapPages_count }} | ||
for: {{ node_warn_swap_out_duration }} | ||
labels: | ||
severity: warn | ||
annotations: | ||
summary: "{% raw %}PageSwap out value is too high on {{ $labels.instance }}{% endraw %}" | ||
description: "PageSwap out value exceeds {{ node_warn_swapPages_count }}. Current value is {% raw %}{{ $value }}.{% endraw %}" | ||
|
||
- alert: HostMemoryFillingUpCritical | ||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > {{ node_critical_memory_pct }} | ||
for: {{ node_critical_outofmemory_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host memory filling up (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Node memory is filling up (> {{ node_critical_memory_pct }}%) {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostDiskSpaceFillingUpCritical | ||
expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > {{ node_critical_OutOfdisk_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 | ||
for: {{ node_critical_outOfDisk_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host disk space is filling up (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk is almost (> {{ node_critical_OutOfdisk_pct }}% ) {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostInodesFillingUpCritical | ||
expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > {{ node_critical_OutOfInodes_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 | ||
for: {{ node_critical_outOfInodes_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host inodes filling Up (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk is almost running out of available inodes (> {{ node_critical_OutOfInodes_pct }}%) {% raw %}VALUE = {{ $value }} {% endraw %}" | ||
|
||
- alert: HostUnusualDiskReadLatencyCritical | ||
expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > {{ node_critical_unusual_disklatency_time }} and (node_disk_reads_completed_total{job="node-exporter"}) > 0 | ||
for: {{ node_critical_unusual_diskRead_latency_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host unusual disk read latency (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk latency is increasing (read operations > {{ node_critical_unusual_disklatency_time }}s) {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostUnusualDiskWriteLatencyCritical | ||
expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > {{ node_critical_unusual_disklatency_time }} and (node_disk_writes_completed_total{job="node-exporter"}) > 0 | ||
for: {{ node_critical_unusual_diskWrite_latency_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host unusual disk write latency (instance {{ $labels.instance }}){% endraw %}" | ||
description: "Disk latency is increasing (write operations > {{ node_critical_unusual_disklatency_time }}s) {% raw %}VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostHighCpuUtilizationCritical | ||
expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_critical_highCPU_pct }} | ||
for: {{ node_high_cpuload_threshold_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host high CPU load (instance {{ $labels.instance }}){% endraw %}" | ||
description: "CPU utilization is almost > {{ node_critical_highCPU_pct }}% {% raw %}VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostCpuStealNoisyNeighborCritical | ||
expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_critical_cpu_steal_pct }} | ||
for: {{ node_cpu_steal_threshold_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host CPU steal noisy neighbor (instance {{ $labels.instance }}){% endraw %}" | ||
description: "CPU steal is > {{ node_critical_cpu_steal_pct }}%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. {% raw %}VALUE = {{ $value }}{% endraw %} " | ||
|
||
- alert: HostNetworkReceiveErrorsCritical | ||
expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > {{ node_critical_network_err }} | ||
for: {{ node_network_receiveError_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }}){% endraw %}" | ||
description: "{% raw %}Instance interface has encountered {{ $value }} receive errors VALUE = {{ $value }} {% endraw %}" | ||
|
||
- alert: HostNetworkTransmitErrorsCritical | ||
expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > {{ node_critical_network_err }} | ||
for: {{ node_network_transmitError_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }}){% endraw %}" | ||
description: "{% raw %}Instance has encountered {{ $value }} transmit errors VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: HostNetworkInterfaceSaturatedCritical | ||
expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > {{ node_critical_network_interface_saturation }} | ||
for: {{ node_network_interface_saturated_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }}){% endraw %}" | ||
description: "The network interface is getting overloaded (> {{ node_critical_network_interface_saturation }}) {% raw %}{{ $value }}. VALUE = {{ $value }}{% endraw %}" | ||
|
||
- alert: SwapInCritical | ||
expr: (node_vmstat_pswpin{job="node-exporter"}) > {{ node_critical_swapPages_count }} | ||
for: {{ node_critical_swap_in_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}PageSwap in value is too high on {{ $labels.instance }}{% endraw %}" | ||
description: "PageSwap in value exceeds {{ node_critical_swapPages_count }}. Current value is {% raw %}{{ $value }}.{% endraw %}" | ||
|
||
- alert: SwapOutCritical | ||
expr: (node_vmstat_pswpout{job="node-exporter"}) > {{ node_critical_swapPages_count }} | ||
for: {{ node_critical_swap_out_duration }} | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "{% raw %}PageSwap out value is too high on {{ $labels.instance }}{% endraw %}" | ||
description: "PageSwap out value exceeds {{ node_critical_swapPages_count }}. Current value is {% raw %}{{ $value }}.{% endraw %}" |
44 changes: 44 additions & 0 deletions
44
config/prometheus/templates/node_exporter_config_data.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
{ | ||
"node_down_critical_duration": "1m", | ||
"node_warn_memory_pct": "70", | ||
"node_warn_outofmemory_duration": "1m", | ||
"node_warn_outOfDisk_duration": "1m", | ||
"node_warn_OutOfdisk_pct": "70", | ||
"node_warn_OutOfInodes_pct": "70", | ||
"node_warn_outOfInodes_duration": "1m", | ||
"node_warn_unusual_disklatency_time": "0.1", | ||
"node_warn_unusual_diskWrite_latency_duration": "1m", | ||
"node_warn_unusual_diskRead_latency_duration": "1m", | ||
"node_high_cpuload_threshold_duration": "30s", | ||
"node_warn_highCPU_pct": "70", | ||
"node_cpu_steal_threshold_duration": "30s", | ||
"node_warn_cpu_steal_pct": "3", | ||
"node_network_receiveError_duration": "1m", | ||
"node_warn_network_err": "3", | ||
"node_network_transmitError_duration": "1m", | ||
"node_warn_network_interface_saturation": "0.8", | ||
"node_network_interface_saturated_duration": "1m", | ||
"node_clock_notsync_duration": "2m", | ||
"node_warn_clock_duration": "16", | ||
"node_warn_swapPages_count": "5", | ||
"node_warn_swap_in_duration": "1m", | ||
"node_warn_swap_out_duration": "1m", | ||
|
||
"node_critical_memory_pct": "90", | ||
"node_critical_outofmemory_duration": "1m", | ||
"node_critical_outOfDisk_duration": "1m", | ||
"node_critical_OutOfdisk_pct": "90", | ||
"node_critical_OutOfInodes_pct": "90", | ||
"node_critical_outOfInodes_duration": "1m", | ||
"node_critical_unusual_disklatency_time": "0.5", | ||
"node_critical_unusual_diskWrite_latency_duration": "1m", | ||
"node_critical_unusual_diskRead_latency_duration": "1m", | ||
"node_critical_highCPU_pct": "90", | ||
"node_critical_cpu_steal_pct": "5", | ||
"node_critical_network_err": "5", | ||
"node_critical_network_interface_saturation": "0.9", | ||
"node_critical_swapPages_count": "10", | ||
"node_critical_swap_in_duration": "1m", | ||
"node_critical_swap_out_duration": "1m" | ||
|
||
} |