Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Feature/improve prometheus alerting #37

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion external_version.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ gnats_version: "1.0.2"

# prometheus exporters
redis_exporter_version: "0.12.2"
jmx_prometheus_javaagent_version: "0.1.0"
jmx_prometheus_javaagent_version: "0.3.0"
node_exporter_version: "0.14.0"
2 changes: 1 addition & 1 deletion hosts.template
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ gnats_version=1.0.2

# prometheus exporters
redis_exporter_version=0.12.2
jmx_prometheus_javaagent_version=0.1.0
jmx_prometheus_javaagent_version=0.3.0
node_exporter_version=0.14.0

[monitoring:children]
Expand Down
2 changes: 1 addition & 1 deletion roles/alertmanager/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
- name: install alertmanager
copy:
src: "{{ dist_directory }}/ext/alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager"
dest: /usr/local/sbin/alertmanager
dest: /usr/local/alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager
mode: 0711

- name: install service for alertmanager
Expand Down
15 changes: 12 additions & 3 deletions roles/alertmanager/templates/alertmanager.yml.j2
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
global:
smtp_from: '[email protected]'
smtp_smarthost: 'localhost:25'
smtp_from: '[email protected]'
smtp_smarthost: 'mail.gandi.net:587'
smtp_auth_username: '[email protected]'
smtp_auth_password: !vault |
$ANSIBLE_VAULT;1.1;AES256
39323731323662373133663263653334643766373562663238653661333963323362336238383034
3032613435376263643130636438353339323465613763640a303561633162356361333136386664
61316631333162386430343935373132393437656234636331613230663362373932356465323865
3636373534363331660a666165656362316335376464376565323239653031353739623831306537
6637
smtp_require_tls: true

route:
group_by: ['alertname', 'service']
Expand All @@ -16,4 +25,4 @@ route:
receivers:
- name: 'team-ops'
email_configs:
- to: 'ops@caliopen.org'
- to: 'alert@caliopen.org'
2 changes: 1 addition & 1 deletion roles/cassandra/templates/cassandra-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -312,4 +312,4 @@ JVM_OPTS="$JVM_OPTS $JVM_EXTRA_OPTS"

# Add JMX prometheus exporter
JMX_EXPORTER_DIR="/etc/cassandra"
# JVM_OPTS="$JVM_OPTS -javaagent:$JMX_EXPORTER_DIR/jmx_prometheus_javaagent-{{ jmx_prometheus_javaagent_version }}.jar=7070:$JMX_EXPORTER_DIR/cassandra_exporter.yaml"
JVM_OPTS="$JVM_OPTS -javaagent:$JMX_EXPORTER_DIR/jmx_prometheus_javaagent-{{ jmx_prometheus_javaagent_version }}.jar=7070:$JMX_EXPORTER_DIR/cassandra_exporter.yaml"
6 changes: 6 additions & 0 deletions roles/nats/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
- name: start nats
service: name=gnatsd state=started enabled=yes

- name: install prometheus-nats-exporter
copy:
src: "{{ dist_directory }}/ext/prometheus-nats-exporter"
dest: /usr/local/sbin/prometheus-nats-exporter
mode: 0711

- name: install prometheus-nats-exporter service
template: src=prometheus-nats-exporter.service.j2 dest=/etc/systemd/system/prometheus-nats-exporter.service

Expand Down
4 changes: 2 additions & 2 deletions roles/nats/templates/prometheus-nats-exporter.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ Description=Nats prometheus exporter

[Service]
Restart=always
ExecStart=/var/tmp/prometheus-nats-exporter -connz -routez -subz -a {{ facter_ipaddress_eth1 }} http://{{ facter_ipaddress_eth1}}:8222
ExecStop=pkill prometheus-nats-exporter
ExecStart=/usr/local/sbin/prometheus-nats-exporter -connz -routez -subz -varz -a {{ facter_ipaddress_eth1 }} http://{{ facter_ipaddress_eth1}}:8222
ExecStop=pkill /usr/local/sbin/prometheus-nats-exporter

[Install]
WantedBy=local.target
9 changes: 9 additions & 0 deletions roles/prometheus/files/cassandra-status.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
ALERT xxxxxxxxxxxxx
IF xxxxxxxxxxxxxxxxxx
FOR 5m
LABELS { severity ="critical" }
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: ES cluster status yellow",
DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in yellow state for more than 5 minutes)"
}

10 changes: 10 additions & 0 deletions roles/prometheus/files/cpu-usage.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ALERT NodeCPUUsage
IF (100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100)) > 75
FOR 2m
LABELS {
severity="critical"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: High CPU usage detected",
DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
}
17 changes: 17 additions & 0 deletions roles/prometheus/files/es-status.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
ALERT EsStatusYellow
IF elasticsearch_cluster_health_status{color="yellow"}
FOR 5m
LABELS { severity ="warning" }
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: ES cluster status yellow",
DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in yellow state for more than 5 minutes)"
}

ALERT EsStatusRed
IF elasticsearch_cluster_health_status{color="red"}
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: ES cluster status red",
DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in red state for more than 5 minutes)"
}
8 changes: 8 additions & 0 deletions roles/prometheus/files/filesystem-usage.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ALERT FilesystemFull
IF node_filesystem_free / node_filesystem_size < 0.3
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "Filesystem {{ $labels.instance }} full",
description = "{{ $labels.instance }} of job {{ $labels.job }} free space less than 30%.",
}
1 change: 1 addition & 0 deletions roles/prometheus/files/http-responses-500.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
rate(nginx_http_requests_total{status=~"5[0-9][0-9]",host!="127.0.0.1"}[5m])
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,4 @@ ALERT InstanceDown
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} down",
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
}


ALERT FilesystemFull
IF node_filesystem_free / node_filesystem_size < 0.3
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "Filesystem {{ $labels.instance }} full",
description = "{{ $labels.instance }} of job {{ $labels.job }} free space less than 30%.",
}
}
10 changes: 10 additions & 0 deletions roles/prometheus/files/load-average.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ALERT NodeLoadAverage
IF ((node_load5 / count without (cpu, mode) (node_cpu{mode="system"})) > 1)
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: High LA detected",
DESCRIPTION = "{{$labels.instance}}: LA is high"
}
17 changes: 17 additions & 0 deletions roles/prometheus/files/memory-usage.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
ALERT NodeMemoryUsage
IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: High memory usage detected",
DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
}

ALERT NodeSwapUsage
IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: Swap usage detected",
DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
}
47 changes: 47 additions & 0 deletions roles/prometheus/files/redis-status.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
ALERT RedisHighMissRatio
IF (((rate(redis_keyspace_misses_total[5m])) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))) > 0.5)
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "{{ $labels.instance }}: Redis instance miss ratio high.",
description = "{{ $labels.instance }}: Redis instance miss ratio is over 50%.",
}

ALERT MemoryFragmentationHigh
IF (redis_memory_fragmentation_ratio > 1.5)
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "{{ $labels.instance }}: Redis memory fragmentation too high.",
description = "{{ $labels.instance }}: Redis instance memory fragmentation ratio over 1.5.",
}

ALERT MemoryFragmentationLow
IF (redis_memory_fragmentation_ratio > 0.9)
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "{{ $labels.instance }}: Redis memory fragmentation too low.",
description = "{{ $labels.instance }}: Redis instance memory fragmentation ratio under 0.9.",
}

ALERT KeyEvictions
IF (rate(redis_evicted_keys_total[5m]) >= 1)
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "{{ $labels.instance }}: Redis instance evicting keys.",
description = "{{ $labels.instance }}: Redis instance has been consistently evicting keys for 5 minutes.",
}

ALERT TotalMemoryUsed
IF (redis_memory_used_bytes{instance="cache1.local:9121"} / node_memory_MemTotal{instance="cache1.local:9100"}) > 0.8
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "{{ $labels.instance }}: Redis is using too much memory.",
description = "{{ $labels.instance }}: Redis instance is using more than 80% of its available memory.",
}

ALERT MaxClients
IF redis_connected_clients > 80000
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "{{ $labels.instance }}: Redis is at 80% of client capacity.",
description = "{{ $labels.instance }}: Redis is currently handling more than 80000 clients",
}
11 changes: 10 additions & 1 deletion roles/prometheus/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,17 @@
path: /etc/prometheus
state: directory

- name: create alerting directory
file:
path: /etc/prometheus/alerting-rules
state: directory

- name: upload rule files
copy: src=rule1 dest=/etc/prometheus/rule1
- copy:
src= "{{ item }}"
dest=/etc/prometheus/alerting-rules/
with_fileglob:
- *.rule

- name: configure prometheus
template: src=prometheus.yml.j2 dest=/etc/prometheus/prometheus.yml
Expand Down
2 changes: 1 addition & 1 deletion roles/prometheus/templates/prometheus.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ scrape_configs:
- targets: [{% for host in groups['all'] %}'{{ host }}.local:9100',{% endfor %}]

rule_files:
- /etc/prometheus/rule*
- /etc/prometheus/alerting-rules/*.rule