diff --git a/external_version.yaml b/external_version.yaml index 20f9baf..6d31d34 100644 --- a/external_version.yaml +++ b/external_version.yaml @@ -12,5 +12,5 @@ gnats_version: "1.0.2" # prometheus exporters redis_exporter_version: "0.12.2" -jmx_prometheus_javaagent_version: "0.1.0" +jmx_prometheus_javaagent_version: "0.3.0" node_exporter_version: "0.14.0" diff --git a/hosts.template b/hosts.template index e1fa806..d1788b5 100644 --- a/hosts.template +++ b/hosts.template @@ -23,7 +23,7 @@ gnats_version=1.0.2 # prometheus exporters redis_exporter_version=0.12.2 -jmx_prometheus_javaagent_version=0.1.0 +jmx_prometheus_javaagent_version=0.3.0 node_exporter_version=0.14.0 [monitoring:children] diff --git a/roles/alertmanager/tasks/main.yml b/roles/alertmanager/tasks/main.yml index da09e2d..1e13679 100644 --- a/roles/alertmanager/tasks/main.yml +++ b/roles/alertmanager/tasks/main.yml @@ -1,7 +1,7 @@ - name: install alertmanager copy: src: "{{ dist_directory }}/ext/alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager" - dest: /usr/local/sbin/alertmanager + dest: /usr/local/alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager mode: 0711 - name: install service for alertmanager diff --git a/roles/alertmanager/templates/alertmanager.yml.j2 b/roles/alertmanager/templates/alertmanager.yml.j2 index cc88d3b..0576bca 100644 --- a/roles/alertmanager/templates/alertmanager.yml.j2 +++ b/roles/alertmanager/templates/alertmanager.yml.j2 @@ -1,6 +1,15 @@ global: - smtp_from: 'alert@caliopen.org' - smtp_smarthost: 'localhost:25' + smtp_from: 'ops@caliopen.org' + smtp_smarthost: 'mail.gandi.net:587' + smtp_auth_username: 'ops@caliopen.org' + smtp_auth_password: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 39323731323662373133663263653334643766373562663238653661333963323362336238383034 + 3032613435376263643130636438353339323465613763640a303561633162356361333136386664 + 61316631333162386430343935373132393437656234636331613230663362373932356465323865 + 3636373534363331660a666165656362316335376464376565323239653031353739623831306537 + 6637 + smtp_require_tls: true route: group_by: ['alertname', 'service'] @@ -16,4 +25,4 @@ route: receivers: - name: 'team-ops' email_configs: - - to: 'ops@caliopen.org' + - to: 'alert@caliopen.org' diff --git a/roles/cassandra/templates/cassandra-env.sh b/roles/cassandra/templates/cassandra-env.sh index a9d18e2..06e66e5 100644 --- a/roles/cassandra/templates/cassandra-env.sh +++ b/roles/cassandra/templates/cassandra-env.sh @@ -312,4 +312,4 @@ JVM_OPTS="$JVM_OPTS $JVM_EXTRA_OPTS" # Add JMX prometheus exporter JMX_EXPORTER_DIR="/etc/cassandra" -# JVM_OPTS="$JVM_OPTS -javaagent:$JMX_EXPORTER_DIR/jmx_prometheus_javaagent-{{ jmx_prometheus_javaagent_version }}.jar=7070:$JMX_EXPORTER_DIR/cassandra_exporter.yaml" +JVM_OPTS="$JVM_OPTS -javaagent:$JMX_EXPORTER_DIR/jmx_prometheus_javaagent-{{ jmx_prometheus_javaagent_version }}.jar=7070:$JMX_EXPORTER_DIR/cassandra_exporter.yaml" diff --git a/roles/nats/tasks/main.yml b/roles/nats/tasks/main.yml index 1edcebe..da8892a 100644 --- a/roles/nats/tasks/main.yml +++ b/roles/nats/tasks/main.yml @@ -10,6 +10,12 @@ - name: start nats service: name=gnatsd state=started enabled=yes +- name: install prometheus-nats-exporter + copy: + src: "{{ dist_directory }}/ext/prometheus-nats-exporter" + dest: /usr/local/sbin/prometheus-nats-exporter + mode: 0711 + - name: install prometheus-nats-exporter service template: src=prometheus-nats-exporter.service.j2 dest=/etc/systemd/system/prometheus-nats-exporter.service diff --git a/roles/nats/templates/prometheus-nats-exporter.service.j2 b/roles/nats/templates/prometheus-nats-exporter.service.j2 index d16c4ad..73ef6c6 100644 --- a/roles/nats/templates/prometheus-nats-exporter.service.j2 +++ b/roles/nats/templates/prometheus-nats-exporter.service.j2 @@ -3,8 +3,8 @@ Description=Nats prometheus exporter [Service] Restart=always -ExecStart=/var/tmp/prometheus-nats-exporter -connz -routez -subz -a {{ facter_ipaddress_eth1 }} http://{{ facter_ipaddress_eth1}}:8222 -ExecStop=pkill prometheus-nats-exporter +ExecStart=/usr/local/sbin/prometheus-nats-exporter -connz -routez -subz -varz -a {{ facter_ipaddress_eth1 }} http://{{ facter_ipaddress_eth1}}:8222 +ExecStop=pkill /usr/local/sbin/prometheus-nats-exporter [Install] WantedBy=local.target diff --git a/roles/prometheus/files/cassandra-status.rule b/roles/prometheus/files/cassandra-status.rule new file mode 100644 index 0000000..c1a9212 --- /dev/null +++ b/roles/prometheus/files/cassandra-status.rule @@ -0,0 +1,9 @@ +ALERT xxxxxxxxxxxxx + IF xxxxxxxxxxxxxxxxxx + FOR 5m + LABELS { severity ="critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: ES cluster status yellow", + DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in yellow state for more than 5 minutes)" +} + diff --git a/roles/prometheus/files/cpu-usage.rule b/roles/prometheus/files/cpu-usage.rule new file mode 100644 index 0000000..b7ec936 --- /dev/null +++ b/roles/prometheus/files/cpu-usage.rule @@ -0,0 +1,10 @@ +ALERT NodeCPUUsage + IF (100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100)) > 75 + FOR 2m + LABELS { + severity="critical" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High CPU usage detected", + DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" +} \ No newline at end of file diff --git a/roles/prometheus/files/es-status.rule b/roles/prometheus/files/es-status.rule new file mode 100644 index 0000000..5a1b6d6 --- /dev/null +++ b/roles/prometheus/files/es-status.rule @@ -0,0 +1,17 @@ +ALERT EsStatusYellow + IF elasticsearch_cluster_health_status{color="yellow"} + FOR 5m + LABELS { severity ="warning" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: ES cluster status yellow", + DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in yellow state for more than 5 minutes)" +} + +ALERT EsStatusRed + IF elasticsearch_cluster_health_status{color="red"} + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: ES cluster status red", + DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in red state for more than 5 minutes)" +} \ No newline at end of file diff --git a/roles/prometheus/files/filesystem-usage.rule b/roles/prometheus/files/filesystem-usage.rule new file mode 100644 index 0000000..b184eac --- /dev/null +++ b/roles/prometheus/files/filesystem-usage.rule @@ -0,0 +1,8 @@ +ALERT FilesystemFull + IF node_filesystem_free / node_filesystem_size < 0.3 + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "Filesystem {{ $labels.instance }} full", + description = "{{ $labels.instance }} of job {{ $labels.job }} free space less than 30%.", + } \ No newline at end of file diff --git a/roles/prometheus/files/http-responses-500.rule b/roles/prometheus/files/http-responses-500.rule new file mode 100644 index 0000000..ed5538b --- /dev/null +++ b/roles/prometheus/files/http-responses-500.rule @@ -0,0 +1 @@ +rate(nginx_http_requests_total{status=~"5[0-9][0-9]",host!="127.0.0.1"}[5m]) \ No newline at end of file diff --git a/roles/prometheus/files/rule1 b/roles/prometheus/files/instance-down.rule similarity index 51% rename from roles/prometheus/files/rule1 rename to roles/prometheus/files/instance-down.rule index c4ab792..99c30a9 100644 --- a/roles/prometheus/files/rule1 +++ b/roles/prometheus/files/instance-down.rule @@ -6,14 +6,4 @@ ALERT InstanceDown ANNOTATIONS { summary = "Instance {{ $labels.instance }} down", description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.", - } - - -ALERT FilesystemFull - IF node_filesystem_free / node_filesystem_size < 0.3 - FOR 5m - LABELS { severity = "critical" } - ANNOTATIONS { - summary = "Filesystem {{ $labels.instance }} full", - description = "{{ $labels.instance }} of job {{ $labels.job }} free space less than 30%.", - } + } \ No newline at end of file diff --git a/roles/prometheus/files/load-average.rule b/roles/prometheus/files/load-average.rule new file mode 100644 index 0000000..a710699 --- /dev/null +++ b/roles/prometheus/files/load-average.rule @@ -0,0 +1,10 @@ +ALERT NodeLoadAverage + IF ((node_load5 / count without (cpu, mode) (node_cpu{mode="system"})) > 1) + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High LA detected", + DESCRIPTION = "{{$labels.instance}}: LA is high" +} \ No newline at end of file diff --git a/roles/prometheus/files/memory-usage.rule b/roles/prometheus/files/memory-usage.rule new file mode 100644 index 0000000..7cc88af --- /dev/null +++ b/roles/prometheus/files/memory-usage.rule @@ -0,0 +1,17 @@ +ALERT NodeMemoryUsage + IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75 + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High memory usage detected", + DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" +} + +ALERT NodeSwapUsage + IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Swap usage detected", + DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" +} \ No newline at end of file diff --git a/roles/prometheus/files/redis-status.rule b/roles/prometheus/files/redis-status.rule new file mode 100644 index 0000000..4066766 --- /dev/null +++ b/roles/prometheus/files/redis-status.rule @@ -0,0 +1,47 @@ +ALERT RedisHighMissRatio + IF (((rate(redis_keyspace_misses_total[5m])) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))) > 0.5) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis instance miss ratio high.", + description = "{{ $labels.instance }}: Redis instance miss ratio is over 50%.", + } + +ALERT MemoryFragmentationHigh + IF (redis_memory_fragmentation_ratio > 1.5) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis memory fragmentation too high.", + description = "{{ $labels.instance }}: Redis instance memory fragmentation ratio over 1.5.", + } + +ALERT MemoryFragmentationLow + IF (redis_memory_fragmentation_ratio > 0.9) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis memory fragmentation too low.", + description = "{{ $labels.instance }}: Redis instance memory fragmentation ratio under 0.9.", + } + +ALERT KeyEvictions + IF (rate(redis_evicted_keys_total[5m]) >= 1) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis instance evicting keys.", + description = "{{ $labels.instance }}: Redis instance has been consistently evicting keys for 5 minutes.", + } + +ALERT TotalMemoryUsed + IF (redis_memory_used_bytes{instance="cache1.local:9121"} / node_memory_MemTotal{instance="cache1.local:9100"}) > 0.8 + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis is using too much memory.", + description = "{{ $labels.instance }}: Redis instance is using more than 80% of its available memory.", + } + +ALERT MaxClients + IF redis_connected_clients > 80000 + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis is at 80% of client capacity.", + description = "{{ $labels.instance }}: Redis is currently handling more than 80000 clients", + } \ No newline at end of file diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 25b4636..d6e9c4b 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -39,8 +39,17 @@ path: /etc/prometheus state: directory +- name: create alerting directory + file: + path: /etc/prometheus/alerting-rules + state: directory + - name: upload rule files - copy: src=rule1 dest=/etc/prometheus/rule1 + - copy: + src= "{{ item }}" + dest=/etc/prometheus/alerting-rules/ + with_fileglob: + - *.rule - name: configure prometheus template: src=prometheus.yml.j2 dest=/etc/prometheus/prometheus.yml diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 031d581..2384f2b 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -44,4 +44,4 @@ scrape_configs: - targets: [{% for host in groups['all'] %}'{{ host }}.local:9100',{% endfor %}] rule_files: - - /etc/prometheus/rule* + - /etc/prometheus/alerting-rules/*.rule