Merge pull request #2 from Logitech/feat-elk-prod-alarms-config

Elastalert Configs
Logitech · Jun 24, 2020 · 84444ad · 84444ad
2 parents 4080211 + 324f5b9
commit 84444ad
Show file tree

Hide file tree

Showing 6 changed files with 359 additions and 0 deletions.
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,115 @@
+# This is the folder that contains the rule yaml files
+# Any .yaml file will be loaded as a rule
+rules_folder: infra-rules
+
+# How often ElastAlert will query Elasticsearch
+# The unit can be anything from weeks to seconds
+run_every:
+  minutes: 1
+
+# ElastAlert will buffer results from the most recent
+# period of time, in case some log sources are not in real time
+buffer_time:
+  minutes: 15
+
+# The Elasticsearch hostname for metadata writeback
+# Note that every rule can have its own Elasticsearch host
+es_host: elk.it.logitech.com
+
+# The Elasticsearch port
+es_port: 9200
+
+# The AWS region to use. Set this when using AWS-managed elasticsearch
+#aws_region: us-east-1
+
+# The AWS profile to use. Use this if you are using an aws-cli profile.
+# See http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html
+# for details
+#profile: test
+
+# Optional URL prefix for Elasticsearch
+#es_url_prefix: elasticsearch
+
+# Connect with TLS to Elasticsearch
+use_ssl: True
+
+# Verify TLS certificates
+verify_certs: True
+
+# GET request with body is the default option for Elasticsearch.
+# If it fails for some reason, you can pass 'GET', 'POST' or 'source'.
+# See http://elasticsearch-py.readthedocs.io/en/master/connection.html?highlight=send_get_body_as#transport
+# for details
+#es_send_get_body_as: GET
+
+# Option basic-auth username and password for Elasticsearch
+es_username: elastalert_master
+es_password: <ea-password>
+
+# Use SSL authentication with client certificates client_cert must be
+# a pem file containing both cert and key for client
+#verify_certs: True
+#ca_certs: /path/to/cacert.pem
+#client_cert: /path/to/client_cert.pem
+#client_key: /path/to/client_key.key
+
+# The index on es_host which is used for metadata storage
+# This can be a unmapped index, but it is recommended that you run
+# elastalert-create-index to set a mapping
+writeback_index: elastalert_status
+writeback_alias: elastalert_alerts
+
+# If an alert fails for some reason, ElastAlert will retry
+# sending the alert until this time period has elapsed
+alert_time_limit:
+  days: 2
+
+# Custom logging configuration
+# If you want to setup your own logging configuration to log into
+# files as well or to Logstash and/or modify log levels, use
+# the configuration below and adjust to your needs.
+# Note: if you run ElastAlert with --verbose/--debug, the log level of
+# the "elastalert" logger is changed to INFO, if not already INFO/DEBUG.
+#logging:
+#  version: 1
+#  incremental: false
+#  disable_existing_loggers: false
+#  formatters:
+#    logline:
+#      format: '%(asctime)s %(levelname)+8s %(name)+20s %(message)s'
+#
+#    handlers:
+#      console:
+#        class: logging.StreamHandler
+#        formatter: logline
+#        level: DEBUG
+#        stream: ext://sys.stderr
+#
+#      file:
+#        class : logging.FileHandler
+#        formatter: logline
+#        level: DEBUG
+#        filename: elastalert.log
+#
+#    loggers:
+#      elastalert:
+#        level: WARN
+#        handlers: []
+#        propagate: true
+#
+#      elasticsearch:
+#        level: WARN
+#        handlers: []
+#        propagate: true
+#
+#      elasticsearch.trace:
+#        level: WARN
+#        handlers: []
+#        propagate: true
+#
+#      '':  # root logger
+#        level: WARN
+#          handlers:
+#            - console
+#            - file
+#        propagate: false
diff --git a/infra-rules/application_critical_uptime_monitor.yaml b/infra-rules/application_critical_uptime_monitor.yaml
@@ -0,0 +1,50 @@
+name: Heartbeat UpTime Monitoring Rule
+type: metric_aggregation
+
+#es_host: localhost
+#es_port: 9200
+
+index: heartbeat*
+
+buffer_time:
+  minutes: 5
+realert:
+  minutes: 5
+
+metric_agg_key: summary.down
+metric_agg_type: avg
+query_key: instance-tag
+#doc_type: metricsets
+
+bucket_interval:
+  minutes: 5
+
+sync_bucket_interval: true
+#allow_buffer_time_overlap: true
+#use_run_every_query_size: true
+
+#min_threshold: 0.1
+max_threshold: 0.9
+
+filter:
+- term:
+    critical-instance: true
+
+match_enhancements:
+- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"
+
+# (Required)
+# The alert is use when a match is found
+alert:
+- slack
+
+alert_subject: "CSAD IT - Critical Instance Application Health Monitoring Alarm"
+alert_text: "The Application {0} is unhealthy and reporting Downtime during {1}"
+alert_text_type: alert_text_only
+alert_text_args: ["instance-tag", "@timestamp"]
+
+
+slack:
+slack_webhook_url: "<slack-webhook>"
+slack_title: "CSAD IT - Critical Instance Application Health Monitoring Alarm"
+slack_title_link: "https://elk.it.logitech.com:5601/app/kibana#/dashboard/35aa4fd0-b0b3-11ea-833d-fde9206e58f3"
diff --git a/infra-rules/application_uptime_non_critical_monitor.yaml b/infra-rules/application_uptime_non_critical_monitor.yaml
@@ -0,0 +1,50 @@
+name: Non-Critical Instance App monitor rule
+type: metric_aggregation
+
+#es_host: localhost
+#es_port: 9200
+
+index: heartbeat*
+
+buffer_time:
+  minutes: 20
+realert:
+  minutes: 20
+
+metric_agg_key: summary.down
+metric_agg_type: avg
+query_key: instance-tag
+#doc_type: metricsets
+
+bucket_interval:
+  minutes: 20
+
+sync_bucket_interval: true
+#allow_buffer_time_overlap: true
+#use_run_every_query_size: true
+
+#min_threshold: 0.1
+max_threshold: 0.9
+
+match_enhancements:
+- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"
+
+filter:
+- term:
+    critical-instance: false
+
+# (Required)
+# The alert is use when a match is found
+alert:
+- slack
+
+alert_subject: "CSAD IT - Non Critical Instance Application Health Monitoring Alarm"
+alert_text: "The Application {0} is unhealthy and reporting Downtime during {1}"
+alert_text_type: alert_text_only
+alert_text_args: ["instance-tag", "@timestamp"]
+
+
+slack:
+slack_webhook_url: "<slack-webhook>"
+slack_title: "CSAD IT - Non Critical Instance Application Health Monitoring Alarm"
+slack_title_link: "https://elk.it.logitech.com:5601/app/kibana#/dashboard/35aa4fd0-b0b3-11ea-833d-fde9206e58f3"
diff --git a/infra-rules/instance_cpu_agg.yaml b/infra-rules/instance_cpu_agg.yaml
@@ -0,0 +1,47 @@
+name: Metricbeat CPU Spike Rule
+type: metric_aggregation
+
+#es_host: localhost
+#es_port: 9200
+
+index: metricbeat-*
+
+buffer_time:
+  minutes: 25
+realert:
+  minutes: 10
+
+metric_agg_key: system.cpu.user.pct
+metric_agg_type: avg
+query_key: instance-tag
+#doc_type: metricsets
+
+bucket_interval:
+  minutes: 5
+
+sync_bucket_interval: true
+#allow_buffer_time_overlap: true
+#use_run_every_query_size: true
+
+#min_threshold: 0.1
+max_threshold: 0.8
+
+filter:
+- term:
+    metricset.name: cpu
+
+match_enhancements:
+- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"
+# (Required)
+# The alert is use when a match is found
+alert:
+- slack
+
+alert_subject: "CSAD IT Infra CPU Utilization Alarm"
+alert_text: "CPU Usage Execeeded 80% on the instance {0} during {1}."
+alert_text_type: alert_text_only
+alert_text_args: ["instance-tag", "@timestamp"]
+
+
+slack:
+slack_webhook_url: "<slack-webhook>"
diff --git a/infra-rules/instance_disk_agg.yaml b/infra-rules/instance_disk_agg.yaml
@@ -0,0 +1,53 @@
+name: Metricbeat Disk Spike Rule
+type: spike_aggregation
+
+#es_host: localhost
+#es_port: 9200
+
+index: metricbeat-*
+
+timeframe:
+  minutes: 25
+
+buffer_time:
+  minutes: 5
+
+metric_agg_key: scriptedDisk
+metric_agg_script:
+  script: (doc['system.fsstat.total_size.free'].value) / (doc['system.fsstat.total_size.total'].value)
+metric_agg_type: avg
+
+query_key: ["instance-tag"]
+#doc_type: metricsets
+
+threshold_cur: 0.8
+
+# (Required, spike specific)
+# The spike aggregation rule matches when the current window contains spike_height times higher aggregated value
+# than the reference window
+spike_height: 2
+
+# (Required, spike specific)
+# The direction of the spike
+# 'up' matches only spikes, 'down' matches only troughs
+# 'both' matches both spikes and troughs
+spike_type: "up"
+
+filter:
+- term:
+    metricset.name: fsstat
+
+match_enhancements:
+- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"
+
+alert:
+- slack
+
+alert_subject: "CSAD IT Infra Disk Alarm"
+alert_text: "Memory Usage Execeeded 80% on the instance {0} during {1}."
+alert_text_type: alert_text_only
+alert_text_args: ["instance-tag", "@timestamp"]
+
+
+slack:
+slack_webhook_url: "<slack-webhook>"
diff --git a/infra-rules/instance_memory_agg.yaml b/infra-rules/instance_memory_agg.yaml
@@ -0,0 +1,44 @@
+name: Metricbeat Memory Spike Rule
+type: metric_aggregation
+
+#es_host: localhost
+#es_port: 9200
+
+index: metricbeat-*
+
+buffer_time:
+  minutes: 25
+realert:
+  minutes: 10
+
+metric_agg_key: system.memory.actual.used.pct
+metric_agg_type: avg
+query_key: ["instance-tag"]
+#doc_type: metricsets
+
+bucket_interval:
+  minutes: 5
+sync_bucket_interval: true
+#allow_buffer_time_overlap: true
+#use_run_every_query_size: true
+
+max_threshold: 0.95
+
+filter:
+- term:
+    metricset.name: memory
+
+match_enhancements:
+- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"
+
+alert:
+- slack
+
+alert_subject: "CSAD IT Infra Memory Alarm"
+alert_text: "Memory Usage Execeeded 95% on the instance {0} during {1}."
+alert_text_type: alert_text_only
+alert_text_args: ["instance-tag", "@timestamp"]
+
+
+slack:
+slack_webhook_url: "<slack-webhook>"