Skip to content

Commit

Permalink
Merge pull request #2 from Logitech/feat-elk-prod-alarms-config
Browse files Browse the repository at this point in the history
Elastalert Configs
  • Loading branch information
h-s04 authored Jun 24, 2020
2 parents 4080211 + 324f5b9 commit 84444ad
Show file tree
Hide file tree
Showing 6 changed files with 359 additions and 0 deletions.
115 changes: 115 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# This is the folder that contains the rule yaml files
# Any .yaml file will be loaded as a rule
rules_folder: infra-rules

# How often ElastAlert will query Elasticsearch
# The unit can be anything from weeks to seconds
run_every:
minutes: 1

# ElastAlert will buffer results from the most recent
# period of time, in case some log sources are not in real time
buffer_time:
minutes: 15

# The Elasticsearch hostname for metadata writeback
# Note that every rule can have its own Elasticsearch host
es_host: elk.it.logitech.com

# The Elasticsearch port
es_port: 9200

# The AWS region to use. Set this when using AWS-managed elasticsearch
#aws_region: us-east-1

# The AWS profile to use. Use this if you are using an aws-cli profile.
# See http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html
# for details
#profile: test

# Optional URL prefix for Elasticsearch
#es_url_prefix: elasticsearch

# Connect with TLS to Elasticsearch
use_ssl: True

# Verify TLS certificates
verify_certs: True

# GET request with body is the default option for Elasticsearch.
# If it fails for some reason, you can pass 'GET', 'POST' or 'source'.
# See http://elasticsearch-py.readthedocs.io/en/master/connection.html?highlight=send_get_body_as#transport
# for details
#es_send_get_body_as: GET

# Option basic-auth username and password for Elasticsearch
es_username: elastalert_master
es_password: <ea-password>

# Use SSL authentication with client certificates client_cert must be
# a pem file containing both cert and key for client
#verify_certs: True
#ca_certs: /path/to/cacert.pem
#client_cert: /path/to/client_cert.pem
#client_key: /path/to/client_key.key

# The index on es_host which is used for metadata storage
# This can be a unmapped index, but it is recommended that you run
# elastalert-create-index to set a mapping
writeback_index: elastalert_status
writeback_alias: elastalert_alerts

# If an alert fails for some reason, ElastAlert will retry
# sending the alert until this time period has elapsed
alert_time_limit:
days: 2

# Custom logging configuration
# If you want to setup your own logging configuration to log into
# files as well or to Logstash and/or modify log levels, use
# the configuration below and adjust to your needs.
# Note: if you run ElastAlert with --verbose/--debug, the log level of
# the "elastalert" logger is changed to INFO, if not already INFO/DEBUG.
#logging:
# version: 1
# incremental: false
# disable_existing_loggers: false
# formatters:
# logline:
# format: '%(asctime)s %(levelname)+8s %(name)+20s %(message)s'
#
# handlers:
# console:
# class: logging.StreamHandler
# formatter: logline
# level: DEBUG
# stream: ext://sys.stderr
#
# file:
# class : logging.FileHandler
# formatter: logline
# level: DEBUG
# filename: elastalert.log
#
# loggers:
# elastalert:
# level: WARN
# handlers: []
# propagate: true
#
# elasticsearch:
# level: WARN
# handlers: []
# propagate: true
#
# elasticsearch.trace:
# level: WARN
# handlers: []
# propagate: true
#
# '': # root logger
# level: WARN
# handlers:
# - console
# - file
# propagate: false
50 changes: 50 additions & 0 deletions infra-rules/application_critical_uptime_monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Heartbeat UpTime Monitoring Rule
type: metric_aggregation

#es_host: localhost
#es_port: 9200

index: heartbeat*

buffer_time:
minutes: 5
realert:
minutes: 5

metric_agg_key: summary.down
metric_agg_type: avg
query_key: instance-tag
#doc_type: metricsets

bucket_interval:
minutes: 5

sync_bucket_interval: true
#allow_buffer_time_overlap: true
#use_run_every_query_size: true

#min_threshold: 0.1
max_threshold: 0.9

filter:
- term:
critical-instance: true

match_enhancements:
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"

# (Required)
# The alert is use when a match is found
alert:
- slack

alert_subject: "CSAD IT - Critical Instance Application Health Monitoring Alarm"
alert_text: "The Application {0} is unhealthy and reporting Downtime during {1}"
alert_text_type: alert_text_only
alert_text_args: ["instance-tag", "@timestamp"]


slack:
slack_webhook_url: "<slack-webhook>"
slack_title: "CSAD IT - Critical Instance Application Health Monitoring Alarm"
slack_title_link: "https://elk.it.logitech.com:5601/app/kibana#/dashboard/35aa4fd0-b0b3-11ea-833d-fde9206e58f3"
50 changes: 50 additions & 0 deletions infra-rules/application_uptime_non_critical_monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Non-Critical Instance App monitor rule
type: metric_aggregation

#es_host: localhost
#es_port: 9200

index: heartbeat*

buffer_time:
minutes: 20
realert:
minutes: 20

metric_agg_key: summary.down
metric_agg_type: avg
query_key: instance-tag
#doc_type: metricsets

bucket_interval:
minutes: 20

sync_bucket_interval: true
#allow_buffer_time_overlap: true
#use_run_every_query_size: true

#min_threshold: 0.1
max_threshold: 0.9

match_enhancements:
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"

filter:
- term:
critical-instance: false

# (Required)
# The alert is use when a match is found
alert:
- slack

alert_subject: "CSAD IT - Non Critical Instance Application Health Monitoring Alarm"
alert_text: "The Application {0} is unhealthy and reporting Downtime during {1}"
alert_text_type: alert_text_only
alert_text_args: ["instance-tag", "@timestamp"]


slack:
slack_webhook_url: "<slack-webhook>"
slack_title: "CSAD IT - Non Critical Instance Application Health Monitoring Alarm"
slack_title_link: "https://elk.it.logitech.com:5601/app/kibana#/dashboard/35aa4fd0-b0b3-11ea-833d-fde9206e58f3"
47 changes: 47 additions & 0 deletions infra-rules/instance_cpu_agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: Metricbeat CPU Spike Rule
type: metric_aggregation

#es_host: localhost
#es_port: 9200

index: metricbeat-*

buffer_time:
minutes: 25
realert:
minutes: 10

metric_agg_key: system.cpu.user.pct
metric_agg_type: avg
query_key: instance-tag
#doc_type: metricsets

bucket_interval:
minutes: 5

sync_bucket_interval: true
#allow_buffer_time_overlap: true
#use_run_every_query_size: true

#min_threshold: 0.1
max_threshold: 0.8

filter:
- term:
metricset.name: cpu

match_enhancements:
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"
# (Required)
# The alert is use when a match is found
alert:
- slack

alert_subject: "CSAD IT Infra CPU Utilization Alarm"
alert_text: "CPU Usage Execeeded 80% on the instance {0} during {1}."
alert_text_type: alert_text_only
alert_text_args: ["instance-tag", "@timestamp"]


slack:
slack_webhook_url: "<slack-webhook>"
53 changes: 53 additions & 0 deletions infra-rules/instance_disk_agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Metricbeat Disk Spike Rule
type: spike_aggregation

#es_host: localhost
#es_port: 9200

index: metricbeat-*

timeframe:
minutes: 25

buffer_time:
minutes: 5

metric_agg_key: scriptedDisk
metric_agg_script:
script: (doc['system.fsstat.total_size.free'].value) / (doc['system.fsstat.total_size.total'].value)
metric_agg_type: avg

query_key: ["instance-tag"]
#doc_type: metricsets

threshold_cur: 0.8

# (Required, spike specific)
# The spike aggregation rule matches when the current window contains spike_height times higher aggregated value
# than the reference window
spike_height: 2

# (Required, spike specific)
# The direction of the spike
# 'up' matches only spikes, 'down' matches only troughs
# 'both' matches both spikes and troughs
spike_type: "up"

filter:
- term:
metricset.name: fsstat

match_enhancements:
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"

alert:
- slack

alert_subject: "CSAD IT Infra Disk Alarm"
alert_text: "Memory Usage Execeeded 80% on the instance {0} during {1}."
alert_text_type: alert_text_only
alert_text_args: ["instance-tag", "@timestamp"]


slack:
slack_webhook_url: "<slack-webhook>"
44 changes: 44 additions & 0 deletions infra-rules/instance_memory_agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: Metricbeat Memory Spike Rule
type: metric_aggregation

#es_host: localhost
#es_port: 9200

index: metricbeat-*

buffer_time:
minutes: 25
realert:
minutes: 10

metric_agg_key: system.memory.actual.used.pct
metric_agg_type: avg
query_key: ["instance-tag"]
#doc_type: metricsets

bucket_interval:
minutes: 5
sync_bucket_interval: true
#allow_buffer_time_overlap: true
#use_run_every_query_size: true

max_threshold: 0.95

filter:
- term:
metricset.name: memory

match_enhancements:
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo"

alert:
- slack

alert_subject: "CSAD IT Infra Memory Alarm"
alert_text: "Memory Usage Execeeded 95% on the instance {0} during {1}."
alert_text_type: alert_text_only
alert_text_args: ["instance-tag", "@timestamp"]


slack:
slack_webhook_url: "<slack-webhook>"

0 comments on commit 84444ad

Please sign in to comment.