forked from Yelp/elastalert
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Prod Elastalert configs and alarms
- Loading branch information
Naresh
committed
Jun 24, 2020
1 parent
4080211
commit 324f5b9
Showing
6 changed files
with
359 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# This is the folder that contains the rule yaml files | ||
# Any .yaml file will be loaded as a rule | ||
rules_folder: infra-rules | ||
|
||
# How often ElastAlert will query Elasticsearch | ||
# The unit can be anything from weeks to seconds | ||
run_every: | ||
minutes: 1 | ||
|
||
# ElastAlert will buffer results from the most recent | ||
# period of time, in case some log sources are not in real time | ||
buffer_time: | ||
minutes: 15 | ||
|
||
# The Elasticsearch hostname for metadata writeback | ||
# Note that every rule can have its own Elasticsearch host | ||
es_host: elk.it.logitech.com | ||
|
||
# The Elasticsearch port | ||
es_port: 9200 | ||
|
||
# The AWS region to use. Set this when using AWS-managed elasticsearch | ||
#aws_region: us-east-1 | ||
|
||
# The AWS profile to use. Use this if you are using an aws-cli profile. | ||
# See http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html | ||
# for details | ||
#profile: test | ||
|
||
# Optional URL prefix for Elasticsearch | ||
#es_url_prefix: elasticsearch | ||
|
||
# Connect with TLS to Elasticsearch | ||
use_ssl: True | ||
|
||
# Verify TLS certificates | ||
verify_certs: True | ||
|
||
# GET request with body is the default option for Elasticsearch. | ||
# If it fails for some reason, you can pass 'GET', 'POST' or 'source'. | ||
# See http://elasticsearch-py.readthedocs.io/en/master/connection.html?highlight=send_get_body_as#transport | ||
# for details | ||
#es_send_get_body_as: GET | ||
|
||
# Option basic-auth username and password for Elasticsearch | ||
es_username: elastalert_master | ||
es_password: <ea-password> | ||
|
||
# Use SSL authentication with client certificates client_cert must be | ||
# a pem file containing both cert and key for client | ||
#verify_certs: True | ||
#ca_certs: /path/to/cacert.pem | ||
#client_cert: /path/to/client_cert.pem | ||
#client_key: /path/to/client_key.key | ||
|
||
# The index on es_host which is used for metadata storage | ||
# This can be a unmapped index, but it is recommended that you run | ||
# elastalert-create-index to set a mapping | ||
writeback_index: elastalert_status | ||
writeback_alias: elastalert_alerts | ||
|
||
# If an alert fails for some reason, ElastAlert will retry | ||
# sending the alert until this time period has elapsed | ||
alert_time_limit: | ||
days: 2 | ||
|
||
# Custom logging configuration | ||
# If you want to setup your own logging configuration to log into | ||
# files as well or to Logstash and/or modify log levels, use | ||
# the configuration below and adjust to your needs. | ||
# Note: if you run ElastAlert with --verbose/--debug, the log level of | ||
# the "elastalert" logger is changed to INFO, if not already INFO/DEBUG. | ||
#logging: | ||
# version: 1 | ||
# incremental: false | ||
# disable_existing_loggers: false | ||
# formatters: | ||
# logline: | ||
# format: '%(asctime)s %(levelname)+8s %(name)+20s %(message)s' | ||
# | ||
# handlers: | ||
# console: | ||
# class: logging.StreamHandler | ||
# formatter: logline | ||
# level: DEBUG | ||
# stream: ext://sys.stderr | ||
# | ||
# file: | ||
# class : logging.FileHandler | ||
# formatter: logline | ||
# level: DEBUG | ||
# filename: elastalert.log | ||
# | ||
# loggers: | ||
# elastalert: | ||
# level: WARN | ||
# handlers: [] | ||
# propagate: true | ||
# | ||
# elasticsearch: | ||
# level: WARN | ||
# handlers: [] | ||
# propagate: true | ||
# | ||
# elasticsearch.trace: | ||
# level: WARN | ||
# handlers: [] | ||
# propagate: true | ||
# | ||
# '': # root logger | ||
# level: WARN | ||
# handlers: | ||
# - console | ||
# - file | ||
# propagate: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
name: Heartbeat UpTime Monitoring Rule | ||
type: metric_aggregation | ||
|
||
#es_host: localhost | ||
#es_port: 9200 | ||
|
||
index: heartbeat* | ||
|
||
buffer_time: | ||
minutes: 5 | ||
realert: | ||
minutes: 5 | ||
|
||
metric_agg_key: summary.down | ||
metric_agg_type: avg | ||
query_key: instance-tag | ||
#doc_type: metricsets | ||
|
||
bucket_interval: | ||
minutes: 5 | ||
|
||
sync_bucket_interval: true | ||
#allow_buffer_time_overlap: true | ||
#use_run_every_query_size: true | ||
|
||
#min_threshold: 0.1 | ||
max_threshold: 0.9 | ||
|
||
filter: | ||
- term: | ||
critical-instance: true | ||
|
||
match_enhancements: | ||
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo" | ||
|
||
# (Required) | ||
# The alert is use when a match is found | ||
alert: | ||
- slack | ||
|
||
alert_subject: "CSAD IT - Critical Instance Application Health Monitoring Alarm" | ||
alert_text: "The Application {0} is unhealthy and reporting Downtime during {1}" | ||
alert_text_type: alert_text_only | ||
alert_text_args: ["instance-tag", "@timestamp"] | ||
|
||
|
||
slack: | ||
slack_webhook_url: "<slack-webhook>" | ||
slack_title: "CSAD IT - Critical Instance Application Health Monitoring Alarm" | ||
slack_title_link: "https://elk.it.logitech.com:5601/app/kibana#/dashboard/35aa4fd0-b0b3-11ea-833d-fde9206e58f3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
name: Non-Critical Instance App monitor rule | ||
type: metric_aggregation | ||
|
||
#es_host: localhost | ||
#es_port: 9200 | ||
|
||
index: heartbeat* | ||
|
||
buffer_time: | ||
minutes: 20 | ||
realert: | ||
minutes: 20 | ||
|
||
metric_agg_key: summary.down | ||
metric_agg_type: avg | ||
query_key: instance-tag | ||
#doc_type: metricsets | ||
|
||
bucket_interval: | ||
minutes: 20 | ||
|
||
sync_bucket_interval: true | ||
#allow_buffer_time_overlap: true | ||
#use_run_every_query_size: true | ||
|
||
#min_threshold: 0.1 | ||
max_threshold: 0.9 | ||
|
||
match_enhancements: | ||
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo" | ||
|
||
filter: | ||
- term: | ||
critical-instance: false | ||
|
||
# (Required) | ||
# The alert is use when a match is found | ||
alert: | ||
- slack | ||
|
||
alert_subject: "CSAD IT - Non Critical Instance Application Health Monitoring Alarm" | ||
alert_text: "The Application {0} is unhealthy and reporting Downtime during {1}" | ||
alert_text_type: alert_text_only | ||
alert_text_args: ["instance-tag", "@timestamp"] | ||
|
||
|
||
slack: | ||
slack_webhook_url: "<slack-webhook>" | ||
slack_title: "CSAD IT - Non Critical Instance Application Health Monitoring Alarm" | ||
slack_title_link: "https://elk.it.logitech.com:5601/app/kibana#/dashboard/35aa4fd0-b0b3-11ea-833d-fde9206e58f3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
name: Metricbeat CPU Spike Rule | ||
type: metric_aggregation | ||
|
||
#es_host: localhost | ||
#es_port: 9200 | ||
|
||
index: metricbeat-* | ||
|
||
buffer_time: | ||
minutes: 25 | ||
realert: | ||
minutes: 10 | ||
|
||
metric_agg_key: system.cpu.user.pct | ||
metric_agg_type: avg | ||
query_key: instance-tag | ||
#doc_type: metricsets | ||
|
||
bucket_interval: | ||
minutes: 5 | ||
|
||
sync_bucket_interval: true | ||
#allow_buffer_time_overlap: true | ||
#use_run_every_query_size: true | ||
|
||
#min_threshold: 0.1 | ||
max_threshold: 0.8 | ||
|
||
filter: | ||
- term: | ||
metricset.name: cpu | ||
|
||
match_enhancements: | ||
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo" | ||
# (Required) | ||
# The alert is use when a match is found | ||
alert: | ||
- slack | ||
|
||
alert_subject: "CSAD IT Infra CPU Utilization Alarm" | ||
alert_text: "CPU Usage Execeeded 80% on the instance {0} during {1}." | ||
alert_text_type: alert_text_only | ||
alert_text_args: ["instance-tag", "@timestamp"] | ||
|
||
|
||
slack: | ||
slack_webhook_url: "<slack-webhook>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
name: Metricbeat Disk Spike Rule | ||
type: spike_aggregation | ||
|
||
#es_host: localhost | ||
#es_port: 9200 | ||
|
||
index: metricbeat-* | ||
|
||
timeframe: | ||
minutes: 25 | ||
|
||
buffer_time: | ||
minutes: 5 | ||
|
||
metric_agg_key: scriptedDisk | ||
metric_agg_script: | ||
script: (doc['system.fsstat.total_size.free'].value) / (doc['system.fsstat.total_size.total'].value) | ||
metric_agg_type: avg | ||
|
||
query_key: ["instance-tag"] | ||
#doc_type: metricsets | ||
|
||
threshold_cur: 0.8 | ||
|
||
# (Required, spike specific) | ||
# The spike aggregation rule matches when the current window contains spike_height times higher aggregated value | ||
# than the reference window | ||
spike_height: 2 | ||
|
||
# (Required, spike specific) | ||
# The direction of the spike | ||
# 'up' matches only spikes, 'down' matches only troughs | ||
# 'both' matches both spikes and troughs | ||
spike_type: "up" | ||
|
||
filter: | ||
- term: | ||
metricset.name: fsstat | ||
|
||
match_enhancements: | ||
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo" | ||
|
||
alert: | ||
- slack | ||
|
||
alert_subject: "CSAD IT Infra Disk Alarm" | ||
alert_text: "Memory Usage Execeeded 80% on the instance {0} during {1}." | ||
alert_text_type: alert_text_only | ||
alert_text_args: ["instance-tag", "@timestamp"] | ||
|
||
|
||
slack: | ||
slack_webhook_url: "<slack-webhook>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
name: Metricbeat Memory Spike Rule | ||
type: metric_aggregation | ||
|
||
#es_host: localhost | ||
#es_port: 9200 | ||
|
||
index: metricbeat-* | ||
|
||
buffer_time: | ||
minutes: 25 | ||
realert: | ||
minutes: 10 | ||
|
||
metric_agg_key: system.memory.actual.used.pct | ||
metric_agg_type: avg | ||
query_key: ["instance-tag"] | ||
#doc_type: metricsets | ||
|
||
bucket_interval: | ||
minutes: 5 | ||
sync_bucket_interval: true | ||
#allow_buffer_time_overlap: true | ||
#use_run_every_query_size: true | ||
|
||
max_threshold: 0.95 | ||
|
||
filter: | ||
- term: | ||
metricset.name: memory | ||
|
||
match_enhancements: | ||
- "elastalert_modules.tst_ist_tz_enhancement.ConvertTzInfo" | ||
|
||
alert: | ||
- slack | ||
|
||
alert_subject: "CSAD IT Infra Memory Alarm" | ||
alert_text: "Memory Usage Execeeded 95% on the instance {0} during {1}." | ||
alert_text_type: alert_text_only | ||
alert_text_args: ["instance-tag", "@timestamp"] | ||
|
||
|
||
slack: | ||
slack_webhook_url: "<slack-webhook>" |