Skip to content

Commit

Permalink
Alerting (tym-xqo#1)
Browse files Browse the repository at this point in the history
* add Alert script
- yaml config file for setting check and threshold per metric
- automatically updates status on failure, so won't alert again until clear
- added args for invoking Scheduler and quieting alerts
  • Loading branch information
tym-xqo authored May 31, 2019
1 parent 6dd31d1 commit 8b1c152
Show file tree
Hide file tree
Showing 29 changed files with 570 additions and 290 deletions.
7 changes: 6 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
DATABASE_URL=postgres://localhost:5432/yardstick
STORE_DB_URL=postgres://localhost:5432/metrics
METRIC_ENV=development
DATABASE_URL=postgres://some_user@localhost:5432/cool_facts
STORE_DB_URL=postgres://dba_user@localhost:5432/metrics
SLACK_TOKEN=xxxx-0000000
CHANNEL=AB123456
HOSTNAME=localhost
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,6 @@ venv.bak/
dmypy.json

# Pyre type checker
.pyre/
.pyre/

query_bak/
12 changes: 7 additions & 5 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
psycopg2-binary = "*"

[packages]
apscheduler = "*"
arrow = "*"
backports-functools-lru-cache = "*"
pathlib = "*"
psycopg2-binary = "*"
python-dotenv = "*"
records = "*"
newrelic = "*"
pyyaml = ">=4.2b1"
records = "*"
slackclient = "*"

[pipenv]
allow_prereleases = true
342 changes: 252 additions & 90 deletions Pipfile.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
#pg_dba_metrics
# pg_dba_metrics

Simple python app that executes arbitrary queries against a database, and returns results as timestamped JSON suitable for insertion to a time series table, and checking results against configurable thresholds for alerting via Slackbot.
12 changes: 12 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- check: test_data_point
name: test_metric
status: clear
threshold: 0
- check: foo
name: test
status: clear
threshold: 5
- check: duration
name: long-connection
status: clear
threshold: 600
96 changes: 96 additions & 0 deletions dba_metrics/alert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python
# -*- coding: utf-8 -
import os
from itertools import cycle

import yaml
from dotenv import find_dotenv, load_dotenv
from slack_post import slack_post

override = False
if os.getenv("METRIC_ENV", "development") == "development":
override = True

load_dotenv(find_dotenv(), override=override)

HOSTNAME = os.getenv("HOSTNAME", "localhost")

CONFIG = yaml.safe_load(open("config.yaml", "r"))


def send_alert(metric, value):
# TODO: include raw metric JSON as attachment
name = metric["name"]
check = metric["check"]
threshold = metric["threshold"]
status = metric["status"]
full_metric = yaml.safe_dump(metric)

title = f"{HOSTNAME} {status}"

message = (
f"Metric *{name}* {check} is {value}\nThreshold is {threshold}\n"
f"```{full_metric}```"
)

color = "good"
if status == "failure":
color = "danger"

alert = slack_post(title=title, message=message, color=color)
return alert


def update_config(metric):
name = metric["name"]
config_match = list(filter(lambda m: m["name"] != name, CONFIG))
metric_config = {
key: metric[key] for key in ("name", "check", "threshold", "status")
}
new_config = [*config_match, metric_config]
with open("config.yaml", "w") as config_file:
config_file.write(yaml.safe_dump(new_config))


def swap_status(status):
opts = cycle(["clear", "failure"])
new_status = next(opts)
if status == new_status:
new_status = next(opts)
return new_status


def check_metric(metric):
# TODO: Support failure modes other than `> threshold`
data = metric["data"]
status = metric["status"]
check = metric["check"]
threshold = metric["threshold"]
alert = None

for row in data:
value = row[check]
test = value >= threshold
if status == "failure":
test = value < threshold
if test:
metric["status"] = swap_status(status)
alert = send_alert(metric, value)
update_config(metric)
return alert


def alert_check(metric):
name = metric["name"]
if any(m["name"] == name for m in CONFIG):
config_match = list(filter(lambda m: m["name"] == name, CONFIG))[0]
metric = {**config_match, **metric}
alert = check_metric(metric)
return alert
else:
pass


if __name__ == "__main__":
metric = {"data": [{"test_data_point": -1}], "name": "test_metric"}
alert_check(metric)
90 changes: 29 additions & 61 deletions dba_metrics/query_checks.py → dba_metrics/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,34 @@
- Invoke from other python script with `query_checks.schedule()` or
`query_checks.get_metrics(as_json=True)`
"""
import argparse
import glob
import json
import os
from datetime import datetime

import arrow
import records
from apscheduler.schedulers.blocking import BlockingScheduler
from alert import alert_check
from dotenv import find_dotenv, load_dotenv
from sqlalchemy.exc import ProgrammingError

load_dotenv(find_dotenv())
override = False
if os.getenv("METRIC_ENV", "development") == "development":
override = True

FETCH_DB_URL = os.getenv('DATABASE_URL',
'postgres://postgres@localhost/yardstick')
STORE_DB_URL = os.getenv('STORE_DB_URL', FETCH_DB_URL)
load_dotenv(find_dotenv(), override=override)

FETCH_DB_URL = os.getenv("DATABASE_URL", "postgres://postgres@localhost/yardstick")
STORE_DB_URL = os.getenv("STORE_DB_URL", FETCH_DB_URL)

# database to measure
fetch_db = records.Database(FETCH_DB_URL)
# database to store metrics in
store_db = records.Database(STORE_DB_URL)


def get_sql(name):
"""Read sql from file matching name and return as SQL string
"""
query_file = os.path.join('query_files', name)
query_file = os.path.join("query_files", name)
with open(query_file) as f:
query_sql = f.read()
return query_sql
Expand All @@ -48,77 +53,40 @@ def fetch_metric(name):
"""Submit query to the database, return results in a dict with timestamp
and query name nodes appended
"""
fetch_db = records.Database(FETCH_DB_URL)

sql = get_sql(name)
try:
metric_result = fetch_db.query(sql)
metric = metric_result.as_dict()
except ProgrammingError as e:
metric = dict(error=repr(e))
j = {}
j['data'] = metric
j['stamp'] = arrow.utcnow().isoformat()
j['name'] = name.replace('.sql', '')
j["data"] = metric
j["stamp"] = datetime.utcnow().isoformat()
j["name"] = name.replace(".sql", "")
return j


def store_metric(name, as_json=False):
def store_metric(name, as_json=False, quiet=False):
"""Insert metric query result in time series table in target database
"""
metric = fetch_metric(name)

if not quiet:
alert_check(metric)

if as_json:
print(json.dumps(metric, default=str))
return
sql = ('insert into dba_metrics (stamp, payload, name)'
'values (:stamp, :payload, :name)'
)
sql = (
"insert into perf_metric (stamp, payload, name)"
"values (:stamp, :payload, :name)"
)
# most metric queries return a single row, but loop here so we can store
# more than one result
# NOTE: Also means we don't insert anything if results are empty (that's
# good)
for i in metric['data']:
stamp = metric['stamp']
name = metric['name']
for i in metric["data"]:
stamp = metric["stamp"]
name = metric["name"]
payload = json.dumps(i, default=str)
store_db.query(sql, stamp=stamp, payload=payload, name=name)


def get_metrics(as_json=False):
queries = [name for name in glob.glob('query_files/*')]
metrics = [os.path.basename(name) for name in queries]
for name in metrics:
store_metric(name, as_json)


def create_table():
"""Create table for storing metrics in target database if missing
"""
sql = ('create table if not exists dba_metrics( '
'stamp timestamp with time zone, '
'payload jsonb, '
'name text)')
store_db.query(sql)


def schedule():
scheduler = BlockingScheduler(timezone='UTC')
scheduler.add_job(get_metrics, 'interval', seconds=10)
print('Press Ctrl+C to exit')

# Execution will block here until Ctrl+C is pressed.
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-j", "--json", action="store_true", default=False)
args = parser.parse_args()
if args.json:
metrics = get_metrics(as_json=True)
else:
create_table()
schedule()
60 changes: 60 additions & 0 deletions dba_metrics/schedule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import argparse
import glob
import os

from apscheduler.schedulers.blocking import BlockingScheduler
from check import store_db, store_metric
from dotenv import find_dotenv, load_dotenv

override = False
if os.getenv("METRIC_ENV", "development") == "development":
override = True
load_dotenv(find_dotenv(), override=override)

INTERVAL = os.getenv("INTERVAL", 60)


def get_metrics(as_json=False, quiet=False):
queries = [name for name in glob.glob("query_files/*")]
metrics = [os.path.basename(name) for name in queries]
for name in metrics:
store_metric(name, as_json, quiet)


def create_table():
"""Create table for storing metrics in target database if not present
"""
sql = (
"create table if not exists perf_metric( "
"metric_id bigserial primary key "
"stamp timestamp with time zone, "
"payload jsonb, "
"name text)"
)
store_db.query(sql)


def schedule(as_json=False, quiet=False):
scheduler = BlockingScheduler(timezone="UTC")
scheduler.add_job(get_metrics, "interval", [as_json, quiet], seconds=INTERVAL)
print("Press Ctrl+C to exit")

# Execution will block here until Ctrl+C is pressed.
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-j", "--json", action="store_true", default=False)
parser.add_argument("-q", "--no-alerts", action="store_true", default=False)
parser.add_argument("-s", "--schedule", action="store_true", default=False)
args = parser.parse_args()
if not args.json:
create_table()
if args.schedule:
schedule(as_json=args.json, quiet=args.no_alerts)
else:
get_metrics(as_json=args.json, quiet=args.no_alerts)
28 changes: 28 additions & 0 deletions dba_metrics/slack_post.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os

import slack
from dotenv import find_dotenv, load_dotenv

override = False
if os.getenv("METRIC_ENV", "development") == "development":
override = True

load_dotenv(find_dotenv(), override=override)

TOKEN = os.getenv("SLACK_TOKEN")
CHANNEL = os.getenv("CHANNEL")
HOSTNAME = os.getenv("HOSTNAME")

client = slack.WebClient(TOKEN)


def slack_post(title="Test", message="Hello world!", color="#999999"):
attach = dict(fallback=message, title=title, text=message, color=color)
r = client.chat_postMessage(
channel=CHANNEL, attachments=[attach], username=f"{HOSTNAME} DBA alert"
)
return r


if __name__ == "__main__":
slack_post(message="Something blah blah")
13 changes: 0 additions & 13 deletions query_files/autovac_freeze.sql

This file was deleted.

Loading

0 comments on commit 8b1c152

Please sign in to comment.