Alerting (tym-xqo#1)

* add Alert script - yaml config file for setting check and threshold per metric - automatically updates status on failure, so won't alert again until clear - added args for invoking Scheduler and quieting alerts
timlkelly · May 31, 2019 · 8b1c152 · 8b1c152
1 parent 6dd31d1
commit 8b1c152
Show file tree

Hide file tree

Showing 29 changed files with 570 additions and 290 deletions.
diff --git a/.env.example b/.env.example
@@ -1,2 +1,7 @@
 DATABASE_URL=postgres://localhost:5432/yardstick
-STORE_DB_URL=postgres://localhost:5432/metrics
+METRIC_ENV=development
+DATABASE_URL=postgres://some_user@localhost:5432/cool_facts
+STORE_DB_URL=postgres://dba_user@localhost:5432/metrics
+SLACK_TOKEN=xxxx-0000000
+CHANNEL=AB123456
+HOSTNAME=localhost
diff --git a/.gitignore b/.gitignore
@@ -112,4 +112,6 @@ venv.bak/
 dmypy.json
 
 # Pyre type checker
-.pyre/
+.pyre/
+
+query_bak/
diff --git a/Pipfile b/Pipfile
@@ -4,13 +4,15 @@ url = "https://pypi.org/simple"
 verify_ssl = true
 
 [dev-packages]
-psycopg2-binary = "*"
 
 [packages]
 apscheduler = "*"
-arrow = "*"
-backports-functools-lru-cache = "*"
+pathlib = "*"
+psycopg2-binary = "*"
 python-dotenv = "*"
-records = "*"
-newrelic = "*"
 pyyaml = ">=4.2b1"
+records = "*"
+slackclient = "*"
+
+[pipenv]
+allow_prereleases = true
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1 +1,3 @@
-#pg_dba_metrics
+# pg_dba_metrics
+
+Simple python app that executes arbitrary queries against a database, and returns results as timestamped JSON suitable for insertion to a time series table, and checking results against configurable thresholds for alerting via Slackbot.
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,12 @@
+- check: test_data_point
+  name: test_metric
+  status: clear
+  threshold: 0
+- check: foo
+  name: test
+  status: clear
+  threshold: 5
+- check: duration
+  name: long-connection
+  status: clear
+  threshold: 600
diff --git a/dba_metrics/alert.py b/dba_metrics/alert.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -
+import os
+from itertools import cycle
+
+import yaml
+from dotenv import find_dotenv, load_dotenv
+from slack_post import slack_post
+
+override = False
+if os.getenv("METRIC_ENV", "development") == "development":
+    override = True
+
+load_dotenv(find_dotenv(), override=override)
+
+HOSTNAME = os.getenv("HOSTNAME", "localhost")
+
+CONFIG = yaml.safe_load(open("config.yaml", "r"))
+
+
+def send_alert(metric, value):
+    # TODO: include raw metric JSON as attachment
+    name = metric["name"]
+    check = metric["check"]
+    threshold = metric["threshold"]
+    status = metric["status"]
+    full_metric = yaml.safe_dump(metric)
+
+    title = f"{HOSTNAME} {status}"
+
+    message = (
+        f"Metric *{name}* {check} is {value}\nThreshold is {threshold}\n"
+        f"```{full_metric}```"
+    )
+
+    color = "good"
+    if status == "failure":
+        color = "danger"
+
+    alert = slack_post(title=title, message=message, color=color)
+    return alert
+
+
+def update_config(metric):
+    name = metric["name"]
+    config_match = list(filter(lambda m: m["name"] != name, CONFIG))
+    metric_config = {
+        key: metric[key] for key in ("name", "check", "threshold", "status")
+    }
+    new_config = [*config_match, metric_config]
+    with open("config.yaml", "w") as config_file:
+        config_file.write(yaml.safe_dump(new_config))
+
+
+def swap_status(status):
+    opts = cycle(["clear", "failure"])
+    new_status = next(opts)
+    if status == new_status:
+        new_status = next(opts)
+    return new_status
+
+
+def check_metric(metric):
+    # TODO: Support failure modes other than `> threshold`
+    data = metric["data"]
+    status = metric["status"]
+    check = metric["check"]
+    threshold = metric["threshold"]
+    alert = None
+
+    for row in data:
+        value = row[check]
+        test = value >= threshold
+        if status == "failure":
+            test = value < threshold
+        if test:
+            metric["status"] = swap_status(status)
+            alert = send_alert(metric, value)
+            update_config(metric)
+    return alert
+
+
+def alert_check(metric):
+    name = metric["name"]
+    if any(m["name"] == name for m in CONFIG):
+        config_match = list(filter(lambda m: m["name"] == name, CONFIG))[0]
+        metric = {**config_match, **metric}
+        alert = check_metric(metric)
+        return alert
+    else:
+        pass
+
+
+if __name__ == "__main__":
+    metric = {"data": [{"test_data_point": -1}], "name": "test_metric"}
+    alert_check(metric)
diff --git a/dba_metrics/query_checks.py → dba_metrics/check.py b/dba_metrics/query_checks.py → dba_metrics/check.py
@@ -16,29 +16,34 @@
     - Invoke from other python script with `query_checks.schedule()` or
     `query_checks.get_metrics(as_json=True)`
 """
-import argparse
-import glob
 import json
 import os
+from datetime import datetime
 
-import arrow
 import records
-from apscheduler.schedulers.blocking import BlockingScheduler
+from alert import alert_check
 from dotenv import find_dotenv, load_dotenv
 from sqlalchemy.exc import ProgrammingError
 
-load_dotenv(find_dotenv())
+override = False
+if os.getenv("METRIC_ENV", "development") == "development":
+    override = True
 
-FETCH_DB_URL = os.getenv('DATABASE_URL',
-                         'postgres://postgres@localhost/yardstick')
-STORE_DB_URL = os.getenv('STORE_DB_URL', FETCH_DB_URL)
+load_dotenv(find_dotenv(), override=override)
+
+FETCH_DB_URL = os.getenv("DATABASE_URL", "postgres://postgres@localhost/yardstick")
+STORE_DB_URL = os.getenv("STORE_DB_URL", FETCH_DB_URL)
+
+# database to measure
+fetch_db = records.Database(FETCH_DB_URL)
+# database to store metrics in
 store_db = records.Database(STORE_DB_URL)
 
 
 def get_sql(name):
     """Read sql from file matching name and return as SQL string
     """
-    query_file = os.path.join('query_files', name)
+    query_file = os.path.join("query_files", name)
     with open(query_file) as f:
         query_sql = f.read()
     return query_sql
@@ -48,77 +53,40 @@ def fetch_metric(name):
     """Submit query to the database, return results in a dict with timestamp
     and query name nodes appended
     """
-    fetch_db = records.Database(FETCH_DB_URL)
-
     sql = get_sql(name)
     try:
         metric_result = fetch_db.query(sql)
         metric = metric_result.as_dict()
     except ProgrammingError as e:
         metric = dict(error=repr(e))
     j = {}
-    j['data'] = metric
-    j['stamp'] = arrow.utcnow().isoformat()
-    j['name'] = name.replace('.sql', '')
+    j["data"] = metric
+    j["stamp"] = datetime.utcnow().isoformat()
+    j["name"] = name.replace(".sql", "")
     return j
 
 
-def store_metric(name, as_json=False):
+def store_metric(name, as_json=False, quiet=False):
     """Insert metric query result in time series table in target database
     """
     metric = fetch_metric(name)
+
+    if not quiet:
+        alert_check(metric)
+
     if as_json:
         print(json.dumps(metric, default=str))
         return
-    sql = ('insert into dba_metrics (stamp, payload, name)'
-           'values (:stamp, :payload, :name)'
-           )
+    sql = (
+        "insert into perf_metric (stamp, payload, name)"
+        "values (:stamp, :payload, :name)"
+    )
     # most metric queries return a single row, but loop here so we can store
     # more than one result
     # NOTE: Also means we don't insert anything if results are empty (that's
     # good)
-    for i in metric['data']:
-        stamp = metric['stamp']
-        name = metric['name']
+    for i in metric["data"]:
+        stamp = metric["stamp"]
+        name = metric["name"]
         payload = json.dumps(i, default=str)
         store_db.query(sql, stamp=stamp, payload=payload, name=name)
-
-
-def get_metrics(as_json=False):
-    queries = [name for name in glob.glob('query_files/*')]
-    metrics = [os.path.basename(name) for name in queries]
-    for name in metrics:
-        store_metric(name, as_json)
-
-
-def create_table():
-    """Create table for storing metrics in target database if missing
-    """
-    sql = ('create table if not exists dba_metrics( '
-           'stamp timestamp with time zone, '
-           'payload jsonb, '
-           'name text)')
-    store_db.query(sql)
-
-
-def schedule():
-    scheduler = BlockingScheduler(timezone='UTC')
-    scheduler.add_job(get_metrics, 'interval', seconds=10)
-    print('Press Ctrl+C to exit')
-
-    # Execution will block here until Ctrl+C is pressed.
-    try:
-        scheduler.start()
-    except (KeyboardInterrupt, SystemExit):
-        pass
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-j", "--json", action="store_true", default=False)
-    args = parser.parse_args()
-    if args.json:
-        metrics = get_metrics(as_json=True)
-    else:
-        create_table()
-        schedule()
diff --git a/dba_metrics/schedule.py b/dba_metrics/schedule.py
@@ -0,0 +1,60 @@
+import argparse
+import glob
+import os
+
+from apscheduler.schedulers.blocking import BlockingScheduler
+from check import store_db, store_metric
+from dotenv import find_dotenv, load_dotenv
+
+override = False
+if os.getenv("METRIC_ENV", "development") == "development":
+    override = True
+load_dotenv(find_dotenv(), override=override)
+
+INTERVAL = os.getenv("INTERVAL", 60)
+
+
+def get_metrics(as_json=False, quiet=False):
+    queries = [name for name in glob.glob("query_files/*")]
+    metrics = [os.path.basename(name) for name in queries]
+    for name in metrics:
+        store_metric(name, as_json, quiet)
+
+
+def create_table():
+    """Create table for storing metrics in target database if not present
+    """
+    sql = (
+        "create table if not exists perf_metric( "
+        "metric_id bigserial primary key "
+        "stamp timestamp with time zone, "
+        "payload jsonb, "
+        "name text)"
+    )
+    store_db.query(sql)
+
+
+def schedule(as_json=False, quiet=False):
+    scheduler = BlockingScheduler(timezone="UTC")
+    scheduler.add_job(get_metrics, "interval", [as_json, quiet], seconds=INTERVAL)
+    print("Press Ctrl+C to exit")
+
+    # Execution will block here until Ctrl+C is pressed.
+    try:
+        scheduler.start()
+    except (KeyboardInterrupt, SystemExit):
+        pass
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-j", "--json", action="store_true", default=False)
+    parser.add_argument("-q", "--no-alerts", action="store_true", default=False)
+    parser.add_argument("-s", "--schedule", action="store_true", default=False)
+    args = parser.parse_args()
+    if not args.json:
+        create_table()
+    if args.schedule:
+        schedule(as_json=args.json, quiet=args.no_alerts)
+    else:
+        get_metrics(as_json=args.json, quiet=args.no_alerts)
diff --git a/dba_metrics/slack_post.py b/dba_metrics/slack_post.py
@@ -0,0 +1,28 @@
+import os
+
+import slack
+from dotenv import find_dotenv, load_dotenv
+
+override = False
+if os.getenv("METRIC_ENV", "development") == "development":
+    override = True
+
+load_dotenv(find_dotenv(), override=override)
+
+TOKEN = os.getenv("SLACK_TOKEN")
+CHANNEL = os.getenv("CHANNEL")
+HOSTNAME = os.getenv("HOSTNAME")
+
+client = slack.WebClient(TOKEN)
+
+
+def slack_post(title="Test", message="Hello world!", color="#999999"):
+    attach = dict(fallback=message, title=title, text=message, color=color)
+    r = client.chat_postMessage(
+        channel=CHANNEL, attachments=[attach], username=f"{HOSTNAME} DBA alert"
+    )
+    return r
+
+
+if __name__ == "__main__":
+    slack_post(message="Something blah blah")
diff --git a/query_files/autovac_freeze.sql b/query_files/autovac_freeze.sql