From e832f87e8e5edecb725355d36eedcc90a441d644 Mon Sep 17 00:00:00 2001 From: Leo McArdle Date: Wed, 24 Jan 2024 23:10:08 +0000 Subject: [PATCH 1/3] feat: api v2 for mdn observatory --- httpobs/database/__init__.py | 2 + httpobs/database/database.py | 34 ++++++--- httpobs/scanner/scanner.py | 2 +- httpobs/website/api.py | 6 ++ httpobs/website/api_v2.py | 134 +++++++++++++++++++++++++++++++++++ httpobs/website/main.py | 34 +++++---- 6 files changed, 184 insertions(+), 28 deletions(-) create mode 100644 httpobs/website/api_v2.py diff --git a/httpobs/database/__init__.py b/httpobs/database/__init__.py index cffca49..9c0f8b8 100644 --- a/httpobs/database/__init__.py +++ b/httpobs/database/__init__.py @@ -5,6 +5,7 @@ periodic_maintenance, refresh_materialized_views, select_scan_host_history, + select_scan_most_recent_scan, select_scan_recent_finished_scans, select_scan_recent_scan, select_scan_scanner_statistics, @@ -23,6 +24,7 @@ 'select_scan_host_history', 'select_scan_recent_finished_scans', 'select_scan_recent_scan', + 'select_scan_most_recent_scan', 'select_scan_scanner_statistics', 'select_site_headers', 'select_site_id', diff --git a/httpobs/database/database.py b/httpobs/database/database.py index d9f0a65..2651bb8 100644 --- a/httpobs/database/database.py +++ b/httpobs/database/database.py @@ -109,6 +109,7 @@ def insert_scan(site_id: int, hidden: bool = False) -> dict: def insert_test_results(site_id: int, scan_id: int, data: dict) -> dict: with get_cursor() as cur: for name, test in data["tests"].items(): + test = test.copy() # don't mutate argument expectation = test.pop('expectation') passed = test.pop('pass') result = test.pop('result') @@ -327,6 +328,20 @@ def select_scan_recent_scan(site_id: int, recent_in_seconds=API_CACHED_RESULT_TI return {} +def select_scan_most_recent_scan(site_id: int) -> dict | None: + with get_cursor() as cur: + cur.execute( + """SELECT * FROM scans + WHERE site_id = %s + ORDER BY start_time DESC + LIMIT 1""", + (site_id,), + ) + + if cur.rowcount > 0: + return dict(cur.fetchone()) + + def select_site_headers(hostname: str) -> dict: # Return the site's headers with get_cursor() as cur: @@ -351,7 +366,7 @@ def select_site_headers(hostname: str) -> dict: return {} -def select_site_id(hostname: str) -> int: +def select_site_id(hostname: str, create=True) -> int | None: # See if the site exists already with get_cursor() as cur: cur.execute( @@ -366,15 +381,16 @@ def select_site_id(hostname: str) -> int: return cur.fetchone()['id'] # If not, let's create the site - with get_cursor() as cur: - cur.execute( - """INSERT INTO sites (domain, creation_time) - VALUES (%s, NOW()) - RETURNING id""", - (hostname,), - ) + if create: + with get_cursor() as cur: + cur.execute( + """INSERT INTO sites (domain, creation_time) + VALUES (%s, NOW()) + RETURNING id""", + (hostname,), + ) - return cur.fetchone()['id'] + return cur.fetchone()['id'] def select_test_results(scan_id: int) -> dict: diff --git a/httpobs/scanner/scanner.py b/httpobs/scanner/scanner.py index ba0f498..ca1cf21 100644 --- a/httpobs/scanner/scanner.py +++ b/httpobs/scanner/scanner.py @@ -8,7 +8,7 @@ from httpobs.scanner.utils import sanitize_headers # Current algorithm version -ALGORITHM_VERSION = 2 +ALGORITHM_VERSION = 3 def scan(hostname: str, **kwargs): diff --git a/httpobs/website/api.py b/httpobs/website/api.py index 9ef3b8f..c062863 100644 --- a/httpobs/website/api.py +++ b/httpobs/website/api.py @@ -19,6 +19,12 @@ # TODO: Implement API to write public and private headers to the database +@api.route('/') +@add_response_headers() +def main() -> str: + return 'Welcome to the HTTP Observatory!' + + @api.route('/api/v1/analyze', methods=['GET', 'OPTIONS', 'POST']) @add_response_headers(cors=True) @sanitized_api_response diff --git a/httpobs/website/api_v2.py b/httpobs/website/api_v2.py new file mode 100644 index 0000000..616c4c0 --- /dev/null +++ b/httpobs/website/api_v2.py @@ -0,0 +1,134 @@ +import sys +from datetime import datetime, timedelta + +from flask import Blueprint, jsonify, request + +import httpobs.database as database +import httpobs.scanner as scanner +from httpobs import STATE_FAILED +from httpobs.conf import API_COOLDOWN, DEVELOPMENT_MODE +from httpobs.scanner.grader import get_score_description +from httpobs.website import add_response_headers +from httpobs.website.utils import valid_hostname + +api_v2 = Blueprint("api_v2", __name__) + + +@api_v2.route("/analyze", methods=["GET", "OPTIONS", "POST"]) +@add_response_headers(cors=True) +def api_post_scan_hostname(): + status_code = 200 + scan = {} + tests = {} + + host = request.args.get("host", "").lower().strip() + try: + site_id = database.select_site_id(host, create=False) + except IOError: + return { + "error": "database-down", + "text": "Unable to connect to database", + }, 500 + + if site_id is not None: + hostname = host + else: + ip = True if valid_hostname(host) is None else False + if ip: + return { + "error": "invalid-hostname-ip", + "text": "Cannot scan IP addresses", + }, 400 + + hostname = valid_hostname(host) or ( + valid_hostname("www." + host) if host else False + ) # prepend www. if necessary + if not hostname: + return { + "error": "invalid-hostname", + "text": f"{host} is an invalid hostname", + }, 400 + + site_id: int = database.select_site_id(host, create=True) + scan = database.select_scan_most_recent_scan(site_id) + + if scan and request.method == "POST": + time_since_scan = datetime.now() - scan["end_time"] + if time_since_scan < timedelta(seconds=API_COOLDOWN): + status_code = 429 + else: + scan = None + + if scan: + scan_id = scan["id"] + + tests = database.select_test_results(scan_id) + for name, test in tests.items(): + del test["id"] + del test["scan_id"] + del test["site_id"] + del test["name"] + test["score_description"] = get_score_description(test["result"]) + tests[name] = {**test.pop("output"), **test} + + else: + hidden = request.form.get("hidden", "false") == "true" + + scan = database.insert_scan(site_id, hidden=hidden) + scan_id = scan["id"] + + # Get the site's cookies and headers + # TODO: add API to insert these into the db + # headers = database.select_site_headers(hostname) + + try: + result = scanner.scan(hostname) + scan = result["scan"] + tests = result["tests"] + + if "error" in result: + scan = database.update_scan_state(scan_id, STATE_FAILED, error=result["error"]) + else: + scan = database.insert_test_results( + site_id, + scan_id, + result, + ) + except: + # If we are unsuccessful, close out the scan in the database + scan = database.update_scan_state(scan_id, STATE_FAILED) + + # Print the exception to stderr if we're in dev + if DEVELOPMENT_MODE: + import traceback + + print("Error detected in scan for: " + hostname) + traceback.print_exc(file=sys.stderr) + + scan["start_time"] = scan["start_time"].isoformat() + scan["end_time"] = scan["end_time"].isoformat() + + history = database.select_scan_host_history(site_id) + + # Prune for when the score doesn't change; thanks to chuck for the elegant list comprehension + history = [ + { + "end_time": v["end_time"].isoformat(), + "grade": v["grade"], + "id": v["scan_id"], + "score": v["score"], + } + for k, v in enumerate(history) + if history[k].get('score') is not history[k - 1].get('score') or k == 0 + ] + + return ( + jsonify( + { + "scan": scan, + "tests": tests, + "history": history, + } + ), + status_code, + ) diff --git a/httpobs/website/main.py b/httpobs/website/main.py index b7e6ad7..89c5677 100644 --- a/httpobs/website/main.py +++ b/httpobs/website/main.py @@ -1,34 +1,32 @@ -import sys - from flask import Flask from httpobs.conf import API_PORT, API_PROPAGATE_EXCEPTIONS, DEVELOPMENT_MODE -from httpobs.website import add_response_headers -from httpobs.website.api import api -from httpobs.website.monitoring import monitoring_api - -def __exit_with(msg: str) -> None: - print(msg) - sys.exit(1) +def create_app(): + # Register the application with flask + app = Flask('http-observatory') + app.config['PROPAGATE_EXCEPTIONS'] = API_PROPAGATE_EXCEPTIONS -# Register the application with flask -app = Flask('http-observatory') -app.config['PROPAGATE_EXCEPTIONS'] = API_PROPAGATE_EXCEPTIONS -app.register_blueprint(api) -app.register_blueprint(monitoring_api) + from httpobs.website.api import api + from httpobs.website.api_v2 import api_v2 + from httpobs.website.monitoring import monitoring_api + app.register_blueprint(api) + app.register_blueprint(api_v2, url_prefix="/api/v2") + app.register_blueprint(monitoring_api) -@app.route('/') -@add_response_headers() -def main() -> str: - return 'Welcome to the HTTP Observatory!' + return app def run(): + app = create_app() app.run(debug=DEVELOPMENT_MODE, port=API_PORT) if __name__ == '__main__': run() + +# make backwards compatible with uwsgi setup +# TODO: move into wsgi.py +app = create_app() From 60ae5e5bbd93fcadcad0ac7053bfbbc07cda9dc7 Mon Sep 17 00:00:00 2001 From: Leo McArdle Date: Thu, 25 Jan 2024 11:17:06 +0000 Subject: [PATCH 2/3] review updates --- httpobs/website/api_v2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/httpobs/website/api_v2.py b/httpobs/website/api_v2.py index 616c4c0..6ac8762 100644 --- a/httpobs/website/api_v2.py +++ b/httpobs/website/api_v2.py @@ -28,7 +28,7 @@ def api_post_scan_hostname(): return { "error": "database-down", "text": "Unable to connect to database", - }, 500 + }, 503 if site_id is not None: hostname = host @@ -55,9 +55,9 @@ def api_post_scan_hostname(): if scan and request.method == "POST": time_since_scan = datetime.now() - scan["end_time"] if time_since_scan < timedelta(seconds=API_COOLDOWN): - status_code = 429 + status_code = 429 # keep going, we'll respond with the most recent scan else: - scan = None + scan = None # clear the scan, and we'll do another if scan: scan_id = scan["id"] @@ -72,6 +72,10 @@ def api_post_scan_hostname(): tests[name] = {**test.pop("output"), **test} else: + # no scan means we're a POST which hasn't been rate limited + # or we're a GET for a host which has no scans in the db + # either way, we need to perform a scan + hidden = request.form.get("hidden", "false") == "true" scan = database.insert_scan(site_id, hidden=hidden) From bc22ae0b3ab45253b742e5bac41b043de19eadb5 Mon Sep 17 00:00:00 2001 From: Leo McArdle Date: Wed, 7 Feb 2024 15:45:54 +0000 Subject: [PATCH 3/3] small fixes around failed scans ensure failed scans get an end_time and we don't fetch recent scans without one --- httpobs/database/database.py | 14 +++++++++++++- httpobs/website/api_v2.py | 3 +-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/httpobs/database/database.py b/httpobs/database/database.py index 2651bb8..3f7f03c 100644 --- a/httpobs/database/database.py +++ b/httpobs/database/database.py @@ -332,7 +332,7 @@ def select_scan_most_recent_scan(site_id: int) -> dict | None: with get_cursor() as cur: cur.execute( """SELECT * FROM scans - WHERE site_id = %s + WHERE site_id = %s AND end_time IS NOT NULL ORDER BY start_time DESC LIMIT 1""", (site_id,), @@ -420,6 +420,18 @@ def update_scan_state(scan_id, state: str, error=None) -> dict: row = dict(cur.fetchone()) + elif state == STATE_FAILED: + with get_cursor() as cur: + cur.execute( + """UPDATE scans + SET (state, end_time) = (%s, NOW()) + WHERE id = %s + RETURNING *""", + (state, scan_id), + ) + + row = dict(cur.fetchone()) + else: with get_cursor() as cur: cur.execute( diff --git a/httpobs/website/api_v2.py b/httpobs/website/api_v2.py index 6ac8762..4b80b6f 100644 --- a/httpobs/website/api_v2.py +++ b/httpobs/website/api_v2.py @@ -87,8 +87,6 @@ def api_post_scan_hostname(): try: result = scanner.scan(hostname) - scan = result["scan"] - tests = result["tests"] if "error" in result: scan = database.update_scan_state(scan_id, STATE_FAILED, error=result["error"]) @@ -98,6 +96,7 @@ def api_post_scan_hostname(): scan_id, result, ) + tests = result["tests"] except: # If we are unsuccessful, close out the scan in the database scan = database.update_scan_state(scan_id, STATE_FAILED)