From 4ceda68d24888a8f6475bc6a4cec5421973ebb7d Mon Sep 17 00:00:00 2001 From: Clive Unger Date: Fri, 15 Nov 2024 14:42:53 -0800 Subject: [PATCH] Delete the unused aws/lambda/github-webhook-rds-sync (#5202) Addresses issue https://github.com/pytorch/test-infra/issues/5178 --------- Co-authored-by: Catherine Lee --- .gitattributes | 1 - .gitignore | 2 - aws/lambda/github-webhook-rds-sync/Makefile | 10 - .../existing_schema.py | 681 ------------------ .../generate_schema.py | 59 -- .../lambda_function.py | 129 ---- .../github-webhook-rds-sync/requirements.txt | 3 - .../github-webhook-rds-sync/test_lambda.py | 48 -- aws/lambda/github-webhook-rds-sync/utils.py | 485 ------------- 9 files changed, 1418 deletions(-) delete mode 100644 aws/lambda/github-webhook-rds-sync/Makefile delete mode 100644 aws/lambda/github-webhook-rds-sync/existing_schema.py delete mode 100644 aws/lambda/github-webhook-rds-sync/generate_schema.py delete mode 100644 aws/lambda/github-webhook-rds-sync/lambda_function.py delete mode 100644 aws/lambda/github-webhook-rds-sync/requirements.txt delete mode 100644 aws/lambda/github-webhook-rds-sync/test_lambda.py delete mode 100644 aws/lambda/github-webhook-rds-sync/utils.py diff --git a/.gitattributes b/.gitattributes index bf0c9eaf9d..d4ead7c3d3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,3 @@ **/.terraform.lock.hcl linguist-generated=true .github/workflows/generated-*.yml linguist-generated=true .github/actions/setup-ssh/index.js linguist-generated=true -aws/lambda/github-webhook-rds-sync/existing_schema.py linguist-generated=true diff --git a/.gitignore b/.gitignore index 526455f9ef..b3c266bad1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,8 +24,6 @@ terraform.rc aws/websites/metrics.pytorch.org/vars.yml -aws/lambda/github-webhook-rds-sync/python/ -aws/lambda/github-webhook-rds-sync/hooks/ aws/lambda/rds-proxy/python/ .vercel diff --git a/aws/lambda/github-webhook-rds-sync/Makefile b/aws/lambda/github-webhook-rds-sync/Makefile deleted file mode 100644 index c5d313e9fa..0000000000 --- a/aws/lambda/github-webhook-rds-sync/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -deployment: - pip install --target ./python -r requirements.txt - cd python && zip -r ../deployment.zip . - zip -g deployment.zip lambda_function.py utils.py existing_schema.py - -manual_update: - aws lambda update-function-code --function-name github-webhook-rds-sync-app --zip-file fileb://deployment.zip - -clean: - rm -rf deployment.zip python diff --git a/aws/lambda/github-webhook-rds-sync/existing_schema.py b/aws/lambda/github-webhook-rds-sync/existing_schema.py deleted file mode 100644 index 1fd1aceaf4..0000000000 --- a/aws/lambda/github-webhook-rds-sync/existing_schema.py +++ /dev/null @@ -1,681 +0,0 @@ -""" -Generate this by running this command for every table and saving the results in out.txt - - select 'XXXXXXX' as ""; - select 'app' as ""; - describe `app`; - -Then run that through this script - -import json - -content = open("out.txt").read() -schema = {} -for part in content.split("XXXXXXX"): - part = part.strip() - if part == "": - continue - lines = part.split("\n") - table_name = lines[0].strip() - schema[table_name] = [] - for line in lines[1:]: - schema[table_name].append(line.split()[0]) - -print(json.dumps(schema, indent=2)) -""" - -existing_schema = { - "app": [ - "id", - "slug", - "owner_node_id", - "name", - "created_at", - "updated_at", - "events", - "sync_last_update_at", - "description", - "node_id", - ], - "check_run": [ - "id", - "head_sha", - "external_id", - "status", - "started_at", - "completed_at", - "output_annotations_count", - "check_suite_node_id", - "app_node_id", - "pull_requests", - "sync_last_update_at", - "name", - "conclusion", - "output_title", - "output_summary", - "output_text", - "node_id", - ], - "check_run_event": [ - "action", - "check_run_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], - "check_suite": [ - "id", - "head_sha", - "status", - "pull_requests", - "app_node_id", - "created_at", - "updated_at", - "sync_last_update_at", - "conclusion", - "latest_check_runs_count", - "before", - "after", - "head_branch", - "head_commit_message", - "head_commit_id", - "head_commit_tree_id", - "head_commit_timestamp", - "head_commit_author_name", - "head_commit_author_email", - "head_commit_committer_name", - "head_commit_committer_email", - "node_id", - ], - "check_suite_event": [ - "action", - "check_suite_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], - "comment": [ - "id", - "user_node_id", - "created_at", - "updated_at", - "author_association", - "performed_via_github_app_node_id_node_id", - "sync_last_update_at", - "performed_via_github_app_node_id", - "body", - "original_start_line", - "start_side", - "start_line", - "node_id", - "pull_request_review_id", - "diff_hunk", - "path", - "original_position", - "position", - "commit_id", - "original_commit_id", - "line", - "original_line", - "side", - "original_side", - "in_reply_to_id", - ], - "commit": [ - "sha", - "commit_author_name", - "commit_author_email", - "commit_author_date", - "commit_committer_name", - "commit_committer_email", - "commit_committer_date", - "commit_message", - "commit_tree_sha", - "commit_comment_count", - "commit_verification_verified", - "commit_verification_reason", - "parents", - "sync_last_update_at", - "commit_verification_signature", - "commit_verification_payload", - "author_node_id", - "committer_node_id", - "node_id", - ], - "create_event": [ - "ref", - "ref_type", - "master_branch", - "pusher_type", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "description", - "pk_id", - ], - "delete_event": [ - "ref", - "ref_type", - "pusher_type", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], - "enterprise": [ - "id", - "slug", - "name", - "created_at", - "updated_at", - "sync_last_update_at", - "description", - "website_url", - "node_id", - ], - "installation": ["id", "sync_last_update_at", "node_id"], - "installation_event": [ - "action", - "installation_id", - "installation_account_node_id", - "installation_repository_selection", - "installation_app_id", - "installation_app_slug", - "installation_target_id", - "installation_target_type", - "installation_events", - "installation_created_at", - "installation_updated_at", - "installation_has_multiple_single_files", - "installation_single_file_paths", - "installation_suspended_at", - "repositories", - "sender_node_id", - "sync_last_update_at", - "installation_single_file_name", - "installation_suspended_by_node_id", - "requester_node_id", - "pk_id", - ], - "issue": [ - "id", - "number", - "title", - "user_node_id", - "labels", - "state", - "locked", - "assignee_node_id_node_id", - "assignees", - "milestone_node_id_node_id", - "comments", - "created_at", - "updated_at", - "author_association", - "body", - "sync_last_update_at", - "assignee_node_id", - "milestone_node_id", - "closed_at", - "active_lock_reason", - "performed_via_github_app", - "node_id", - ], - "issue_comment_event": [ - "action", - "issue_node_id", - "comment_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "changes_body_from", - "pk_id", - ], - "issues_event": [ - "action", - "issue_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "assignee_node_id", - "sync_last_update_at", - "changes_title_from", - "changes_body_from", - "label_node_id", - "pk_id", - "milestone_node_id", - ], - "label": [ - "id", - "name", - "color", - "default", - "sync_last_update_at", - "description", - "node_id", - ], - "label_event": [ - "action", - "label_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], - "license": ["key", "name", "spdx_id", "sync_last_update_at", "url", "node_id"], - "meta_event": [ - "action", - "hook_id", - "hook_type", - "hook_name", - "hook_active", - "hook_events", - "hook_config_content_type", - "hook_config_insecure_ssl", - "hook_config_secret", - "hook_updated_at", - "hook_created_at", - "hook_app_id", - "sender_node_id", - "sync_last_update_at", - "pk_id", - ], - "milestone": [ - "id", - "number", - "title", - "description", - "creator_node_id", - "open_issues", - "closed_issues", - "state", - "created_at", - "updated_at", - "closed_at", - "sync_last_update_at", - "due_on", - "node_id", - ], - "organization": ["login", "id", "description", "sync_last_update_at", "node_id"], - "parent": [ - "name", - "id", - "slug", - "description", - "privacy", - "permission", - "sync_last_update_at", - "node_id", - ], - "ping_event": [ - "zen", - "hook_id", - "hook_type", - "hook_name", - "hook_active", - "hook_events", - "hook_config_content_type", - "hook_config_insecure_ssl", - "hook_config_secret", - "hook_updated_at", - "hook_created_at", - "hook_app_id", - "sync_last_update_at", - "pk_id", - ], - "pull_request": [ - "id", - "number", - "state", - "locked", - "title", - "user_node_id", - "created_at", - "updated_at", - "closed_at", - "merged_at", - "assignee_node_id_node_id", - "assignees", - "requested_reviewers", - "requested_teams", - "labels", - "milestone_node_id_node_id", - "draft", - "head_label", - "head_ref", - "head_sha", - "head_user_node_id", - "head_repo_node_id", - "base_label", - "base_ref", - "base_sha", - "base_user_node_id", - "base_repo_node_id", - "author_association", - "sync_last_update_at", - "body", - "milestone_node_id", - "head_repo_description", - "head_repo_homepage", - "head_repo_mirror_url", - "head_repo_license", - "base_repo_description", - "base_repo_homepage", - "base_repo_mirror_url", - "base_repo_license", - "auto_merge", - "active_lock_reason", - "mergeable", - "rebaseable", - "merged_by", - "merge_commit_sha", - "assignee_node_id", - "node_id", - "merged", - "mergeable_state", - "comments", - "review_comments", - "maintainer_can_modify", - "commits", - "additions", - "deletions", - "changed_files", - ], - "pull_request_event": [ - "action", - "number", - "pull_request_node_id", - "requested_reviewer_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "before", - "after", - "assignee_node_id", - "requested_team_node_id", - "label_node_id", - "sync_last_update_at", - "changes_title_from", - "changes_body_from", - "pk_id", - ], - "pull_request_review_comment_event": [ - "action", - "comment_node_id", - "pull_request_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "changes_body_from", - "sync_last_update_at", - "pk_id", - ], - "pull_request_review_event": [ - "action", - "review_node_id", - "pull_request_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "changes_body_from", - "sync_last_update_at", - "pk_id", - ], - "pull_request_review_thread_event": [ - "action", - "pull_request_node_id", - "thread_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], - "push_event": [ - "ref", - "before", - "after", - "repository_node_id", - "pusher_name", - "pusher_email", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "created", - "deleted", - "forced", - "compare", - "commits", - "head_commit_id", - "head_commit_tree_id", - "head_commit_distinct", - "head_commit_timestamp", - "head_commit_author_name", - "head_commit_author_email", - "head_commit_author_username", - "head_commit_committer_name", - "head_commit_committer_email", - "head_commit_committer_username", - "head_commit_added", - "head_commit_removed", - "head_commit_modified", - "sync_last_update_at", - "base_ref", - "head_commit_message", - "pk_id", - ], - "repository": [ - "id", - "name", - "full_name", - "private", - "owner_node_id", - "fork", - "created_at", - "updated_at", - "pushed_at", - "size", - "stargazers_count", - "watchers_count", - "language", - "has_issues", - "has_projects", - "has_downloads", - "has_wiki", - "has_pages", - "forks_count", - "archived", - "disabled", - "open_issues_count", - "forks", - "open_issues", - "watchers", - "default_branch", - "sync_last_update_at", - "description", - "homepage", - "license_node_id", - "mirror_url", - "master_branch", - "stargazers", - "organization", - "allow_squash_merge", - "allow_merge_commit", - "allow_rebase_merge", - "allow_auto_merge", - "delete_branch_on_merge", - "node_id", - "allow_forking", - ], - "requested_team": [ - "name", - "id", - "slug", - "description", - "privacy", - "permission", - "parent_node_id", - "sync_last_update_at", - "node_id", - ], - "review": [ - "id", - "user_node_id", - "body", - "commit_id", - "submitted_at", - "state", - "author_association", - "sync_last_update_at", - "node_id", - ], - "star_event": [ - "action", - "starred_at", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], - "status_event": [ - "id", - "sha", - "name", - "context", - "description", - "state", - "commit_node_id", - "branches", - "created_at", - "updated_at", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - "target_url", - ], - "thread": ["comments", "sync_last_update_at", "node_id"], - "user": [ - "login", - "id", - "gravatar_id", - "type", - "site_admin", - "sync_last_update_at", - "name", - "email", - "node_id", - ], - "workflow": [ - "id", - "name", - "path", - "state", - "created_at", - "updated_at", - "sync_last_update_at", - "node_id", - ], - "workflow_job": [ - "id", - "run_id", - "head_sha", - "status", - "started_at", - "completed_at", - "name", - "steps", - "labels", - "sync_last_update_at", - "conclusion", - "node_id", - ], - "workflow_job_event": [ - "action", - "workflow_job_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], - "workflow_run": [ - "id", - "name", - "head_branch", - "head_sha", - "run_number", - "event", - "status", - "conclusion", - "workflow_id", - "check_suite_id", - "check_suite_node_id", - "pull_requests", - "created_at", - "updated_at", - "head_commit_id", - "head_commit_tree_id", - "head_commit_message", - "head_commit_timestamp", - "head_commit_author_name", - "head_commit_author_email", - "head_commit_committer_name", - "head_commit_committer_email", - "repository_node_id", - "head_repository_node_id", - "sync_last_update_at", - "node_id", - ], - "workflow_run_event": [ - "action", - "workflow_run_node_id", - "workflow_node_id", - "repository_node_id", - "organization_node_id", - "enterprise_node_id", - "sender_node_id", - "installation_node_id", - "sync_last_update_at", - "pk_id", - ], -} diff --git a/aws/lambda/github-webhook-rds-sync/generate_schema.py b/aws/lambda/github-webhook-rds-sync/generate_schema.py deleted file mode 100644 index d75e95bcb3..0000000000 --- a/aws/lambda/github-webhook-rds-sync/generate_schema.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -This script creates and stores the schema needed to hold the data for a set of -webhooks. The webhooks should be in JSON files in a hooks/ directory (see -save_to_s3 in lambda_function.py for the mechanism to gather the hooks). Once -thats done set up the DB connection with the (db_host, db_user, db_password) -env variables and run this script to create classes in SQLAlchemy's ORM and -insert them into the database. - -This is intended to be run manually on DB migrations or for testing / restoring -the database. -""" -import asyncio -import json -from collections import defaultdict -from pathlib import Path -from typing import Any, Dict - -from sqlalchemy.orm import declarative_base - -from utils import connection_string, extract_github_objects, generate_orm, get_engine - - -async def update_schema_for(payload: Dict[str, Any], webhook: str): - Base = declarative_base() - - # Marshal JSON into SQL-able data - objects = extract_github_objects(payload, webhook) - - # NB: This has to be before create_all since it passively registers the tables - [generate_orm(name, obj, Base) for name, obj in objects] - - # # Set up link to DB - # session, engine = get_session() - Base.metadata.create_all(get_engine(connection_string())) - - -if __name__ == "__main__": - samples_path = Path(__file__).resolve().parent / "hooks" - - webhooks = defaultdict(dict) - - # Go over and combine all webhooks of the same type - n = len(list(samples_path.glob("*/*.json"))) - for i, name in enumerate(samples_path.glob("*/*.json")): - if i % 1000 == 0: - print(f"{i} / {n}") - - webhook = name.name.replace(".json", "").split("-")[0] - name = samples_path / name - - with open(name) as f: - data = json.load(f) - - for k, v in data.items(): - webhooks[webhook][k] = v - - # Write all the schemas to the DB - for webhook, combined_data in webhooks.items(): - r = asyncio.run(update_schema_for(combined_data, webhook=webhook)) diff --git a/aws/lambda/github-webhook-rds-sync/lambda_function.py b/aws/lambda/github-webhook-rds-sync/lambda_function.py deleted file mode 100644 index f948d4d91b..0000000000 --- a/aws/lambda/github-webhook-rds-sync/lambda_function.py +++ /dev/null @@ -1,129 +0,0 @@ -import asyncio -import datetime -import hashlib -import hmac -import json -import os -from typing import Any, Dict - -import boto3 -from existing_schema import existing_schema -from sqlalchemy import column, table -from sqlalchemy.dialects.mysql import insert - -from utils import ( - connection_string, - extract_github_objects, - get_engine, - transform_data, - WEBHOOK_SECRET, -) - - -def upsert(engine, model, insert_dict): - """ - Insert or update to an engine backed by MySQL - """ - inserted = insert(model).values(**insert_dict) - upserted = inserted.on_duplicate_key_update( - **{k: inserted.inserted[k] for k, v in insert_dict.items()} - ) - res = engine.execute(upserted) - return res.lastrowid - - -async def handle_webhook(payload: Dict[str, Any], type: str): - engine = get_engine(connection_string()) - - # Marshal JSON into SQL-able data - objects = extract_github_objects(payload, type) - - print("Writing", ", ".join([n for n, o in objects])) - - with engine.connect() as conn: - for tablename, obj in objects: - # Some of the data is not already in the right form (e.g. dates and - # lists, so fix that up here) - obj = transform_data(obj) - - model_data = [tablename] + [column(k) for k in obj.keys()] - model = table(*model_data) - - if tablename not in existing_schema: - print( - f"Skipping write of {tablename} since it doesn't exist in hardcoded schema" - ) - continue - - # Remove non-existent fields - newdata = {} - for key, value in obj.items(): - if key in existing_schema[tablename]: - newdata[key] = value - else: - print( - f"Dropping key '{key}' with value '{value}' since it doesn't exist in table {tablename}" - ) - obj = newdata - upsert(conn, model, obj) - - return {"statusCode": 200, "body": "ok"} - - -def check_hash(payload, expected): - signature = hmac.new( - WEBHOOK_SECRET.encode("utf-8"), payload, hashlib.sha256 - ).hexdigest() - return hmac.compare_digest(signature, expected) - - -def save_to_s3(event_type, payload): - """ - Save a webhook payload to S3 in gha-artifacts/webhooks (used - in generate_schema.py) - """ - session = boto3.Session( - aws_access_key_id=os.environ["aws_key_id"], - aws_secret_access_key=os.environ["aws_access_key"], - ) - s3 = session.resource("s3") - - now = datetime.datetime.now() - millis = int(now.timestamp() * 1000) - day = now.strftime("%Y-%m-%d") - name = f"webhooks/{day}/{event_type}-{millis}.json" - bucket = s3.Bucket("gha-artifacts") - bucket.put_object(Key=name, Body=json.dumps(payload).encode("utf-8")) - - -def lambda_handler(event, context): - expected = event["headers"].get("X-Hub-Signature-256", "").split("=")[1] - payload = event["body"].encode("utf-8") - - # Check that the signature matches the secret on GitHub - if check_hash(payload, expected): - body = event["body"] - if body.startswith("payload="): - body = body[len("payload=") :] # noqa: E203 - try: - payload = json.loads(body) - except Exception as e: - raise RuntimeError(f"Failed to decode JSON:\n{str(e)}\n\n{body}\n\n{event}") - - # Pull out the webhook type (e.g. pull_request, issues, check_run, etc) - event_type = event["headers"]["X-GitHub-Event"] - - # If we want to, save webhooks to S3 for later processing (this is used - # to generate the DB schema with generate_schema.py - if os.getenv("save_to_s3", False) == "1": - save_to_s3(event_type, payload) - - if os.getenv("write_to_db", "1") == "1": - result = asyncio.run(handle_webhook(payload, event_type)) - else: - result = {"statusCode": 200, "body": "didn't write"} - else: - result = {"statusCode": 403, "body": "Forbidden"} - - print("Result:", result) - return result diff --git a/aws/lambda/github-webhook-rds-sync/requirements.txt b/aws/lambda/github-webhook-rds-sync/requirements.txt deleted file mode 100644 index ad53517bf3..0000000000 --- a/aws/lambda/github-webhook-rds-sync/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -boto3==1.16.52 -sqlalchemy==1.4.22 -pymysql==1.1.1 \ No newline at end of file diff --git a/aws/lambda/github-webhook-rds-sync/test_lambda.py b/aws/lambda/github-webhook-rds-sync/test_lambda.py deleted file mode 100644 index 72b8620295..0000000000 --- a/aws/lambda/github-webhook-rds-sync/test_lambda.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -NB: This file requires a running MySQL database. On an Ubuntu machine, this can -be done with these steps: https://www.digitalocean.com/community/tutorials/how-to-install-mariadb-on-ubuntu-20-04 - -Once done, set the db_host, db_password and db_user env variables accordingly. -""" -import asyncio -import json -import os -import tempfile -import unittest -from pathlib import Path - -import lambda_function - -os.environ["gh_secret"] = "test" - - -class TestWebhook(unittest.TestCase): - def test_real_webhooks(self): - samples_path = Path(__file__).resolve().parent / "hooks" - with tempfile.NamedTemporaryFile() as f: # noqa: F841 - - def load_hook(name: str): - name = samples_path / name - with open(name) as f: - data = json.load(f) - - type_name = name.name.replace(".json", "").split("-")[0] - return type_name, data - - glob_path = os.getenv("TEST", "*.json") - - n = len(list(samples_path.glob(glob_path))) - for i, name in enumerate(samples_path.glob(glob_path)): - type, data = load_hook(name) - type = type.split("-")[0] - if i % 20 == 0: - print(f"{i} / {n}") - try: - asyncio.run(lambda_function.handle_webhook(data, type=type)) - except Exception as e: - print(f"Failed on {name.name}") - raise e - - -if __name__ == "__main__": - unittest.main() diff --git a/aws/lambda/github-webhook-rds-sync/utils.py b/aws/lambda/github-webhook-rds-sync/utils.py deleted file mode 100644 index 81efd2f63e..0000000000 --- a/aws/lambda/github-webhook-rds-sync/utils.py +++ /dev/null @@ -1,485 +0,0 @@ -import datetime -import json -import os -from contextlib import suppress -from typing import Any, Dict, List, Tuple, Union - -from sqlalchemy import ( - Boolean, - Column, - create_engine, - DateTime, - Integer, - JSON, - String, - Text, -) - -# We're just going to dump the webhook straight into mysql by flattening out the -# JSON object and inlining arrays into JSON strings -FlatDict = Dict[str, Union[str, int]] -NamedDict = Tuple[str, FlatDict] - - -WEBHOOK_SECRET = os.environ["gh_secret"] - -# This marks that a key would store an object (something with a node_id) -OBJECT_PLACEHOLDER = object() - -NAME_MAP = { - "id": (str, lambda: Column(String(20))), -} - -TYPE_MAP = { - "repository": { - "description": lambda: Column(Text), - "homepage": lambda: Column(String(300)), - "license": lambda: OBJECT_PLACEHOLDER, - "mirror_url": lambda: Column(String(300)), - "master_branch": lambda: Column(String(300)), - "stargazers": lambda: Column(Integer), - "organization": lambda: Column(String(300)), - "allow_squash_merge": lambda: Column(Boolean), - "allow_merge_commit": lambda: Column(Boolean), - "allow_rebase_merge": lambda: Column(Boolean), - "allow_auto_merge": lambda: Column(Boolean), - "delete_branch_on_merge": lambda: Column(Boolean), - }, - "app": { - "description": lambda: Column(Text), - }, - "issues_event": { - "changes_title_from": lambda: Column(String(300)), - "changes_body_from": lambda: Column(Text), - "label": lambda: OBJECT_PLACEHOLDER, - "milestone": lambda: OBJECT_PLACEHOLDER, - }, - "push_event": { - "base_ref": lambda: Column(String(300)), - "head_commit_message": lambda: Column(Text), - }, - "license": { - "url": lambda: Column(String(300)), - }, - "issue": { - "assignee": lambda: OBJECT_PLACEHOLDER, - "milestone": lambda: OBJECT_PLACEHOLDER, - "closed_at": lambda: Column(DateTime), - "body": lambda: Column(Text), - "active_lock_reason": lambda: Column(String(100)), - "performed_via_github_app": lambda: Column(Boolean), - }, - "user": { - "name": lambda: Column(String(100)), - "email": lambda: Column(String(100)), - }, - "enterprise": { - "description": lambda: Column(Text), - "website_url": lambda: Column(String(300)), - }, - "check_run": { - "name": lambda: Column(String(300)), - "conclusion": lambda: Column(String(100)), - "output_title": lambda: Column(String(300)), - "output_summary": lambda: Column(Text), - "output_text": lambda: Column(Text), - }, - "workflow_job": { - "conclusion": lambda: Column(String(100)), - }, - "create_event": { - "description": lambda: Column(String(100)), - }, - "label": {"description": lambda: Column(Text)}, - "review": { - "body": lambda: Column(Text), - }, - "pull_request_review_event": { - "changes_body_from": lambda: Column(Text), - }, - "pull_request_review_comment_event": { - "changes_body_from": lambda: Column(Text), - }, - "comment": { - "performed_via_github_app": lambda: OBJECT_PLACEHOLDER, - "body": lambda: Column(Text), - "side": lambda: Column(String(30)), - "start_side": lambda: Column(String(30)), - "diff_hunk": lambda: Column(Text), - "pull_request_review_id": lambda: Column(String(20)), - "original_start_line": lambda: Column(Integer), - "path": lambda: Column(String(300)), - "start_line": lambda: Column(Integer), - "position": lambda: Column(Integer), - "original_position": lambda: Column(Integer), - "line": lambda: Column(Integer), - "original_line": lambda: Column(Integer), - "commit_id": lambda: Column(String(300)), - "original_commit_id": lambda: Column(String(300)), - "in_reply_to_id": lambda: Column(String(30)), - }, - "issue_comment_event": {"changes_body_from": lambda: Column(Text)}, - "check_suite": { - "conclusion": lambda: Column(String(100)), - "latest_check_runs_count": lambda: Column(Integer), - "before": lambda: Column(String(300)), - "after": lambda: Column(String(300)), - "head_branch": lambda: Column(String(300)), - "head_commit_message": lambda: Column(Text), - "head_commit_id": lambda: Column(String(300)), - "head_commit_tree_id": lambda: Column(String(300)), - "head_commit_timestamp": lambda: Column(DateTime), - "head_commit_author_name": lambda: Column(String(300)), - "head_commit_author_email": lambda: Column(String(300)), - "head_commit_committer_name": lambda: Column(String(300)), - "head_commit_committer_email": lambda: Column(String(300)), - }, - "commit": { - "commit_verification_signature": lambda: Column(Text), - "commit_verification_payload": lambda: Column(Text), - "author": lambda: OBJECT_PLACEHOLDER, - "committer": lambda: OBJECT_PLACEHOLDER, - "commit_message": lambda: Column(Text), - }, - "milestone": { - "due_on": lambda: Column(DateTime), - }, - "installation_event": { - "installation_single_file_name": lambda: Column(Text), - "installation_suspended_by": lambda: OBJECT_PLACEHOLDER, - "requester": lambda: OBJECT_PLACEHOLDER, - }, - "pull_request": { - "body": lambda: Column(Text), - "comments": lambda: Column(Integer), - "commits": lambda: Column(Integer), - "deletions": lambda: Column(Integer), - "changed_files": lambda: Column(Integer), - "additions": lambda: Column(Integer), - "review_comments": lambda: Column(Integer), - "milestone": lambda: OBJECT_PLACEHOLDER, - "head_repo_description": lambda: Column(Text), - "head_repo_homepage": lambda: Column(String(100)), - "head_repo_mirror_url": lambda: Column(String(100)), - "head_repo_license": lambda: Column(String(100)), - "base_repo_description": lambda: Column(Text), - "base_repo_homepage": lambda: Column(String(100)), - "base_repo_mirror_url": lambda: Column(String(100)), - "base_repo_license": lambda: Column(String(100)), - "auto_merge": lambda: Column(String(100)), - "active_lock_reason": lambda: Column(String(100)), - "merged": lambda: Column(Boolean), - "mergeable": lambda: Column(Boolean), - "maintainer_can_modify": lambda: Column(Boolean), - "mergeable_state": lambda: Column(String(100)), - "rebaseable": lambda: Column(Boolean), - "merged_by": lambda: Column(String(100)), - "merge_commit_sha": lambda: Column(String(100)), - "assignee": lambda: OBJECT_PLACEHOLDER, - }, - "pull_request_event": { - "changes_title_from": lambda: Column(String(300)), - "changes_body_from": lambda: Column(Text), - }, - "workflow_run": { - "id": lambda: Column(String(20)), - "check_suite_id": lambda: Column(String(20)), - "workflow_id": lambda: Column(String(20)), - "head_commit_message": lambda: Column(Text), - }, -} - -TABLE_NAME_REMAP = { - "head_repository": "repository", - "repo": "repository", - "committer": "user", - "assignee": "user", - "author": "user", - "requested_reviewer": "user", - "owner": "user", - "requester": "user", - "installation_suspended_by": "user", - "sender": "user", - "account": "user", - "creator": "user", -} - - -def flatten_object(obj: Dict[str, Any]) -> FlatDict: - """ - Take an object and inline all the fields so it doesn't have any nesting - """ - result = {} - - def helper(curr: Dict[str, Any], name: List[str]): - for key, value in curr.items(): - full_name = "_".join(name + [key]) - if value is None or isinstance(value, (str, int, bool, list)): - result[full_name] = value - elif isinstance(value, dict): - helper(value, name + [key]) - else: - raise RuntimeError(f"Unknown type on {full_name}: {value}") - - helper(obj, []) - return result - - -def extract_github_objects(obj: Dict[str, Any], obj_name: str) -> List[NamedDict]: - """ - GitHub's real 'objects' (i.e. things accessible in the API) all have a - unique "node_id" string. This descends into an object and pulls out anything - with a node_id and removes it from the parent. It also flattens the objects - from a Dict[str, Any] to a Dict[str, str] (with an exception for lists so we - still know later on that they're lists and not ordinary strings) - """ - objects = [] - - def drop_key(key: str) -> bool: - if key == "target_url": - return False - - return ( - key.endswith("_url") - or key == "_links" - or key == "url" - or key == "permissions" - ) - - def visit_dict(curr: Dict[str, Any], full_name: List[str]) -> Tuple[bool, FlatDict]: - result = {} - - for key, value in list(curr.items()): - # Objects are not always named consistently (e.g. repository vs - # repo, owner vs. user, so fix that up here) - remapped_key = TABLE_NAME_REMAP.get(key, None) - - if drop_key(key): - # Ignore URLs - continue - - if isinstance(value, dict): - if remapped_key is not None: - is_gh_object, data = visit_dict(value, full_name + [remapped_key]) - else: - is_gh_object, data = visit_dict(value, full_name + [key]) - - if not is_gh_object: - # Not a separate object so inline all of its fields - for flat_key, flat_value in flatten_object(data).items(): - result[f"{key}_{flat_key}"] = flat_value - else: - # It will go into its own table so just put a link to it - # here - result[f"{key}_node_id"] = data["node_id"] - elif ( - value is None - and TYPE_MAP.get(full_name[-1], {}).get(key, lambda: None)() - == OBJECT_PLACEHOLDER - ): - # We might have a null object, in which case we still need to - # add it as a _node_id - result[f"{key}_node_id"] = None - else: - result[key] = value - - if "node_id" in curr: - # It's a github object so stash it for returning later - objects.append((full_name[-1], result)) - return True, curr - else: - return False, result - - _, newobj = visit_dict(obj, [obj_name]) - - # Add an entry for the top level object - objects.append((f"{obj_name}_event", flatten_object(newobj))) - - # Add the time of creation for each object - for _, object in objects: - object["sync_last_update_at"] = datetime.datetime.now() - - return objects - - -def get_column(key: str, value: Any, type_name: str) -> Column: - """ - If the key is present for the webhook type 'type_name' in the hardcoded - TYPE_MAP, use it. Otherwise, guess the type based on the value's runtime - type. - """ - if isinstance(value, dict): - raise RuntimeError(f"Value cannot be a dict: {key}: {value}") - - if key in TYPE_MAP.get(type_name, {}): - return TYPE_MAP[type_name][key]() - - if key in NAME_MAP: - return NAME_MAP[key][1]() - - if is_date(key, value): - return Column(DateTime) - if isinstance(value, str): - return Column(String(max(30, len(value) * 10))) - if isinstance(value, int): - return Column(Integer) - if isinstance(value, bool): - return Column(Boolean) - if isinstance(value, list): - return Column(JSON) - else: - # Don't error out immediately, but bubble this up so we can report all - # errors at once later - # breakpoint() - if key.endswith("_node_id"): - return get_column(key[: -len("_node_id")], value, type_name) - # raise RuntimeError() - return None - - -rprint_buffer = "" - - -def rprint(s): - global rprint_buffer - rprint_buffer += "\n" + str(s) - - -def is_date(key: str, value: Any) -> bool: - return key.endswith("_at") or key.endswith("timestamp") - - -def get_primary_key(name: str, obj: FlatDict) -> Tuple[str, Column]: - if "node_id" in obj: - # if name == "status_event": - # return "node_id", Column(String(100), primary_key=True) - return "node_id", Column(String(100), primary_key=True) - - return "pk_id", Column(Integer, primary_key=True) - - -def transform_data(obj: Dict[str, Any]) -> Dict[str, Any]: - """ - Run in-place transformations on obj to translate fields into the appropriate - type for storage: - * lists -> JSON encoded data - * dates -> Python datetimes - """ - for key, value in obj.items(): - if value is None: - # Don't bother writing nulls, they can mess with object fields - continue - if isinstance(value, list): - obj[key] = json.dumps(value) - elif is_date(key, value) and value is not None: - if isinstance(value, int): - # convert from timestamp - obj[key] = datetime.datetime.fromtimestamp(value) - elif isinstance(value, datetime.datetime): - obj[key] = value - elif isinstance(value, str): - formats = [ - "%Y-%m-%dT%H:%M:%SZ", - "%Y-%m-%dT%H:%M:%S%z", - "%Y-%m-%dT%H:%M:%S.%fZ", - "%Y-%m-%dT%H:%M:%S.%f%z", - ] - date = None - - for format in formats: - with suppress(ValueError): - date = datetime.datetime.strptime(value, format) - - if date is None: - raise RuntimeError(value) - obj[key] = date - else: - raise RuntimeError(f"Unknown date type {key}: {value}") - elif isinstance(value, str): - # TODO: Use utf8mb4 on the DB instead of this which deletes all - # unicode chars - obj[key] = value.encode("ascii", "ignore").decode() - - if len(obj[key]) >= 65530: - obj[key] = obj[key][:65530] - - for key, item in NAME_MAP.items(): - caster, _ = item - if key in obj: - obj[key] = caster(obj[key]) - - return obj - - -def generate_orm(name: str, obj: FlatDict, sql_base: Any) -> Any: - """ - Create an instance of a SQLAlchemy ORM class from a dictionary. - """ - columns = {"__tablename__": name, "__table_args__": {"extend_existing": True}} - errors = [] - for key, value in obj.items(): - col = get_column(key, value, type_name=name) - if col is OBJECT_PLACEHOLDER: - # There is a null object (with a node_id) missing, so create it as - # we would if something was there but leave the value blank - columns[f"{key}_node_id"] = Column(String(100)) - elif col is None: - # Unable to find a type for this value. An entry is missing in the - # TYPE_MAP for this name.key pair - errors.append(f"{name} -> {key}: {value}") - else: - # Got a column successfully, so set it on the table - columns[key] = col - - if len(errors) > 0: - # Couldn't get a column type for some of the data, so error out - catted_errors = "\n ".join([f"typeerr: {e}" for e in errors]) - raise RuntimeError(f"Unknown types:\n{catted_errors}") - - # Change data into the right types for storage - obj = transform_data(obj) - - # Fill in any inconsistent / missing columns from the GitHub API - # The loop above only looks at the data actually received on the webhook. - # Some things may be missing (inconsistencies in GitHub's API or just - # doesn't exist), so fill in their types here: - for key, column_creator in TYPE_MAP.get(name, {}).items(): - value = column_creator() - if value is OBJECT_PLACEHOLDER: - columns[f"{key}_node_id"] = Column(String(50)) - if key in obj: - if obj[key] is not None: - raise RuntimeError(f"not doing it {name}.{key}") - else: - del obj[key] - obj[f"{key}_node_id"] = None - else: - columns[key] = value - - # Set the primary key (some webhooks don't have a node_id at the top level - # so set up an auto-incrementing int ID for them) - pk_name, pk_type = get_primary_key(name, obj) - columns[pk_name] = pk_type - - # Create SQLAlchemy ORM class (which registers it to be created in sql_base) - the_class = type(name, (sql_base,), columns) - return the_class(**obj) - - -def connection_string(): - host = os.environ["db_host"] - password = os.environ["db_password"] - user = os.environ["db_user"] - - return f"mysql+pymysql://{user}:{password}@{host}?charset=utf8mb4" - - -engine = None - - -def get_engine(connection_string: str): - global engine - if engine is None: - engine = create_engine(connection_string, echo=bool(os.getenv("ECHO", False))) - - return engine