diff --git a/.gitignore b/.gitignore index b3c266bad1..29cfb6cda8 100644 --- a/.gitignore +++ b/.gitignore @@ -24,8 +24,6 @@ terraform.rc aws/websites/metrics.pytorch.org/vars.yml -aws/lambda/rds-proxy/python/ - .vercel torchci/api/log/classify/target/ torchci/api/log/classify/Cargo.lock diff --git a/aws/lambda/checks-cron/Makefile b/aws/lambda/checks-cron/Makefile deleted file mode 100644 index 138445a342..0000000000 --- a/aws/lambda/checks-cron/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -deployment.zip: - pip install --target ./python -r requirements.txt - cd python && zip -r ../deployment.zip . - zip -g deployment.zip lambda_function.py - -manual_update: deployment.zip - aws lambda update-function-code --function-name github-checks-status-updater --zip-file fileb://deployment.zip - -clean: - rm -rf deployment.zip python \ No newline at end of file diff --git a/aws/lambda/checks-cron/lambda_function.py b/aws/lambda/checks-cron/lambda_function.py deleted file mode 100644 index 3cffeaaebd..0000000000 --- a/aws/lambda/checks-cron/lambda_function.py +++ /dev/null @@ -1,144 +0,0 @@ -""" -This pings repos/pytorch/pytorch/actions/runs and gathers the most recent jobs -until it sees that everything is complete. It then stores the current count of -all types of jobs ('in_progress' and 'queued' are the relevant parts). -""" -import asyncio -import collections -import datetime -import json -import os - -import aiobotocore -import aiohttp - - -config = {"quiet": False, "github_oauth": os.environ["gh_pat"]} - - -async def github(method, path, payload=None, note="", **kwargs): - if payload is None: - payload = {} - headers = { - "Content-Type": "application/json", - "Host": "api.github.com", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "en-US,en;q=0.9", - } - if config["github_oauth"] is not None: - headers["Authorization"] = "token " + config["github_oauth"] - url = "https://api.github.com/" + path - print(method, url, f"({note})") - async with aiohttp.ClientSession() as session: - r = await session.get(url, headers=headers, **kwargs) - return await r.json() - - -async def fetch_workflow_page(num): - return await github( - "get", - "repos/pytorch/pytorch/actions/runs", - params={"per_page": 100, "page": num}, - ) - - -def page_in_progress(new_statuses): - return "queued" in new_statuses or "in_progress" in new_statuses - - -async def get_page_batch(start: int, batch_size: int): - coros = [] - for i in range(start, start + batch_size): - coros.append( - github( - "get", - "repos/pytorch/pytorch/actions/runs", - params={"per_page": 100, "page": i}, - note=f"fetching page {i}", - ) - ) - - return await asyncio.gather(*coros) - - -def should_check_github(stats): - if len(stats) == 0: - return True - - delta = datetime.datetime.now() - datetime.datetime.fromtimestamp( - stats[0]["last_updated"] - ) - return delta > datetime.timedelta(minutes=5) - - -async def get_gha_statuses(max_pages=30, batch_size=10): - all_statuses = collections.defaultdict(lambda: 0) - - # There's no way to get all the unfinished jobs from the GitHub Actions API, - # here this assumes that it returns the most recent data first (and it's all - # paginated via the API), the heuristic here is that once the lambda has - # seen a certain number of completed jobs with no pending/queued/in progress - # jobs in between, it's probably safe to assume we've seen all the - # unfinished jobs. The stuff here is just making sure it checks a certain - # number of completed jobs before moving on (page size * max_pages_past) - max_pages_past = 10 - pages_past = max_pages_past - - i = 1 - should_quit = False - while not should_quit and i < max_pages: - batch = await get_page_batch(i, batch_size) - i += batch_size - for page in batch: - new_statuses = collections.defaultdict(lambda: 0) - for run in page["workflow_runs"]: - all_statuses[run["status"]] += 1 - new_statuses[run["status"]] += 1 - - if not page_in_progress(new_statuses): - pages_past -= 1 - else: - pages_past = max_pages_past - - if pages_past == 0: - should_quit = True - print(new_statuses) - - return {"last_updated": datetime.datetime.now().timestamp(), **all_statuses} - - -MAX_LEN = 1000 - - -async def main(): - bucket_name = "ossci-checks-status" - session = aiobotocore.get_session() - async with session.create_client( - "s3", - region_name="us-east-1", - aws_secret_access_key=os.environ["aws_secret"], - aws_access_key_id=os.environ["aws_key"], - ) as client: - content = await client.get_object(Bucket=bucket_name, Key="status.json") - content = await content["Body"].read() - all_stats = json.loads(content.decode()) - - if not should_check_github(all_stats): - print("Ran too early, not doing anything") - return - - # Chop off old data - all_stats = all_stats[:MAX_LEN] - - all_stats.insert(0, await get_gha_statuses()) - print("writing", all_stats) - await client.put_object( - Bucket=bucket_name, Key="status.json", Body=json.dumps(all_stats) - ) - - -def lambda_handler(event, context): - print("handling lambda") - asyncio.run(main()) - return {"statusCode": 200, "body": "update processed"} diff --git a/aws/lambda/checks-cron/requirements.txt b/aws/lambda/checks-cron/requirements.txt deleted file mode 100644 index 12d02fb8a4..0000000000 --- a/aws/lambda/checks-cron/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -# To install the lambda layers: -# -# pip install --target ./python -r requirements.txt -# zip -r layer.zip ./python -# -# Then upload the layer via aws console or aws cli -# Update the lambda's layer reference to the newly uploaded layer - -aioboto3 -aiohttp \ No newline at end of file diff --git a/aws/lambda/rds-proxy/Makefile b/aws/lambda/rds-proxy/Makefile deleted file mode 100644 index ba65efabfe..0000000000 --- a/aws/lambda/rds-proxy/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -deployment: - pip install --target ./python -r requirements.txt - cd python && zip -r ../deployment.zip . - zip -g deployment.zip lambda_function.py utils.py - -manual_update: - aws lambda update-function-code --function-name rds-proxy --zip-file fileb://deployment.zip - -clean: - rm -rf deployment.zip python diff --git a/aws/lambda/rds-proxy/lambda_function.py b/aws/lambda/rds-proxy/lambda_function.py deleted file mode 100644 index fdf72325db..0000000000 --- a/aws/lambda/rds-proxy/lambda_function.py +++ /dev/null @@ -1,302 +0,0 @@ -""" -This adds a lambda to handle reading / writing to our RDS MySQL instance that -we can access from the EC2 runners in CI. Similar to scribe-proxy, this means -we can write on pull requests without a secret and also run queries to plan -tests / etc. -""" -import datetime -import json -import os -import re -from contextlib import closing -from typing import Any, Dict, List, Union - -import mysql.connector - - -_connections = { - "reader": { - "connection": None, - "user": "db_user", - "password": "db_password", - "database": "pytorch", - }, - "inserter": { - "connection": None, - "user": "db_user_inserter", - "password": "db_password_inserter", - "database": "metrics", - }, - "creator": { - "connection": None, - "user": "db_user_creator", - "password": "db_password_creator", - "database": "metrics", - }, -} - - -def get_connection(name: str): - field = _connections[name] - - if field["connection"] is None: - field["connection"] = mysql.connector.connect( - host=os.environ["db_host"], - port=3306, - user=os.environ[field["user"]], - password=os.environ[field["password"]], - database=field["database"], - ) - - return field["connection"] - - -SAVED_QUERIES = {"sample": "select name from workflow_run limit 10"} - -TYPE_MAP = { - "int": "INTEGER", - "string": "VARCHAR(300)", - "float": "FLOAT", - "date": "DATETIME", -} - - -NAME_REGEX = re.compile("^[a-z_]+$") - - -def validate_schema_name(s: str): - if NAME_REGEX.match(s) is not None: - return s - else: - raise RuntimeError(f"Invalid name: {s}") - - -def safe_join(s: Union[str, List[str]], join_str: str = ", ") -> str: - if isinstance(s, str): - s = [s] - - return join_str.join([validate_schema_name(x) for x in s]) - - -def build_query(body): - # If the request is a simple query we can just build it manually rather - # than having to hard code it in the list above - params = [] - table_name = validate_schema_name(body["table_name"]) - - query = f"SELECT {safe_join(body['fields'])} FROM {safe_join(table_name)}" - - where = body.get("where", None) - if where is not None: - if not isinstance(where, list): - where = [where] - for item in where: - item["field"] = validate_schema_name(item["field"]) - query += " WHERE" - items = [f"{n['field']} {'like' if n['like'] else '='} %s" for n in where] - query += f" {' and '.join(items)}" - params += [n["value"] for n in where] - - group_by = body.get("group_by", None) - if group_by is not None: - query += f" GROUP BY {safe_join(group_by)}" - - order_by = body.get("order_by", None) - if order_by is not None: - query += f" ORDER BY {safe_join(order_by)}" - - limit = body.get("limit", None) - if limit is not None: - query += " LIMIT %s" - params.append(int(limit)) - - return query, params - - -def run_query(query: str, params: List[str], connection: Any) -> List[Dict[str, str]]: - print(f"Executing '{query}' with params: {params}") - if "--" in query: - raise RuntimeError("No -- allowed") - - with closing(connection.cursor(dictionary=True)) as cursor: - cursor.execute(query, params) - return list(cursor) - - -def run_write(write): - # Insert a record into a table - name = validate_schema_name(write["table_name"]) - fields = { - validate_schema_name(field): value for field, value in write["values"].items() - } - fields["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - names = ", ".join(list(fields.keys())) - - # Here we can actually use parameterized queries, so don't put the actual - # values - placeholders = ", ".join(["%s" for _ in range(len(fields))]) - query = f"INSERT INTO {name}({names}) VALUES ({placeholders})" - params = list(fields.values()) - - conn = get_connection("inserter") - res = run_query(query, params, conn) - conn.commit() - return res - - -def run_create_table(create_table): - name = validate_schema_name(create_table["table_name"]) - - def mquery(sql): - return run_query(sql, [], get_connection("creator")) - - # Validate all the field names since we have to insert them directly - fields = { - validate_schema_name(field): TYPE_MAP[validate_schema_name(type)] - for field, type in create_table["fields"].items() - } - # Add a marker of when this data was inserted - fields["updated_at"] = "DATETIME" - - if "id" in fields: - raise RuntimeError(f"Cannot use name 'id': {fields}") - - # SQL returns schema types a little different from how they're specified, so - # fix that up here - def clean_type(s): - if s == "int(11)": - return "integer".upper() - return s.upper() - - try: - # Check if the table exists - schema = mquery(f"DESCRIBE {name}") - existing_fields = {x["Field"]: x["Type"] for x in schema} - existing_fields = { - field: clean_type(type) for field, type in existing_fields.items() - } - - # Make sure every requested field in the DB is there and the type - # matches, and fix it if not - for field, type in fields.items(): - if field not in existing_fields: - print(f"Adding new field {field}") - mquery(f"ALTER TABLE {name} ADD COLUMN {field} {type}") - elif existing_fields[field] != type: - print(f"Modifying {field}") - mquery(f"ALTER TABLE {name} MODIFY {field} {type}") - - except mysql.connector.errors.ProgrammingError as e: - if not str(e).endswith(f"Table 'metrics.{name}' doesn't exist"): - raise e - - # The table isn't there at all and we need to create it from scratch - field_queries = [f"{field} {type}" for field, type in fields.items()] - create_table_query = f""" - CREATE TABLE {name} ( - id INTEGER AUTO_INCREMENT, - {', '.join(field_queries)}, - PRIMARY KEY (id) - ); - """.strip() - print(create_table_query) - mquery(create_table_query) - - -def run_read(read): - # If the query is in the list of hardcoded queries, just use that - print(f"Executing read {read}") - saved_query_name = read.get("saved_query_name", None) - params = read.get("params", []) - saved_query = SAVED_QUERIES.get(saved_query_name, None) - if saved_query is not None: - results = run_query(saved_query, params, get_connection("reader")) - else: - # Build a SQL query ad-hoc and run it - query, params = build_query(read) - results = run_query(query, params, get_connection("reader")) - - print("Fetched", results) - return json.dumps(results, default=str) - - -def handle_event(event): - print("Handling event", event) - create_table = event.get("create_table", None) - if create_table is not None: - # Create the table if requests, gated behind a killswitch since we - # shouldn't need this to be on all the time - if os.environ.get("create_enabled", False) == "1": - return run_create_table(create_table) - else: - return "create is disabled" - - write = event.get("write", None) - if write is not None: - return run_write(write) - - read = event.get("read", None) - if read is not None: - return run_read(read) - - -def lambda_handler(events, context): - """ - Takes in a list of "events", which are actions for the lambda to do on MySQL - - Create: make a table or alter an existing table - { - "create_table": { - "table_name": "my_table", - "fields": { - "something": "int", - "something_else": "string", - }, - } - } - - Write: insert a record into a metrics table - { - "create_table": { - "table_name": "my_table", - "fields": { - "something": "int", - "something_else": "string", - }, - } - } - - Read: query the pytorch database (everything after "fields" is optional) - { - "read": { - "table_name": "my_table", - "fields": ["something", "something_else"], - "where": [ - { - "field": "something", - "value": 10 - } - ], - "group_by": ["something"], - "order_by": ["something"], - "limit": 5, - } - } - - or use a hardcoded query - - { - "read": { - "saved_query_name": "sample", - } - } - """ - print("Handling", events) - - # Run over all the requests and collate the results - results = [] - for event in events: - results.append(handle_event(event)) - - print(results) - return results diff --git a/aws/lambda/rds-proxy/requirements.txt b/aws/lambda/rds-proxy/requirements.txt deleted file mode 100644 index b2ebf79450..0000000000 --- a/aws/lambda/rds-proxy/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -mysql-connector-python==9.1.0 \ No newline at end of file diff --git a/stats/windows_smoke_tests.csv b/stats/windows_smoke_tests.csv deleted file mode 100644 index a6c25a4645..0000000000 --- a/stats/windows_smoke_tests.csv +++ /dev/null @@ -1,31 +0,0 @@ -test_filename,test_suite_name,test_case_name,test_total_time,windows_only_failure_sha_count,total_sha_count,windows_failure_count,linux_failure_count,windows_total_count,linux_total_count -test_cuda,TestCuda,test_cudnn_multiple_threads_same_device,8061.3993181568,43,3717,48,0,2090,6743 -test_ops,TestGradientsCUDA,test_forward_mode_AD_acos_cuda_complex128,95.775390625,11,2448,26,1,1370,4590 -test_ops,TestGradientsCUDA,test_forward_mode_AD_acosh_cuda_complex128,86.469563802084,17,2448,26,1,1370,4590 -test_cuda,TestCuda,test_external_streams,31.505263157895,10,34,21,0,27,66 -test_profiler,TestProfiler,test_profiler_tracing,233.76255174092,9,4488,21,3,2678,8825 -test_profiler,TestProfiler,test_source,60.529337293665,9,4507,21,3,2701,8861 -test_profiler,TestProfiler,test_kineto_profiler_api,539.97791790427,9,4507,21,3,2701,8861 -test_profiler,TestProfiler,test_memory_profiler,67.081949651847,9,4507,21,3,2701,8861 -test_tensor_creation_ops,TestRandomTensorCreationCUDA,test_randperm_device_compatibility_cuda,111.81035440507,7,3706,23,1,2078,6687 -test_multiprocessing,TestMultiprocessing,test_fd_sharing,10613.203172502,8,4579,5,0,2783,8793 -test_utils,TestStandaloneCPPJIT,test_load_standalone,8235.3274431394,5,4496,21,0,2663,8775 -test_multiprocessing,TestMultiprocessing,test_fs_sharing,10596.191085614,11,4579,5,0,2783,8793 -test_cuda,TestCuda,test_graph_cudnn_dropout,134.22467934542,5,3702,6,0,2069,6715 -test_dataloader,TestDataLoaderPersistentWorkers,test_segfault,2524.6477660715,3,4675,3,0,2232,9045 -test_dataloader,TestDataLoader,test_segfault,2524.6477660715,2,4675,2,0,2232,9045 -test_utils,TestHub,test_load_commit_from_forked_repo,10661.148300102,2,4496,2,0,2663,8768 -test_ops,TestOpInfoCUDA,test_unsupported_backward_einsum_cuda_bfloat16,39.5625,1,10,2,0,10,22 -test_cpp_extensions_jit,TestCppExtensionJIT,test_jit_cuda_archflags,25528.358531106,1,4697,1,0,2087,9068 -test_ops,TestOpInfoCUDA,test_unsupported_backward___rmatmul___cuda_bfloat16,38.068965517241,1,7,2,0,10,19 -test_dataloader,TestDataLoaderPersistentWorkers,test_timeout,2666.6782265146,1,4675,1,0,2232,9045 -test_ops,TestCommonCUDA,test_variant_consistency_jit_cross_cuda_float32,256.21441441441,1,265,1,0,109,428 -test_ops,TestGradientsCUDA,test_inplace_forward_mode_AD_acos_cuda_complex128,68.82411310151,25,3605,47,0,1128,10182 -test_ops,TestCommonCUDA,test_variant_consistency_eager_nn_functional_conv_transpose2d_cuda_float32,61.214152700187,18,1491,23,0,439,4359 -test_quantization,TestZeroRedundancyOptimizerSingleRank,test_state_dict,517.12081259249,4,4682,4,0,4,13588 -test_jit_fuser_legacy,test_jit_fuser.TestFuser,test_abs_cpu_unicode_temp_dir,322.4254011608,4,4154,7,0,467,1228 -test_ao_sparsity,TestZeroRedundancyOptimizerSingleRank,test_constructor,347.0455703783,4,4682,4,0,4,13588 -test_nn,TestZeroRedundancyOptimizerSingleRank,test_zero_grad,493.56455394828,4,4682,4,0,4,13588 -test_ops,TestJitCUDA,test_variant_consistency_jit_square_cuda_float32,216.0424774561,1,4086,1,7,1231,11328 -test_ops,TestJitCUDA,test_variant_consistency_jit_acos_cuda_complex64,472.17433950324,1,4086,1,6,1231,11328 -test_ops,TestJitCUDA,test_variant_consistency_jit_put_cuda_complex64,9459.8308020884,1,4086,1,5,1231,11328 diff --git a/tools/alerts/validate_alerts.py b/tools/alerts/validate_alerts.py deleted file mode 100644 index 55b2e247db..0000000000 --- a/tools/alerts/validate_alerts.py +++ /dev/null @@ -1,76 +0,0 @@ -import argparse -import copy -import json -from collections import defaultdict - -import jsonschema - -BASE_ALERT_SCHEMA = { - "type": "array", - "items": { - "type": "object", - "properties": { - "AlertType": {"type": "string"}, - "AlertObject": {"type": "string"}, - "OncallTeams": {"type": "array", "items": {"type": "string"}}, - "OncallIndividuals": {"type": "array", "items": {"type": "string"}}, - "Flags": {"type": "array", "items": {"type": "string"}}, - "branch": {"type": "string"}, - }, - "additionalProperties": True, - "required": [ - "AlertType", - "AlertObject", - "OncallTeams", - "OncallIndividuals", - "Flags", - "branch", - ], - }, -} - -EXTRA_ALERT_PROPERTIES = { - "RecurentlyFailingJobAlert": { - "sha": {"type": "string"}, - } -} - - -def validate_json(json_string): - try: - json_object = json.loads(json_string) - print("The input string is a valid JSON.") - except ValueError as e: - raise ValueError(f"The input string is not a valid JSON: Error: {e}") - - -def validate_schema(json_string): - all_alerts_schemas = defaultdict(lambda: copy.deepcopy(BASE_ALERT_SCHEMA)) - for alert_type, alert_schema_add_on in EXTRA_ALERT_PROPERTIES.items(): - all_alerts_schemas[alert_type]["items"]["properties"].update( - alert_schema_add_on - ) - all_alerts_schemas[alert_type]["items"]["additionalProperties"] = False - for property in alert_schema_add_on.keys(): - all_alerts_schemas[alert_type]["items"]["required"].append(property) - json_object = json.loads(json_string) - for alert in json_object: - jsonschema.validate( - instance=[alert], schema=all_alerts_schemas[alert["AlertType"]] - ) - - -def main(): - parser = argparse.ArgumentParser( - description="Validate json string containing alerts" - ) - parser.add_argument( - "--alerts", type=str, required=True, help="JSON string to validate." - ) - args = parser.parse_args() - validate_json(args.alerts) - validate_schema(args.alerts) - - -if __name__ == "__main__": - main() diff --git a/tools/tests/test_validate_alerts.py b/tools/tests/test_validate_alerts.py deleted file mode 100644 index c833dd6b3c..0000000000 --- a/tools/tests/test_validate_alerts.py +++ /dev/null @@ -1,115 +0,0 @@ -import json -from unittest import main, TestCase - -import jsonschema -import tools.alerts.validate_alerts as validate_alerts - -# valid json data -valid_json = json.dumps( - [ - { - "AlertType": "Foo", - "AlertObject": "Bar", - "OncallTeams": ["Team1", "Team2"], - "OncallIndividuals": ["Individual1", "Individual2"], - "branch": "main", - "Flags": ["Flag1", "Flag2"], - }, - { - "AlertType": "FooBar", - "AlertObject": "BarFoo", - "OncallTeams": ["Team1", "Team2"], - "OncallIndividuals": ["Individual1", "Individual2"], - "branch": "main", - "Flags": ["Flag1", "Flag2"], - }, - ] -) - -# invalid json data -invalid_json = '{"invalid_json"}' - -# valid json but invalid schema -valid_json_invalid_schema = json.dumps( - [ - { - "AlertType": "Foo", - "AlertObject": "Bar", - "OncallTeams": "Team1", # should be list - "OncallIndividuals": ["Individual1", "Individual2"], - "Flags": ["Flag1", "Flag2"], - } - ] -) - -valid_json_valid_recurrently_failing_job_alert_schema = json.dumps( - [ - { - "AlertType": "RecurentlyFailingJobAlert", - "AlertObject": "Bar", - "OncallTeams": ["Team1", "Team2"], - "OncallIndividuals": ["Individual1", "Individual2"], - "Flags": ["Flag1", "Flag2"], - "branch": "main", - "sha": "1234567890123456789012345678901234567890", - }, - { - "AlertType": "FooBar", - "AlertObject": "BarFoo", - "OncallTeams": ["Team1", "Team2"], - "OncallIndividuals": ["Individual1", "Individual2"], - "branch": "main", - "Flags": ["Flag1", "Flag2"], - }, - ] -) - -valid_json_invalid_recurrently_failing_job_alert_schema = json.dumps( - [ - { - "AlertType": "RecurentlyFailingJobAlert", - "AlertObject": "Bar", - "OncallTeams": ["Team1", "Team2"], - "OncallIndividuals": ["Individual1", "Individual2"], - "Flags": ["Flag1", "Flag2"], - "branch": "main", - } - ] -) - - -class AlertValidationTest(TestCase): - def test_validate_json(self): - # Test whether valid json is correctly validated - assert validate_alerts.validate_json(valid_json) is None - - # Test whether invalid json raises an error - with self.assertRaises(ValueError): - validate_alerts.validate_json(invalid_json) - - def test_validate_schema(self): - # Test whether valid json that conforms to the schema is correctly validated - assert validate_alerts.validate_schema(valid_json) is None - - # Test whether valid json that does not conform to the schema raises an error - with self.assertRaises(jsonschema.exceptions.ValidationError): - validate_alerts.validate_schema(valid_json_invalid_schema) - - def test_validate_recurrently_failing_job_alert_schema(self): - # Test whether valid json that conforms to the schema is correctly validated - assert ( - validate_alerts.validate_schema( - valid_json_valid_recurrently_failing_job_alert_schema - ) - is None - ) - - # Test whether valid json that does not conform to the schema raises an error - with self.assertRaises(jsonschema.exceptions.ValidationError): - validate_alerts.validate_schema( - valid_json_invalid_recurrently_failing_job_alert_schema - ) - - -if __name__ == "__main__": - main()