From 027584656edb4a4bc6fa0836b59c22d1ebccc2f8 Mon Sep 17 00:00:00 2001 From: Eman Elsabban Date: Thu, 16 Nov 2023 13:00:02 -0800 Subject: [PATCH] A quick script to check if a job took longer than x time to run and update python in precommit --- .pre-commit-config.yaml | 7 ++- requirements-dev.txt | 6 +- tron/bin/get_jobs_exceeding_runtime.py | 86 ++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 6 deletions(-) create mode 100644 tron/bin/get_jobs_exceeding_runtime.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 66128a600..fe9c40164 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ --- default_language_version: - python: python3.6 + python: python3.8 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.5.0 @@ -21,6 +21,7 @@ repos: hooks: - id: flake8 exclude: ^docs/source/conf.py$ + language_version: python3.8 - repo: https://github.com/asottile/reorder_python_imports rev: v1.9.0 hooks: @@ -41,8 +42,8 @@ repos: language: script files: ^tests/.*\.py$ - repo: http://github.com/psf/black - rev: 19.10b0 + rev: 23.3.0 hooks: - id: black - language_version: python3.6 + language_version: python3.8 args: [--target-version, py36] diff --git a/requirements-dev.txt b/requirements-dev.txt index c87f4b21c..a9a007c25 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ asynctest==0.12.0 cfgv==2.0.1 entrypoints==0.3 flake8==3.7.9 -identify==1.4.9 +identify==2.4.4 importlib-resources==1.0.2 iniconfig==1.1.1 isort==4.3.18 @@ -15,7 +15,7 @@ mypy-extensions==0.4.3 nodeenv==1.3.3 packaging==19.2 pluggy==0.13.0 -pre-commit==1.21.0 +pre-commit==2.9.2 py==1.10.0 pycodestyle==2.5.0 pyflakes==2.1.1 @@ -26,4 +26,4 @@ pytest-asyncio==0.14.0 requirements-tools==1.2.1 toml==0.10.2 typed-ast==1.4.0 -virtualenv==16.7.5 +virtualenv==20.0.8 diff --git a/tron/bin/get_jobs_exceeding_runtime.py b/tron/bin/get_jobs_exceeding_runtime.py new file mode 100644 index 000000000..786f7d3c2 --- /dev/null +++ b/tron/bin/get_jobs_exceeding_runtime.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3.8 +import argparse +import logging +import sys +from typing import Optional + +import pytimeparse + +from tron.commands import cmd_utils +from tron.commands.client import Client + + +log = logging.getLogger("check_exceeding_time") + +STATES_TO_CHECK = {"queued", "scheduled", "cancelled", "skipped"} + + +def parse_args() -> argparse.Namespace: + parser = cmd_utils.build_option_parser() + parser.add_argument( + "--job", + default=None, + help="Check if a particular job exceeded a time to run. If unset checks all jobs", + ) + parser.add_argument( + "--time", + help="This is used to specify the time that if any job exceeds will show. Defaults to 5 hours", + type=int, + dest="time_limit", + default=18000, + ) + args = parser.parse_args() + return args + + +def check_if_time_exceeded(job_runs, job_expected_runtime) -> list: + result = [] + for job_run in job_runs: + if job_run.get("state", "unknown") not in STATES_TO_CHECK: + if is_job_run_exceeding_expected_runtime( + job_run, + job_expected_runtime, + ): + result.append(job_run["id"]) + return result + + +def is_job_run_exceeding_expected_runtime(job_run, job_expected_runtime) -> bool: + if job_expected_runtime is not None: + duration_seconds = pytimeparse.parse(job_run.get("duration", "")) + return duration_seconds and duration_seconds > job_expected_runtime + return False + + +def check_job_time(job, time_limit) -> list: + job_runs = job.get("runs", []) + return check_if_time_exceeded(job_runs, time_limit) + + +def main() -> Optional[int]: + args = parse_args() + cmd_utils.setup_logging(args) + cmd_utils.load_config(args) + client = Client(args.server, args.cluster_name) + results = [] + + if args.job is None: + jobs = client.jobs(include_job_runs=True) + for job in jobs: + job_url = client.get_url(job["name"]) + job = client.job_runs(job_url) + results.extend(check_job_time(job=job, time_limit=args.time_limit)) + else: + job_url = client.get_url(args.job) + job = client.job_runs(job_url) + results.extend(check_job_time(job=job, time_limit=args.time_limit)) + + if not results: + print("All jobs ran within the time limit") + else: + print(f"These are the runs that took longer than {args.time_limit} to run: {sorted(results)}") + return + + +if __name__ == "__main__": + sys.exit(main())