From 7d35b19326fd4172a587f7f386f5e360928a231e Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 16 Jan 2025 03:34:19 +0200 Subject: [PATCH] unified runner code --- .github/workflows/runner.py | 23 +-- .../cogs/github_cog.py | 17 +- src/discord-cluster-manager/cogs/modal_cog.py | 53 ++---- src/discord-cluster-manager/consts.py | 8 +- src/discord-cluster-manager/modal_runner.py | 53 +----- .../modal_runner_archs.py | 165 +----------------- src/discord-cluster-manager/run_eval.py | 26 ++- src/discord-cluster-manager/utils.py | 47 +++++ 8 files changed, 119 insertions(+), 273 deletions(-) diff --git a/.github/workflows/runner.py b/.github/workflows/runner.py index 02b48d59..873540f4 100644 --- a/.github/workflows/runner.py +++ b/.github/workflows/runner.py @@ -5,28 +5,11 @@ sys.path.append("src/discord-cluster-manager") -from leaderboard_eval import cu_eval, py_eval -from run_eval import run_cuda_script, run_pytorch_script +from run_eval import run_config -config = json.loads(Path("payload.json").read_text()) # type: dict +config = json.loads(Path("payload.json").read_text()) Path("payload.json").unlink() -if config["lang"] == "cu": - comp, run = run_cuda_script( - {"eval.cu": cu_eval}, - {key: config[key] for key in ["reference.cuh", "submission.cuh"] if key in config}, - arch=None, - ) - result = {"compile": asdict(comp), "run": asdict(run)} -else: - run = run_pytorch_script( - { - "eval.py": py_eval, - **{key: config[key] for key in ["reference.py", "submission.py"] if key in config}, - }, - main="eval.py", - arch=None, - ) - result = {"run": asdict(run)} +result = asdict(run_config(config)) Path("result.json").write_text(json.dumps(result)) diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py index 422123e4..a2ae3944 100644 --- a/src/discord-cluster-manager/cogs/github_cog.py +++ b/src/discord-cluster-manager/cogs/github_cog.py @@ -15,7 +15,7 @@ from leaderboard_eval import amd_requirements, nvidia_requirements from report import generate_report from run_eval import CompileResult, FullResult, RunResult -from utils import get_github_branch_name, send_discord_message, setup_logging +from utils import build_task_config, get_github_branch_name, send_discord_message, setup_logging logger = setup_logging() @@ -113,15 +113,14 @@ async def trigger_github_run( # TODO implement HIP raise ValueError("Cannot use CUDA runs with AMD GPUs") - eval_name = {"py": "eval.py", "cu": "eval.cu"}[lang] - ref_name = {"py": "reference.py", "cu": "reference.cuh"}[lang] - sub_name = {"py": "submission.py", "cu": "submission.cuh"}[lang] lang_name = {"py": "Python", "cu": "CUDA"}[lang] - if reference_content is None: - config = {eval_name: script_content, "lang": lang} - else: - config = {ref_name: reference_content, sub_name: script_content, "lang": lang} + config = build_task_config( + lang=lang, + reference_content=reference_content, + submission_content=script_content, + arch=None, + ) logger.info(f"Attempting to trigger GitHub action for {lang_name} on {gpu_type.name}") gh = Github(GITHUB_TOKEN) @@ -208,7 +207,7 @@ async def download_results(self, run_id) -> FullResult: data = await self.download_artifact(run_id, name="run-result") logs = data["result.json"].decode("utf-8") data = json.loads(logs) - if "compile" in data: + if "compile" in data and data["compile"] is not None: comp = CompileResult(**data["compile"]) else: comp = None diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py index b74067d0..82281ada 100644 --- a/src/discord-cluster-manager/cogs/modal_cog.py +++ b/src/discord-cluster-manager/cogs/modal_cog.py @@ -3,13 +3,12 @@ import discord import modal -from consts import ModalGPU +from consts import GPU_TO_SM, ModalGPU from discord import app_commands from discord.ext import commands -from leaderboard_eval import cu_eval, py_eval from report import generate_report from run_eval import FullResult -from utils import send_discord_message, setup_logging +from utils import build_task_config, send_discord_message, setup_logging logger = setup_logging() @@ -99,41 +98,27 @@ async def handle_modal_execution( try: loop = asyncio.get_event_loop() func_type = "pytorch" if filename.endswith(".py") else "cuda" + lang = "py" if filename.endswith(".py") else "cu" func_name = f"run_{func_type}_script_{gpu_type.lower()}" - if reference_content is not None: - result = await loop.run_in_executor( - None, - lambda: modal.Function.lookup("discord-bot-runner", func_name).remote( - py_eval if filename.endswith(".py") else cu_eval, - reference_content=reference_content, - submission_content=script_content, - ), - ) - - # Send results - await thread.send(f"\n**Script size:** {len(script_content)} bytes") - await generate_report(thread, result) - return result - - else: - # Currently broken? - result = await loop.run_in_executor( - None, - lambda: modal.Function.lookup("discord-bot-runner", func_name).remote( - script_content, - ), - ) - await send_discord_message( - interaction, f"Modal job completed in thread {thread.jump_url}", ephemeral=True - ) + config = build_task_config( + lang=lang, + reference_content=reference_content, + submission_content=script_content, + arch=GPU_TO_SM[gpu_type.upper()], + ) - # Send results - await thread.send(f"\n**Script size:** {len(script_content)} bytes") - await thread.send(f"**Execution time:** {result.run.duration:.3f} s\n") + result = await loop.run_in_executor( + None, + lambda: modal.Function.lookup("discord-bot-runner", func_name).remote( + config=config + ), + ) - await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!") - return result + # Send results + await thread.send(f"\n**Script size:** {len(script_content)} bytes") + await generate_report(thread, result) + return result except Exception as e: logger.error(f"Error in handle_modal_execution: {str(e)}", exc_info=True) diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 05cb1713..d74195f1 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -56,10 +56,10 @@ def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum: GPU_TO_SM = { - "T4": 75, - "L4": 80, - "A100": 80, - "H100": 90, + "T4": "75", + "L4": "80", + "A100": "80", + "H100": "90a", } diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py index 5afbff5b..e9ab200e 100644 --- a/src/discord-cluster-manager/modal_runner.py +++ b/src/discord-cluster-manager/modal_runner.py @@ -1,10 +1,10 @@ import signal +import traceback from contextlib import contextmanager -from typing import Optional -from consts import MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH +from consts import MODAL_PATH from modal import App, Image, Mount -from run_eval import FullResult, run_cuda_script, run_pytorch_script +from run_eval import FullResult, run_config # Create a stub for the Modal app # IMPORTANT: This has to stay in separate file or modal breaks @@ -74,55 +74,18 @@ def timeout_handler(signum, frame): signal.signal(signal.SIGALRM, original_handler) -def modal_run_pytorch_script( # noqa: C901 - script_content: str, - reference_content: Optional[str] = None, - submission_content: Optional[str] = None, +def modal_run_config( # noqa: C901 + config: dict, timeout_seconds: int = 300, - arch: int = None, ) -> FullResult: """Modal version of run_pytorch_script, handling timeouts""" try: with timeout(timeout_seconds): - run_result = run_pytorch_script( - { - "eval.py": script_content, - "reference.py": reference_content, - "submission.py": submission_content, - }, - "eval.py", - ) - return FullResult(success=True, error="", compile=None, run=run_result) - # TODO fixup error handling! + return run_config(config) except TimeoutException as e: return FullResult(success=False, error=f"Timeout Error: {str(e)}", compile=None, run=None) except Exception as e: + exception = "".join(traceback.format_exception(e)) return FullResult( - success=False, error=f"Error executing script: {str(e)}", compile=None, run=None - ) - - -def modal_run_cuda_script( # # noqa: C901 - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, - arch: int = None, -) -> FullResult: - """Modal version of run_cuda_script, handling timeouts""" - try: - with timeout(timeout_seconds): - comp, run = run_cuda_script( - {"eval.cu": script_content}, - {"reference.cuh": reference_content, "submission.cuh": submission_content}, - arch=arch, - include_dirs=MODAL_CUDA_INCLUDE_DIRS, - ) - return FullResult(success=True, error="", compile=comp, run=run) - # TODO fixup error handling! - except TimeoutException as e: - return FullResult(success=False, error=f"Timeout Error: {str(e)}", compile=None, run=None) - except Exception as e: - return FullResult( - success=False, error=f"Error executing script: {str(e)}", compile=None, run=None + success=False, error=f"Error executing script:\n{exception}", compile=None, run=None ) diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py index d1ada74b..615a4ee6 100644 --- a/src/discord-cluster-manager/modal_runner_archs.py +++ b/src/discord-cluster-manager/modal_runner_archs.py @@ -2,162 +2,13 @@ # Modal apps on specific devices. We will fix this later. -from consts import GPU_TO_SM -from modal_runner import app, cuda_image, modal_run_cuda_script, modal_run_pytorch_script -from run_eval import FullResult +from modal_runner import app, cuda_image, modal_run_config, python_image - -# T4: sm_70 (CUDA 7.x, Maxwell Architecture) -@app.function( - gpu="T4", - image=cuda_image, -) -def run_cuda_script_t4( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_cuda_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["T4"], - ) - - -@app.function( - gpu="T4", - image=cuda_image, -) -def run_pytorch_script_t4( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_pytorch_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["T4"], - ) - - -# L4: sm_80 (L4 Tensor Core architecture) -@app.function( - gpu="L4", - image=cuda_image, -) -def run_cuda_script_l4( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_cuda_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["L4"], - ) - - -@app.function( - gpu="L4", - image=cuda_image, -) -def run_pytorch_script_l4( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_pytorch_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["L4"], - ) - - -# A100: sm_80 (Ampere architecture) -@app.function( - gpu="A100", - image=cuda_image, -) -def run_cuda_script_a100( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_cuda_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["A100"], - ) - - -@app.function( - gpu="A100", - image=cuda_image, -) -def run_pytorch_script_a100( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_pytorch_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["A100"], - ) - - -# H100: sm_90 (Hopper architecture) -@app.function( - gpu="H100", - image=cuda_image, -) -def run_cuda_script_h100( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_cuda_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["H100"], - ) - - -@app.function( - gpu="H100", - image=cuda_image, -) -def run_pytorch_script_h100( - script_content: str, - reference_content: str = None, - submission_content: str = None, - timeout_seconds: int = 600, -) -> FullResult: - return modal_run_pytorch_script( - script_content, - reference_content, - submission_content, - timeout_seconds, - arch=GPU_TO_SM["H100"], +gpus = ["T4", "L4", "A100", "H100"] +for gpu in gpus: + app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu.lower()}", serialized=True)( + modal_run_config ) + app.function( + gpu=gpu, image=python_image, name=f"run_pytorch_script_{gpu.lower()}", serialized=True + )(modal_run_config) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 7a3eef96..a29b98e4 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -38,10 +38,10 @@ class RunResult: @dataclasses.dataclass class FullResult: # fmt: off - success: bool # did the runner (github/modal) execute successfully - error: str # if not success, an error message - compile: CompileResult # results of compilation - run: RunResult # results of running + success: bool # did the runner (github/modal) execute successfully + error: str # if not success, an error message + compile: CompileResult | None # results of compilation + run: RunResult | None # results of running # fmt: on @@ -263,3 +263,21 @@ def run_pytorch_script( # noqa: C901 for f in sources.keys(): if os.path.exists(f): os.remove(f) + + +def run_config(config: dict): + if config["lang"] == "py": + run_result = run_pytorch_script( + sources=config["sources"], main=config["main"], arch=config.get("arch", None) + ) + return FullResult(success=True, error="", compile=None, run=run_result) + elif config["lang"] == "cu": + comp, run = run_cuda_script( + sources=config["sources"], + headers=config.get("headers", {}), + arch=config.get("arch", None), + include_dirs=config.get("include_dirs", []), + ) + return FullResult(success=True, error="", compile=comp, run=run) + else: + raise ValueError(f"Invalid language {config['lang']}") diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index df44f450..32ae8700 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -5,6 +5,8 @@ from typing import Any, List, NotRequired, TypedDict import discord +from consts import MODAL_CUDA_INCLUDE_DIRS +from leaderboard_eval import cu_eval, py_eval def setup_logging(): @@ -182,3 +184,48 @@ class SubmissionItem(TypedDict): gpu_type: str stdout: NotRequired[str] profiler_output: NotRequired[str] + + +def build_task_config( + lang: str, reference_content: str = None, submission_content: str = None, arch: str = None +) -> dict: + eval_name = {"py": "eval.py", "cu": "eval.cu"}[lang] + + config = { + "lang": lang, + "arch": arch, + } + + if lang == "py": + config["main"] = "eval.py" + else: + config["include_dirs"] = MODAL_CUDA_INCLUDE_DIRS + + if reference_content is None: + return { + **config, + "sources": { + eval_name: submission_content, + }, + } + else: + if lang == "py": + return { + **config, + "sources": { + "eval.py": py_eval, + "reference.py": reference_content, + "submission.py": submission_content, + }, + } + else: + return { + **config, + "sources": { + "eval.cu": cu_eval, + }, + "headers": { + "reference.cuh": reference_content, + "submission.cuh": submission_content, + }, + }