wip github runner

gpu-mode · Jan 13, 2025 · a37c356 · a37c356
1 parent d7df3c9
commit a37c356
Show file tree

Hide file tree

Showing 4 changed files with 203 additions and 57 deletions.
diff --git a/.github/workflows/nvidia_cuda_workflow.yml b/.github/workflows/nvidia_cuda_workflow.yml
@@ -0,0 +1,64 @@
+name: NVIDIA CUDA Job
+on:
+  push:
+    branches:
+      - '**'
+
+  workflow_dispatch:
+    inputs:
+      script_content:
+        description: 'Content of CUDA script'
+        required: true
+        type: string
+      reference_content:
+        description: 'Content of the reference code script'
+        required: true
+        type: string
+
+jobs:
+  run:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Create input files
+        shell: bash
+        run: |
+          cat > "train.cuh" <<EOL
+          ${{ github.event.inputs.script_content }}
+          EOL
+          
+          cat "train.cuh"  # Debug: show file contents
+          
+          echo "Creating reference script..."
+          cat > "reference.cuh" <<EOL
+          ${{ github.event.inputs.reference_content }}
+          EOL
+          
+          cat "reference.cuh"  # Debug: Show file contents
+
+      - name: Run script
+        shell: bash
+        run: |
+          python .github/workflows/run-cuda.py
+          cat result.json  # Debug: show output
+
+      - name: Upload training artifacts
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: run-result
+          path: |
+            result.json
+            train.cuh
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
diff --git a/.github/workflows/run-cuda.py b/.github/workflows/run-cuda.py
@@ -0,0 +1,22 @@
+import json
+import sys
+from pathlib import Path
+from dataclasses import asdict
+
+sys.path.append("src/discord-cluster-manager")
+
+from leaderboard_eval import cu_eval
+from run_eval import run_cuda_script
+
+ref = Path("reference.cuh")
+sub = Path("train.cuh")
+
+comp, run = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None)
+
+
+result = {
+    "compile": asdict(comp),
+    "run": asdict(run)
+}
+
+Path("result.json").write_text(json.dumps(result))
diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py
@@ -1,6 +1,9 @@
 import asyncio
+import json
 import os
 import zipfile
+import pprint
+import tempfile
 from datetime import datetime, timedelta, timezone
 
 import discord
@@ -49,6 +52,7 @@ async def run_github(
 
         thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job")
         await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")
+        expect_result = False
 
         try:
             script_content = (await script.read()).decode("utf-8")
@@ -61,30 +65,37 @@ async def run_github(
                     if reference_code is not None
                     else (await reference_script.read()).decode("utf-8")
                 )
-                eval_code = py_eval if script.filename.endswith(".py") else cu_eval
-
-                run_id = await self.trigger_github_action(
-                    script_content,
-                    filename,
-                    selected_gpu,
-                    reference_content,
-                    eval_code,
-                )
+                if not script.filename.endswith(".py") and gpu_type.value != "amd":
+                    run_id = await self.trigger_github_cuda(
+                        script_content,
+                        reference_content,
+                    )
+                    expect_result = True
+                else:
+                    eval_code = py_eval if script.filename.endswith(".py") else cu_eval
+
+                    run_id = await self.trigger_github_action(
+                        script_content,
+                        filename,
+                        selected_gpu,
+                        reference_content,
+                        eval_code,
+                    )
             else:
                 run_id = await self.trigger_github_action(script_content, filename, selected_gpu)
 
             if run_id:
                 await thread.send(
                     f"GitHub Action triggered! Run ID: {run_id}\nMonitoring progress..."
                 )
-                status, logs, url = await self.check_workflow_status(run_id, thread)
+                status, logs, url = await self.check_workflow_status(run_id, thread, expect_result)
 
                 await thread.send(f"Training completed with status: {status}")
 
                 if len(logs) > 1900:
                     await self.bot.send_chunked_message(thread, logs, code_block=True)
                 else:
-                    await thread.send(f"```\nLogs:\n{logs}\n```")
+                    await thread.send(f"```\n!!Logs!!:\n{logs}\n```")
 
                 if url:
                     await thread.send(f"View the full run at: <{url}>")
@@ -101,6 +112,40 @@ async def run_github(
                 await thread.send(f"Error processing request: {str(e)}")
             raise
 
+    async def trigger_github_cuda(
+        self,
+        script_content,
+        reference_content,
+    ):
+        logger.info(f"Attempting to trigger GitHub action for CUDA")
+        gh = Github(GITHUB_TOKEN)
+        repo = gh.get_repo(GITHUB_REPO)
+
+        try:
+            trigger_time = datetime.now(timezone.utc)
+            workflow_file = "nvidia_cuda_workflow.yml"
+            workflow = repo.get_workflow(workflow_file)
+
+            success = workflow.create_dispatch(
+                get_github_branch_name(),
+                {
+                    "script_content": script_content,
+                    "reference_content": reference_content,
+                },
+            )
+            if success:
+                await asyncio.sleep(2)
+                runs = list(workflow.get_runs())
+
+                for run in runs:
+                    if run.created_at.replace(tzinfo=timezone.utc) > trigger_time:
+                        return run.id
+            return None
+
+        except Exception as e:
+            logger.error(f"Error in trigger_github_action: {str(e)}", exc_info=True)
+            return None
+
     async def trigger_github_action(
         self,
         script_content,
@@ -152,7 +197,7 @@ async def trigger_github_action(
             logger.error(f"Error in trigger_github_action: {str(e)}", exc_info=True)
             return None
 
-    async def check_workflow_status(self, run_id, thread):
+    async def check_workflow_status(self, run_id, thread, expect_result: bool=False):
         logger.info(f"Starting to monitor workflow status for run {run_id}")
         gh = Github(GITHUB_TOKEN)
         repo = gh.get_repo(GITHUB_REPO)
@@ -187,7 +232,11 @@ async def check_workflow_status(self, run_id, thread):
                     )
 
                 if run.status == "completed":
-                    logs = await self.download_artifact(run_id)
+                    if expect_result:
+                        result = await self.download_results(run_id)
+                        logs = self.make_logs(result)
+                    else:
+                        logs = await self.handle_training_log(run_id)
                     return run.conclusion, logs, run.html_url
 
                 await thread.send(
@@ -199,41 +248,51 @@ async def check_workflow_status(self, run_id, thread):
             except Exception as e:
                 return "error", str(e), None
 
-    async def download_artifact(self, run_id):
-        logger.info(f"Attempting to download artifacts for run {run_id}")
-        gh = Github(GITHUB_TOKEN)
-        repo = gh.get_repo(GITHUB_REPO)
+    def make_logs(self, result: dict):
+        return pprint.pformat(result)
 
+    async def download_results(self, run_id):
         try:
-            run = repo.get_workflow_run(run_id)
-            artifacts = run.get_artifacts()
-
-            for artifact in artifacts:
-                if artifact.name == "training-artifacts":
-                    url = artifact.archive_download_url
-                    headers = {"Authorization": f"token {GITHUB_TOKEN}"}
-                    response = requests.get(url, headers=headers)
-
-                    if response.status_code == 200:
-                        with open("training.log.zip", "wb") as f:
-                            f.write(response.content)
-
-                        with zipfile.ZipFile("training.log.zip") as z:
-                            log_file = next(
-                                (f for f in z.namelist() if f.endswith("training.log")),
-                                None,
-                            )
-                            if log_file:
-                                with z.open(log_file) as f:
-                                    logs = f.read().decode("utf-8")
-                            else:
-                                logs = "training.log file not found in artifact"
-
-                        os.remove("training.log.zip")
-                        return logs
-                    else:
-                        return f"Failed to download artifact. Status code: {response.status_code}"
+            data = await self.download_artifact(run_id, name="run-result")
+            logs = data['result.json'].decode("utf-8")
+            return {"success": True, **json.loads(logs)}
+        except Exception as e:
+            return {"success": False, "error": f"Error downloading artifacts: {str(e)}"}
 
-            return "No training artifacts found"
+    async def handle_training_log(self, run_id):
+        try:
+            data = await self.download_artifact(run_id, name="training-artifacts")
+            logs = data['training.log'].decode("utf-8")
+            return logs
         except Exception as e:
             return f"Error downloading artifacts: {str(e)}"
+
+    async def download_artifact(self, run_id, name: str):
+        logger.info(f"Attempting to download artifact {name} for run {run_id}")
+        gh = Github(GITHUB_TOKEN)
+        repo = gh.get_repo(GITHUB_REPO)
+
+        run = repo.get_workflow_run(run_id)
+        artifacts = run.get_artifacts()
+
+        for artifact in artifacts:
+            if artifact.name == name:
+                url = artifact.archive_download_url
+                headers = {"Authorization": f"token {GITHUB_TOKEN}"}
+                response = requests.get(url, headers=headers)
+
+                if response.status_code == 200:
+                    with tempfile.NamedTemporaryFile("w+b") as temp:
+                        temp.write(response.content)
+                        temp.flush()
+
+                        with zipfile.ZipFile(temp.name) as z:
+                            artifact_dict = {}
+                            for file in z.namelist():
+                                with z.open(file) as f:
+                                    artifact_dict[file] = f.read()
+
+                    return artifact_dict
+                else:
+                    raise RuntimeError(f"Failed to download artifact. Status code: {response.status_code}")
+        return RuntimeError(f"Could not find artifact {name}")
diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py
@@ -1,5 +1,6 @@
 import asyncio
 import re
+from pathlib import Path
 from unittest.mock import AsyncMock
 
 import discord
@@ -12,17 +13,17 @@
 logger = setup_logging()
 
 
-def create_mock_attachment():
+def create_mock_attachment(file_name: str, content: str):
     "Create an AsyncMock to simulate discord.Attachment"
 
     mock_attachment = AsyncMock(spec=discord.Attachment)
-    mock_attachment.filename = "test_script.py"
+    mock_attachment.filename = file_name
     mock_attachment.content_type = "text/plain"
-    mock_attachment.read = AsyncMock(return_value="print('Hello, world!')".encode("utf-8"))
+    mock_attachment.read = AsyncMock(return_value=content.encode("utf-8"))
     return mock_attachment
 
 
-script_file = create_mock_attachment()
+script_file = create_mock_attachment("test_script.py", "print('Hello, world!')")
 
 
 class VerifyRunCog(commands.Cog):
@@ -45,7 +46,10 @@ async def verify_github_run(
         interaction: discord.Interaction,
     ) -> bool:
         github_command = github_cog.run_github
-        github_thread = await github_command.callback(github_cog, interaction, script_file, choice)
+        cuda_file = create_mock_attachment("test.cu", Path("examples/identity_cuda/submission.cuh").read_text())
+        reference_code = Path("examples/identity_cuda/reference.cuh").read_text()
+        github_thread = await github_command.callback(github_cog, interaction, cuda_file, choice,
+                                                      reference_code=reference_code)
 
         message_contents = [msg.content async for msg in github_thread.history(limit=None)]
 
@@ -90,10 +94,7 @@ async def verify_modal_run(self, modal_cog: ModalCog, interaction: discord.Inter
 
         message_contents = [msg.content async for msg in modal_thread.history(limit=None)]
 
-        required_patterns = [
-            "Running on Modal...",
-            "Job completed!"
-        ]
+        required_patterns = ["Running on Modal...", "Job completed!"]
 
         all_patterns_found = all(
             any(re.search(pattern, content, re.DOTALL) is not None for content in message_contents)
@@ -139,8 +140,8 @@ async def verify_runs(self, interaction: discord.Interaction):
 
             results = await asyncio.gather(
                 self.verify_github_run(github_cog, nvidia, interaction),
-                self.verify_github_run(github_cog, amd, interaction),
-                self.verify_modal_run(modal_cog, interaction),
+                #self.verify_github_run(github_cog, amd, interaction),
+                #self.verify_modal_run(modal_cog, interaction),
             )
 
             if all(results):