Skip to content

Commit

Permalink
wip github runner
Browse files Browse the repository at this point in the history
  • Loading branch information
ngc92 committed Jan 13, 2025
1 parent d7df3c9 commit a37c356
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 57 deletions.
64 changes: 64 additions & 0 deletions .github/workflows/nvidia_cuda_workflow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: NVIDIA CUDA Job
on:
push:
branches:
- '**'

workflow_dispatch:
inputs:
script_content:
description: 'Content of CUDA script'
required: true
type: string
reference_content:
description: 'Content of the reference code script'
required: true
type: string

jobs:
run:
runs-on: [gpumode-nvidia-arc]
timeout-minutes: 10
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Create input files
shell: bash
run: |
cat > "train.cuh" <<EOL
${{ github.event.inputs.script_content }}
EOL
cat "train.cuh" # Debug: show file contents
echo "Creating reference script..."
cat > "reference.cuh" <<EOL
${{ github.event.inputs.reference_content }}
EOL
cat "reference.cuh" # Debug: Show file contents
- name: Run script
shell: bash
run: |
python .github/workflows/run-cuda.py
cat result.json # Debug: show output
- name: Upload training artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: run-result
path: |
result.json
train.cuh
env:
CUDA_VISIBLE_DEVICES: 0
22 changes: 22 additions & 0 deletions .github/workflows/run-cuda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json
import sys
from pathlib import Path
from dataclasses import asdict

sys.path.append("src/discord-cluster-manager")

from leaderboard_eval import cu_eval
from run_eval import run_cuda_script

ref = Path("reference.cuh")
sub = Path("train.cuh")

comp, run = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None)


result = {
"compile": asdict(comp),
"run": asdict(run)
}

Path("result.json").write_text(json.dumps(result))
151 changes: 105 additions & 46 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import asyncio
import json
import os
import zipfile
import pprint
import tempfile
from datetime import datetime, timedelta, timezone

import discord
Expand Down Expand Up @@ -49,6 +52,7 @@ async def run_github(

thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job")
await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")
expect_result = False

try:
script_content = (await script.read()).decode("utf-8")
Expand All @@ -61,30 +65,37 @@ async def run_github(
if reference_code is not None
else (await reference_script.read()).decode("utf-8")
)
eval_code = py_eval if script.filename.endswith(".py") else cu_eval

run_id = await self.trigger_github_action(
script_content,
filename,
selected_gpu,
reference_content,
eval_code,
)
if not script.filename.endswith(".py") and gpu_type.value != "amd":
run_id = await self.trigger_github_cuda(
script_content,
reference_content,
)
expect_result = True
else:
eval_code = py_eval if script.filename.endswith(".py") else cu_eval

run_id = await self.trigger_github_action(
script_content,
filename,
selected_gpu,
reference_content,
eval_code,
)
else:
run_id = await self.trigger_github_action(script_content, filename, selected_gpu)

if run_id:
await thread.send(
f"GitHub Action triggered! Run ID: {run_id}\nMonitoring progress..."
)
status, logs, url = await self.check_workflow_status(run_id, thread)
status, logs, url = await self.check_workflow_status(run_id, thread, expect_result)

await thread.send(f"Training completed with status: {status}")

if len(logs) > 1900:
await self.bot.send_chunked_message(thread, logs, code_block=True)
else:
await thread.send(f"```\nLogs:\n{logs}\n```")
await thread.send(f"```\n!!Logs!!:\n{logs}\n```")

if url:
await thread.send(f"View the full run at: <{url}>")
Expand All @@ -101,6 +112,40 @@ async def run_github(
await thread.send(f"Error processing request: {str(e)}")
raise

async def trigger_github_cuda(
self,
script_content,
reference_content,
):
logger.info(f"Attempting to trigger GitHub action for CUDA")
gh = Github(GITHUB_TOKEN)
repo = gh.get_repo(GITHUB_REPO)

try:
trigger_time = datetime.now(timezone.utc)
workflow_file = "nvidia_cuda_workflow.yml"
workflow = repo.get_workflow(workflow_file)

success = workflow.create_dispatch(
get_github_branch_name(),
{
"script_content": script_content,
"reference_content": reference_content,
},
)
if success:
await asyncio.sleep(2)
runs = list(workflow.get_runs())

for run in runs:
if run.created_at.replace(tzinfo=timezone.utc) > trigger_time:
return run.id
return None

except Exception as e:
logger.error(f"Error in trigger_github_action: {str(e)}", exc_info=True)
return None

async def trigger_github_action(
self,
script_content,
Expand Down Expand Up @@ -152,7 +197,7 @@ async def trigger_github_action(
logger.error(f"Error in trigger_github_action: {str(e)}", exc_info=True)
return None

async def check_workflow_status(self, run_id, thread):
async def check_workflow_status(self, run_id, thread, expect_result: bool=False):
logger.info(f"Starting to monitor workflow status for run {run_id}")
gh = Github(GITHUB_TOKEN)
repo = gh.get_repo(GITHUB_REPO)
Expand Down Expand Up @@ -187,7 +232,11 @@ async def check_workflow_status(self, run_id, thread):
)

if run.status == "completed":
logs = await self.download_artifact(run_id)
if expect_result:
result = await self.download_results(run_id)
logs = self.make_logs(result)
else:
logs = await self.handle_training_log(run_id)
return run.conclusion, logs, run.html_url

await thread.send(
Expand All @@ -199,41 +248,51 @@ async def check_workflow_status(self, run_id, thread):
except Exception as e:
return "error", str(e), None

async def download_artifact(self, run_id):
logger.info(f"Attempting to download artifacts for run {run_id}")
gh = Github(GITHUB_TOKEN)
repo = gh.get_repo(GITHUB_REPO)
def make_logs(self, result: dict):
return pprint.pformat(result)

async def download_results(self, run_id):
try:
run = repo.get_workflow_run(run_id)
artifacts = run.get_artifacts()

for artifact in artifacts:
if artifact.name == "training-artifacts":
url = artifact.archive_download_url
headers = {"Authorization": f"token {GITHUB_TOKEN}"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
with open("training.log.zip", "wb") as f:
f.write(response.content)

with zipfile.ZipFile("training.log.zip") as z:
log_file = next(
(f for f in z.namelist() if f.endswith("training.log")),
None,
)
if log_file:
with z.open(log_file) as f:
logs = f.read().decode("utf-8")
else:
logs = "training.log file not found in artifact"

os.remove("training.log.zip")
return logs
else:
return f"Failed to download artifact. Status code: {response.status_code}"
data = await self.download_artifact(run_id, name="run-result")
logs = data['result.json'].decode("utf-8")
return {"success": True, **json.loads(logs)}
except Exception as e:
return {"success": False, "error": f"Error downloading artifacts: {str(e)}"}

return "No training artifacts found"
async def handle_training_log(self, run_id):
try:
data = await self.download_artifact(run_id, name="training-artifacts")
logs = data['training.log'].decode("utf-8")
return logs
except Exception as e:
return f"Error downloading artifacts: {str(e)}"

async def download_artifact(self, run_id, name: str):
logger.info(f"Attempting to download artifact {name} for run {run_id}")
gh = Github(GITHUB_TOKEN)
repo = gh.get_repo(GITHUB_REPO)

run = repo.get_workflow_run(run_id)
artifacts = run.get_artifacts()

for artifact in artifacts:
if artifact.name == name:
url = artifact.archive_download_url
headers = {"Authorization": f"token {GITHUB_TOKEN}"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
with tempfile.NamedTemporaryFile("w+b") as temp:
temp.write(response.content)
temp.flush()

with zipfile.ZipFile(temp.name) as z:
artifact_dict = {}
for file in z.namelist():
with z.open(file) as f:
artifact_dict[file] = f.read()

return artifact_dict
else:
raise RuntimeError(f"Failed to download artifact. Status code: {response.status_code}")
return RuntimeError(f"Could not find artifact {name}")
23 changes: 12 additions & 11 deletions src/discord-cluster-manager/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import re
from pathlib import Path
from unittest.mock import AsyncMock

import discord
Expand All @@ -12,17 +13,17 @@
logger = setup_logging()


def create_mock_attachment():
def create_mock_attachment(file_name: str, content: str):
"Create an AsyncMock to simulate discord.Attachment"

mock_attachment = AsyncMock(spec=discord.Attachment)
mock_attachment.filename = "test_script.py"
mock_attachment.filename = file_name
mock_attachment.content_type = "text/plain"
mock_attachment.read = AsyncMock(return_value="print('Hello, world!')".encode("utf-8"))
mock_attachment.read = AsyncMock(return_value=content.encode("utf-8"))
return mock_attachment


script_file = create_mock_attachment()
script_file = create_mock_attachment("test_script.py", "print('Hello, world!')")


class VerifyRunCog(commands.Cog):
Expand All @@ -45,7 +46,10 @@ async def verify_github_run(
interaction: discord.Interaction,
) -> bool:
github_command = github_cog.run_github
github_thread = await github_command.callback(github_cog, interaction, script_file, choice)
cuda_file = create_mock_attachment("test.cu", Path("examples/identity_cuda/submission.cuh").read_text())
reference_code = Path("examples/identity_cuda/reference.cuh").read_text()
github_thread = await github_command.callback(github_cog, interaction, cuda_file, choice,
reference_code=reference_code)

message_contents = [msg.content async for msg in github_thread.history(limit=None)]

Expand Down Expand Up @@ -90,10 +94,7 @@ async def verify_modal_run(self, modal_cog: ModalCog, interaction: discord.Inter

message_contents = [msg.content async for msg in modal_thread.history(limit=None)]

required_patterns = [
"Running on Modal...",
"Job completed!"
]
required_patterns = ["Running on Modal...", "Job completed!"]

all_patterns_found = all(
any(re.search(pattern, content, re.DOTALL) is not None for content in message_contents)
Expand Down Expand Up @@ -139,8 +140,8 @@ async def verify_runs(self, interaction: discord.Interaction):

results = await asyncio.gather(
self.verify_github_run(github_cog, nvidia, interaction),
self.verify_github_run(github_cog, amd, interaction),
self.verify_modal_run(modal_cog, interaction),
#self.verify_github_run(github_cog, amd, interaction),
#self.verify_modal_run(modal_cog, interaction),
)

if all(results):
Expand Down

0 comments on commit a37c356

Please sign in to comment.