Skip to content

Commit

Permalink
towards unified report code
Browse files Browse the repository at this point in the history
  • Loading branch information
ngc92 committed Jan 13, 2025
1 parent a37c356 commit cfe0eb0
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 9 deletions.
24 changes: 16 additions & 8 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
from env import GITHUB_REPO, GITHUB_TOKEN
from github import Github
from leaderboard_eval import cu_eval, py_eval
from run_eval import RunResult, CompileResult
from utils import get_github_branch_name, send_discord_message, setup_logging
from report import generate_report

logger = setup_logging()

Expand Down Expand Up @@ -92,10 +94,20 @@ async def run_github(

await thread.send(f"Training completed with status: {status}")

if len(logs) > 1900:
await self.bot.send_chunked_message(thread, logs, code_block=True)
if expect_result:
# {"success": True, **json.loads(logs)}
if logs['success']:
generate_report(thread,
CompileResult(**logs['compile']),
RunResult(**logs['run']))
else:
await thread.send(logs['error'])

else:
await thread.send(f"```\n!!Logs!!:\n{logs}\n```")
if len(logs) > 1900:
await self.bot.send_chunked_message(thread, logs, code_block=True)
else:
await thread.send(f"```\nLogs:\n{logs}\n```")

if url:
await thread.send(f"View the full run at: <{url}>")
Expand Down Expand Up @@ -233,8 +245,7 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)

if run.status == "completed":
if expect_result:
result = await self.download_results(run_id)
logs = self.make_logs(result)
logs = await self.download_results(run_id)
else:
logs = await self.handle_training_log(run_id)
return run.conclusion, logs, run.html_url
Expand All @@ -248,9 +259,6 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)
except Exception as e:
return "error", str(e), None

def make_logs(self, result: dict):
return pprint.pformat(result)

async def download_results(self, run_id):
try:
data = await self.download_artifact(run_id, name="run-result")
Expand Down
117 changes: 117 additions & 0 deletions src/discord-cluster-manager/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import pprint
from run_eval import CompileResult, RunResult
import discord


def _limit_length(text: str, maxlen: int):
if len(text) > maxlen:
return text[:maxlen-6] + " [...]"
else:
return text


def _send_split_log(thread: discord.Thread, partial_message: str, header: str, log: str):
if len(partial_message) + len(log) + len(header) < 1900:
partial_message += f"\n\n## {header}:\n"
partial_message += f"```\n{log}```"
return partial_message
else:
# send previous chunk
thread.send(partial_message)
lines = log.splitlines()
chunks = []
partial_message = ""
for line in lines:
if len(partial_message) + len(line) < 1900:
partial_message += line + "\n"
else:
chunks.append(partial_message)
partial_message = line

# now, format the chunks
for i, chunk in enumerate(chunks):
partial_message += f"\n\n## {header} ({i}/{len(chunks)}):\n"
partial_message += f"```\n{_limit_length(log, 1900)}```"
thread.send(partial_message)

return ""


def generate_report(thread: discord.Thread, comp: CompileResult, run: RunResult):
message = ""
if not comp.success or not run.success:
message = "# Run was not successful\n"
else:
message = "# Run was successful\n"

if not comp.success:
if not comp.nvcc_found:
message += "**Compilation failed**\nNVCC could not be found.\n"
message += "This indicates a bug in the runner configuration, _not in your code_.\n"
message += "Please notify the server admins of this problem"
thread.send(message)
return

# ok, we found nvcc
message += "**Compilation failed**\n"
message += "Command "
message += f"```bash\n>{_limit_length(comp.command, 1000)}```\n"
message += f"exited with code **{comp.exit_code}**."

message += _send_split_log(thread, message, "Compiler stderr", comp.stderr.strip())

if len(comp.stdout.strip()) > 0:
message += _send_split_log(thread, message, "Compiler stdout", comp.stdout.strip())

if len(message) != 0:
thread.send(message)

return

if not run.success:
message += "**Running failed**\n"
message += "Command "
message += f"```bash\n>{_limit_length(run.command, 1000)}```\n"
message += f"exited with error code **{run.exit_code}** after {run.duration} seconds."

if len(run.stderr.strip()) > 0:
message += _send_split_log(thread, message, "Program stderr", run.stderr.strip())

if len(run.stdout.strip()) > 0:
message += _send_split_log(thread, message, "Program stdout", run.stdout.strip())

if len(message) != 0:
thread.send(message)

return

if not run.passed:
message += "**Testing failed**\n"
message += "Command "
message += f"```bash\n>{_limit_length(run.command, 1000)}```\n"
message += f"ran successfully in {run.duration} seconds, but did not pass all tests.\n"

if len(run.stderr.strip()) > 0:
message += _send_split_log(thread, message, "Program stderr", run.stderr.strip())

if len(run.stdout.strip()) > 0:
message += _send_split_log(thread, message, "Program stdout", run.stdout.strip())

if len(message) != 0:
thread.send(message)

# TODO dedicated "error" entry in our results dict that gets populated by check_implementation
return

# OK, we were successful
message += "**Success!**\n"
message += _send_split_log(thread, message, "Result", pprint.pformat(run.result))

if len(run.stderr.strip()) > 0:
message += _send_split_log(thread, message, "Program stderr", run.stderr).strip()

if len(run.stdout.strip()) > 0:
message += _send_split_log(thread, message, "Program stdout", run.stdout.strip())

if len(message) != 0:
thread.send(message)
5 changes: 4 additions & 1 deletion src/discord-cluster-manager/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class CompileResult:
class RunResult:
# fmt: off
success: bool # did the compiled executable run successfully
passed: bool # did it pass all tests
command: str # the command that was run to compile the code
stdout: str # standard output produced by the compiler
stderr: str # standard error produced by the compiler
Expand Down Expand Up @@ -142,7 +143,9 @@ def run_cuda_program(args: list[str]) -> RunResult:
result_dict[key.strip()] = value.strip()

return RunResult(
success=run_process.returncode == 0,
# TODO should we return 0 also on test failure?
success=(run_process.returncode == 0 or run_process.returncode == 1),
passed=result_dict.get("check", None) == "pass",
command=_make_cmd(run_process.args),
stdout=run_process.stdout,
stderr=run_process.stderr,
Expand Down

0 comments on commit cfe0eb0

Please sign in to comment.