Skip to content

Commit cfe0eb0

Browse files
committed
towards unified report code
1 parent a37c356 commit cfe0eb0

File tree

3 files changed

+137
-9
lines changed

3 files changed

+137
-9
lines changed

src/discord-cluster-manager/cogs/github_cog.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
from env import GITHUB_REPO, GITHUB_TOKEN
1515
from github import Github
1616
from leaderboard_eval import cu_eval, py_eval
17+
from run_eval import RunResult, CompileResult
1718
from utils import get_github_branch_name, send_discord_message, setup_logging
19+
from report import generate_report
1820

1921
logger = setup_logging()
2022

@@ -92,10 +94,20 @@ async def run_github(
9294

9395
await thread.send(f"Training completed with status: {status}")
9496

95-
if len(logs) > 1900:
96-
await self.bot.send_chunked_message(thread, logs, code_block=True)
97+
if expect_result:
98+
# {"success": True, **json.loads(logs)}
99+
if logs['success']:
100+
generate_report(thread,
101+
CompileResult(**logs['compile']),
102+
RunResult(**logs['run']))
103+
else:
104+
await thread.send(logs['error'])
105+
97106
else:
98-
await thread.send(f"```\n!!Logs!!:\n{logs}\n```")
107+
if len(logs) > 1900:
108+
await self.bot.send_chunked_message(thread, logs, code_block=True)
109+
else:
110+
await thread.send(f"```\nLogs:\n{logs}\n```")
99111

100112
if url:
101113
await thread.send(f"View the full run at: <{url}>")
@@ -233,8 +245,7 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)
233245

234246
if run.status == "completed":
235247
if expect_result:
236-
result = await self.download_results(run_id)
237-
logs = self.make_logs(result)
248+
logs = await self.download_results(run_id)
238249
else:
239250
logs = await self.handle_training_log(run_id)
240251
return run.conclusion, logs, run.html_url
@@ -248,9 +259,6 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)
248259
except Exception as e:
249260
return "error", str(e), None
250261

251-
def make_logs(self, result: dict):
252-
return pprint.pformat(result)
253-
254262
async def download_results(self, run_id):
255263
try:
256264
data = await self.download_artifact(run_id, name="run-result")

src/discord-cluster-manager/report.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import pprint
2+
from run_eval import CompileResult, RunResult
3+
import discord
4+
5+
6+
def _limit_length(text: str, maxlen: int):
7+
if len(text) > maxlen:
8+
return text[:maxlen-6] + " [...]"
9+
else:
10+
return text
11+
12+
13+
def _send_split_log(thread: discord.Thread, partial_message: str, header: str, log: str):
14+
if len(partial_message) + len(log) + len(header) < 1900:
15+
partial_message += f"\n\n## {header}:\n"
16+
partial_message += f"```\n{log}```"
17+
return partial_message
18+
else:
19+
# send previous chunk
20+
thread.send(partial_message)
21+
lines = log.splitlines()
22+
chunks = []
23+
partial_message = ""
24+
for line in lines:
25+
if len(partial_message) + len(line) < 1900:
26+
partial_message += line + "\n"
27+
else:
28+
chunks.append(partial_message)
29+
partial_message = line
30+
31+
# now, format the chunks
32+
for i, chunk in enumerate(chunks):
33+
partial_message += f"\n\n## {header} ({i}/{len(chunks)}):\n"
34+
partial_message += f"```\n{_limit_length(log, 1900)}```"
35+
thread.send(partial_message)
36+
37+
return ""
38+
39+
40+
def generate_report(thread: discord.Thread, comp: CompileResult, run: RunResult):
41+
message = ""
42+
if not comp.success or not run.success:
43+
message = "# Run was not successful\n"
44+
else:
45+
message = "# Run was successful\n"
46+
47+
if not comp.success:
48+
if not comp.nvcc_found:
49+
message += "**Compilation failed**\nNVCC could not be found.\n"
50+
message += "This indicates a bug in the runner configuration, _not in your code_.\n"
51+
message += "Please notify the server admins of this problem"
52+
thread.send(message)
53+
return
54+
55+
# ok, we found nvcc
56+
message += "**Compilation failed**\n"
57+
message += "Command "
58+
message += f"```bash\n>{_limit_length(comp.command, 1000)}```\n"
59+
message += f"exited with code **{comp.exit_code}**."
60+
61+
message += _send_split_log(thread, message, "Compiler stderr", comp.stderr.strip())
62+
63+
if len(comp.stdout.strip()) > 0:
64+
message += _send_split_log(thread, message, "Compiler stdout", comp.stdout.strip())
65+
66+
if len(message) != 0:
67+
thread.send(message)
68+
69+
return
70+
71+
if not run.success:
72+
message += "**Running failed**\n"
73+
message += "Command "
74+
message += f"```bash\n>{_limit_length(run.command, 1000)}```\n"
75+
message += f"exited with error code **{run.exit_code}** after {run.duration} seconds."
76+
77+
if len(run.stderr.strip()) > 0:
78+
message += _send_split_log(thread, message, "Program stderr", run.stderr.strip())
79+
80+
if len(run.stdout.strip()) > 0:
81+
message += _send_split_log(thread, message, "Program stdout", run.stdout.strip())
82+
83+
if len(message) != 0:
84+
thread.send(message)
85+
86+
return
87+
88+
if not run.passed:
89+
message += "**Testing failed**\n"
90+
message += "Command "
91+
message += f"```bash\n>{_limit_length(run.command, 1000)}```\n"
92+
message += f"ran successfully in {run.duration} seconds, but did not pass all tests.\n"
93+
94+
if len(run.stderr.strip()) > 0:
95+
message += _send_split_log(thread, message, "Program stderr", run.stderr.strip())
96+
97+
if len(run.stdout.strip()) > 0:
98+
message += _send_split_log(thread, message, "Program stdout", run.stdout.strip())
99+
100+
if len(message) != 0:
101+
thread.send(message)
102+
103+
# TODO dedicated "error" entry in our results dict that gets populated by check_implementation
104+
return
105+
106+
# OK, we were successful
107+
message += "**Success!**\n"
108+
message += _send_split_log(thread, message, "Result", pprint.pformat(run.result))
109+
110+
if len(run.stderr.strip()) > 0:
111+
message += _send_split_log(thread, message, "Program stderr", run.stderr).strip()
112+
113+
if len(run.stdout.strip()) > 0:
114+
message += _send_split_log(thread, message, "Program stdout", run.stdout.strip())
115+
116+
if len(message) != 0:
117+
thread.send(message)

src/discord-cluster-manager/run_eval.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class CompileResult:
2525
class RunResult:
2626
# fmt: off
2727
success: bool # did the compiled executable run successfully
28+
passed: bool # did it pass all tests
2829
command: str # the command that was run to compile the code
2930
stdout: str # standard output produced by the compiler
3031
stderr: str # standard error produced by the compiler
@@ -142,7 +143,9 @@ def run_cuda_program(args: list[str]) -> RunResult:
142143
result_dict[key.strip()] = value.strip()
143144

144145
return RunResult(
145-
success=run_process.returncode == 0,
146+
# TODO should we return 0 also on test failure?
147+
success=(run_process.returncode == 0 or run_process.returncode == 1),
148+
passed=result_dict.get("check", None) == "pass",
146149
command=_make_cmd(run_process.args),
147150
stdout=run_process.stdout,
148151
stderr=run_process.stderr,

0 commit comments

Comments
 (0)