html formatting

google · Oct 30, 2024 · 8c25286 · 8c25286
1 parent 01f2e1a
commit 8c25286
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 8 deletions.
diff --git a/agent/base_agent.py b/agent/base_agent.py
@@ -44,8 +44,8 @@ def get_tool(self, tool_name: str) -> Optional[BaseTool]:
 
   def chat_llm(self, cur_round: int, client: Any, prompt: Prompt) -> str:
     """Chat with LLM."""
-    logger.info('<CHAT PROMPT:ROUND %02d>%s</CHAT PROMPT:ROUND %2d>', cur_round,
-                prompt.get(), cur_round)
+    logger.info('<CHAT PROMPT:ROUND %02d>%s</CHAT PROMPT:ROUND %02d>',
+                cur_round, prompt.get(), cur_round)
     response = self.llm.chat_llm(client=client, prompt=prompt)
     logger.info('<CHAT RESPONSE:ROUND %02d>%s</CHAT RESPONSE:ROUND %02d>',
                 cur_round, response, cur_round)

diff --git a/report/common.py b/report/common.py
@@ -129,6 +129,13 @@ class Triage:
   triager_prompt: str
 
 
+@dataclasses.dataclass
+class LogPart:
+  chat_prompt: bool = False
+  chat_response: bool = False
+  content: str = ''
+
+
 class FileSystem:
   """
   FileSystem provides a wrapper over standard library and GCS client and
@@ -267,6 +274,9 @@ def match_benchmark(self, benchmark_id: str, results: list[evaluator.Result],
   def get_final_target_code(self, benchmark: str, sample: str) -> str:
     """Gets the targets of benchmark |benchmark| with sample ID |sample|."""
     targets_dir = os.path.join(self._results_dir, benchmark, 'fixed_targets')
+    # TODO(donggeliu): Make this consistent with agent output.
+    if not os.path.exists(targets_dir):
+      return ''
 
     for name in sorted(FileSystem(targets_dir).listdir()):
       path = os.path.join(targets_dir, name)
@@ -277,14 +287,14 @@ def get_final_target_code(self, benchmark: str, sample: str) -> str:
         return code
     return ''
 
-  def get_logs(self, benchmark: str, sample: str) -> str:
+  def get_logs(self, benchmark: str, sample: str) -> list[LogPart]:
     status_dir = os.path.join(self._results_dir, benchmark, 'status')
     results_path = os.path.join(status_dir, sample, 'log.txt')
     if not FileSystem(results_path).exists():
-      return ''
+      return []
 
     with FileSystem(results_path).open() as f:
-      return f.read()
+      return _parse_log_parts(f.read())
 
   def get_run_logs(self, benchmark: str, sample: str) -> str:
     """Returns the content of the last run log."""
@@ -353,6 +363,10 @@ def _get_targets(self, benchmark: str, sample: str) -> list[Target]:
     """Gets the targets of benchmark |benchmark| with sample ID |sample| from
     the OFG version 1 (single prompt)."""
     targets_dir = os.path.join(self._results_dir, benchmark, 'fixed_targets')
+    # TODO(donggeliu): Make this consistent with agent output.
+    if not os.path.exists(targets_dir):
+      return []
+
     targets = []
 
     for name in sorted(FileSystem(targets_dir).listdir()):
@@ -533,7 +547,11 @@ def _is_valid_benchmark_dir(self, cur_dir: str) -> bool:
       return True
 
     # Check sub-directories.
-    expected_dirs = ['raw_targets', 'status', 'fixed_targets']
+    # TODO(donggeliu): Make this consistent with agent output.
+    # We used to expect 'fixed_targets' and 'raw_targets' here, but the agent
+    # workflow doesn't populate them. As a result, these directories don't get
+    # uploaded to GCS.
+    expected_dirs = ['status']
     return all(
         FileSystem(os.path.join(self._results_dir, cur_dir,
                                 expected_dir)).isdir()
@@ -545,6 +563,10 @@ def _get_generated_targets(self, benchmark: str) -> list[str]:
     prompt)."""
     targets = []
     raw_targets_dir = os.path.join(self._results_dir, benchmark, 'raw_targets')
+    # TODO(donggeliu): Make this consistent with agent output.
+    if not os.path.exists(raw_targets_dir):
+      return []
+
     for filename in sorted(FileSystem(raw_targets_dir).listdir()):
       if os.path.splitext(filename)[1] in TARGET_EXTS:
         targets.append(os.path.join(raw_targets_dir, filename))
@@ -623,3 +645,53 @@ def _find_benchmark_signature(self, project: str,
           matched_prefix_signature = function_signature
 
     return matched_prefix_signature
+
+
+def _parse_log_parts(log: str) -> list[LogPart]:
+  """Parse log into parts."""
+  _CHAT_PROMPT_START_MARKER = re.compile(r'<CHAT PROMPT:ROUND\s+\d+>')
+  _CHAT_PROMPT_END_MARKER = re.compile(r'</CHAT PROMPT:ROUND\s+\d+>')
+  _CHAT_RESPONSE_START_MARKER = re.compile(r'<CHAT RESPONSE:ROUND\s+\d+>')
+  _CHAT_RESPONSE_END_MARKER = re.compile(r'</CHAT RESPONSE:ROUND\s+\d+>')
+  parts = []
+  idx = 0
+  next_marker = _CHAT_PROMPT_START_MARKER
+
+  while idx < len(log):
+    match = next_marker.search(log, idx)
+    if not match:
+      parts.append(LogPart(content=log[idx:]))
+      break
+
+    if match.start() > idx:
+      # Log content in between chat logs.
+      parts.append(LogPart(content=log[idx:match.start()]))
+
+    # Read up to the start of the corresponding end marker.
+    end_idx = len(log)
+
+    chat_prompt = False
+    chat_response = False
+    if next_marker == _CHAT_PROMPT_START_MARKER:
+      end = _CHAT_PROMPT_END_MARKER.search(log, match.end())
+      chat_prompt = True
+      next_marker = _CHAT_RESPONSE_START_MARKER
+    else:
+      assert next_marker == _CHAT_RESPONSE_START_MARKER
+      end = _CHAT_RESPONSE_END_MARKER.search(log, match.end())
+      chat_response = True
+      next_marker = _CHAT_PROMPT_START_MARKER
+
+    if end:
+      end_idx = end.start()
+      # Skip past the end tag.
+      idx = end.end()
+    else:
+      # No corresponding end tag, just read till the end of the log.
+      end_idx = len(log)
+      idx = end_idx
+
+    parts.append(LogPart(chat_prompt=chat_prompt, chat_response=chat_response, content=log[match.end():end_idx]))
+
+
+  return parts
diff --git a/report/templates/base.html b/report/templates/base.html
@@ -53,6 +53,19 @@
 tbody tr:nth-child(odd) {
     background-color: #f4f5ff;
 }
+
+.chat_prompt {
+    background-color: #fff7f2;
+    max-width: 50%;
+    overflow: scroll;
+}
+
+.chat_response {
+    background-color: #fcfff2;
+    max-width: 50%;
+    overflow: scroll;
+    margin-left: auto;
+}
 </style>
 <body>
     LLM: {{ model }}

diff --git a/report/templates/sample.html b/report/templates/sample.html
@@ -58,9 +58,11 @@ <h3>Code #{{ loop.index - 1}}</h3>
 {% endfor %}
 
 <h2>Logs</h2>
-<pre>
-{{ logs }}
+{% for part in logs %}
+<pre {% if part.chat_prompt %}class="chat_prompt"{% elif part.chat_response %}class="chat_response"{% endif %}>
+{{ part.content }}
 </pre>
+{% endfor %}
 
 <h2>Run logs</h2>
 <pre>