diff --git a/evaluation/benchmarks/the_agent_company/browsing.py b/evaluation/benchmarks/the_agent_company/browsing.py
index 5ce97129777a..e8747c2dede9 100644
--- a/evaluation/benchmarks/the_agent_company/browsing.py
+++ b/evaluation/benchmarks/the_agent_company/browsing.py
@@ -267,7 +267,9 @@ def pre_login(
             obs: BrowserOutputObservation = runtime.run_action(browser_action)
             logger.debug(obs, extra={'msg_type': 'OBSERVATION'})
             if save_screenshots:
-                image_data = base64.b64decode(obs.screenshot)
+                image_data = base64.b64decode(
+                    obs.screenshot.replace('data:image/png;base64,', '')
+                )
                 with open(os.path.join(directory, f'{image_id}.png'), 'wb') as file:
                     file.write(image_data)
                     image_id += 1
diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py
index cbfbb386fdde..84fb057ec791 100644
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -36,7 +36,7 @@ def get_config(
     task_short_name: str,
     mount_path_on_host: str,
     llm_config: LLMConfig,
-    agent_config: AgentConfig,
+    agent_config: AgentConfig | None,
 ) -> AppConfig:
     config = AppConfig(
         run_as_openhands=False,
@@ -159,11 +159,21 @@ def run_solver(
         os.makedirs(screenshots_dir, exist_ok=True)
         for image_id, obs in enumerate(state.history):
             if isinstance(obs, BrowserOutputObservation):
-                image_data = base64.b64decode(obs.screenshot)
+                image_data = base64.b64decode(
+                    obs.screenshot.replace('data:image/png;base64,', '')
+                )
                 with open(
                     os.path.join(screenshots_dir, f'{image_id}.png'), 'wb'
                 ) as file:
                     file.write(image_data)
+                if obs.set_of_marks:
+                    som_image_data = base64.b64decode(
+                        obs.set_of_marks.replace('data:image/png;base64,', '')
+                    )
+                    with open(
+                        os.path.join(screenshots_dir, f'{image_id}_som.png'), 'wb'
+                    ) as file:
+                        file.write(som_image_data)
 
     if save_final_state:
         os.makedirs(state_dir, exist_ok=True)
diff --git a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
index 3366c9826005..e266e5990b1a 100755
--- a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
+++ b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
@@ -129,8 +129,6 @@ temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md"
 sed -n "${start_line},${end_line}p" tasks.md > "$temp_file"
 
 while IFS= read -r task_image; do
-    docker pull $task_image
-
     # Remove prefix using ## to remove longest matching pattern from start
     task_name=${task_image##ghcr.io/theagentcompany/}
 
@@ -144,6 +142,8 @@ while IFS= read -r task_image; do
         continue
     fi
 
+    docker pull $task_image
+
     # Build the Python command
     COMMAND="poetry run python run_infer.py \
             --agent-llm-config \"$AGENT_LLM_CONFIG\" \
diff --git a/openhands/core/logger.py b/openhands/core/logger.py
index cf66263b0398..f319fd93e830 100644
--- a/openhands/core/logger.py
+++ b/openhands/core/logger.py
@@ -165,11 +165,13 @@ class RollingLogger:
     max_lines: int
     char_limit: int
     log_lines: list[str]
+    all_lines: str
 
     def __init__(self, max_lines=10, char_limit=80):
         self.max_lines = max_lines
         self.char_limit = char_limit
         self.log_lines = [''] * self.max_lines
+        self.all_lines = ''
 
     def is_enabled(self):
         return DEBUG and sys.stdout.isatty()
@@ -184,6 +186,7 @@ def add_line(self, line):
         self.log_lines.pop(0)
         self.log_lines.append(line[: self.char_limit])
         self.print_lines()
+        self.all_lines += line + '\n'
 
     def write_immediately(self, line):
         self._write(line)
diff --git a/openhands/runtime/builder/docker.py b/openhands/runtime/builder/docker.py
index ddd9c85334b9..f7391100eaec 100644
--- a/openhands/runtime/builder/docker.py
+++ b/openhands/runtime/builder/docker.py
@@ -163,8 +163,10 @@ def build(
                 )
 
         except subprocess.CalledProcessError as e:
-            logger.error(f'Image build failed:\n{e}')
+            logger.error(f'Image build failed:\n{e}') # TODO: {e} is empty
             logger.error(f'Command output:\n{e.output}')
+            if self.rolling_logger.is_enabled():
+                logger.error("Docker build output:\n" + self.rolling_logger.all_lines) # Show the error
             raise
 
         except subprocess.TimeoutExpired:
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 73af80be6ac2..3872c5a43add 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -11,6 +11,7 @@
     finalize_config,
     get_agent_config_arg,
     get_llm_config_arg,
+    load_app_config,
     load_from_env,
     load_from_toml,
 )
@@ -809,3 +810,29 @@ def test_get_agent_config_arg(temp_toml_file):
     assert not agent_config2.memory_enabled
     assert agent_config2.enable_prompt_extensions
     assert agent_config2.memory_max_threads == 10
+
+
+def test_agent_config_custom_group_name(temp_toml_file):
+    temp_toml = """
+[core]
+max_iterations = 99
+
+[agent.group1]
+memory_enabled = true
+
+[agent.group2]
+memory_enabled = false
+"""
+    with open(temp_toml_file, 'w') as f:
+        f.write(temp_toml)
+
+    # just a sanity check that load app config wouldn't fail
+    app_config = load_app_config(config_file=temp_toml_file)
+    assert app_config.max_iterations == 99
+
+    # run_infer in evaluation can use `get_agent_config_arg` to load custom
+    # agent configs with any group name (not just agent name)
+    agent_config1 = get_agent_config_arg('group1', temp_toml_file)
+    assert agent_config1.memory_enabled
+    agent_config2 = get_agent_config_arg('group2', temp_toml_file)
+    assert not agent_config2.memory_enabled