diff --git a/evaluation/benchmarks/the_agent_company/browsing.py b/evaluation/benchmarks/the_agent_company/browsing.py index 5ce97129777a..e8747c2dede9 100644 --- a/evaluation/benchmarks/the_agent_company/browsing.py +++ b/evaluation/benchmarks/the_agent_company/browsing.py @@ -267,7 +267,9 @@ def pre_login( obs: BrowserOutputObservation = runtime.run_action(browser_action) logger.debug(obs, extra={'msg_type': 'OBSERVATION'}) if save_screenshots: - image_data = base64.b64decode(obs.screenshot) + image_data = base64.b64decode( + obs.screenshot.replace('data:image/png;base64,', '') + ) with open(os.path.join(directory, f'{image_id}.png'), 'wb') as file: file.write(image_data) image_id += 1 diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py index cbfbb386fdde..84fb057ec791 100644 --- a/evaluation/benchmarks/the_agent_company/run_infer.py +++ b/evaluation/benchmarks/the_agent_company/run_infer.py @@ -36,7 +36,7 @@ def get_config( task_short_name: str, mount_path_on_host: str, llm_config: LLMConfig, - agent_config: AgentConfig, + agent_config: AgentConfig | None, ) -> AppConfig: config = AppConfig( run_as_openhands=False, @@ -159,11 +159,21 @@ def run_solver( os.makedirs(screenshots_dir, exist_ok=True) for image_id, obs in enumerate(state.history): if isinstance(obs, BrowserOutputObservation): - image_data = base64.b64decode(obs.screenshot) + image_data = base64.b64decode( + obs.screenshot.replace('data:image/png;base64,', '') + ) with open( os.path.join(screenshots_dir, f'{image_id}.png'), 'wb' ) as file: file.write(image_data) + if obs.set_of_marks: + som_image_data = base64.b64decode( + obs.set_of_marks.replace('data:image/png;base64,', '') + ) + with open( + os.path.join(screenshots_dir, f'{image_id}_som.png'), 'wb' + ) as file: + file.write(som_image_data) if save_final_state: os.makedirs(state_dir, exist_ok=True) diff --git a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh index 3366c9826005..e266e5990b1a 100755 --- a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh +++ b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh @@ -129,8 +129,6 @@ temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md" sed -n "${start_line},${end_line}p" tasks.md > "$temp_file" while IFS= read -r task_image; do - docker pull $task_image - # Remove prefix using ## to remove longest matching pattern from start task_name=${task_image##ghcr.io/theagentcompany/} @@ -144,6 +142,8 @@ while IFS= read -r task_image; do continue fi + docker pull $task_image + # Build the Python command COMMAND="poetry run python run_infer.py \ --agent-llm-config \"$AGENT_LLM_CONFIG\" \ diff --git a/openhands/core/logger.py b/openhands/core/logger.py index cf66263b0398..f319fd93e830 100644 --- a/openhands/core/logger.py +++ b/openhands/core/logger.py @@ -165,11 +165,13 @@ class RollingLogger: max_lines: int char_limit: int log_lines: list[str] + all_lines: str def __init__(self, max_lines=10, char_limit=80): self.max_lines = max_lines self.char_limit = char_limit self.log_lines = [''] * self.max_lines + self.all_lines = '' def is_enabled(self): return DEBUG and sys.stdout.isatty() @@ -184,6 +186,7 @@ def add_line(self, line): self.log_lines.pop(0) self.log_lines.append(line[: self.char_limit]) self.print_lines() + self.all_lines += line + '\n' def write_immediately(self, line): self._write(line) diff --git a/openhands/runtime/builder/docker.py b/openhands/runtime/builder/docker.py index ddd9c85334b9..f7391100eaec 100644 --- a/openhands/runtime/builder/docker.py +++ b/openhands/runtime/builder/docker.py @@ -163,8 +163,10 @@ def build( ) except subprocess.CalledProcessError as e: - logger.error(f'Image build failed:\n{e}') + logger.error(f'Image build failed:\n{e}') # TODO: {e} is empty logger.error(f'Command output:\n{e.output}') + if self.rolling_logger.is_enabled(): + logger.error("Docker build output:\n" + self.rolling_logger.all_lines) # Show the error raise except subprocess.TimeoutExpired: diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 73af80be6ac2..3872c5a43add 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -11,6 +11,7 @@ finalize_config, get_agent_config_arg, get_llm_config_arg, + load_app_config, load_from_env, load_from_toml, ) @@ -809,3 +810,29 @@ def test_get_agent_config_arg(temp_toml_file): assert not agent_config2.memory_enabled assert agent_config2.enable_prompt_extensions assert agent_config2.memory_max_threads == 10 + + +def test_agent_config_custom_group_name(temp_toml_file): + temp_toml = """ +[core] +max_iterations = 99 + +[agent.group1] +memory_enabled = true + +[agent.group2] +memory_enabled = false +""" + with open(temp_toml_file, 'w') as f: + f.write(temp_toml) + + # just a sanity check that load app config wouldn't fail + app_config = load_app_config(config_file=temp_toml_file) + assert app_config.max_iterations == 99 + + # run_infer in evaluation can use `get_agent_config_arg` to load custom + # agent configs with any group name (not just agent name) + agent_config1 = get_agent_config_arg('group1', temp_toml_file) + assert agent_config1.memory_enabled + agent_config2 = get_agent_config_arg('group2', temp_toml_file) + assert not agent_config2.memory_enabled