diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index 2d65e1943849..f216a86ff8ca 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -9,6 +9,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -17,7 +18,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -60,17 +60,14 @@ def codeact_user_response_eda(state: State) -> str: def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=False, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index 8c1f08b37798..a78e40239548 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -17,6 +17,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -25,7 +26,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -40,21 +40,15 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-slim' + config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-slim', - enable_auto_lint=True, - use_host_network=False, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_runtime_alive=False, - remote_runtime_init_timeout=3600, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 8045f948d3f9..ae5faadc098b 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -16,6 +16,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -24,7 +25,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, load_from_toml, parse_arguments, @@ -47,22 +47,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.11-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.11-bookworm', - enable_auto_lint=True, - use_host_network=False, - timeout=100, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_runtime_alive=False, - remote_runtime_init_timeout=1800, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index 20f3dc4870a1..f1c98ed06672 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -14,6 +14,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -22,7 +23,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -57,18 +57,15 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0' + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE, - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 02d92aa3ee3e..1c56deb9670c 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -17,6 +17,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -25,7 +26,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -71,17 +71,15 @@ def codeact_user_response(state: State) -> str: def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' + config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py index 164e117e26c0..5f3ee99d7437 100644 --- a/evaluation/benchmarks/browsing_delegation/run_infer.py +++ b/evaluation/benchmarks/browsing_delegation/run_infer.py @@ -10,6 +10,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -18,7 +19,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -36,17 +36,14 @@ def get_config( assert ( metadata.max_iterations == 1 ), 'max_iterations must be 1 for browsing delegation evaluation.' + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=False, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, workspace_base=None, workspace_mount_path=None, ) diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index 2e0fc528f7c3..63d394a029d1 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -15,6 +15,7 @@ EvalOutput, assert_and_raise, codeact_user_response, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -25,7 +26,6 @@ from openhands.core.config import ( AgentConfig, AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -105,9 +105,7 @@ def get_config( instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: - # COMMIT0_CONTAINER_IMAGE = 'wentingzhao/' assert USE_INSTANCE_IMAGE - # We use a different instance image for the each instance of commit0 eval repo_name = instance['repo'].split('/')[1] base_container_image = get_instance_docker_image(repo_name) logger.info( @@ -115,28 +113,16 @@ def get_config( f'Please make sure this image exists. ' f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' ) - # else: - # raise - # base_container_image = SWE_BENCH_CONTAINER_IMAGE - # logger.info(f'Using swe-bench container image: {base_container_image}') + + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = base_container_image config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, max_iterations=metadata.max_iterations, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=SandboxConfig( - base_container_image=base_container_image, - enable_auto_lint=True, - use_host_network=False, - # large enough timeout, since some testcases take very long to run - timeout=300, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_runtime_alive=False, - remote_runtime_init_timeout=3600, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index fc5d74b13554..d91d01194d83 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -16,6 +16,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -25,7 +26,6 @@ from openhands.core.config import ( AgentConfig, AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -62,17 +62,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index 2fdab0b2927a..e63026e813e4 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -13,6 +13,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -21,7 +22,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -48,17 +48,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index d107151fc537..e856fa267c03 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -11,6 +11,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -19,7 +20,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -40,17 +40,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index b92a30b8590f..e297e3fb9ed5 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -29,6 +29,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -37,7 +38,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -61,17 +61,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py index c2cccf90c732..fbf88859b6af 100644 --- a/evaluation/benchmarks/humanevalfix/run_infer.py +++ b/evaluation/benchmarks/humanevalfix/run_infer.py @@ -22,6 +22,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -30,7 +31,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -82,17 +82,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py index e37c5b4ab053..fac82f29f510 100644 --- a/evaluation/benchmarks/logic_reasoning/run_infer.py +++ b/evaluation/benchmarks/logic_reasoning/run_infer.py @@ -9,6 +9,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -17,7 +18,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -45,18 +45,18 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'xingyaoww/od-eval-logic-reasoning:v1.0' + sandbox_config.runtime_extra_deps = ( + '$OH_INTERPRETER_PATH -m pip install scitools-pyke' + ) + config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0', - enable_auto_lint=True, - use_host_network=False, - runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke', - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py index 023cbe9cab8b..55e510818a80 100644 --- a/evaluation/benchmarks/miniwob/run_infer.py +++ b/evaluation/benchmarks/miniwob/run_infer.py @@ -12,6 +12,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -21,7 +22,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -55,23 +55,14 @@ def get_config( metadata: EvalMetadata, env_id: str, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='xingyaoww/od-eval-miniwob:v1.0', - enable_auto_lint=True, - use_host_network=False, - browsergym_eval_env=env_id, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - remote_runtime_init_timeout=1800, - keep_runtime_alive=False, - timeout=120, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index 4c356f26d944..bd1a394332c9 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -14,6 +14,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -22,7 +23,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -103,18 +103,18 @@ def load_incontext_example(task_name: str, with_tool: bool = True): def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0' + sandbox_config.runtime_extra_deps = ( + f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}' + ) + config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='xingyaoww/od-eval-mint:v1.0', - enable_auto_lint=True, - use_host_network=False, - runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}', - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py index c2fcc1ae3e26..5eff173b4600 100644 --- a/evaluation/benchmarks/ml_bench/run_infer.py +++ b/evaluation/benchmarks/ml_bench/run_infer.py @@ -25,6 +25,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -33,7 +34,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, load_app_config, @@ -77,16 +77,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='public.ecr.aws/i5g0m1f6/ml-bench', - enable_auto_lint=True, - use_host_network=False, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py index 09619fb718a6..fe0cd7ef3a00 100644 --- a/evaluation/benchmarks/scienceagentbench/run_infer.py +++ b/evaluation/benchmarks/scienceagentbench/run_infer.py @@ -11,6 +11,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -20,7 +21,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -59,22 +59,17 @@ def get_config( metadata: EvalMetadata, instance_id: str, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = ( + 'docker.io/xingyaoww/openhands-eval-scienceagentbench' + ) config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime=os.environ.get('RUNTIME', 'docker'), max_budget_per_task=4, max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench', - enable_auto_lint=True, - use_host_network=False, - timeout=300, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_runtime_alive=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index be18a36e9ab6..d5d2a81857c6 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -1,5 +1,6 @@ import json import os +import subprocess import tempfile import time from functools import partial @@ -21,13 +22,14 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + get_default_sandbox_config_for_eval, prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, ) from openhands.core.config import ( AppConfig, - SandboxConfig, + LLMConfig, get_parser, ) from openhands.core.logger import openhands_logger as logger @@ -79,22 +81,16 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> AppConfig: f'Please make sure this image exists. ' f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' ) + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = base_container_image + sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor( + dataset_name=metadata.dataset, + instance_id=instance['instance_id'], + ) config = AppConfig( run_as_openhands=False, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=SandboxConfig( - base_container_image=base_container_image, - use_host_network=False, - # large enough timeout, since some testcases take very long to run - timeout=600, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - remote_runtime_init_timeout=3600, - remote_runtime_resource_factor=get_instance_resource_factor( - dataset_name=metadata.dataset, - instance_id=instance['instance_id'], - ), - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, @@ -415,13 +411,17 @@ def process_instance( else: # Initialize with a dummy metadata when file doesn't exist metadata = EvalMetadata( - agent_class="dummy_agent", # Placeholder agent class - llm_config=LLMConfig(model="dummy_model"), # Minimal LLM config + agent_class='dummy_agent', # Placeholder agent class + llm_config=LLMConfig(model='dummy_model'), # Minimal LLM config max_iterations=1, # Minimal iterations - eval_output_dir=os.path.dirname(args.input_file), # Use input file dir as output dir + eval_output_dir=os.path.dirname( + args.input_file + ), # Use input file dir as output dir start_time=time.strftime('%Y-%m-%d %H:%M:%S'), # Current time - git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip(), # Current commit - dataset=args.dataset # Dataset name from args + git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD']) + .decode('utf-8') + .strip(), # Current commit + dataset=args.dataset, # Dataset name from args ) # The evaluation harness constrains the signature of `process_instance_func` but we need to diff --git a/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json b/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json deleted file mode 100644 index 161ab736da08..000000000000 --- a/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json +++ /dev/null @@ -1 +0,0 @@ -{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2} diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 5e3f0e6a5bd7..89fe618a6c34 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -18,6 +18,7 @@ EvalOutput, assert_and_raise, codeact_user_response, + get_default_sandbox_config_for_eval, get_metrics, is_fatal_evaluation_error, make_metadata, @@ -30,7 +31,6 @@ from openhands.core.config import ( AgentConfig, AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -122,30 +122,23 @@ def get_config( base_container_image = SWE_BENCH_CONTAINER_IMAGE logger.info(f'Using swe-bench container image: {base_container_image}') + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = base_container_image + sandbox_config.enable_auto_lint = True + sandbox_config.use_host_network = False + # Add platform to the sandbox config to solve issue 4401 + sandbox_config.platform = 'linux/amd64' + sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor( + dataset_name=metadata.dataset, + instance_id=instance['instance_id'], + ) + config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, max_iterations=metadata.max_iterations, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=SandboxConfig( - base_container_image=base_container_image, - enable_auto_lint=True, - use_host_network=False, - # large enough timeout, since some testcases take very long to run - timeout=300, - # Add platform to the sandbox config to solve issue 4401 - platform='linux/amd64', - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_runtime_alive=False, - remote_runtime_init_timeout=3600, - remote_runtime_api_timeout=120, - remote_runtime_resource_factor=get_instance_resource_factor( - dataset_name=metadata.dataset, - instance_id=instance['instance_id'], - ), - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, @@ -331,6 +324,22 @@ def complete_runtime( logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + if obs.exit_code == -1: + # The previous command is still running + # We need to kill previous command + logger.info('The previous command is still running, trying to kill it...') + action = CmdRunAction(command='C-c') + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + # Then run the command again + action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py index 84fb057ec791..43047fc17f8a 100644 --- a/evaluation/benchmarks/the_agent_company/run_infer.py +++ b/evaluation/benchmarks/the_agent_company/run_infer.py @@ -13,11 +13,11 @@ import yaml from browsing import pre_login +from evaluation.utils.shared import get_default_sandbox_config_for_eval from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, LLMConfig, - SandboxConfig, get_agent_config_arg, get_llm_config_arg, get_parser, @@ -38,6 +38,8 @@ def get_config( llm_config: LLMConfig, agent_config: AgentConfig | None, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = base_container_image config = AppConfig( run_as_openhands=False, max_budget_per_task=4, @@ -45,16 +47,7 @@ def get_config( save_trajectory_path=os.path.join( mount_path_on_host, f'traj_{task_short_name}.json' ), - sandbox=SandboxConfig( - base_container_image=base_container_image, - enable_auto_lint=True, - # using host network to access the host machine from the container - use_host_network=True, - # large enough timeout, since some testcases take very long to run - timeout=300, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # we mount trajectories path so that trajectories, generated by OpenHands # controller, can be accessible to the evaluator file in the runtime container workspace_mount_path=mount_path_on_host, diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index 45b9febed27b..2fc670e568c6 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -10,6 +10,7 @@ EvalOutput, codeact_user_response, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -18,7 +19,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, get_parser, ) @@ -41,17 +41,14 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/visualwebarena/run_infer.py b/evaluation/benchmarks/visualwebarena/run_infer.py index 8986d3ab8fff..69656e610b02 100644 --- a/evaluation/benchmarks/visualwebarena/run_infer.py +++ b/evaluation/benchmarks/visualwebarena/run_infer.py @@ -11,6 +11,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -20,7 +21,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -55,32 +55,29 @@ def get_config( assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set' assert openai_api_key is not None, 'OPENAI_API_KEY must be set' assert openai_base_url is not None, 'OPENAI_BASE_URL must be set' + + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' + sandbox_config.browsergym_eval_env = env_id + sandbox_config.runtime_startup_env_vars = { + 'BASE_URL': base_url, + 'OPENAI_API_KEY': openai_api_key, + 'OPENAI_BASE_URL': openai_base_url, + 'VWA_CLASSIFIEDS': f'{base_url}:9980', + 'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c', + 'VWA_SHOPPING': f'{base_url}:7770', + 'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin', + 'VWA_REDDIT': f'{base_url}:9999', + 'VWA_GITLAB': f'{base_url}:8023', + 'VWA_WIKIPEDIA': f'{base_url}:8888', + 'VWA_HOMEPAGE': f'{base_url}:4399', + } config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - browsergym_eval_env=env_id, - runtime_startup_env_vars={ - 'BASE_URL': base_url, - 'OPENAI_API_KEY': openai_api_key, - 'OPENAI_BASE_URL': openai_base_url, - 'VWA_CLASSIFIEDS': f'{base_url}:9980', - 'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c', - 'VWA_SHOPPING': f'{base_url}:7770', - 'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin', - 'VWA_REDDIT': f'{base_url}:9999', - 'VWA_GITLAB': f'{base_url}:8023', - 'VWA_WIKIPEDIA': f'{base_url}:8888', - 'VWA_HOMEPAGE': f'{base_url}:4399', - }, - timeout=300, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py index ad846190d8fe..a9b251b90ae5 100644 --- a/evaluation/benchmarks/webarena/run_infer.py +++ b/evaluation/benchmarks/webarena/run_infer.py @@ -11,6 +11,7 @@ EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -19,7 +20,6 @@ from openhands.controller.state.state import State from openhands.core.config import ( AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -50,29 +50,26 @@ def get_config( assert base_url is not None, 'WEBARENA_BASE_URL must be set' assert openai_api_key is not None, 'OPENAI_API_KEY must be set' + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'python:3.12-bookworm' + sandbox_config.browsergym_eval_env = env_id + sandbox_config.runtime_startup_env_vars = { + 'BASE_URL': base_url, + 'OPENAI_API_KEY': openai_api_key, + 'SHOPPING': f'{base_url}:7770/', + 'SHOPPING_ADMIN': f'{base_url}:7780/admin', + 'REDDIT': f'{base_url}:9999', + 'GITLAB': f'{base_url}:8023', + 'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing', + 'MAP': f'{base_url}:3000', + 'HOMEPAGE': f'{base_url}:4399', + } config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - browsergym_eval_env=env_id, - runtime_startup_env_vars={ - 'BASE_URL': base_url, - 'OPENAI_API_KEY': openai_api_key, - 'SHOPPING': f'{base_url}:7770/', - 'SHOPPING_ADMIN': f'{base_url}:7780/admin', - 'REDDIT': f'{base_url}:9999', - 'GITLAB': f'{base_url}:8023', - 'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing', - 'MAP': f'{base_url}:3000', - 'HOMEPAGE': f'{base_url}:4399', - }, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index f240d2e2333d..d215b0599bf0 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -8,6 +8,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -21,7 +22,6 @@ from openhands.core.config import ( AgentConfig, AppConfig, - SandboxConfig, get_llm_config_arg, parse_arguments, ) @@ -43,23 +43,14 @@ def get_config( metadata: EvalMetadata, instance_id: str, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.platform = 'linux/amd64' config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - # use default base_container_image - enable_auto_lint=True, - use_host_network=False, - timeout=300, - # Add platform to the sandbox config to solve issue 4401 - platform='linux/amd64', - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_runtime_alive=False, - remote_runtime_init_timeout=3600, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 7035d56e41ef..566fbbd71bb3 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -16,7 +16,7 @@ from tqdm import tqdm from openhands.controller.state.state import State -from openhands.core.config import LLMConfig +from openhands.core.config import LLMConfig, SandboxConfig from openhands.core.config.agent_config import AgentConfig from openhands.core.config.condenser_config import ( CondenserConfig, @@ -555,3 +555,18 @@ def get_metrics(state: State) -> dict[str, Any]: metrics = state.metrics.get() if state.metrics else {} metrics['condenser'] = get_condensation_metadata(state) return metrics + + +def get_default_sandbox_config_for_eval() -> SandboxConfig: + return SandboxConfig( + use_host_network=False, + # large enough timeout, since some testcases take very long to run + timeout=300, + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=3600, + remote_runtime_api_timeout=120, + remote_runtime_enable_retries=True, + remote_runtime_class='sysbox', + ) diff --git a/openhands/core/config/sandbox_config.py b/openhands/core/config/sandbox_config.py index 12edbbd4d99a..7c1b1adaba64 100644 --- a/openhands/core/config/sandbox_config.py +++ b/openhands/core/config/sandbox_config.py @@ -52,6 +52,9 @@ class SandboxConfig(BaseModel): remote_runtime_init_timeout: int = Field(default=180) remote_runtime_api_timeout: int = Field(default=10) remote_runtime_enable_retries: bool = Field(default=False) + remote_runtime_class: str | None = Field( + default='sysbox' + ) # can be "None" (default to gvisor) or "sysbox" (support docker inside runtime + more stable) enable_auto_lint: bool = Field( default=False # once enabled, OpenHands would lint files after editing ) diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index 536a6a6ed82d..d9ee74c4293a 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -57,6 +57,7 @@ from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin from openhands.runtime.utils.bash import BashSession from openhands.runtime.utils.files import insert_lines, read_lines +from openhands.runtime.utils.memory_monitor import MemoryMonitor from openhands.runtime.utils.runtime_init import init_user_and_working_directory from openhands.runtime.utils.system_stats import get_system_stats from openhands.utils.async_utils import call_sync_from_async, wait_all @@ -171,12 +172,19 @@ def __init__( else: logger.info('No max memory limit set, using all available system memory') + self.memory_monitor = MemoryMonitor( + enable=os.environ.get('RUNTIME_MEMORY_MONITOR', 'False').lower() + in ['true', '1', 'yes'] + ) + self.memory_monitor.start_monitoring() + @property def initial_cwd(self): return self._initial_cwd async def ainit(self): # bash needs to be initialized first + logger.debug('Initializing bash session') self.bash_session = BashSession( work_dir=self._initial_cwd, username=self.username, @@ -186,15 +194,18 @@ async def ainit(self): max_memory_mb=self.max_memory_gb * 1024 if self.max_memory_gb else None, ) self.bash_session.initialize() + logger.debug('Bash session initialized') await wait_all( (self._init_plugin(plugin) for plugin in self.plugins_to_load), timeout=30, ) + logger.debug('All plugins initialized') # This is a temporary workaround # TODO: refactor AgentSkills to be part of JupyterPlugin # AFTER ServerRuntime is deprecated + logger.debug('Initializing AgentSkills') if 'agent_skills' in self.plugins and 'jupyter' in self.plugins: obs = await self.run_ipython( IPythonRunCellAction( @@ -203,6 +214,7 @@ async def ainit(self): ) logger.debug(f'AgentSkills initialized: {obs}') + logger.debug('Initializing bash commands') await self._init_bash_commands() logger.debug('Runtime client initialized.') self._initialized = True @@ -447,6 +459,7 @@ async def browse_interactive(self, action: BrowseInteractiveAction) -> Observati return await browse(action, self.browser) def close(self): + self.memory_monitor.stop_monitoring() if self.bash_session is not None: self.bash_session.close() self.browser.close() diff --git a/openhands/runtime/impl/docker/docker_runtime.py b/openhands/runtime/impl/docker/docker_runtime.py index b2c5e980226e..b1a9caa8cf7f 100644 --- a/openhands/runtime/impl/docker/docker_runtime.py +++ b/openhands/runtime/impl/docker/docker_runtime.py @@ -255,7 +255,6 @@ def _init_container(self): server_port=self._container_port, plugins=self.plugins, app_config=self.config, - use_nice_for_root=False, ) try: diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index 0340f7d0a088..70e4217e4718 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -75,6 +75,8 @@ def __init__( 'remote_runtime_api_url is required in the remote runtime.' ) + assert self.config.sandbox.remote_runtime_class in (None, 'sysbox', 'gvisor') + self.runtime_builder = RemoteRuntimeBuilder( self.config.sandbox.remote_runtime_api_url, self.config.sandbox.api_key, @@ -225,6 +227,9 @@ def _start_runtime(self): 'session_id': self.sid, 'resource_factor': self.config.sandbox.remote_runtime_resource_factor, } + if self.config.sandbox.remote_runtime_class == 'sysbox': + start_request['runtime_class'] = 'sysbox-runc' + # We ignore other runtime classes for now, because both None and 'gvisor' map to 'gvisor' # Start the sandbox using the /start endpoint try: diff --git a/openhands/runtime/utils/command.py b/openhands/runtime/utils/command.py index 17458c1f3ee0..871ae22b55fa 100644 --- a/openhands/runtime/utils/command.py +++ b/openhands/runtime/utils/command.py @@ -16,7 +16,6 @@ def get_action_execution_server_startup_command( plugins: list[PluginRequirement], app_config: AppConfig, python_prefix: list[str] = DEFAULT_PYTHON_PREFIX, - use_nice_for_root: bool = True, override_user_id: int | None = None, override_username: str | None = None, ): @@ -40,7 +39,6 @@ def get_action_execution_server_startup_command( user_id = override_user_id or ( sandbox_config.user_id if app_config.run_as_openhands else 0 ) - is_root = bool(username == 'root') base_cmd = [ *python_prefix, @@ -59,17 +57,4 @@ def get_action_execution_server_startup_command( *browsergym_args, ] - if is_root and use_nice_for_root: - # If running as root, set highest priority and lowest OOM score - cmd_str = ' '.join(base_cmd) - return [ - 'nice', - '-n', - '-20', # Highest priority - 'sh', - '-c', - f'echo -1000 > /proc/self/oom_score_adj && exec {cmd_str}', - ] - else: - # If not root OR not using nice for root, run with normal priority - return base_cmd + return base_cmd diff --git a/openhands/runtime/utils/memory_monitor.py b/openhands/runtime/utils/memory_monitor.py new file mode 100644 index 000000000000..b7cf0492042f --- /dev/null +++ b/openhands/runtime/utils/memory_monitor.py @@ -0,0 +1,66 @@ +"""Memory monitoring utilities for the runtime.""" + +import threading + +from memory_profiler import memory_usage + +from openhands.core.logger import openhands_logger as logger + + +class LogStream: + """Stream-like object that redirects writes to a logger.""" + + def write(self, message): + if message and not message.isspace(): + logger.info(f'[Memory usage] {message.strip()}') + + def flush(self): + pass + + +class MemoryMonitor: + def __init__(self, enable: bool = False): + """Memory monitor for the runtime.""" + self._monitoring_thread: threading.Thread | None = None + self._stop_monitoring = threading.Event() + self.log_stream = LogStream() + self.enable = enable + + def start_monitoring(self): + """Start monitoring memory usage.""" + if not self.enable: + return + + if self._monitoring_thread is not None: + return + + def monitor_process(): + try: + # Use memory_usage's built-in monitoring loop + mem_usage = memory_usage( + -1, # Monitor current process + interval=0.1, # Check every second + timeout=3600, # Run indefinitely + max_usage=False, # Get continuous readings + include_children=True, # Include child processes + multiprocess=True, # Monitor all processes + stream=self.log_stream, # Redirect output to logger + backend='psutil_pss', + ) + logger.info(f'Memory usage across time: {mem_usage}') + except Exception as e: + logger.error(f'Memory monitoring failed: {e}') + + self._monitoring_thread = threading.Thread(target=monitor_process, daemon=True) + self._monitoring_thread.start() + logger.info('Memory monitoring started') + + def stop_monitoring(self): + """Stop monitoring memory usage.""" + if not self.enable: + return + + if self._monitoring_thread is not None: + self._stop_monitoring.set() + self._monitoring_thread = None + logger.info('Memory monitoring stopped') diff --git a/poetry.lock b/poetry.lock index 4ee064f2c01d..f4136a7da0d6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -4909,6 +4909,21 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "memory-profiler" +version = "0.61.0" +description = "A module for monitoring memory usage of a python program" +optional = false +python-versions = ">=3.5" +groups = ["main"] +files = [ + {file = "memory_profiler-0.61.0-py3-none-any.whl", hash = "sha256:400348e61031e3942ad4d4109d18753b2fb08c2f6fb8290671c5513a34182d84"}, + {file = "memory_profiler-0.61.0.tar.gz", hash = "sha256:4e5b73d7864a1d1292fb76a03e82a3e78ef934d06828a698d9dada76da2067b0"}, +] + +[package.dependencies] +psutil = "*" + [[package]] name = "minio" version = "7.2.15" @@ -10787,4 +10802,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "63c0a6d2f0c382f9e8010ab167df76d3275945acf4fba3da7611d68be8241429" +content-hash = "a663ed31b71b4307c9f9665a8af4d5fbb8e1a4f0a5a562055df5ec981e5bdc16" diff --git a/pyproject.toml b/pyproject.toml index f5afa9d9982a..3257bebd3a13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,9 +71,11 @@ openhands-aci = "^0.2.3" python-socketio = "^5.11.4" redis = "^5.2.0" sse-starlette = "^2.1.3" +psutil = "*" stripe = "^11.5.0" ipywidgets = "^8.1.5" qtconsole = "^5.6.1" +memory-profiler = "^0.61.0" [tool.poetry.group.llama-index.dependencies] llama-index = "*" diff --git a/tests/runtime/test_stress_remote_runtime.py b/tests/runtime/test_stress_remote_runtime.py index 5c201af8b726..a8412479c402 100644 --- a/tests/runtime/test_stress_remote_runtime.py +++ b/tests/runtime/test_stress_remote_runtime.py @@ -1,8 +1,21 @@ -"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox.""" +"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox. + +Example usage: + +```bash +export ALLHANDS_API_KEY="YOUR_API_KEY" +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.staging.all-hands.dev" +poetry run pytest -vvxss tests/runtime/test_stress_remote_runtime.py +``` + +""" import asyncio import os import tempfile +import time +from datetime import datetime from unittest.mock import MagicMock import pandas as pd @@ -30,7 +43,12 @@ ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller -from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.action import ( + CmdRunAction, + FileEditAction, + FileWriteAction, + MessageAction, +) from openhands.events.observation import CmdOutputObservation from openhands.events.serialization.event import event_to_dict from openhands.llm import LLM @@ -42,20 +60,10 @@ } -def get_config( - metadata: EvalMetadata, -) -> AppConfig: - assert ( - os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL') is not None - ), 'SANDBOX_REMOTE_RUNTIME_API_URL must be set.' - assert ( - os.environ.get('ALLHANDS_API_KEY') is not None - ), 'ALLHANDS_API_KEY must be set.' +def get_config() -> AppConfig: config = AppConfig( - default_agent=metadata.agent_class, run_as_openhands=False, - max_iterations=metadata.max_iterations, - runtime='remote', + runtime=os.environ.get('RUNTIME', 'remote'), sandbox=SandboxConfig( base_container_image='python:3.11-bookworm', enable_auto_lint=True, @@ -63,8 +71,11 @@ def get_config( # large enough timeout, since some testcases take very long to run timeout=300, api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + remote_runtime_api_url=os.environ.get( + 'SANDBOX_REMOTE_RUNTIME_API_URL', None + ), keep_runtime_alive=False, + remote_runtime_resource_factor=1, ), # do not mount workspace workspace_base=None, @@ -79,131 +90,129 @@ def get_config( return config -def initialize_runtime( - runtime: Runtime, -): - """Initialize the runtime for the agent. - - This function is called before the runtime is used to run the agent. - """ - logger.info('-' * 30) - logger.info('BEGIN Runtime Initialization Fn') - logger.info('-' * 30) - obs: CmdOutputObservation - - action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) - action.set_hard_timeout(600) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}') - - action = CmdRunAction(command='mkdir -p /dummy_dir') - action.set_hard_timeout(600) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - obs.exit_code == 0, - f'Failed to create /dummy_dir: {str(obs)}', - ) - - with tempfile.TemporaryDirectory() as temp_dir: - # Construct the full path for the desired file name within the temporary directory - temp_file_path = os.path.join(temp_dir, 'dummy_file') - # Write to the file with the desired name within the temporary directory - with open(temp_file_path, 'w') as f: - f.write('dummy content') - - # Copy the file to the desired location - runtime.copy_to(temp_file_path, '/dummy_dir/') - - logger.info('-' * 30) - logger.info('END Runtime Initialization Fn') - logger.info('-' * 30) - - -def process_instance( - instance: pd.Series, - metadata: EvalMetadata, - reset_logger: bool = True, -) -> EvalOutput: - config = get_config(metadata) - - # Setup the logger properly, so you can run multi-processing to parallelize the evaluation - if reset_logger: - log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') - reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) - else: - logger.info(f'Starting evaluation for instance {instance.instance_id}.') - - runtime = create_runtime(config, headless_mode=False) - call_async_from_sync(runtime.connect) - - try: - initialize_runtime(runtime) +@pytest.mark.skipif( + TEST_IN_CI, + reason='This test should only be run locally, not in CI.', +) +def test_stress_remote_runtime_eval(n_eval_workers: int = 64): + """Mimic evaluation setting to test remote runtime in a multi-processing setting.""" - instruction = 'dummy instruction' - agent = Agent.get_cls(metadata.agent_class)( - llm=LLM(config=metadata.llm_config), - config=config.get_agent_config(metadata.agent_class), + def _initialize_runtime( + runtime: Runtime, + ): + """Initialize the runtime for the agent. + + This function is called before the runtime is used to run the agent. + """ + logger.info('-' * 30) + logger.info('BEGIN Runtime Initialization Fn') + logger.info('-' * 30) + obs: CmdOutputObservation + + action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}') + + action = CmdRunAction(command='mkdir -p /dummy_dir') + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + obs.exit_code == 0, + f'Failed to create /dummy_dir: {str(obs)}', ) - def next_command(*args, **kwargs): - return CmdRunAction(command='ls -lah') - - agent.step = MagicMock(side_effect=next_command) - - # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = asyncio.run( - run_controller( - config=config, - initial_user_action=MessageAction(content=instruction), - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ - metadata.agent_class - ], - agent=agent, + with tempfile.TemporaryDirectory() as temp_dir: + # Construct the full path for the desired file name within the temporary directory + temp_file_path = os.path.join(temp_dir, 'dummy_file') + # Write to the file with the desired name within the temporary directory + with open(temp_file_path, 'w') as f: + f.write('dummy content') + + # Copy the file to the desired location + runtime.copy_to(temp_file_path, '/dummy_dir/') + + logger.info('-' * 30) + logger.info('END Runtime Initialization Fn') + logger.info('-' * 30) + + def _process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, + ) -> EvalOutput: + config = get_config() + + # Setup the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) + else: + logger.info(f'Starting evaluation for instance {instance.instance_id}.') + + runtime = create_runtime(config, headless_mode=True) + call_async_from_sync(runtime.connect) + + try: + _initialize_runtime(runtime) + + instruction = 'dummy instruction' + agent = Agent.get_cls(metadata.agent_class)( + llm=LLM(config=metadata.llm_config), + config=config.get_agent_config(metadata.agent_class), ) - ) - - # if fatal error, throw EvalError to trigger re-run - if ( - state.last_error - and 'fatal error during agent execution' in state.last_error - and 'stuck in a loop' not in state.last_error - ): - raise EvalException('Fatal error detected: ' + state.last_error) - - finally: - runtime.close() - - test_result = {} - if state is None: - raise ValueError('State should not be None.') - histories = [event_to_dict(event) for event in state.history] - metrics = state.metrics.get() if state.metrics else None - - # Save the output - output = EvalOutput( - instance_id=instance.instance_id, - instruction=instruction, - instance=instance.to_dict(), # SWE Bench specific - test_result=test_result, - metadata=metadata, - history=histories, - metrics=metrics, - error=state.last_error if state and state.last_error else None, - ) - return output + def next_command(*args, **kwargs): + return CmdRunAction(command='ls -lah') + + agent.step = MagicMock(side_effect=next_command) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + agent=agent, + ) + ) -@pytest.mark.skipif( - TEST_IN_CI, - reason='This test should only be run locally, not in CI.', -) -def test_stress_remote_runtime(n_eval_workers: int = 64): - """Mimic evaluation setting to test remote runtime in a multi-processing setting.""" + # if fatal error, throw EvalError to trigger re-run + if ( + state.last_error + and 'fatal error during agent execution' in state.last_error + and 'stuck in a loop' not in state.last_error + ): + raise EvalException('Fatal error detected: ' + state.last_error) + + finally: + runtime.close() + + test_result = {} + if state is None: + raise ValueError('State should not be None.') + histories = [event_to_dict(event) for event in state.history] + metrics = state.metrics.get() if state.metrics else None + + # Save the output + output = EvalOutput( + instance_id=instance.instance_id, + instruction=instruction, + instance=instance.to_dict(), # SWE Bench specific + test_result=test_result, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + ) + return output llm_config = LLMConfig() metadata = make_metadata( @@ -228,4 +237,247 @@ def test_stress_remote_runtime(n_eval_workers: int = 64): dummy_instance, output_file, eval_n_limit=len(dummy_instance) ) - run_evaluation(instances, metadata, output_file, n_eval_workers, process_instance) + run_evaluation(instances, metadata, output_file, n_eval_workers, _process_instance) + + +@pytest.mark.skipif( + TEST_IN_CI, + reason='This test should only be run locally, not in CI.', +) +def test_stress_remote_runtime_long_output_with_soft_and_hard_timeout(): + """Stress test for the remote runtime.""" + config = get_config() + + try: + runtime = create_runtime(config, headless_mode=True) + call_async_from_sync(runtime.connect) + _time_for_test = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + + # Run a command that generates long output multiple times + for i in range(10): + start_time = time.time() + iteration_stats = { + 'iteration': i, + 'timestamp': time.time(), + } + + # Check overall system memory usage + mem_action = CmdRunAction( + 'free -k | grep "Mem:" | awk \'{printf "Total: %8.1f MB, Used: %8.1f MB, Free: %8.1f MB, Available: %8.1f MB\\n", $2/1024, $3/1024, $4/1024, $7/1024}\'' + ) + mem_obs = runtime.run_action(mem_action) + assert mem_obs.exit_code == 0 + logger.info( + f'System memory usage (iteration {i}): {mem_obs.content.strip()}' + ) + # Parse memory values from output + mem_parts = mem_obs.content.strip().split(',') + for part in mem_parts: + key, value = part.strip().split(':') + iteration_stats[f'memory_{key.lower()}'] = float( + value.replace('MB', '').strip() + ) + + # Check top memory-consuming processes + mem_action = CmdRunAction( + 'ps aux | awk \'{printf "%8.1f MB %s\\n", $6/1024, $0}\' | sort -nr | head -n 5' + ) + mem_obs = runtime.run_action(mem_action) + assert mem_obs.exit_code == 0 + _top_processes = [i.strip() for i in mem_obs.content.strip().split('\n')] + logger.info( + f'Top 5 memory-consuming processes (iteration {i}):\n{"- " + "\n- ".join(_top_processes)}' + ) + iteration_stats['top_processes'] = _top_processes + + # Check tmux memory usage (in KB) + mem_action = CmdRunAction( + 'ps aux | awk \'{printf "%8.1f MB %s\\n", $6/1024, $0}\' | sort -nr | grep "/usr/bin/tmux" | grep -v grep | awk \'{print $1}\'' + ) + mem_obs = runtime.run_action(mem_action) + assert mem_obs.exit_code == 0 + logger.info( + f'Tmux memory usage (iteration {i}): {mem_obs.content.strip()} KB' + ) + try: + iteration_stats['tmux_memory_mb'] = float(mem_obs.content.strip()) + except (ValueError, AttributeError): + iteration_stats['tmux_memory_mb'] = None + + # Check action_execution_server mem + mem_action = CmdRunAction( + 'ps aux | awk \'{printf "%8.1f MB %s\\n", $6/1024, $0}\' | sort -nr | grep "action_execution_server" | grep "/openhands/poetry" | grep -v grep | awk \'{print $1}\'' + ) + mem_obs = runtime.run_action(mem_action) + assert mem_obs.exit_code == 0 + logger.info( + f'Action execution server memory usage (iteration {i}): {mem_obs.content.strip()} MB' + ) + try: + iteration_stats['action_server_memory_mb'] = float( + mem_obs.content.strip() + ) + except (ValueError, AttributeError): + iteration_stats['action_server_memory_mb'] = None + + # Test soft timeout + action = CmdRunAction( + 'read -p "Do you want to continue? [Y/n] " answer; if [[ $answer == "Y" ]]; then echo "Proceeding with operation..."; echo "Operation completed successfully!"; else echo "Operation cancelled."; exit 1; fi' + ) + obs = runtime.run_action(action) + assert 'Do you want to continue?' in obs.content + assert obs.exit_code == -1 # Command is still running, waiting for input + + # Send the confirmation + action = CmdRunAction('Y', is_input=True) + obs = runtime.run_action(action) + assert 'Proceeding with operation...' in obs.content + assert 'Operation completed successfully!' in obs.content + assert obs.exit_code == 0 + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + + # Test hard timeout w/ long output + # Generate long output with 1000 asterisks per line + action = CmdRunAction( + f'export i={i}; for j in $(seq 1 100); do echo "Line $j - Iteration $i - $(printf \'%1000s\' | tr " " "*")"; sleep 1; done' + ) + action.set_hard_timeout(2) + obs = runtime.run_action(action) + + # Verify the output + assert obs.exit_code == -1 + assert f'Line 1 - Iteration {i}' in obs.content + + # Because hard-timeout is triggered, the terminal will in a weird state + # where it will not accept any new commands. + obs = runtime.run_action(CmdRunAction('ls')) + assert obs.exit_code == -1 + assert 'The previous command is still running' in obs.metadata.suffix + + # We need to send a Ctrl+C to reset the terminal. + obs = runtime.run_action(CmdRunAction('C-c', is_input=True)) + assert obs.exit_code == 130 + + # Now make sure the terminal is in a good state + obs = runtime.run_action(CmdRunAction('ls')) + assert obs.exit_code == 0 + + duration = time.time() - start_time + iteration_stats['duration'] = duration + logger.info(f'Completed iteration {i} in {duration:.2f} seconds') + + finally: + runtime.close() + + +@pytest.mark.skipif( + TEST_IN_CI, + reason='This test should only be run locally, not in CI.', +) +def test_stress_runtime_memory_limits(): + """Test runtime behavior under resource constraints.""" + config = get_config() + + # For Docker runtime, add resource constraints + if config.runtime == 'docker': + config.sandbox.docker_runtime_kwargs = { + 'cpu_period': 100000, # 100ms + 'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU) + 'mem_limit': '4G', # 4 GB of memory + 'memswap_limit': '0', # No swap + 'mem_swappiness': 0, # Disable swapping + 'oom_kill_disable': False, # Enable OOM killer + } + config.sandbox.runtime_startup_env_vars = { + 'RUNTIME_MAX_MEMORY_GB': '3', + 'RUNTIME_MEMORY_MONITOR': 'true', + } + + try: + runtime = create_runtime(config, headless_mode=True) + call_async_from_sync(runtime.connect) + + # Install stress-ng + action = CmdRunAction( + command='sudo apt-get update && sudo apt-get install -y stress-ng' + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + action = CmdRunAction( + command='stress-ng --vm 1 --vm-bytes 6G --timeout 1m --metrics' + ) + action.set_hard_timeout(120) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'aborted early, out of system resources' in obs.content + assert obs.exit_code == 3 # OOM killed! + + finally: + runtime.close() + + +@pytest.mark.skipif( + TEST_IN_CI, + reason='This test should only be run locally, not in CI.', +) +def test_stress_runtime_memory_limits_with_repeated_file_edit(): + """Test runtime behavior under resource constraints with repeated file edits.""" + config = get_config() + + # For Docker runtime, add resource constraints + if config.runtime == 'docker': + config.sandbox.docker_runtime_kwargs = { + 'cpu_period': 100000, # 100ms + 'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU) + 'mem_limit': '4G', # 4 GB of memory + 'memswap_limit': '0', # No swap + 'mem_swappiness': 0, # Disable swapping + 'oom_kill_disable': False, # Enable OOM killer + } + config.sandbox.runtime_startup_env_vars = { + 'RUNTIME_MAX_MEMORY_GB': '3', + 'RUNTIME_MEMORY_MONITOR': 'true', + } + + try: + runtime = create_runtime(config, headless_mode=True) + call_async_from_sync(runtime.connect) + + # Create initial test file with base content + test_file = '/tmp/test_file.txt' + # base_content = 'content_1\n' * 1000 # Create a reasonably sized file + base_content = '' + for i in range(1000): + base_content += f'content_{i:03d}\n' + + # Use FileWriteAction to create initial file + write_action = FileWriteAction(path=test_file, content=base_content) + obs = runtime.run_action(write_action) + + # Perform repeated file edits + for i in range(1000): + # Use FileEditAction with str_replace instead of IPythonRunCellAction + edit_action = FileEditAction( + command='str_replace', + path=test_file, + old_str=f'content_{i:03d}', + new_str=f'-content_{i:03d}', + ) + obs = runtime.run_action(edit_action) + assert ( + f'The file {test_file} has been edited' in obs.content + ), f'Edit failed at iteration {i}' + logger.info(f'finished iteration {i}') + + # Verify final file state using FileEditAction view command + action = FileEditAction(command='view', path=test_file) + obs = runtime.run_action(action) + assert '-content_999' in obs.content, 'Final content verification failed' + logger.info('Final file content verified successfully') + + finally: + runtime.close()