diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index c866b5090bdd..e8cee3df3e20 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -63,7 +63,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index f008c9dc8a8a..a64c66f22cdc 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -43,7 +43,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-slim', diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index e059a6b46f6a..bc850dbc6261 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -50,7 +50,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.11-bookworm', diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index 2da7b09f0fcf..c33c75e5a221 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -61,7 +61,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE, diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index d35084fdbc82..14946ebacb2f 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -74,7 +74,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py index 38fb6cae25ce..016b6c3f582e 100644 --- a/evaluation/benchmarks/browsing_delegation/run_infer.py +++ b/evaluation/benchmarks/browsing_delegation/run_infer.py @@ -39,7 +39,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index 1ef347931feb..d8f1f64b1a6b 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -124,7 +124,7 @@ def get_config( default_agent=metadata.agent_class, run_as_openhands=False, max_iterations=metadata.max_iterations, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), sandbox=SandboxConfig( base_container_image=base_container_image, enable_auto_lint=True, diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index 55e958d9fd9c..0d5b47410c2d 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -65,7 +65,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index 99c29b211dc4..8aaa479e92be 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -50,7 +50,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index 64263242d751..e453b1f570ba 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -43,7 +43,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index d9e1caec7768..08e66827924e 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -64,7 +64,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py index 3b5a5bca2ff8..b2fb6d677a9c 100644 --- a/evaluation/benchmarks/humanevalfix/run_infer.py +++ b/evaluation/benchmarks/humanevalfix/run_infer.py @@ -85,7 +85,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py index 0a1447f06171..d84c5f8ca8cb 100644 --- a/evaluation/benchmarks/logic_reasoning/run_infer.py +++ b/evaluation/benchmarks/logic_reasoning/run_infer.py @@ -48,7 +48,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0', diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py index dd93fbaf0a75..acc1431c81f1 100644 --- a/evaluation/benchmarks/miniwob/run_infer.py +++ b/evaluation/benchmarks/miniwob/run_infer.py @@ -58,7 +58,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='xingyaoww/od-eval-miniwob:v1.0', diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index 7106f4a59d86..a98fa8d91805 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -106,7 +106,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='xingyaoww/od-eval-mint:v1.0', diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py index ab94b925ab14..1c084fc14916 100644 --- a/evaluation/benchmarks/ml_bench/run_infer.py +++ b/evaluation/benchmarks/ml_bench/run_infer.py @@ -80,7 +80,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='public.ecr.aws/i5g0m1f6/ml-bench', diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py index db4abf0f4828..ebe1b783cfed 100644 --- a/evaluation/benchmarks/scienceagentbench/run_infer.py +++ b/evaluation/benchmarks/scienceagentbench/run_infer.py @@ -62,7 +62,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), max_budget_per_task=4, max_iterations=metadata.max_iterations, sandbox=SandboxConfig( diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index 95f65245f22f..c5d479dd50d5 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -76,7 +76,7 @@ def get_config(instance: pd.Series) -> AppConfig: ) config = AppConfig( run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), sandbox=SandboxConfig( base_container_image=base_container_image, use_host_network=False, diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index be4761da13e7..61c045037bbb 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -121,7 +121,7 @@ def get_config( default_agent=metadata.agent_class, run_as_openhands=False, max_iterations=metadata.max_iterations, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), sandbox=SandboxConfig( base_container_image=base_container_image, enable_auto_lint=True, diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index f88163a048f5..6f6f1a0e2048 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -44,7 +44,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py index d18918cf969f..ac51a201a712 100644 --- a/evaluation/benchmarks/webarena/run_infer.py +++ b/evaluation/benchmarks/webarena/run_infer.py @@ -53,7 +53,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime='docker', max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='python:3.12-bookworm', diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 2da68b9b82b9..fe85d23bf585 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -42,7 +42,7 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'eventstream'), + runtime=os.environ.get('RUNTIME', 'docker'), max_iterations=metadata.max_iterations, sandbox=SandboxConfig( # use default base_container_image