From 4136c5349a969d68dd16af790eb38ecad8dfe953 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 23 Nov 2024 21:44:21 +0000 Subject: [PATCH] Fix pr #5223: Fix issue #5222: [Refactor]: Refactor the evaluation directory --- CREDITS.md | 55 +++++++++++++++---- evaluation/benchmarks/EDA/README.md | 4 +- .../benchmarks/EDA/scripts/run_infer.sh | 4 +- evaluation/benchmarks/agent_bench/README.md | 6 +- .../agent_bench/scripts/run_infer.sh | 4 +- evaluation/benchmarks/aider_bench/README.md | 14 ++--- .../aider_bench/scripts/run_infer.sh | 4 +- evaluation/benchmarks/biocoder/README.md | 4 +- .../benchmarks/biocoder/scripts/run_infer.sh | 4 +- evaluation/benchmarks/bird/README.md | 4 +- .../benchmarks/bird/scripts/run_infer.sh | 4 +- .../benchmarks/browsing_delegation/README.md | 4 +- .../browsing_delegation/scripts/run_infer.sh | 4 +- evaluation/benchmarks/commit0_bench/README.md | 12 ++-- .../benchmarks/commit0_bench/run_infer.py | 2 +- .../commit0_bench/scripts/run_infer.sh | 4 +- .../benchmarks/discoverybench/README.md | 4 +- .../discoverybench/scripts/run_infer.sh | 4 +- evaluation/benchmarks/gaia/README.md | 10 ++-- .../benchmarks/gaia/scripts/run_infer.sh | 4 +- evaluation/benchmarks/gorilla/README.md | 4 +- evaluation/benchmarks/gorilla/ast_eval_hf.py | 2 +- evaluation/benchmarks/gorilla/ast_eval_tf.py | 2 +- evaluation/benchmarks/gorilla/ast_eval_th.py | 2 +- .../benchmarks/gorilla/scripts/run_infer.sh | 4 +- evaluation/benchmarks/gpqa/README.md | 2 +- .../benchmarks/gpqa/scripts/run_infer.sh | 4 +- evaluation/benchmarks/humanevalfix/README.md | 4 +- .../humanevalfix/scripts/run_infer.sh | 4 +- .../benchmarks/logic_reasoning/README.md | 2 +- .../logic_reasoning/scripts/run_infer.sh | 4 +- evaluation/benchmarks/miniwob/README.md | 10 ++-- .../benchmarks/miniwob/scripts/run_infer.sh | 4 +- evaluation/benchmarks/mint/README.md | 4 +- .../benchmarks/mint/scripts/run_infer.sh | 4 +- evaluation/benchmarks/ml_bench/README.md | 16 +++--- .../benchmarks/ml_bench/run_analysis.py | 2 +- .../ml_bench/scripts/run_analysis.sh | 2 +- .../benchmarks/ml_bench/scripts/run_infer.sh | 4 +- .../benchmarks/scienceagentbench/Dockerfile | 2 +- .../benchmarks/scienceagentbench/README.md | 8 +-- .../scienceagentbench/scripts/run_infer.sh | 4 +- evaluation/benchmarks/swe_bench/README.md | 36 ++++++------ evaluation/benchmarks/swe_bench/run_infer.py | 2 +- .../docker/push_docker_instance_images.py | 2 +- ...onvert_oh_folder_to_swebench_submission.sh | 2 +- .../swe_bench/scripts/eval_infer.sh | 6 +- .../swe_bench/scripts/eval_infer_remote.sh | 4 +- .../benchmarks/swe_bench/scripts/run_infer.sh | 4 +- .../scripts/setup/prepare_swe_utils.sh | 2 +- evaluation/benchmarks/toolqa/README.md | 4 +- .../benchmarks/toolqa/scripts/run_infer.sh | 4 +- evaluation/benchmarks/webarena/README.md | 6 +- .../benchmarks/webarena/scripts/run_infer.sh | 6 +- 54 files changed, 180 insertions(+), 147 deletions(-) diff --git a/CREDITS.md b/CREDITS.md index 873742b7e011..38b3bdec9584 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -24,33 +24,66 @@ OpenHands includes and adapts the following open source projects. We are gratefu ### Reference Implementations for Evaluation Benchmarks OpenHands integrates code of the reference implementations for the following agent evaluation benchmarks: -#### [HumanEval](https://github.com/openai/human-eval) - - License: MIT License - -#### [DSP](https://github.com/microsoft/DataScienceProblems) - - License: MIT License +#### [EDA](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/EDA) + - Description: Exploratory Data Analysis benchmark -#### [HumanEvalPack](https://github.com/bigcode-project/bigcode-evaluation-harness) +#### [AgentBench](https://github.com/THUDM/AgentBench) - License: Apache License 2.0 -#### [AgentBench](https://github.com/THUDM/AgentBench) +#### [Aider Bench](https://github.com/paul-gauthier/aider) - License: Apache License 2.0 -#### [SWE-Bench](https://github.com/princeton-nlp/SWE-bench) - - License: MIT License +#### [BioCoder](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/biocoder) + - Description: Benchmark for biological code generation tasks #### [BIRD](https://bird-bench.github.io/) - License: MIT License - Dataset: CC-BY-SA 4.0 +#### [Browsing Delegation](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/browsing_delegation) + - Description: Web browsing delegation benchmark + +#### [Commit0 Bench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/commit0_bench) + - Description: Git commit analysis benchmark + +#### [DiscoveryBench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/discoverybench) + - Description: Benchmark for discovery tasks + +#### [GAIA](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/gaia) + - Description: General AI Assistant benchmark + #### [Gorilla APIBench](https://github.com/ShishirPatil/gorilla) - License: Apache License 2.0 #### [GPQA](https://github.com/idavidrein/gpqa) - License: MIT License -#### [ProntoQA](https://github.com/asaparov/prontoqa) - - License: Apache License 2.0 +#### [HumanEvalFix](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/humanevalfix) + - Description: Code fixing benchmark based on HumanEval + +#### [Logic Reasoning](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/logic_reasoning) + - Description: Benchmark for logical reasoning tasks + +#### [MiniWoB](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/miniwob) + - Description: Mini World of Bits benchmark + +#### [MINT](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/mint) + - Description: Machine learning INTerpretation benchmark + +#### [ML Bench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/ml_bench) + - Description: Machine Learning benchmark + +#### [ScienceAgentBench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/scienceagentbench) + - Description: Benchmark for scientific tasks + +#### [SWE-Bench](https://github.com/princeton-nlp/SWE-bench) + - License: MIT License + +#### [ToolQA](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/toolqa) + - Description: Tool-based Question Answering benchmark + +#### [WebArena](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/webarena) + - Description: Web interaction benchmark ## Open Source licenses diff --git a/evaluation/benchmarks/EDA/README.md b/evaluation/benchmarks/EDA/README.md index fee875c5dd51..05f56dbd3eb4 100644 --- a/evaluation/benchmarks/EDA/README.md +++ b/evaluation/benchmarks/EDA/README.md @@ -12,7 +12,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ```bash export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation) -./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit] +./evaluation/benchmarks/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit] ``` where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional. @@ -33,7 +33,7 @@ to `CodeActAgent`. For example, ```bash -./evaluation/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things +./evaluation/benchmarks/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things ``` ## Reference diff --git a/evaluation/benchmarks/EDA/scripts/run_infer.sh b/evaluation/benchmarks/EDA/scripts/run_infer.sh index a803073f73c6..8251744a453a 100755 --- a/evaluation/benchmarks/EDA/scripts/run_infer.sh +++ b/evaluation/benchmarks/EDA/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -43,7 +43,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" -COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/EDA/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --dataset $DATASET \ diff --git a/evaluation/benchmarks/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md index e8a1e3dc955e..bb4dfd7fd29b 100644 --- a/evaluation/benchmarks/agent_bench/README.md +++ b/evaluation/benchmarks/agent_bench/README.md @@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Start the evaluation ```bash -./evaluation/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] +./evaluation/benchmarks/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your @@ -25,7 +25,7 @@ in order to use `eval_limit`, you must also set `agent`. Following is the basic command to start the evaluation. -You can update the arguments in the script `evaluation/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on. +You can update the arguments in the script `evaluation/benchmarks/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on. - `--agent-cls`, the agent to use. For example, `CodeActAgent`. - `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`. @@ -34,5 +34,5 @@ You can update the arguments in the script `evaluation/benchmarks/agent_bench/sc - `--eval-n-limit`: the number of examples to evaluate. For example, `100`. ```bash -./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1 +./evaluation/benchmarks/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1 ``` diff --git a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh index 16e98b074b74..8033c343873e 100755 --- a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -26,7 +26,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \ +COMMAND="export PYTHONPATH=evaluation/benchmarks/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/benchmarks/agent_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md index 965fc06d7ecc..2672c81269fd 100644 --- a/evaluation/benchmarks/aider_bench/README.md +++ b/evaluation/benchmarks/aider_bench/README.md @@ -16,7 +16,7 @@ development environment and LLM. ## Start the evaluation ```bash -./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] +./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for @@ -42,7 +42,7 @@ export SKIP_NUM=12 # skip the first 12 instances from the dataset Following is the basic command to start the evaluation. You can update the arguments in the script -`evaluation/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`, +`evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on: - `--agent-cls`, the agent to use. For example, `CodeActAgent`. @@ -53,7 +53,7 @@ You can update the arguments in the script - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`. ```bash -./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" +./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -61,25 +61,25 @@ You can update the arguments in the script This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] +./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] # Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel export ALLHANDS_API_KEY="YOUR-API-KEY" export RUNTIME=remote export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" -./evaluation/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2 +./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2 ``` ## Summarize Results ```bash -poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] +poetry run python ./evaluation/benchmarks/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] ``` Full example: ```bash -poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl +poetry run python ./evaluation/benchmarks/benchmarks/aider_bench/scripts/summarize_results.py evaluation/benchmarks/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl ``` This will list the instances that passed and the instances that failed. For each diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 0b3824ceae6e..72fd6e6c23fe 100755 --- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -39,7 +39,7 @@ if [ "$USE_UNIT_TESTS" = true ]; then EVAL_NOTE=$EVAL_NOTE-w-test fi -COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/aider_bench/run_infer.py \ +COMMAND="export PYTHONPATH=evaluation/benchmarks/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/benchmarks/aider_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/benchmarks/biocoder/README.md b/evaluation/benchmarks/biocoder/README.md index 035f2d20bf12..1549ed4a974c 100644 --- a/evaluation/benchmarks/biocoder/README.md +++ b/evaluation/benchmarks/biocoder/README.md @@ -21,7 +21,7 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode ```bash -./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] +./evaluation/benchmarks/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] ``` where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional. @@ -43,7 +43,7 @@ with current OpenHands version, then your command would be: ## Examples ```bash -./evaluation/benchmarks/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1 +./evaluation/benchmarks/benchmarks/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1 ``` ## Reference diff --git a/evaluation/benchmarks/biocoder/scripts/run_infer.sh b/evaluation/benchmarks/biocoder/scripts/run_infer.sh index 61fddb621176..98d6ad264b58 100755 --- a/evaluation/benchmarks/biocoder/scripts/run_infer.sh +++ b/evaluation/benchmarks/biocoder/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -28,7 +28,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" -COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/biocoder/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/benchmarks/bird/README.md b/evaluation/benchmarks/bird/README.md index 90e3fa300cbd..dcce84f04290 100644 --- a/evaluation/benchmarks/bird/README.md +++ b/evaluation/benchmarks/bird/README.md @@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Run Inference on Bird ```bash -./evaluation/benchmarks/bird/scripts/run_infer.sh [model_config] [git-version] +./evaluation/benchmarks/benchmarks/bird/scripts/run_infer.sh [model_config] [git-version] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your @@ -31,7 +31,7 @@ For each problem, OpenHands is given a set number of iterations to fix the faili "agent_class": "CodeActAgent", "model_name": "gpt-4-1106-preview", "max_iterations": 5, - "eval_output_dir": "evaluation/evaluation_outputs/outputs/bird/CodeActAgent/gpt-4-1106-preview_maxiter_5_N_v1.5", + "eval_output_dir": "evaluation/benchmarks/evaluation_outputs/outputs/bird/CodeActAgent/gpt-4-1106-preview_maxiter_5_N_v1.5", "start_time": "2024-05-29 02:00:22", "git_commit": "ae105c2fafc64ad3eeb7a8bea09119fcb5865bc4" }, diff --git a/evaluation/benchmarks/bird/scripts/run_infer.sh b/evaluation/benchmarks/bird/scripts/run_infer.sh index bf69d9d50bd7..85f49bf36192 100755 --- a/evaluation/benchmarks/bird/scripts/run_infer.sh +++ b/evaluation/benchmarks/bird/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -26,7 +26,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/bird/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 5 \ diff --git a/evaluation/benchmarks/browsing_delegation/README.md b/evaluation/benchmarks/browsing_delegation/README.md index a06170f8b9e0..3ac767516fe8 100644 --- a/evaluation/benchmarks/browsing_delegation/README.md +++ b/evaluation/benchmarks/browsing_delegation/README.md @@ -12,8 +12,8 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Run Inference ```bash -./evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] -# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300 +./evaluation/benchmarks/benchmarks/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] +# e.g., ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300 ``` where `model_config` is mandatory, while `agent` and `eval_limit` are optional. diff --git a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh index 30607ca3336b..52cae31dda45 100755 --- a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh +++ b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -28,7 +28,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG" EVAL_NOTE="$AGENT_VERSION" -COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/browsing_delegation/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 1 \ diff --git a/evaluation/benchmarks/commit0_bench/README.md b/evaluation/benchmarks/commit0_bench/README.md index 78b58b02137f..6b209cf92e25 100644 --- a/evaluation/benchmarks/commit0_bench/README.md +++ b/evaluation/benchmarks/commit0_bench/README.md @@ -24,10 +24,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least When the `run_infer.sh` script is started, it will automatically pull the `lite` split in Commit0. For example, for instance ID `commit-0/minitorch`, it will try to pull our pre-build docker image `wentingzhao/minitorch` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on. ```bash -./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example -./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test +./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test ``` where `model_config` is mandatory, and the rest are optional. @@ -56,7 +56,7 @@ Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgen then your command would be: ```bash -./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test +./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -64,17 +64,17 @@ then your command would be: This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example - This runs evaluation on CodeActAgent for 10 instances on "wentingzhao/commit0_combined"'s test set, with max 30 iteration per instances, with 1 number of workers running in parallel ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="docker.io/wentingzhao" \ -./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test +./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test ``` To clean-up all existing runtime you've already started, run: ```bash -ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh +ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh ``` ### Specify a subset of tasks to run infer diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index ef2df020310c..84faa9da307a 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -58,7 +58,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): test_cmd = instance['test']['test_cmd'] test_dir = instance['test']['test_dir'] # Instruction based on Anthropic's official trajectory - # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs + # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/benchmarks/verified/20241022_tools_claude-3-5-sonnet-updated/trajs instruction = ( '\n' f'/workspace/{workspace_dir_name}\n' diff --git a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh index 227a5ff05ea9..78843b310ea5 100755 --- a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" REPO_SPLIT=$1 MODEL_CONFIG=$2 @@ -91,7 +91,7 @@ fi function run_eval() { local eval_note=$1 - COMMAND="poetry run python evaluation/benchmarks/commit0_bench/run_infer.py \ + COMMAND="poetry run python evaluation/benchmarks/benchmarks/commit0_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations $MAX_ITER \ diff --git a/evaluation/benchmarks/discoverybench/README.md b/evaluation/benchmarks/discoverybench/README.md index daf5cc34bbb4..e48dad52039c 100644 --- a/evaluation/benchmarks/discoverybench/README.md +++ b/evaluation/benchmarks/discoverybench/README.md @@ -16,7 +16,7 @@ 2. Execute the bash script to start DiscoveryBench Evaluation ``` -./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG] +./evaluation/benchmarks/benchmarks/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG] ``` Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml` @@ -27,7 +27,7 @@ When the `run_infer.sh` script is started, it will automatically pull the latest ``` -./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] +./evaluation/benchmarks/benchmarks/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] ``` - `MODEL_CONFIG`: Name of the model you want to evaluate with diff --git a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh index e12b9c139891..06f776b0a5d6 100755 --- a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh +++ b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -29,7 +29,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/discoverybench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/benchmarks/gaia/README.md b/evaluation/benchmarks/gaia/README.md index f592e5f7118d..7dfb92a5c777 100644 --- a/evaluation/benchmarks/gaia/README.md +++ b/evaluation/benchmarks/gaia/README.md @@ -10,11 +10,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA). Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation. -Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/benchmarks/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on. +Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on. ```bash -./evaluation/benchmarks/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset] -# e.g., ./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300 +./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset] +# e.g., ./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300 ``` where `model_config` is mandatory, while `git-version`, `agent`, `eval_limit` and `gaia_subset` are optional. @@ -35,13 +35,13 @@ to `CodeActAgent`. For example, ```bash -./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10 +./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10 ``` ## Get score Then you can get stats by running the following command: ```bash -python ./evaluation/benchmarks/gaia/get_score.py \ +python ./evaluation/benchmarks/benchmarks/gaia/get_score.py \ --file ``` diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh index aedfe01a0c60..02abd0e0eb3e 100755 --- a/evaluation/benchmarks/gaia/scripts/run_infer.sh +++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -35,7 +35,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "LEVELS: $LEVELS" -COMMAND="poetry run python ./evaluation/gaia/run_infer.py \ +COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/benchmarks/gorilla/README.md b/evaluation/benchmarks/gorilla/README.md index c6f1cde55b40..03fe7217adfa 100644 --- a/evaluation/benchmarks/gorilla/README.md +++ b/evaluation/benchmarks/gorilla/README.md @@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop Make sure your Docker daemon is running, then run this bash script: ```bash -./evaluation/benchmarks/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs] +./evaluation/benchmarks/benchmarks/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs] ``` where `model_config` is mandatory, while all other arguments are optional. @@ -35,5 +35,5 @@ Note: in order to use `eval_limit`, you must also set `agent`; in order to use ` For example, ```bash -./evaluation/benchmarks/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th +./evaluation/benchmarks/benchmarks/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th ``` diff --git a/evaluation/benchmarks/gorilla/ast_eval_hf.py b/evaluation/benchmarks/gorilla/ast_eval_hf.py index 25229aee7407..d1d019894c14 100644 --- a/evaluation/benchmarks/gorilla/ast_eval_hf.py +++ b/evaluation/benchmarks/gorilla/ast_eval_hf.py @@ -40,7 +40,7 @@ def get_all_sub_trees(root_node): # Parse the program into AST trees def ast_parse(candidate, lang='python'): - LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang) + LANGUAGE = Language('evaluation/benchmarks/gorilla/my-languages.so', lang) parser = Parser() parser.set_language(LANGUAGE) diff --git a/evaluation/benchmarks/gorilla/ast_eval_tf.py b/evaluation/benchmarks/gorilla/ast_eval_tf.py index 22067010c140..6f9e7729c246 100644 --- a/evaluation/benchmarks/gorilla/ast_eval_tf.py +++ b/evaluation/benchmarks/gorilla/ast_eval_tf.py @@ -40,7 +40,7 @@ def get_all_sub_trees(root_node): # Parse the program into AST trees def ast_parse(candidate, lang='python'): - LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang) + LANGUAGE = Language('evaluation/benchmarks/gorilla/my-languages.so', lang) parser = Parser() parser.set_language(LANGUAGE) diff --git a/evaluation/benchmarks/gorilla/ast_eval_th.py b/evaluation/benchmarks/gorilla/ast_eval_th.py index f55f70ed7c5e..cc8137c3d9a6 100644 --- a/evaluation/benchmarks/gorilla/ast_eval_th.py +++ b/evaluation/benchmarks/gorilla/ast_eval_th.py @@ -40,7 +40,7 @@ def get_all_sub_trees(root_node): # Parse the program into AST trees def ast_parse(candidate, lang='python'): - LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang) + LANGUAGE = Language('evaluation/benchmarks/gorilla/my-languages.so', lang) parser = Parser() parser.set_language(LANGUAGE) diff --git a/evaluation/benchmarks/gorilla/scripts/run_infer.sh b/evaluation/benchmarks/gorilla/scripts/run_infer.sh index 45424444431c..c735289e5b4f 100755 --- a/evaluation/benchmarks/gorilla/scripts/run_infer.sh +++ b/evaluation/benchmarks/gorilla/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -33,7 +33,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "HUBS: $HUBS" -COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/gorilla/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/benchmarks/gpqa/README.md b/evaluation/benchmarks/gpqa/README.md index 235b9ab9b281..b3305f757f63 100644 --- a/evaluation/benchmarks/gpqa/README.md +++ b/evaluation/benchmarks/gpqa/README.md @@ -23,7 +23,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options From the root of the OpenHands repo, run the following command: ```bash -./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass] +./evaluation/benchmarks/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass] ``` You can replace `model_config_name` with any model you set up in `config.toml`. diff --git a/evaluation/benchmarks/gpqa/scripts/run_infer.sh b/evaluation/benchmarks/gpqa/scripts/run_infer.sh index ec5a61dbbbc0..9687c4e25e62 100755 --- a/evaluation/benchmarks/gpqa/scripts/run_infer.sh +++ b/evaluation/benchmarks/gpqa/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -33,7 +33,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/gpqa/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/benchmarks/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md index 5f3ae58ee29d..a486ec4c25aa 100644 --- a/evaluation/benchmarks/humanevalfix/README.md +++ b/evaluation/benchmarks/humanevalfix/README.md @@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Run Inference on HumanEvalFix ```bash -./evaluation/benchmarks/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview +./evaluation/benchmarks/benchmarks/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview ``` You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`. @@ -28,7 +28,7 @@ For each problem, OpenHands is given a set number of iterations to fix the faili "agent_class": "CodeActAgent", "model_name": "gpt-4", "max_iterations": 10, - "eval_output_dir": "evaluation/evaluation_outputs/outputs/humanevalfix/CodeActAgent/gpt-4_maxiter_10_N_v1.4", + "eval_output_dir": "evaluation/benchmarks/evaluation_outputs/outputs/humanevalfix/CodeActAgent/gpt-4_maxiter_10_N_v1.4", "start_time": "2024-05-22 20:54:15", "git_commit": "4d3253696f5a9d9de02ab86969fe9796fa40331f" }, diff --git a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh index b0b30628eb5e..a54a2ff6cd9b 100755 --- a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh +++ b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -64,7 +64,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/humanevalfix/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/benchmarks/logic_reasoning/README.md b/evaluation/benchmarks/logic_reasoning/README.md index d4e4d3e9a554..fa1574a23303 100644 --- a/evaluation/benchmarks/logic_reasoning/README.md +++ b/evaluation/benchmarks/logic_reasoning/README.md @@ -10,5 +10,5 @@ Please follow instruction [here](../README.md#setup) to setup your local develop The following code will run inference on the first example of the ProofWriter dataset, ```bash -./evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter +./evaluation/benchmarks/benchmarks/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter ``` diff --git a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh index 40c244d18b2a..e766851b1238 100755 --- a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh +++ b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 DATASET=$2 @@ -34,7 +34,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/logic_reasoning/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --dataset $DATASET \ diff --git a/evaluation/benchmarks/miniwob/README.md b/evaluation/benchmarks/miniwob/README.md index 5535e45a7dc0..4f771e3922e9 100644 --- a/evaluation/benchmarks/miniwob/README.md +++ b/evaluation/benchmarks/miniwob/README.md @@ -13,7 +13,7 @@ Access with browser the above MiniWoB URLs and see if they load correctly. ## Run Evaluation ```sh -./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval +./evaluation/benchmarks/benchmarks/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -21,21 +21,21 @@ Access with browser the above MiniWoB URLs and see if they load correctly. This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/benchmarks/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers] +./evaluation/benchmarks/benchmarks/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers] # Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel export ALLHANDS_API_KEY="YOUR-API-KEY" export RUNTIME=remote export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" -./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2 +./evaluation/benchmarks/benchmarks/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2 ``` -Results will be in `evaluation/evaluation_outputs/outputs/miniwob/` +Results will be in `evaluation/benchmarks/evaluation_outputs/outputs/miniwob/` To calculate the average reward, run: ```sh -poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl +poetry run python evaluation/benchmarks/benchmarks/miniwob/get_success_rate.py evaluation/benchmarks/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl ``` ## Submit your evaluation results diff --git a/evaluation/benchmarks/miniwob/scripts/run_infer.sh b/evaluation/benchmarks/miniwob/scripts/run_infer.sh index 8f997e29c308..da192fdaee03 100755 --- a/evaluation/benchmarks/miniwob/scripts/run_infer.sh +++ b/evaluation/benchmarks/miniwob/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" # configure browsing agent export USE_NAV="false" @@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG" EVAL_NOTE="${AGENT_VERSION}_${NOTE}" -COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \ +COMMAND="export PYTHONPATH=evaluation/benchmarks/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/benchmarks/miniwob/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/benchmarks/mint/README.md b/evaluation/benchmarks/mint/README.md index f9ab43327199..1cb445596978 100644 --- a/evaluation/benchmarks/mint/README.md +++ b/evaluation/benchmarks/mint/README.md @@ -15,7 +15,7 @@ We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/da Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`. ```bash -./evaluation/benchmarks/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit] +./evaluation/benchmarks/benchmarks/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit] ``` where `model_config` is mandatory, while others are optional. @@ -34,7 +34,7 @@ Note: in order to use `eval_limit`, you must also set `subset`. For example, ```bash -./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 ``` ## Reference diff --git a/evaluation/benchmarks/mint/scripts/run_infer.sh b/evaluation/benchmarks/mint/scripts/run_infer.sh index b9ec6d7a7a85..4bf36cccbd0c 100755 --- a/evaluation/benchmarks/mint/scripts/run_infer.sh +++ b/evaluation/benchmarks/mint/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -25,7 +25,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" export PYTHONPATH=$(pwd) -COMMAND="poetry run python ./evaluation/mint/run_infer.py \ +COMMAND="poetry run python ./evaluation/benchmarks/mint/run_infer.py \ --llm-config $MODEL_CONFIG \ --max-iterations 5 \ --max-propose-solution 2 \ diff --git a/evaluation/benchmarks/ml_bench/README.md b/evaluation/benchmarks/ml_bench/README.md index 528edddc148a..5644735bee7b 100644 --- a/evaluation/benchmarks/ml_bench/README.md +++ b/evaluation/benchmarks/ml_bench/README.md @@ -19,8 +19,8 @@ Please follow instruction [here](../README.md#setup) to setup your local develop To run the evaluation on the ML-Bench dataset, use the following command: ```bash -./evaluation/benchmarks/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit] -# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10 +./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit] +# e.g., ./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10 ``` You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`. @@ -30,8 +30,8 @@ You can replace `eval_gpt4_1106_preview` with any model you set up in `config.to To score the evaluation output, use the following command: ```bash -./evaluation/benchmarks/ml_bench/scripts/summarise_results.py [eval_output_dir] -# e.g., ./evaluation/benchmarks/ml_bench/scripts/summarise_results.py evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5 +./evaluation/benchmarks/benchmarks/ml_bench/scripts/summarise_results.py [eval_output_dir] +# e.g., ./evaluation/benchmarks/benchmarks/ml_bench/scripts/summarise_results.py evaluation/benchmarks/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5 ``` ## Run Error Analysis on ML-Bench @@ -39,8 +39,8 @@ To score the evaluation output, use the following command: To run error analysis on the ML-Bench dataset, use the following command: ```bash -./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config] -# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview +./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config] +# e.g., ./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_analysis.sh evaluation/benchmarks/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview ``` This command generates a report on the evaluation output and provides insights into the agent's performance. @@ -60,7 +60,7 @@ Here's an example of the evaluation output for a single task instance: "agent_class": "CodeActAgent", "model_name": "gpt-4-1106-preview", "max_iterations": 10, - "eval_output_dir": "evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5", + "eval_output_dir": "evaluation/benchmarks/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5", "start_time": "2024-05-26 17:39:59", "git_commit": "dd8ee9044a94a213dc2e31d2085dbf2924ee80a1" }, @@ -105,7 +105,7 @@ The `metrics` field contains the parsed evaluation metrics from the `eval_output ## Customization -You can customize the evaluation script by modifying the `evaluation/benchmarks/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs. +You can customize the evaluation script by modifying the `evaluation/benchmarks/benchmarks/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs. Feel free to adjust the configuration, logging, and output formatting to suit your needs. diff --git a/evaluation/benchmarks/ml_bench/run_analysis.py b/evaluation/benchmarks/ml_bench/run_analysis.py index eda8fd4bdd45..665b1080302e 100644 --- a/evaluation/benchmarks/ml_bench/run_analysis.py +++ b/evaluation/benchmarks/ml_bench/run_analysis.py @@ -120,7 +120,7 @@ def classify_error(llm: LLM, failed_case: dict) -> str: ) args, _ = parser.parse_known_args() - # Check https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/swe_bench/README.md#configure-openhands-and-your-llm + # Check https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks/swe_bench/README.md#configure-openhands-and-your-llm # for details of how to set `llm_config` if args.llm_config: specified_llm_config = get_llm_config_arg(args.llm_config) diff --git a/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh index d5fe6365ca86..6dea03a52417 100644 --- a/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh +++ b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh @@ -17,7 +17,7 @@ fi echo "MODEL_CONFIG: $MODEL_CONFIG" echo "RESULT_FILE: $RESULT_FILE" -COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_analysis.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/ml_bench/run_analysis.py \ --llm-config $MODEL_CONFIG \ --json_file_path $RESULT_FILE" diff --git a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh index 97ff0003fc5c..0e7d2957ce4e 100755 --- a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -32,7 +32,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/ml_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/benchmarks/scienceagentbench/Dockerfile b/evaluation/benchmarks/scienceagentbench/Dockerfile index 70ed92cc4dc8..6a22f6797084 100644 --- a/evaluation/benchmarks/scienceagentbench/Dockerfile +++ b/evaluation/benchmarks/scienceagentbench/Dockerfile @@ -4,7 +4,7 @@ FROM python:3.11-bookworm # For OpenHands agents to explore the dataset directories, please download the full benchmark [here](https://buckeyemailosu-my.sharepoint.com/:u:/g/personal/chen_8336_buckeyemail_osu_edu/EQuA6uJ3CtRHvRfZ2GiN1tYBRVJE4DSUD10MW61fr7HuSQ?e=sCBegG) and unzip it with password `scienceagentbench`. # **Please DO NOT redistribute the unzipped data files online.** # It will download a benchmark.zip file to the current directory. -# unzip it and put the benchmark folder under evaluation/scienceagentbench/ +# unzip it and put the benchmark folder under evaluation/benchmarks/scienceagentbench/ RUN mkdir -p /benchmark COPY benchmark /benchmark diff --git a/evaluation/benchmarks/scienceagentbench/README.md b/evaluation/benchmarks/scienceagentbench/README.md index 4d979177215b..84ca6393dc0e 100644 --- a/evaluation/benchmarks/scienceagentbench/README.md +++ b/evaluation/benchmarks/scienceagentbench/README.md @@ -13,10 +13,10 @@ To prevent benchmark data contamination, we only provide the annotation sheet on ## Run Inference on ScienceAgentBench ```bash -./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/benchmarks/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example -./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3 +./evaluation/benchmarks/benchmarks/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3 ``` where `model_config` is mandatory, and the rest are optional. @@ -45,9 +45,9 @@ After the inference is completed, you may use the following command to extract n ```bash python post_proc.py [log_fname] ``` -- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent. +- `log_fname`, e.g. `evaluation/benchmarks/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent. -Output will be write to e.g. `evaluation/.../output.converted.jsonl` +Output will be write to e.g. `evaluation/benchmarks/.../output.converted.jsonl` ### Run evaluation diff --git a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh index 970f10ed2fef..3b162265365f 100755 --- a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh +++ b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -32,7 +32,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/scienceagentbench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --use_knowledge $USE_KNOWLEDGE \ diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md index b69a7389555c..9c4ab8374a38 100644 --- a/evaluation/benchmarks/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -27,10 +27,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on. ```bash -./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example -./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test +./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test ``` where `model_config` is mandatory, and the rest are optional. @@ -62,7 +62,7 @@ Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and then your command would be: ```bash -./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10 +./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10 ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -70,23 +70,23 @@ then your command would be: This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ -./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test +./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test ``` To clean-up all existing runtime you've already started, run: ```bash -ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh +ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh ``` ### Specify a subset of tasks to run infer If you would like to specify a list of tasks you'd like to benchmark on, you could -create a `config.toml` under `./evaluation/benchmarks/swe_bench/` folder, and put a list +create a `config.toml` under `./evaluation/benchmarks/benchmarks/swe_bench/` folder, and put a list attribute named `selected_ids`, e.g. ```toml @@ -105,19 +105,19 @@ After running the inference, you will obtain a `output.jsonl` (by default it wil **(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running: ```bash -evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance +evaluation/benchmarks/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance ``` If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images: ```bash -evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh env +evaluation/benchmarks/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh env ``` If you want to evaluate on the full SWE-Bench test set: ```bash -evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full +evaluation/benchmarks/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full ``` ### Run evaluation @@ -136,10 +136,10 @@ NOTE, you should have already pulled the instance-level OR env-level docker imag Then you can run the following: ```bash -./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split] +./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split] # Example -./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl +./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/benchmarks/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl ``` The script now accepts optional arguments: @@ -150,12 +150,12 @@ The script now accepts optional arguments: For example, to evaluate a specific instance with a custom dataset and split: ```bash -./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test +./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test ``` -> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`. +> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`. -The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory: +The final results will be saved to `evaluation/benchmarks/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory: - `README.md`: a report showing what are the instances that passed, failed, etc. - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent. @@ -166,17 +166,17 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers] +./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers] # Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ -evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" +evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/benchmarks/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" ``` To clean-up all existing runtimes that you've already started, run: ```bash -ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh +ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh ``` diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 3ffc08d29bfb..e98428877050 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -70,7 +70,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name) else: # Instruction based on Anthropic's official trajectory - # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs + # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/benchmarks/verified/20241022_tools_claude-3-5-sonnet-updated/trajs instruction = ( '\n' f'/workspace/{workspace_dir_name}\n' diff --git a/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py index 52e2ea4cb141..79a5c23eb2d4 100644 --- a/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py +++ b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py @@ -19,7 +19,7 @@ To push the docker images for "princeton-nlp/SWE-bench_Lite" test set to the docker hub (e.g., under `docker.io/xingyaoww/`), run: ```bash -EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test +EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test ``` """ diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh index 044f9972f4eb..009a0a211d1a 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh @@ -5,7 +5,7 @@ NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission mkdir -p $NEW_FOLDER_PATH # Build all_preds.jsonl -poetry run python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl +poetry run python evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl # Build trajs/ diff --git a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh index 13ef271671a5..977807de084b 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh @@ -58,7 +58,7 @@ else # ==== Convert OH format to SWE-bench format ==== echo "Merged output file with fine-grained report will be saved to $FILE_DIR" - poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH + poetry run python3 evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH # replace .jsonl with .swebench.jsonl in filename SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl} echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL" @@ -106,7 +106,7 @@ if [ -z "$INSTANCE_ID" ]; then rm -rf $RESULT_OUTPUT_DIR/eval_outputs fi - mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR + mv logs/run_evaluation/benchmarks/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt @@ -125,7 +125,7 @@ if [ -z "$INSTANCE_ID" ]; then mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json fi - poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH + poetry run python evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH else echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID" diff --git a/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh index 68280978368e..fc3718d1b5bd 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh +++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh @@ -28,7 +28,7 @@ fi echo "... Evaluating on $INPUT_FILE ..." -COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/swe_bench/eval_infer.py \ --eval-num-workers $NUM_WORKERS \ --input-file $INPUT_FILE \ --dataset $DATASET \ @@ -43,4 +43,4 @@ fi eval $COMMAND # update the output with evaluation results -poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE +poetry run python evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh index a27bd7cdbb14..a4917f80691a 100755 --- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -84,7 +84,7 @@ fi function run_eval() { local eval_note=$1 - COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \ + COMMAND="poetry run python evaluation/benchmarks/benchmarks/swe_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations $MAX_ITER \ diff --git a/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh index 7091b6f586b7..8bb160ff98b8 100755 --- a/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh +++ b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -EVAL_WORKSPACE="evaluation/benchmarks/swe_bench/eval_workspace" +EVAL_WORKSPACE="evaluation/benchmarks/benchmarks/swe_bench/eval_workspace" mkdir -p $EVAL_WORKSPACE # 1. Prepare REPO diff --git a/evaluation/benchmarks/toolqa/README.md b/evaluation/benchmarks/toolqa/README.md index eda478f4489f..c9bee9d13283 100644 --- a/evaluation/benchmarks/toolqa/README.md +++ b/evaluation/benchmarks/toolqa/README.md @@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop Make sure your Docker daemon is running, then run this bash script: ```bash -bash evaluation/benchmarks/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid] +bash evaluation/benchmarks/benchmarks/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid] ``` where `model_config` is mandatory, while all other arguments are optional. @@ -40,5 +40,5 @@ Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `coffee then your command would be: ```bash -bash evaluation/benchmarks/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy +bash evaluation/benchmarks/benchmarks/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy ``` diff --git a/evaluation/benchmarks/toolqa/scripts/run_infer.sh b/evaluation/benchmarks/toolqa/scripts/run_infer.sh index bfe3471f4f6a..8ecc88c19ac3 100755 --- a/evaluation/benchmarks/toolqa/scripts/run_infer.sh +++ b/evaluation/benchmarks/toolqa/scripts/run_infer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" MODEL_CONFIG=$1 COMMIT_HASH=$2 @@ -47,7 +47,7 @@ echo "DATASET: $DATASET" echo "HARDNESS: $HARDNESS" echo "WOLFRAM_APPID: $WOLFRAM_APPID" -COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/toolqa/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/benchmarks/webarena/README.md b/evaluation/benchmarks/webarena/README.md index 3e403d5a7f46..7b60cac9caff 100644 --- a/evaluation/benchmarks/webarena/README.md +++ b/evaluation/benchmarks/webarena/README.md @@ -24,15 +24,15 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie ```bash export WEBARENA_BASE_URL= export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs -bash evaluation/benchmarks/webarena/scripts/run_infer.sh +bash evaluation/benchmarks/benchmarks/webarena/scripts/run_infer.sh ``` -Results will be in `evaluation/evaluation_outputs/outputs/webarena/` +Results will be in `evaluation/benchmarks/evaluation_outputs/outputs/webarena/` To calculate the success rate, run: ```sh -poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl +poetry run python evaluation/benchmarks/benchmarks/webarena/get_success_rate.py evaluation/benchmarks/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl ``` ## Submit your evaluation results diff --git a/evaluation/benchmarks/webarena/scripts/run_infer.sh b/evaluation/benchmarks/webarena/scripts/run_infer.sh index 22372b82d781..aa245344646b 100755 --- a/evaluation/benchmarks/webarena/scripts/run_infer.sh +++ b/evaluation/benchmarks/webarena/scripts/run_infer.sh @@ -1,10 +1,10 @@ #!/bin/bash set -eo pipefail -source "evaluation/utils/version_control.sh" +source "evaluation/benchmarks/utils/version_control.sh" # configure webarena websites and environment -source evaluation/benchmarks/webarena/scripts/webarena_env.sh +source evaluation/benchmarks/benchmarks/webarena/scripts/webarena_env.sh # configure browsing agent export USE_NAV="false" @@ -35,7 +35,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG" EVAL_NOTE="$AGENT_VERSION" -COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/benchmarks/webarena/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 15 \