From 4136c5349a969d68dd16af790eb38ecad8dfe953 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sat, 23 Nov 2024 21:44:21 +0000
Subject: [PATCH] Fix pr #5223: Fix issue #5222: [Refactor]: Refactor the
 evaluation directory

---
 CREDITS.md                                    | 55 +++++++++++++++----
 evaluation/benchmarks/EDA/README.md           |  4 +-
 .../benchmarks/EDA/scripts/run_infer.sh       |  4 +-
 evaluation/benchmarks/agent_bench/README.md   |  6 +-
 .../agent_bench/scripts/run_infer.sh          |  4 +-
 evaluation/benchmarks/aider_bench/README.md   | 14 ++---
 .../aider_bench/scripts/run_infer.sh          |  4 +-
 evaluation/benchmarks/biocoder/README.md      |  4 +-
 .../benchmarks/biocoder/scripts/run_infer.sh  |  4 +-
 evaluation/benchmarks/bird/README.md          |  4 +-
 .../benchmarks/bird/scripts/run_infer.sh      |  4 +-
 .../benchmarks/browsing_delegation/README.md  |  4 +-
 .../browsing_delegation/scripts/run_infer.sh  |  4 +-
 evaluation/benchmarks/commit0_bench/README.md | 12 ++--
 .../benchmarks/commit0_bench/run_infer.py     |  2 +-
 .../commit0_bench/scripts/run_infer.sh        |  4 +-
 .../benchmarks/discoverybench/README.md       |  4 +-
 .../discoverybench/scripts/run_infer.sh       |  4 +-
 evaluation/benchmarks/gaia/README.md          | 10 ++--
 .../benchmarks/gaia/scripts/run_infer.sh      |  4 +-
 evaluation/benchmarks/gorilla/README.md       |  4 +-
 evaluation/benchmarks/gorilla/ast_eval_hf.py  |  2 +-
 evaluation/benchmarks/gorilla/ast_eval_tf.py  |  2 +-
 evaluation/benchmarks/gorilla/ast_eval_th.py  |  2 +-
 .../benchmarks/gorilla/scripts/run_infer.sh   |  4 +-
 evaluation/benchmarks/gpqa/README.md          |  2 +-
 .../benchmarks/gpqa/scripts/run_infer.sh      |  4 +-
 evaluation/benchmarks/humanevalfix/README.md  |  4 +-
 .../humanevalfix/scripts/run_infer.sh         |  4 +-
 .../benchmarks/logic_reasoning/README.md      |  2 +-
 .../logic_reasoning/scripts/run_infer.sh      |  4 +-
 evaluation/benchmarks/miniwob/README.md       | 10 ++--
 .../benchmarks/miniwob/scripts/run_infer.sh   |  4 +-
 evaluation/benchmarks/mint/README.md          |  4 +-
 .../benchmarks/mint/scripts/run_infer.sh      |  4 +-
 evaluation/benchmarks/ml_bench/README.md      | 16 +++---
 .../benchmarks/ml_bench/run_analysis.py       |  2 +-
 .../ml_bench/scripts/run_analysis.sh          |  2 +-
 .../benchmarks/ml_bench/scripts/run_infer.sh  |  4 +-
 .../benchmarks/scienceagentbench/Dockerfile   |  2 +-
 .../benchmarks/scienceagentbench/README.md    |  8 +--
 .../scienceagentbench/scripts/run_infer.sh    |  4 +-
 evaluation/benchmarks/swe_bench/README.md     | 36 ++++++------
 evaluation/benchmarks/swe_bench/run_infer.py  |  2 +-
 .../docker/push_docker_instance_images.py     |  2 +-
 ...onvert_oh_folder_to_swebench_submission.sh |  2 +-
 .../swe_bench/scripts/eval_infer.sh           |  6 +-
 .../swe_bench/scripts/eval_infer_remote.sh    |  4 +-
 .../benchmarks/swe_bench/scripts/run_infer.sh |  4 +-
 .../scripts/setup/prepare_swe_utils.sh        |  2 +-
 evaluation/benchmarks/toolqa/README.md        |  4 +-
 .../benchmarks/toolqa/scripts/run_infer.sh    |  4 +-
 evaluation/benchmarks/webarena/README.md      |  6 +-
 .../benchmarks/webarena/scripts/run_infer.sh  |  6 +-
 54 files changed, 180 insertions(+), 147 deletions(-)

diff --git a/CREDITS.md b/CREDITS.md
index 873742b7e011..38b3bdec9584 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -24,33 +24,66 @@ OpenHands includes and adapts the following open source projects. We are gratefu
 ### Reference Implementations for Evaluation Benchmarks
 OpenHands integrates code of the reference implementations for the following agent evaluation benchmarks:
 
-#### [HumanEval](https://github.com/openai/human-eval)
-   - License: MIT License
-
-#### [DSP](https://github.com/microsoft/DataScienceProblems)
-   - License: MIT License
+#### [EDA](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/EDA)
+   - Description: Exploratory Data Analysis benchmark
 
-#### [HumanEvalPack](https://github.com/bigcode-project/bigcode-evaluation-harness)
+#### [AgentBench](https://github.com/THUDM/AgentBench)
    - License: Apache License 2.0
 
-#### [AgentBench](https://github.com/THUDM/AgentBench)
+#### [Aider Bench](https://github.com/paul-gauthier/aider)
    - License: Apache License 2.0
 
-#### [SWE-Bench](https://github.com/princeton-nlp/SWE-bench)
-   - License: MIT License
+#### [BioCoder](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/biocoder)
+   - Description: Benchmark for biological code generation tasks
 
 #### [BIRD](https://bird-bench.github.io/)
    - License: MIT License
    - Dataset: CC-BY-SA 4.0
 
+#### [Browsing Delegation](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/browsing_delegation)
+   - Description: Web browsing delegation benchmark
+
+#### [Commit0 Bench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/commit0_bench)
+   - Description: Git commit analysis benchmark
+
+#### [DiscoveryBench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/discoverybench)
+   - Description: Benchmark for discovery tasks
+
+#### [GAIA](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/gaia)
+   - Description: General AI Assistant benchmark
+
 #### [Gorilla APIBench](https://github.com/ShishirPatil/gorilla)
    - License: Apache License 2.0
 
 #### [GPQA](https://github.com/idavidrein/gpqa)
    - License: MIT License
 
-#### [ProntoQA](https://github.com/asaparov/prontoqa)
-   - License: Apache License 2.0
+#### [HumanEvalFix](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/humanevalfix)
+   - Description: Code fixing benchmark based on HumanEval
+
+#### [Logic Reasoning](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/logic_reasoning)
+   - Description: Benchmark for logical reasoning tasks
+
+#### [MiniWoB](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/miniwob)
+   - Description: Mini World of Bits benchmark
+
+#### [MINT](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/mint)
+   - Description: Machine learning INTerpretation benchmark
+
+#### [ML Bench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/ml_bench)
+   - Description: Machine Learning benchmark
+
+#### [ScienceAgentBench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/scienceagentbench)
+   - Description: Benchmark for scientific tasks
+
+#### [SWE-Bench](https://github.com/princeton-nlp/SWE-bench)
+   - License: MIT License
+
+#### [ToolQA](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/toolqa)
+   - Description: Tool-based Question Answering benchmark
+
+#### [WebArena](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/benchmarks/webarena)
+   - Description: Web interaction benchmark
 
 
 ## Open Source licenses
diff --git a/evaluation/benchmarks/EDA/README.md b/evaluation/benchmarks/EDA/README.md
index fee875c5dd51..05f56dbd3eb4 100644
--- a/evaluation/benchmarks/EDA/README.md
+++ b/evaluation/benchmarks/EDA/README.md
@@ -12,7 +12,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 
 ```bash
 export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
-./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
+./evaluation/benchmarks/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
 ```
 
 where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
@@ -33,7 +33,7 @@ to `CodeActAgent`.
 For example,
 
 ```bash
-./evaluation/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
+./evaluation/benchmarks/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
 ```
 
 ## Reference
diff --git a/evaluation/benchmarks/EDA/scripts/run_infer.sh b/evaluation/benchmarks/EDA/scripts/run_infer.sh
index a803073f73c6..8251744a453a 100755
--- a/evaluation/benchmarks/EDA/scripts/run_infer.sh
+++ b/evaluation/benchmarks/EDA/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -43,7 +43,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
-COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/EDA/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --dataset $DATASET \
diff --git a/evaluation/benchmarks/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md
index e8a1e3dc955e..bb4dfd7fd29b 100644
--- a/evaluation/benchmarks/agent_bench/README.md
+++ b/evaluation/benchmarks/agent_bench/README.md
@@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Start the evaluation
 
 ```bash
-./evaluation/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/benchmarks/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
@@ -25,7 +25,7 @@ in order to use `eval_limit`, you must also set `agent`.
 
 Following is the basic command to start the evaluation.
 
-You can update the arguments in the script `evaluation/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
+You can update the arguments in the script `evaluation/benchmarks/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
 
 - `--agent-cls`, the agent to use. For example, `CodeActAgent`.
 - `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
@@ -34,5 +34,5 @@ You can update the arguments in the script `evaluation/benchmarks/agent_bench/sc
 - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
 
 ```bash
-./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
+./evaluation/benchmarks/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
 ```
diff --git a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
index 16e98b074b74..8033c343873e 100755
--- a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -26,7 +26,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/benchmarks/agent_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md
index 965fc06d7ecc..2672c81269fd 100644
--- a/evaluation/benchmarks/aider_bench/README.md
+++ b/evaluation/benchmarks/aider_bench/README.md
@@ -16,7 +16,7 @@ development environment and LLM.
 ## Start the evaluation
 
 ```bash
-./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
@@ -42,7 +42,7 @@ export SKIP_NUM=12 # skip the first 12 instances from the dataset
 Following is the basic command to start the evaluation.
 
 You can update the arguments in the script
-`evaluation/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
+`evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
 `--eval-num-workers` and so on:
 
 - `--agent-cls`, the agent to use. For example, `CodeActAgent`.
@@ -53,7 +53,7 @@ You can update the arguments in the script
 - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
 
 ```bash
-./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
+./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -61,25 +61,25 @@ You can update the arguments in the script
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
 
 # Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel
 export ALLHANDS_API_KEY="YOUR-API-KEY"
 export RUNTIME=remote
 export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
-./evaluation/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
+./evaluation/benchmarks/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
 ```
 
 ## Summarize Results
 
 ```bash
-poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+poetry run python ./evaluation/benchmarks/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
 ```
 
 Full example:
 
 ```bash
-poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
+poetry run python ./evaluation/benchmarks/benchmarks/aider_bench/scripts/summarize_results.py evaluation/benchmarks/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```
 
 This will list the instances that passed and the instances that failed. For each
diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
index 0b3824ceae6e..72fd6e6c23fe 100755
--- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -39,7 +39,7 @@ if [ "$USE_UNIT_TESTS" = true ]; then
   EVAL_NOTE=$EVAL_NOTE-w-test
 fi
 
-COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/aider_bench/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/benchmarks/aider_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/benchmarks/biocoder/README.md b/evaluation/benchmarks/biocoder/README.md
index 035f2d20bf12..1549ed4a974c 100644
--- a/evaluation/benchmarks/biocoder/README.md
+++ b/evaluation/benchmarks/biocoder/README.md
@@ -21,7 +21,7 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode
 
 
 ```bash
-./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/benchmarks/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
 
 where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
@@ -43,7 +43,7 @@ with current OpenHands version, then your command would be:
 ## Examples
 
 ```bash
-./evaluation/benchmarks/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
+./evaluation/benchmarks/benchmarks/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
 ```
 
 ## Reference
diff --git a/evaluation/benchmarks/biocoder/scripts/run_infer.sh b/evaluation/benchmarks/biocoder/scripts/run_infer.sh
index 61fddb621176..98d6ad264b58 100755
--- a/evaluation/benchmarks/biocoder/scripts/run_infer.sh
+++ b/evaluation/benchmarks/biocoder/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -28,7 +28,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
-COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/biocoder/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/benchmarks/bird/README.md b/evaluation/benchmarks/bird/README.md
index 90e3fa300cbd..dcce84f04290 100644
--- a/evaluation/benchmarks/bird/README.md
+++ b/evaluation/benchmarks/bird/README.md
@@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Run Inference on Bird
 
 ```bash
-./evaluation/benchmarks/bird/scripts/run_infer.sh [model_config] [git-version]
+./evaluation/benchmarks/benchmarks/bird/scripts/run_infer.sh [model_config] [git-version]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
@@ -31,7 +31,7 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
     "agent_class": "CodeActAgent",
     "model_name": "gpt-4-1106-preview",
     "max_iterations": 5,
-    "eval_output_dir": "evaluation/evaluation_outputs/outputs/bird/CodeActAgent/gpt-4-1106-preview_maxiter_5_N_v1.5",
+    "eval_output_dir": "evaluation/benchmarks/evaluation_outputs/outputs/bird/CodeActAgent/gpt-4-1106-preview_maxiter_5_N_v1.5",
     "start_time": "2024-05-29 02:00:22",
     "git_commit": "ae105c2fafc64ad3eeb7a8bea09119fcb5865bc4"
   },
diff --git a/evaluation/benchmarks/bird/scripts/run_infer.sh b/evaluation/benchmarks/bird/scripts/run_infer.sh
index bf69d9d50bd7..85f49bf36192 100755
--- a/evaluation/benchmarks/bird/scripts/run_infer.sh
+++ b/evaluation/benchmarks/bird/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -26,7 +26,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/bird/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 5 \
diff --git a/evaluation/benchmarks/browsing_delegation/README.md b/evaluation/benchmarks/browsing_delegation/README.md
index a06170f8b9e0..3ac767516fe8 100644
--- a/evaluation/benchmarks/browsing_delegation/README.md
+++ b/evaluation/benchmarks/browsing_delegation/README.md
@@ -12,8 +12,8 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Run Inference
 
 ```bash
-./evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
+./evaluation/benchmarks/benchmarks/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+# e.g., ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
 ```
 
 where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
diff --git a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
index 30607ca3336b..52cae31dda45 100755
--- a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -28,7 +28,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="$AGENT_VERSION"
 
-COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/browsing_delegation/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 1 \
diff --git a/evaluation/benchmarks/commit0_bench/README.md b/evaluation/benchmarks/commit0_bench/README.md
index 78b58b02137f..6b209cf92e25 100644
--- a/evaluation/benchmarks/commit0_bench/README.md
+++ b/evaluation/benchmarks/commit0_bench/README.md
@@ -24,10 +24,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least
 When the `run_infer.sh` script is started, it will automatically pull the `lite` split in Commit0. For example, for instance ID `commit-0/minitorch`, it will try to pull our pre-build docker image `wentingzhao/minitorch` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
 
 ```bash
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
+./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
 ```
 
 where `model_config` is mandatory, and the rest are optional.
@@ -56,7 +56,7 @@ Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgen
 then your command would be:
 
 ```bash
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
+./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -64,17 +64,17 @@ then your command would be:
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example - This runs evaluation on CodeActAgent for 10 instances on "wentingzhao/commit0_combined"'s test set, with max 30 iteration per instances, with 1 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="docker.io/wentingzhao" \
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
+./evaluation/benchmarks/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
 ```
 
 To clean-up all existing runtime you've already started, run:
 
 ```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh
 ```
 
 ### Specify a subset of tasks to run infer
diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
index ef2df020310c..84faa9da307a 100644
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -58,7 +58,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
     test_cmd = instance['test']['test_cmd']
     test_dir = instance['test']['test_dir']
     # Instruction based on Anthropic's official trajectory
-    # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
+    # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/benchmarks/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
     instruction = (
         '<uploaded_files>\n'
         f'/workspace/{workspace_dir_name}\n'
diff --git a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
index 227a5ff05ea9..78843b310ea5 100755
--- a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 REPO_SPLIT=$1
 MODEL_CONFIG=$2
@@ -91,7 +91,7 @@ fi
 
 function run_eval() {
   local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/commit0_bench/run_infer.py \
+  COMMAND="poetry run python evaluation/benchmarks/benchmarks/commit0_bench/run_infer.py \
     --agent-cls $AGENT \
     --llm-config $MODEL_CONFIG \
     --max-iterations $MAX_ITER \
diff --git a/evaluation/benchmarks/discoverybench/README.md b/evaluation/benchmarks/discoverybench/README.md
index daf5cc34bbb4..e48dad52039c 100644
--- a/evaluation/benchmarks/discoverybench/README.md
+++ b/evaluation/benchmarks/discoverybench/README.md
@@ -16,7 +16,7 @@
 2. Execute the bash script to start DiscoveryBench Evaluation
 
 ```
-./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG]
+./evaluation/benchmarks/benchmarks/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG]
 ```
 Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml`
 
@@ -27,7 +27,7 @@ When the `run_infer.sh` script is started, it will automatically pull the latest
 
 
 ```
-./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
+./evaluation/benchmarks/benchmarks/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
 ```
 
 - `MODEL_CONFIG`: Name of the model you want to evaluate with
diff --git a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
index e12b9c139891..06f776b0a5d6 100755
--- a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -29,7 +29,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/discoverybench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/benchmarks/gaia/README.md b/evaluation/benchmarks/gaia/README.md
index f592e5f7118d..7dfb92a5c777 100644
--- a/evaluation/benchmarks/gaia/README.md
+++ b/evaluation/benchmarks/gaia/README.md
@@ -10,11 +10,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
 Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation.
 
-Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/benchmarks/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on.
+Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on.
 
 ```bash
-./evaluation/benchmarks/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset]
-# e.g., ./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300
+./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset]
+# e.g., ./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300
 ```
 
 where `model_config` is mandatory, while `git-version`, `agent`, `eval_limit` and `gaia_subset` are optional.
@@ -35,13 +35,13 @@ to `CodeActAgent`.
 For example,
 
 ```bash
-./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10
+./evaluation/benchmarks/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10
 ```
 
 ## Get score
 
 Then you can get stats by running the following command:
 ```bash
-python ./evaluation/benchmarks/gaia/get_score.py \
+python ./evaluation/benchmarks/benchmarks/gaia/get_score.py \
 --file <path_to/output.json>
 ```
diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh
index aedfe01a0c60..02abd0e0eb3e 100755
--- a/evaluation/benchmarks/gaia/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -35,7 +35,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "LEVELS: $LEVELS"
 
-COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
+COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/benchmarks/gorilla/README.md b/evaluation/benchmarks/gorilla/README.md
index c6f1cde55b40..03fe7217adfa 100644
--- a/evaluation/benchmarks/gorilla/README.md
+++ b/evaluation/benchmarks/gorilla/README.md
@@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 Make sure your Docker daemon is running, then run this bash script:
 
 ```bash
-./evaluation/benchmarks/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
+./evaluation/benchmarks/benchmarks/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
 ```
 
 where `model_config` is mandatory, while all other arguments are optional.
@@ -35,5 +35,5 @@ Note: in order to use `eval_limit`, you must also set `agent`; in order to use `
 For example,
 
 ```bash
-./evaluation/benchmarks/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
+./evaluation/benchmarks/benchmarks/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
 ```
diff --git a/evaluation/benchmarks/gorilla/ast_eval_hf.py b/evaluation/benchmarks/gorilla/ast_eval_hf.py
index 25229aee7407..d1d019894c14 100644
--- a/evaluation/benchmarks/gorilla/ast_eval_hf.py
+++ b/evaluation/benchmarks/gorilla/ast_eval_hf.py
@@ -40,7 +40,7 @@ def get_all_sub_trees(root_node):
 
 # Parse the program into AST trees
 def ast_parse(candidate, lang='python'):
-    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    LANGUAGE = Language('evaluation/benchmarks/gorilla/my-languages.so', lang)
     parser = Parser()
     parser.set_language(LANGUAGE)
 
diff --git a/evaluation/benchmarks/gorilla/ast_eval_tf.py b/evaluation/benchmarks/gorilla/ast_eval_tf.py
index 22067010c140..6f9e7729c246 100644
--- a/evaluation/benchmarks/gorilla/ast_eval_tf.py
+++ b/evaluation/benchmarks/gorilla/ast_eval_tf.py
@@ -40,7 +40,7 @@ def get_all_sub_trees(root_node):
 
 # Parse the program into AST trees
 def ast_parse(candidate, lang='python'):
-    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    LANGUAGE = Language('evaluation/benchmarks/gorilla/my-languages.so', lang)
     parser = Parser()
     parser.set_language(LANGUAGE)
 
diff --git a/evaluation/benchmarks/gorilla/ast_eval_th.py b/evaluation/benchmarks/gorilla/ast_eval_th.py
index f55f70ed7c5e..cc8137c3d9a6 100644
--- a/evaluation/benchmarks/gorilla/ast_eval_th.py
+++ b/evaluation/benchmarks/gorilla/ast_eval_th.py
@@ -40,7 +40,7 @@ def get_all_sub_trees(root_node):
 
 # Parse the program into AST trees
 def ast_parse(candidate, lang='python'):
-    LANGUAGE = Language('evaluation/gorilla/my-languages.so', lang)
+    LANGUAGE = Language('evaluation/benchmarks/gorilla/my-languages.so', lang)
     parser = Parser()
     parser.set_language(LANGUAGE)
 
diff --git a/evaluation/benchmarks/gorilla/scripts/run_infer.sh b/evaluation/benchmarks/gorilla/scripts/run_infer.sh
index 45424444431c..c735289e5b4f 100755
--- a/evaluation/benchmarks/gorilla/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gorilla/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -33,7 +33,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "HUBS: $HUBS"
 
-COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/gorilla/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/benchmarks/gpqa/README.md b/evaluation/benchmarks/gpqa/README.md
index 235b9ab9b281..b3305f757f63 100644
--- a/evaluation/benchmarks/gpqa/README.md
+++ b/evaluation/benchmarks/gpqa/README.md
@@ -23,7 +23,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
 From the root of the OpenHands repo, run the following command:
 ```bash
-./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
+./evaluation/benchmarks/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
 ```
 You can replace `model_config_name` with any model you set up in `config.toml`.
 
diff --git a/evaluation/benchmarks/gpqa/scripts/run_infer.sh b/evaluation/benchmarks/gpqa/scripts/run_infer.sh
index ec5a61dbbbc0..9687c4e25e62 100755
--- a/evaluation/benchmarks/gpqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gpqa/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -33,7 +33,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/gpqa/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/benchmarks/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md
index 5f3ae58ee29d..a486ec4c25aa 100644
--- a/evaluation/benchmarks/humanevalfix/README.md
+++ b/evaluation/benchmarks/humanevalfix/README.md
@@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Run Inference on HumanEvalFix
 
 ```bash
-./evaluation/benchmarks/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview
+./evaluation/benchmarks/benchmarks/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview
 ```
 
 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
@@ -28,7 +28,7 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
         "agent_class": "CodeActAgent",
         "model_name": "gpt-4",
         "max_iterations": 10,
-        "eval_output_dir": "evaluation/evaluation_outputs/outputs/humanevalfix/CodeActAgent/gpt-4_maxiter_10_N_v1.4",
+        "eval_output_dir": "evaluation/benchmarks/evaluation_outputs/outputs/humanevalfix/CodeActAgent/gpt-4_maxiter_10_N_v1.4",
         "start_time": "2024-05-22 20:54:15",
         "git_commit": "4d3253696f5a9d9de02ab86969fe9796fa40331f"
     },
diff --git a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
index b0b30628eb5e..a54a2ff6cd9b 100755
--- a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -64,7 +64,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/humanevalfix/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/benchmarks/logic_reasoning/README.md b/evaluation/benchmarks/logic_reasoning/README.md
index d4e4d3e9a554..fa1574a23303 100644
--- a/evaluation/benchmarks/logic_reasoning/README.md
+++ b/evaluation/benchmarks/logic_reasoning/README.md
@@ -10,5 +10,5 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 The following code will run inference on the first example of the ProofWriter dataset,
 
 ```bash
-./evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter
+./evaluation/benchmarks/benchmarks/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter
 ```
diff --git a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
index 40c244d18b2a..e766851b1238 100755
--- a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 DATASET=$2
@@ -34,7 +34,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/logic_reasoning/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --dataset $DATASET \
diff --git a/evaluation/benchmarks/miniwob/README.md b/evaluation/benchmarks/miniwob/README.md
index 5535e45a7dc0..4f771e3922e9 100644
--- a/evaluation/benchmarks/miniwob/README.md
+++ b/evaluation/benchmarks/miniwob/README.md
@@ -13,7 +13,7 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 ## Run Evaluation
 
 ```sh
-./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
+./evaluation/benchmarks/benchmarks/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -21,21 +21,21 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/benchmarks/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
+./evaluation/benchmarks/benchmarks/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
 
 # Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
 export ALLHANDS_API_KEY="YOUR-API-KEY"
 export RUNTIME=remote
 export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
-./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
+./evaluation/benchmarks/benchmarks/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
 ```
 
-Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
+Results will be in `evaluation/benchmarks/evaluation_outputs/outputs/miniwob/`
 
 To calculate the average reward, run:
 
 ```sh
-poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
+poetry run python evaluation/benchmarks/benchmarks/miniwob/get_success_rate.py evaluation/benchmarks/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
 ```
 
 ## Submit your evaluation results
diff --git a/evaluation/benchmarks/miniwob/scripts/run_infer.sh b/evaluation/benchmarks/miniwob/scripts/run_infer.sh
index 8f997e29c308..da192fdaee03 100755
--- a/evaluation/benchmarks/miniwob/scripts/run_infer.sh
+++ b/evaluation/benchmarks/miniwob/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 # configure browsing agent
 export USE_NAV="false"
@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
 
-COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/benchmarks/miniwob/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/benchmarks/mint/README.md b/evaluation/benchmarks/mint/README.md
index f9ab43327199..1cb445596978 100644
--- a/evaluation/benchmarks/mint/README.md
+++ b/evaluation/benchmarks/mint/README.md
@@ -15,7 +15,7 @@ We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/da
 Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
 
 ```bash
-./evaluation/benchmarks/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit]
+./evaluation/benchmarks/benchmarks/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit]
 ```
 
 where `model_config` is mandatory, while others are optional.
@@ -34,7 +34,7 @@ Note: in order to use `eval_limit`, you must also set `subset`.
 For example,
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
 ```
 
 ## Reference
diff --git a/evaluation/benchmarks/mint/scripts/run_infer.sh b/evaluation/benchmarks/mint/scripts/run_infer.sh
index b9ec6d7a7a85..4bf36cccbd0c 100755
--- a/evaluation/benchmarks/mint/scripts/run_infer.sh
+++ b/evaluation/benchmarks/mint/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -25,7 +25,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 
 export PYTHONPATH=$(pwd)
 
-COMMAND="poetry run python ./evaluation/mint/run_infer.py \
+COMMAND="poetry run python ./evaluation/benchmarks/mint/run_infer.py \
     --llm-config $MODEL_CONFIG \
     --max-iterations 5 \
     --max-propose-solution 2 \
diff --git a/evaluation/benchmarks/ml_bench/README.md b/evaluation/benchmarks/ml_bench/README.md
index 528edddc148a..5644735bee7b 100644
--- a/evaluation/benchmarks/ml_bench/README.md
+++ b/evaluation/benchmarks/ml_bench/README.md
@@ -19,8 +19,8 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 To run the evaluation on the ML-Bench dataset, use the following command:
 
 ```bash
-./evaluation/benchmarks/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit]
-# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10
+./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit]
+# e.g., ./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10
 ```
 
 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
@@ -30,8 +30,8 @@ You can replace `eval_gpt4_1106_preview` with any model you set up in `config.to
 To score the evaluation output, use the following command:
 
 ```bash
-./evaluation/benchmarks/ml_bench/scripts/summarise_results.py [eval_output_dir]
-# e.g., ./evaluation/benchmarks/ml_bench/scripts/summarise_results.py evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5
+./evaluation/benchmarks/benchmarks/ml_bench/scripts/summarise_results.py [eval_output_dir]
+# e.g., ./evaluation/benchmarks/benchmarks/ml_bench/scripts/summarise_results.py evaluation/benchmarks/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5
 ```
 
 ## Run Error Analysis on ML-Bench
@@ -39,8 +39,8 @@ To score the evaluation output, use the following command:
 To run error analysis on the ML-Bench dataset, use the following command:
 
 ```bash
-./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config]
-# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview
+./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config]
+# e.g., ./evaluation/benchmarks/benchmarks/ml_bench/scripts/run_analysis.sh evaluation/benchmarks/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview
 ```
 
 This command generates a report on the evaluation output and provides insights into the agent's performance.
@@ -60,7 +60,7 @@ Here's an example of the evaluation output for a single task instance:
     "agent_class": "CodeActAgent",
     "model_name": "gpt-4-1106-preview",
     "max_iterations": 10,
-    "eval_output_dir": "evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5",
+    "eval_output_dir": "evaluation/benchmarks/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5",
     "start_time": "2024-05-26 17:39:59",
     "git_commit": "dd8ee9044a94a213dc2e31d2085dbf2924ee80a1"
   },
@@ -105,7 +105,7 @@ The `metrics` field contains the parsed evaluation metrics from the `eval_output
 
 ## Customization
 
-You can customize the evaluation script by modifying the `evaluation/benchmarks/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs.
+You can customize the evaluation script by modifying the `evaluation/benchmarks/benchmarks/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs.
 
 Feel free to adjust the configuration, logging, and output formatting to suit your needs.
 
diff --git a/evaluation/benchmarks/ml_bench/run_analysis.py b/evaluation/benchmarks/ml_bench/run_analysis.py
index eda8fd4bdd45..665b1080302e 100644
--- a/evaluation/benchmarks/ml_bench/run_analysis.py
+++ b/evaluation/benchmarks/ml_bench/run_analysis.py
@@ -120,7 +120,7 @@ def classify_error(llm: LLM, failed_case: dict) -> str:
     )
     args, _ = parser.parse_known_args()
 
-    # Check https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/swe_bench/README.md#configure-openhands-and-your-llm
+    # Check https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks/swe_bench/README.md#configure-openhands-and-your-llm
     # for details of how to set `llm_config`
     if args.llm_config:
         specified_llm_config = get_llm_config_arg(args.llm_config)
diff --git a/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
index d5fe6365ca86..6dea03a52417 100644
--- a/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
+++ b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
@@ -17,7 +17,7 @@ fi
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "RESULT_FILE: $RESULT_FILE"
 
-COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_analysis.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/ml_bench/run_analysis.py \
   --llm-config $MODEL_CONFIG \
   --json_file_path $RESULT_FILE"
 
diff --git a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
index 97ff0003fc5c..0e7d2957ce4e 100755
--- a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -32,7 +32,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/ml_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/benchmarks/scienceagentbench/Dockerfile b/evaluation/benchmarks/scienceagentbench/Dockerfile
index 70ed92cc4dc8..6a22f6797084 100644
--- a/evaluation/benchmarks/scienceagentbench/Dockerfile
+++ b/evaluation/benchmarks/scienceagentbench/Dockerfile
@@ -4,7 +4,7 @@ FROM python:3.11-bookworm
 # For OpenHands agents to explore the dataset directories, please download the full benchmark [here](https://buckeyemailosu-my.sharepoint.com/:u:/g/personal/chen_8336_buckeyemail_osu_edu/EQuA6uJ3CtRHvRfZ2GiN1tYBRVJE4DSUD10MW61fr7HuSQ?e=sCBegG) and unzip it with password `scienceagentbench`.
 # **Please DO NOT redistribute the unzipped data files online.**
 # It will download a benchmark.zip file to the current directory.
-# unzip it and put the benchmark folder under evaluation/scienceagentbench/
+# unzip it and put the benchmark folder under evaluation/benchmarks/scienceagentbench/
 
 RUN mkdir -p /benchmark
 COPY benchmark /benchmark
diff --git a/evaluation/benchmarks/scienceagentbench/README.md b/evaluation/benchmarks/scienceagentbench/README.md
index 4d979177215b..84ca6393dc0e 100644
--- a/evaluation/benchmarks/scienceagentbench/README.md
+++ b/evaluation/benchmarks/scienceagentbench/README.md
@@ -13,10 +13,10 @@ To prevent benchmark data contamination, we only provide the annotation sheet on
 ## Run Inference on ScienceAgentBench
 
 ```bash
-./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/benchmarks/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example
-./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3
+./evaluation/benchmarks/benchmarks/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3
 ```
 
 where `model_config` is mandatory, and the rest are optional.
@@ -45,9 +45,9 @@ After the inference is completed, you may use the following command to extract n
 ```bash
 python post_proc.py [log_fname]
 ```
-- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent.
+- `log_fname`, e.g. `evaluation/benchmarks/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent.
 
-Output will be write to e.g. `evaluation/.../output.converted.jsonl`
+Output will be write to e.g. `evaluation/benchmarks/.../output.converted.jsonl`
 
 ### Run evaluation
 
diff --git a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
index 970f10ed2fef..3b162265365f 100755
--- a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -32,7 +32,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/scienceagentbench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --use_knowledge $USE_KNOWLEDGE \
diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md
index b69a7389555c..9c4ab8374a38 100644
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -27,10 +27,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least
 When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
 
 ```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
 ```
 
 where `model_config` is mandatory, and the rest are optional.
@@ -62,7 +62,7 @@ Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and
 then your command would be:
 
 ```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -70,23 +70,23 @@ then your command would be:
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
 ```
 
 To clean-up all existing runtime you've already started, run:
 
 ```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
 ```
 
 ### Specify a subset of tasks to run infer
 
 If you would like to specify a list of tasks you'd like to benchmark on, you could
-create a `config.toml` under `./evaluation/benchmarks/swe_bench/` folder, and put a list
+create a `config.toml` under `./evaluation/benchmarks/benchmarks/swe_bench/` folder, and put a list
 attribute named `selected_ids`, e.g.
 
 ```toml
@@ -105,19 +105,19 @@ After running the inference, you will obtain a `output.jsonl` (by default it wil
 **(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running:
 
 ```bash
-evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
+evaluation/benchmarks/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
 ```
 
 If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
 
 ```bash
-evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh env
+evaluation/benchmarks/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh env
 ```
 
 If you want to evaluate on the full SWE-Bench test set:
 
 ```bash
-evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full
+evaluation/benchmarks/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full
 ```
 
 ### Run evaluation
@@ -136,10 +136,10 @@ NOTE, you should have already pulled the instance-level OR env-level docker imag
 Then you can run the following:
 
 ```bash
-./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
 
 # Example
-./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/benchmarks/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```
 
 The script now accepts optional arguments:
@@ -150,12 +150,12 @@ The script now accepts optional arguments:
 For example, to evaluate a specific instance with a custom dataset and split:
 
 ```bash
-./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test
 ```
 
-> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.
+> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.
 
-The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory:
+The final results will be saved to `evaluation/benchmarks/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory:
 
 - `README.md`: a report showing what are the instances that passed, failed, etc.
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
@@ -166,17 +166,17 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
+./evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
 
 # Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
-evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+evaluation/benchmarks/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/benchmarks/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
 ```
 
 To clean-up all existing runtimes that you've already started, run:
 
 ```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
 ```
 
 
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index 3ffc08d29bfb..e98428877050 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -70,7 +70,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
         instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name)
     else:
         # Instruction based on Anthropic's official trajectory
-        # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
+        # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/benchmarks/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
         instruction = (
             '<uploaded_files>\n'
             f'/workspace/{workspace_dir_name}\n'
diff --git a/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py
index 52e2ea4cb141..79a5c23eb2d4 100644
--- a/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py
+++ b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py
@@ -19,7 +19,7 @@
 
 To push the docker images for "princeton-nlp/SWE-bench_Lite" test set to the docker hub (e.g., under `docker.io/xingyaoww/`), run:
 ```bash
-EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test
+EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test
 ```
 """
 
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
index 044f9972f4eb..009a0a211d1a 100755
--- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
@@ -5,7 +5,7 @@ NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission
 mkdir -p $NEW_FOLDER_PATH
 
 # Build all_preds.jsonl
-poetry run python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
+poetry run python evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
 mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl
 
 # Build trajs/
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
index 13ef271671a5..977807de084b 100755
--- a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
@@ -58,7 +58,7 @@ else
 
     # ==== Convert OH format to SWE-bench format ====
     echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
-    poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    poetry run python3 evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
     # replace .jsonl with .swebench.jsonl in filename
     SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
     echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
@@ -106,7 +106,7 @@ if [ -z "$INSTANCE_ID" ]; then
         rm -rf $RESULT_OUTPUT_DIR/eval_outputs
     fi
 
-    mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
+    mv logs/run_evaluation/benchmarks/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
     mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
     echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt
 
@@ -125,7 +125,7 @@ if [ -z "$INSTANCE_ID" ]; then
         mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
     fi
 
-    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
+    poetry run python evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
 
 else
     echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
index 68280978368e..fc3718d1b5bd 100755
--- a/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
@@ -28,7 +28,7 @@ fi
 
 echo "... Evaluating on $INPUT_FILE ..."
 
-COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/swe_bench/eval_infer.py \
   --eval-num-workers $NUM_WORKERS \
   --input-file $INPUT_FILE \
   --dataset $DATASET \
@@ -43,4 +43,4 @@ fi
 eval $COMMAND
 
 # update the output with evaluation results
-poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE
+poetry run python evaluation/benchmarks/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE
diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
index a27bd7cdbb14..a4917f80691a 100755
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -84,7 +84,7 @@ fi
 
 function run_eval() {
   local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
+  COMMAND="poetry run python evaluation/benchmarks/benchmarks/swe_bench/run_infer.py \
     --agent-cls $AGENT \
     --llm-config $MODEL_CONFIG \
     --max-iterations $MAX_ITER \
diff --git a/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh
index 7091b6f586b7..8bb160ff98b8 100755
--- a/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 set -e
-EVAL_WORKSPACE="evaluation/benchmarks/swe_bench/eval_workspace"
+EVAL_WORKSPACE="evaluation/benchmarks/benchmarks/swe_bench/eval_workspace"
 mkdir -p $EVAL_WORKSPACE
 
 # 1. Prepare REPO
diff --git a/evaluation/benchmarks/toolqa/README.md b/evaluation/benchmarks/toolqa/README.md
index eda478f4489f..c9bee9d13283 100644
--- a/evaluation/benchmarks/toolqa/README.md
+++ b/evaluation/benchmarks/toolqa/README.md
@@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 Make sure your Docker daemon is running, then run this bash script:
 
 ```bash
-bash evaluation/benchmarks/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
+bash evaluation/benchmarks/benchmarks/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
 ```
 
 where `model_config` is mandatory, while all other arguments are optional.
@@ -40,5 +40,5 @@ Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `coffee
 then your command would be:
 
 ```bash
-bash evaluation/benchmarks/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy
+bash evaluation/benchmarks/benchmarks/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy
 ```
diff --git a/evaluation/benchmarks/toolqa/scripts/run_infer.sh b/evaluation/benchmarks/toolqa/scripts/run_infer.sh
index bfe3471f4f6a..8ecc88c19ac3 100755
--- a/evaluation/benchmarks/toolqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/toolqa/scripts/run_infer.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
@@ -47,7 +47,7 @@ echo "DATASET: $DATASET"
 echo "HARDNESS: $HARDNESS"
 echo "WOLFRAM_APPID: $WOLFRAM_APPID"
 
-COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/toolqa/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/benchmarks/webarena/README.md b/evaluation/benchmarks/webarena/README.md
index 3e403d5a7f46..7b60cac9caff 100644
--- a/evaluation/benchmarks/webarena/README.md
+++ b/evaluation/benchmarks/webarena/README.md
@@ -24,15 +24,15 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie
 ```bash
 export WEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
 export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
-bash evaluation/benchmarks/webarena/scripts/run_infer.sh
+bash evaluation/benchmarks/benchmarks/webarena/scripts/run_infer.sh
 ```
 
-Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
+Results will be in `evaluation/benchmarks/evaluation_outputs/outputs/webarena/`
 
 To calculate the success rate, run:
 
 ```sh
-poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
+poetry run python evaluation/benchmarks/benchmarks/webarena/get_success_rate.py evaluation/benchmarks/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
 ```
 
 ## Submit your evaluation results
diff --git a/evaluation/benchmarks/webarena/scripts/run_infer.sh b/evaluation/benchmarks/webarena/scripts/run_infer.sh
index 22372b82d781..aa245344646b 100755
--- a/evaluation/benchmarks/webarena/scripts/run_infer.sh
+++ b/evaluation/benchmarks/webarena/scripts/run_infer.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 set -eo pipefail
 
-source "evaluation/utils/version_control.sh"
+source "evaluation/benchmarks/utils/version_control.sh"
 
 # configure webarena websites and environment
-source evaluation/benchmarks/webarena/scripts/webarena_env.sh
+source evaluation/benchmarks/benchmarks/webarena/scripts/webarena_env.sh
 
 # configure browsing agent
 export USE_NAV="false"
@@ -35,7 +35,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="$AGENT_VERSION"
 
-COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/benchmarks/webarena/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 15 \