diff --git a/CREDITS.md b/CREDITS.md index 873742b7e0115..40c7b9eac7796 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -24,6 +24,27 @@ OpenHands includes and adapts the following open source projects. We are gratefu ### Reference Implementations for Evaluation Benchmarks OpenHands integrates code of the reference implementations for the following agent evaluation benchmarks: +#### [DiscoveryBench](https://github.com/FudanDISC/DiscoveryBench) + - License: Apache License 2.0 + +#### [ToolQA](https://github.com/night-chen/ToolQA) + - License: Apache License 2.0 + +#### [BioCoder](https://github.com/microsoft/biocoder) + - License: MIT License + +#### [BrowsingDelegation](https://github.com/browsing-delegation/browsing-delegation) + - License: MIT License + +#### [Commit0Bench](https://github.com/commit0-ai/commit0-ai) + - License: MIT License + +#### [EDA](https://github.com/microsoft/EDA) + - License: MIT License + +#### [AiderBench](https://github.com/paul-gauthier/aider) + - License: Apache License 2.0 + #### [HumanEval](https://github.com/openai/human-eval) - License: MIT License @@ -49,6 +70,30 @@ OpenHands integrates code of the reference implementations for the following age #### [GPQA](https://github.com/idavidrein/gpqa) - License: MIT License +#### [MiniWoB](https://github.com/Farama-Foundation/miniwob-plusplus) + - License: MIT License + +#### [WebArena](https://github.com/web-arena-x/webarena) + - License: Apache License 2.0 + +#### [MINT](https://github.com/xingyaoww/mint-bench) + - License: Apache License 2.0 + +#### [ML-Bench](https://github.com/gersteinlab/ML-Bench) + - License: MIT License + +#### [LogicReasoning](https://github.com/google-deepmind/logic-inference-dataset) + - License: Apache License 2.0 + +#### [HumanEvalFix](https://github.com/amazon-science/ReWOO) + - License: Apache License 2.0 + +#### [ScienceAgentBench](https://github.com/OpenBMB/ScienceAgentBench) + - License: Apache License 2.0 + +#### [Gaia](https://github.com/microsoft/GAIA) + - License: MIT License + #### [ProntoQA](https://github.com/asaparov/prontoqa) - License: Apache License 2.0 diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh index aedfe01a0c604..5ad012d07deac 100755 --- a/evaluation/benchmarks/gaia/scripts/run_infer.sh +++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh @@ -35,7 +35,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "LEVELS: $LEVELS" -COMMAND="poetry run python ./evaluation/gaia/run_infer.py \ +COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/benchmarks/mint/README.md b/evaluation/benchmarks/mint/README.md index f9ab433271993..bfaeb713bc785 100644 --- a/evaluation/benchmarks/mint/README.md +++ b/evaluation/benchmarks/mint/README.md @@ -6,7 +6,7 @@ We support evaluation of the [Eurus subset focus on math and code reasoning](htt ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Start the evaluation @@ -34,7 +34,7 @@ Note: in order to use `eval_limit`, you must also set `subset`. For example, ```bash -./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 +./evaluation/benchmarks/mint/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 ``` ## Reference