diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh index aedfe01a0c60..5ad012d07dea 100755 --- a/evaluation/benchmarks/gaia/scripts/run_infer.sh +++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh @@ -35,7 +35,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "LEVELS: $LEVELS" -COMMAND="poetry run python ./evaluation/gaia/run_infer.py \ +COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/benchmarks/mint/README.md b/evaluation/benchmarks/mint/README.md index f9ab43327199..bfaeb713bc78 100644 --- a/evaluation/benchmarks/mint/README.md +++ b/evaluation/benchmarks/mint/README.md @@ -6,7 +6,7 @@ We support evaluation of the [Eurus subset focus on math and code reasoning](htt ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Start the evaluation @@ -34,7 +34,7 @@ Note: in order to use `eval_limit`, you must also set `subset`. For example, ```bash -./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 +./evaluation/benchmarks/mint/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 ``` ## Reference