diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh
index aedfe01a0c60..5ad012d07dea 100755
--- a/evaluation/benchmarks/gaia/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -35,7 +35,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "LEVELS: $LEVELS"
 
-COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
+COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/benchmarks/mint/README.md b/evaluation/benchmarks/mint/README.md
index f9ab43327199..bfaeb713bc78 100644
--- a/evaluation/benchmarks/mint/README.md
+++ b/evaluation/benchmarks/mint/README.md
@@ -6,7 +6,7 @@ We support evaluation of the [Eurus subset focus on math and code reasoning](htt
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Start the evaluation
 
@@ -34,7 +34,7 @@ Note: in order to use `eval_limit`, you must also set `subset`.
 For example,
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
+./evaluation/benchmarks/mint/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
 ```
 
 ## Reference