Skip to content

Commit

Permalink
Fix pr #5223: Fix issue #5222: [Refactor]: Refactor the evaluation di…
Browse files Browse the repository at this point in the history
…rectory
  • Loading branch information
openhands-agent committed Nov 23, 2024
1 parent b07b554 commit 655af80
Show file tree
Hide file tree
Showing 18 changed files with 19 additions and 19 deletions.
2 changes: 1 addition & 1 deletion evaluation/benchmarks/EDA/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"

COMMAND="poetry run python evaluation/EDA/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--dataset $DATASET \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/agent_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run python evaluation/agent_bench/run_infer.py \
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/aider_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ if [ "$USE_UNIT_TESTS" = true ]; then
EVAL_NOTE=$EVAL_NOTE-w-test
fi

COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/aider_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/biocoder/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"

COMMAND="poetry run python evaluation/biocoder/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bird/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/bird/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

EVAL_NOTE="$AGENT_VERSION"

COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 1 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ fi

function run_eval() {
local eval_note=$1
COMMAND="poetry run python evaluation/commit0_bench/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/commit0_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations $MAX_ITER \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/discoverybench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/discoverybench/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/gorilla/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "HUBS: $HUBS"

COMMAND="poetry run python evaluation/gorilla/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/gpqa/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/gpqa/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--dataset $DATASET \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/miniwob/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

EVAL_NOTE="${AGENT_VERSION}_${NOTE}"

COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fi
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "RESULT_FILE: $RESULT_FILE"

COMMAND="poetry run python evaluation/ml_bench/run_analysis.py \
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_analysis.py \
--llm-config $MODEL_CONFIG \
--json_file_path $RESULT_FILE"

Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/ml_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/ml_bench/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/scienceagentbench/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--use_knowledge $USE_KNOWLEDGE \
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/toolqa/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ echo "DATASET: $DATASET"
echo "HARDNESS: $HARDNESS"
echo "WOLFRAM_APPID: $WOLFRAM_APPID"

COMMAND="poetry run python evaluation/toolqa/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
Expand Down
4 changes: 2 additions & 2 deletions evaluation/benchmarks/webarena/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -eo pipefail
source "evaluation/utils/version_control.sh"

# configure webarena websites and environment
source evaluation/webarena/scripts/webarena_env.sh
source evaluation/benchmarks/webarena/scripts/webarena_env.sh

# configure browsing agent
export USE_NAV="false"
Expand Down Expand Up @@ -35,7 +35,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

EVAL_NOTE="$AGENT_VERSION"

COMMAND="poetry run python evaluation/webarena/run_infer.py \
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 15 \
Expand Down

0 comments on commit 655af80

Please sign in to comment.