From 0f00ea6ea3f034d3ac22a1bb03339280e3eaeb84 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Fri, 10 Jan 2025 03:44:09 +0100 Subject: [PATCH] fix iterations --- .github/workflows/integration-runner.yml | 8 ++++---- evaluation/integration_tests/run_infer.py | 2 +- evaluation/integration_tests/scripts/run_infer.sh | 5 +++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 7ceae21d4479..8d9b69e022af 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -134,10 +134,10 @@ jobs: env: SANDBOX_FORCE_REBUILD_RUNTIME: True run: | - poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run' + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run' # Find and export the delegator test results - REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) + REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU" echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<> $GITHUB_ENV cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV @@ -166,10 +166,10 @@ jobs: env: SANDBOX_FORCE_REBUILD_RUNTIME: True run: | - poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run' + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run' # Find and export the delegator test results - REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) + REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK" echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<> $GITHUB_ENV cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 5036cc34b541..b7018d0b04d1 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -245,7 +245,7 @@ def load_integration_tests() -> pd.DataFrame: ) # capture the top-level error if present, per instance - df['error_message'] = df['error'] + df['error_message'] = df.get('error', None) logger.info(f'Total cost: USD {df["cost"].sum():.2f}') diff --git a/evaluation/integration_tests/scripts/run_infer.sh b/evaluation/integration_tests/scripts/run_infer.sh index e5ae35e849d2..32702afa9013 100755 --- a/evaluation/integration_tests/scripts/run_infer.sh +++ b/evaluation/integration_tests/scripts/run_infer.sh @@ -7,8 +7,9 @@ MODEL_CONFIG=$1 COMMIT_HASH=$2 AGENT=$3 EVAL_LIMIT=$4 -NUM_WORKERS=$5 -EVAL_IDS=$6 +MAX_ITERATIONS=$5 +NUM_WORKERS=$6 +EVAL_IDS=$7 if [ -z "$NUM_WORKERS" ]; then NUM_WORKERS=1