From 0f00ea6ea3f034d3ac22a1bb03339280e3eaeb84 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Fri, 10 Jan 2025 03:44:09 +0100
Subject: [PATCH] fix iterations

---
 .github/workflows/integration-runner.yml          | 8 ++++----
 evaluation/integration_tests/run_infer.py         | 2 +-
 evaluation/integration_tests/scripts/run_infer.sh | 5 +++--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 7ceae21d4479..8d9b69e022af 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -134,10 +134,10 @@ jobs:
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run'
 
           # Find and export the delegator test results
-          REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1)
           echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU"
           echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<<EOF" >> $GITHUB_ENV
           cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV
@@ -166,10 +166,10 @@ jobs:
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run'
 
           # Find and export the delegator test results
-          REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1)
           echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK"
           echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<<EOF" >> $GITHUB_ENV
           cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 5036cc34b541..b7018d0b04d1 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -245,7 +245,7 @@ def load_integration_tests() -> pd.DataFrame:
     )
 
     # capture the top-level error if present, per instance
-    df['error_message'] = df['error']
+    df['error_message'] = df.get('error', None)
 
     logger.info(f'Total cost: USD {df["cost"].sum():.2f}')
 
diff --git a/evaluation/integration_tests/scripts/run_infer.sh b/evaluation/integration_tests/scripts/run_infer.sh
index e5ae35e849d2..32702afa9013 100755
--- a/evaluation/integration_tests/scripts/run_infer.sh
+++ b/evaluation/integration_tests/scripts/run_infer.sh
@@ -7,8 +7,9 @@ MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
 EVAL_LIMIT=$4
-NUM_WORKERS=$5
-EVAL_IDS=$6
+MAX_ITERATIONS=$5
+NUM_WORKERS=$6
+EVAL_IDS=$7
 
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1