From eaf30578b1c6fd46e2bf0ca148574d47c6d934f2 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sat, 16 Nov 2024 00:39:39 +0000
Subject: [PATCH 01/34] Fix issue #5076: Integration test github action

---
 .github/workflows/eval-runner.yml        |  23 +---
 .github/workflows/integration-runner.yml | 135 +++++++++++++++++++++++
 2 files changed, 136 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/integration-runner.yml

diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml
index f788cf78d2f8..6ebfb0ec6ad9 100644
--- a/.github/workflows/eval-runner.yml
+++ b/.github/workflows/eval-runner.yml
@@ -1,4 +1,4 @@
-name: Run Evaluation
+name: Run SWE-Bench Evaluation
 
 on:
   pull_request:
@@ -60,24 +60,6 @@ jobs:
           echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
-      - name: Run integration test evaluation
-        env:
-          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          RUNTIME: remote
-          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
-
-        run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
-
-          # get evaluation report
-          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
-          echo "REPORT_FILE: $REPORT_FILE"
-          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
-          cat $REPORT_FILE >> $GITHUB_ENV
-          echo >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
       - name: Run SWE-Bench evaluation
         env:
           ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
@@ -145,9 +127,6 @@ jobs:
               **SWE-Bench Evaluation Report**
               ${{ env.SWEBENCH_REPORT }}
               ---
-              **Integration Tests Evaluation Report**
-              ${{ env.INTEGRATION_TEST_REPORT }}
-              ---
               You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
 
       - name: Post to a Slack channel
diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
new file mode 100644
index 000000000000..4c3a04ffdb9f
--- /dev/null
+++ b/.github/workflows/integration-runner.yml
@@ -0,0 +1,135 @@
+name: Run Integration Tests
+
+on:
+  pull_request:
+    types: [labeled]
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: "Reason for manual trigger"
+        required: true
+        default: ""
+
+env:
+  N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation
+
+jobs:
+  run-integration-tests:
+    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: "read"
+      id-token: "write"
+      pull-requests: "write"
+      issues: "write"
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+
+      - name: Comment on PR if 'integration-test' label is present
+        if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          unique: false
+          comment: |
+            Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
+
+      - name: Install Python dependencies using Poetry
+        run: poetry install
+
+      - name: Configure config.toml for evaluation
+        env:
+          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"deepseek/deepseek-chat\"" >> config.toml
+          echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          RUNTIME: remote
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
+
+          # get evaluation report
+          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE"
+          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Create tar.gz of evaluation outputs
+        run: |
+          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
+          tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
+
+      - name: Upload evaluation results as artifact
+        uses: actions/upload-artifact@v4
+        id: upload_results_artifact
+        with:
+          name: integration-test-outputs
+          path: evaluation_outputs_*.tar.gz
+
+      - name: Get artifact URL
+        run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+
+      - name: Authenticate to Google Cloud
+        uses: 'google-github-actions/auth@v2'
+        with:
+          credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
+
+      - name: Set timestamp and trigger reason
+        run: |
+          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          else
+            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
+          fi
+
+      - name: Upload evaluation results to Google Cloud Storage
+        uses: 'google-github-actions/upload-cloud-storage@v2'
+        with:
+          path: 'evaluation/evaluation_outputs/outputs'
+          destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
+
+      - name: Comment with evaluation results and artifact link
+        id: create_comment
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
+          unique: false
+          comment: |
+              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}
+              Commit: ${{ github.sha }}
+              **Integration Tests Evaluation Report**
+              ${{ env.INTEGRATION_TEST_REPORT }}
+              ---
+              You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
+
+      - name: Post to a Slack channel
+        id: slack
+        uses: slackapi/slack-github-action@v1.27.0
+        with:
+          channel-id: 'C07SVQSCR6F'
+          slack-message: "*Integration Tests Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}

From 14fe4c6a6bf5f4eaec1c292e8467d6aa9b0e6799 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 23 Nov 2024 03:03:01 +0100
Subject: [PATCH 02/34] Update integration-runner.yml

---
 .github/workflows/integration-runner.yml | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 4c3a04ffdb9f..b15938848d6f 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -124,12 +124,4 @@ jobs:
               ${{ env.INTEGRATION_TEST_REPORT }}
               ---
               You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
-
-      - name: Post to a Slack channel
-        id: slack
-        uses: slackapi/slack-github-action@v1.27.0
-        with:
-          channel-id: 'C07SVQSCR6F'
-          slack-message: "*Integration Tests Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}
+  

From b415ad2a69b6230c1f1b0a1eeb63f67cd396b94e Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 23 Nov 2024 03:11:15 +0100
Subject: [PATCH 03/34] Update integration-runner.yml

---
 .github/workflows/integration-runner.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index b15938848d6f..ecd440510413 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -11,7 +11,7 @@ on:
         default: ""
 
 env:
-  N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation
+  N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation
 
 jobs:
   run-integration-tests:
@@ -115,7 +115,7 @@ jobs:
         id: create_comment
         uses: KeisukeYamashita/create-comment@v1
         with:
-          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
           unique: false
           comment: |
               Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}

From 0fd1ddff64032a252e5aa6932a2b7e99e4042f6b Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 23 Nov 2024 15:31:49 +0100
Subject: [PATCH 04/34] update variables

---
 .github/workflows/integration-runner.yml | 26 ++++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index ecd440510413..dd1657dfe6ac 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -51,11 +51,12 @@ jobs:
 
       - name: Configure config.toml for evaluation
         env:
-          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
         run: |
           echo "[llm.eval]" > config.toml
-          echo "model = \"deepseek/deepseek-chat\"" >> config.toml
-          echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
       - name: Run integration test evaluation
@@ -91,10 +92,10 @@ jobs:
       - name: Get artifact URL
         run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
 
-      - name: Authenticate to Google Cloud
-        uses: 'google-github-actions/auth@v2'
-        with:
-          credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
+      # - name: Authenticate to Google Cloud
+      #   uses: 'google-github-actions/auth@v2'
+      #   with:
+      #     credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
 
       - name: Set timestamp and trigger reason
         run: |
@@ -105,11 +106,11 @@ jobs:
             echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
           fi
 
-      - name: Upload evaluation results to Google Cloud Storage
-        uses: 'google-github-actions/upload-cloud-storage@v2'
-        with:
-          path: 'evaluation/evaluation_outputs/outputs'
-          destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
+      # - name: Upload evaluation results to Google Cloud Storage
+      #   uses: 'google-github-actions/upload-cloud-storage@v2'
+      #   with:
+      #     path: 'evaluation/evaluation_outputs/outputs'
+      #     destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
 
       - name: Comment with evaluation results and artifact link
         id: create_comment
@@ -124,4 +125,3 @@ jobs:
               ${{ env.INTEGRATION_TEST_REPORT }}
               ---
               You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
-  

From bc3f13657652f099134e97be19e17ee6cd0502ec Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 23 Nov 2024 15:39:29 +0100
Subject: [PATCH 05/34] use haiku

---
 .github/workflows/integration-runner.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index dd1657dfe6ac..c2238495db7f 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -51,11 +51,11 @@ jobs:
 
       - name: Configure config.toml for evaluation
         env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }}
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
         run: |
           echo "[llm.eval]" > config.toml
-          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "model = \"$HAIKU_LLM_MODEL\"" >> config.toml
           echo "api_key = \"$LLM_API_KEY\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 

From 73e88370d2693c349e29d5f1079c926caaf79c64 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 23 Nov 2024 17:49:22 +0100
Subject: [PATCH 06/34] use base url

---
 .github/workflows/integration-runner.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index c2238495db7f..fb5ea65b3e1b 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -53,10 +53,12 @@ jobs:
         env:
           LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }}
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
         run: |
           echo "[llm.eval]" > config.toml
-          echo "model = \"$HAIKU_LLM_MODEL\"" >> config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
           echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
       - name: Run integration test evaluation

From 7af35189c7b5eb3702c063a988f6587ea207bdf7 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 23 Nov 2024 18:03:56 +0100
Subject: [PATCH 07/34] fix report name

---
 .github/workflows/integration-runner.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index fb5ea65b3e1b..5946dc3e54bd 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -72,7 +72,7 @@ jobs:
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
 
           # get evaluation report
-          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1)
           echo "REPORT_FILE: $REPORT_FILE"
           echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
           cat $REPORT_FILE >> $GITHUB_ENV

From dcd4681a3ffac876ebbbe06df61471071c240d33 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 25 Nov 2024 15:39:09 +0000
Subject: [PATCH 08/34] Fix pr #8: Integration tests (openhands fix issue 5076)

---
 .github/workflows/integration-tests.yml | 44 +++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 .github/workflows/integration-tests.yml

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
new file mode 100644
index 000000000000..c435fc0eae89
--- /dev/null
+++ b/.github/workflows/integration-tests.yml
@@ -0,0 +1,44 @@
+name: Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  schedule:
+    - cron: '0 2 * * *'  # Runs at 2 AM UTC every day
+
+jobs:
+  integration-tests:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+    
+    - name: Set up Node.js
+      uses: actions/setup-node@v3
+      with:
+        node-version: '18'
+    
+    - name: Install Poetry
+      run: |
+        pip install poetry
+    
+    - name: Install backend dependencies
+      run: |
+        poetry install
+    
+    - name: Install frontend dependencies
+      working-directory: ./frontend
+      run: |
+        npm install
+    
+    - name: Run integration tests
+      run: |
+        make build
+        poetry run pytest tests/integration

From 1a24a946d7fa0e3b5320a5cd41fe50f0159a139d Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 17:09:30 +0100
Subject: [PATCH 09/34] Revert "Fix pr #8: Integration tests (openhands fix
 issue 5076)"

This reverts commit dcd4681a3ffac876ebbbe06df61471071c240d33.
---
 .github/workflows/integration-tests.yml | 44 -------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 .github/workflows/integration-tests.yml

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
deleted file mode 100644
index c435fc0eae89..000000000000
--- a/.github/workflows/integration-tests.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Integration Tests
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-  schedule:
-    - cron: '0 2 * * *'  # Runs at 2 AM UTC every day
-
-jobs:
-  integration-tests:
-    runs-on: ubuntu-latest
-    
-    steps:
-    - uses: actions/checkout@v3
-    
-    - name: Set up Python
-      uses: actions/setup-python@v3
-      with:
-        python-version: '3.10'
-    
-    - name: Set up Node.js
-      uses: actions/setup-node@v3
-      with:
-        node-version: '18'
-    
-    - name: Install Poetry
-      run: |
-        pip install poetry
-    
-    - name: Install backend dependencies
-      run: |
-        poetry install
-    
-    - name: Install frontend dependencies
-      working-directory: ./frontend
-      run: |
-        npm install
-    
-    - name: Run integration tests
-      run: |
-        make build
-        poetry run pytest tests/integration

From 5e5eb0ff5a34fc10ab48d014a76421893c32fd6a Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 25 Nov 2024 16:13:54 +0000
Subject: [PATCH 10/34] Fix pr #8: Integration tests (openhands fix issue 5076)

---
 .github/workflows/integration-runner.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 5946dc3e54bd..1449bb31c722 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -9,13 +9,15 @@ on:
         description: "Reason for manual trigger"
         required: true
         default: ""
+  schedule:
+    - cron: '0 0 * * *'  # Runs at midnight UTC every day
 
 env:
   N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation
 
 jobs:
   run-integration-tests:
-    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch'
+    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
     runs-on: ubuntu-latest
     permissions:
       contents: "read"
@@ -104,8 +106,10 @@ jobs:
           echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
           if [[ "${{ github.event_name }}" == "pull_request" ]]; then
             echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
-          else
+          elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
             echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
+          else
+            echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
           fi
 
       # - name: Upload evaluation results to Google Cloud Storage
@@ -121,7 +125,7 @@ jobs:
           number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
           unique: false
           comment: |
-              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}
+              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
               Commit: ${{ github.sha }}
               **Integration Tests Evaluation Report**
               ${{ env.INTEGRATION_TEST_REPORT }}

From 1f908675f9ff1646bdf4ce3aef2ba050c8c3305a Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 17:41:53 +0100
Subject: [PATCH 11/34] use haiku explicitly, in results too

---
 .github/workflows/integration-runner.yml | 29 +++++++++++++-----------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 5946dc3e54bd..5e8eb2e14669 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -3,6 +3,8 @@ name: Run Integration Tests
 on:
   pull_request:
     types: [labeled]
+  schedule:
+    - cron: "0 1 * * *" # 1 AM UTC every day
   workflow_dispatch:
     inputs:
       reason:
@@ -11,7 +13,7 @@ on:
         default: ""
 
 env:
-  N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation
+  N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
 
 jobs:
   run-integration-tests:
@@ -47,9 +49,9 @@ jobs:
             Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
 
       - name: Install Python dependencies using Poetry
-        run: poetry install
+        run: poetry install --without evaluation, llama-index
 
-      - name: Configure config.toml for evaluation
+      - name: Configure config.toml for testing with Haiku
         env:
           LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }}
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
@@ -71,10 +73,10 @@ jobs:
         run: |
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
 
-          # get evaluation report
+          # get integration tests report
           REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1)
           echo "REPORT_FILE: $REPORT_FILE"
-          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
+          echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
           cat $REPORT_FILE >> $GITHUB_ENV
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
@@ -82,17 +84,17 @@ jobs:
       - name: Create tar.gz of evaluation outputs
         run: |
           TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
-          tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
+          tar -czvf outputs_haiku_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
 
       - name: Upload evaluation results as artifact
         uses: actions/upload-artifact@v4
         id: upload_results_artifact
         with:
           name: integration-test-outputs
-          path: evaluation_outputs_*.tar.gz
+          path: outputs_haiku_*.tar.gz
 
       - name: Get artifact URL
-        run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+        run: echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
 
       # - name: Authenticate to Google Cloud
       #   uses: 'google-github-actions/auth@v2'
@@ -114,16 +116,17 @@ jobs:
       #     path: 'evaluation/evaluation_outputs/outputs'
       #     destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
 
-      - name: Comment with evaluation results and artifact link
+      - name: Comment with results and artifact link
         id: create_comment
         uses: KeisukeYamashita/create-comment@v1
         with:
-          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
+          # if triggered by PR, use PR number, otherwise use 9 as fallback issue number for manual triggers
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9 }}
           unique: false
           comment: |
               Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}
               Commit: ${{ github.sha }}
-              **Integration Tests Evaluation Report**
-              ${{ env.INTEGRATION_TEST_REPORT }}
+              **Integration Tests Report (Haiku)**
+              ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
               ---
-              You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
+              You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL_HAIKU }}).

From fa9e65191cfdd5599679d088d279384ad39f778f Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 17:44:36 +0100
Subject: [PATCH 12/34] remove duplicate

---
 .github/workflows/integration-runner.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index c13b55db08ef..2bfaf0ac1641 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -3,8 +3,6 @@ name: Run Integration Tests
 on:
   pull_request:
     types: [labeled]
-  schedule:
-    - cron: "0 1 * * *" # 1 AM UTC every day
   workflow_dispatch:
     inputs:
       reason:

From 7e7200e514961239cd4bb31169febdfef75101de Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 17:54:37 +0100
Subject: [PATCH 13/34] Update .github/workflows/integration-runner.yml

---
 .github/workflows/integration-runner.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 2bfaf0ac1641..ded969e50cf5 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -49,7 +49,7 @@ jobs:
             Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
 
       - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation, llama-index
+        run: poetry install
 
       - name: Configure config.toml for testing with Haiku
         env:

From 96ef986d88c90b9845073ce7a4bad62d3881f1b8 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:00:49 +0100
Subject: [PATCH 14/34] Revert "Update
 .github/workflows/integration-runner.yml"

This reverts commit 7e7200e514961239cd4bb31169febdfef75101de.
---
 .github/workflows/integration-runner.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index ded969e50cf5..2bfaf0ac1641 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -49,7 +49,7 @@ jobs:
             Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
 
       - name: Install Python dependencies using Poetry
-        run: poetry install
+        run: poetry install --without evaluation, llama-index
 
       - name: Configure config.toml for testing with Haiku
         env:

From 7c2db5bbcd289e988f6812ea5fc6b2689ef3b62b Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:01:19 +0100
Subject: [PATCH 15/34] funny space

---
 .github/workflows/integration-runner.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 2bfaf0ac1641..dc5d28a34524 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -49,7 +49,7 @@ jobs:
             Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
 
       - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation, llama-index
+        run: poetry install --without evaluation,llama-index
 
       - name: Configure config.toml for testing with Haiku
         env:

From 76df32e5d7331c49be7043fb2fa77a948313829e Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 25 Nov 2024 17:05:19 +0000
Subject: [PATCH 16/34] Fix pr #8: Integration tests (openhands fix issue 5076)

---
 .github/workflows/integration-runner.yml | 55 +++++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index dc5d28a34524..b9cd0b7dca50 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -63,7 +63,7 @@ jobs:
           echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
-      - name: Run integration test evaluation
+      - name: Run integration test evaluation for Haiku
         env:
           ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
           RUNTIME: remote
@@ -81,20 +81,57 @@ jobs:
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
-      - name: Create tar.gz of evaluation outputs
+      - name: Configure config.toml for testing with DeepSeek
+        env:
+          LLM_MODEL: "deepseek/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DeepSeek
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          RUNTIME: remote
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
+
+          # get integration tests report
+          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE"
+          echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Create tar.gz of Haiku evaluation outputs
         run: |
           TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
           tar -czvf outputs_haiku_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
 
+      - name: Create tar.gz of DeepSeek evaluation outputs
+        run: |
+          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
+          tar -czvf outputs_deepseek_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
+
       - name: Upload evaluation results as artifact
         uses: actions/upload-artifact@v4
         id: upload_results_artifact
         with:
           name: integration-test-outputs
-          path: outputs_haiku_*.tar.gz
+          path: |
+            outputs_haiku_*.tar.gz
+            outputs_deepseek_*.tar.gz
 
-      - name: Get artifact URL
-        run: echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+      - name: Get artifact URLs
+        run: |
+          echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+          echo "ARTIFACT_URL_DEEPSEEK=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
 
       # - name: Authenticate to Google Cloud
       #   uses: 'google-github-actions/auth@v2'
@@ -129,6 +166,12 @@ jobs:
               Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
               Commit: ${{ github.sha }}
               **Integration Tests Report (Haiku)**
+              Haiku LLM Test Results:
               ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
               ---
-              You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL_HAIKU }}).
+              **Integration Tests Report (DeepSeek)**
+              DeepSeek LLM Test Results:
+              ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              ---
+              Haiku Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_HAIKU }})
+              DeepSeek Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_DEEPSEEK }})

From 78951201cd826d7874efe974d4d3e763fdb4e3dd Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:21:55 +0100
Subject: [PATCH 17/34] artifact fix

---
 .github/workflows/integration-runner.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index b9cd0b7dca50..fcf124f203ba 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -123,7 +123,8 @@ jobs:
         uses: actions/upload-artifact@v4
         id: upload_results_artifact
         with:
-          name: integration-test-outputs
+          # using a single artifact with both archives since they are related to same test run
+          name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
           path: |
             outputs_haiku_*.tar.gz
             outputs_deepseek_*.tar.gz
@@ -173,5 +174,4 @@ jobs:
               DeepSeek LLM Test Results:
               ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
               ---
-              Haiku Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_HAIKU }})
-              DeepSeek Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_DEEPSEEK }})
+              Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})

From 4e178d532ea9c3dc42d111bf66d83e6338cc14a2 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:34:41 +0100
Subject: [PATCH 18/34] clean up remote runtimes

---
 .github/workflows/integration-runner.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index fcf124f203ba..a4298cfa3237 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -156,6 +156,14 @@ jobs:
       #     path: 'evaluation/evaluation_outputs/outputs'
       #     destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
 
+      - name: Cleanup remote runtimes
+        if: always()  # run this step even if previous steps failed
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+        run: |
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
+
       - name: Comment with results and artifact link
         id: create_comment
         uses: KeisukeYamashita/create-comment@v1

From fa2544581dd848dcb03ea6d3e8caa9b72f1ad0d2 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 19:16:57 +0100
Subject: [PATCH 19/34] clean up runtimes more aggressively - a bit unexpected
 though

---
 .github/workflows/integration-runner.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index a4298cfa3237..9015c76ea9f4 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -69,7 +69,6 @@ jobs:
           RUNTIME: remote
           SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
           EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
-
         run: |
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
 
@@ -81,6 +80,16 @@ jobs:
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
+      - name: Cleanup Haiku runtimes
+        if: always()
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+        run: |
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
+          # Add a small delay to ensure cleanup is complete
+          sleep 10
+
       - name: Configure config.toml for testing with DeepSeek
         env:
           LLM_MODEL: "deepseek/deepseek-chat"

From 4ceda73522a761de4eeaf51a1630b424fc3c2bad Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 25 Nov 2024 18:42:19 +0000
Subject: [PATCH 20/34] Fix pr #8: Integration tests (openhands fix issue 5076)

---
 evaluation/integration_tests/run_infer.py     |  8 ++++
 .../tests/t05_simple_browsing.py              | 33 +++++++++-----
 .../tests/t06_github_pr_browsing.py           | 45 ++++++++++++-------
 3 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 5e3205fefe2e..ba8b2f124754 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -130,6 +130,14 @@ def process_instance(
     # # =============================================
 
     histories = [event_to_dict(event) for event in state.history]
+    
+    # Debug logging
+    logger.info(f"Total events in history: {len(histories)}")
+    for event in histories:
+        logger.info(f"Event type: {event.get('type', 'Unknown')}")
+        if 'content' in event:
+            logger.info(f"Event content: {event['content']}")
+    
     test_result: TestResult = test_class.verify_result(runtime, histories)
     metrics = state.metrics.get() if state.metrics else None
 
diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py
index 8f08cb4e7250..d08aff81f883 100644
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -108,6 +108,11 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
 
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # Log all events for debugging
+        from openhands.core.logger import openhands_logger as logger
+        logger.info("Verifying simple browsing test result")
+        logger.info(f"Total events: {len(histories)}")
+
         # check if the "The answer is OpenHands is all you need!" is in any message
         message_actions = [
             event
@@ -116,18 +121,26 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
             )
         ]
+        logger.info(f"Total message-like events: {len(message_actions)}")
+
         for event in message_actions:
-            if isinstance(event, AgentDelegateObservation):
-                content = event.content
-            elif isinstance(event, AgentFinishAction):
-                content = event.outputs.get('content', '')
-            elif isinstance(event, MessageAction):
-                content = event.content
-            else:
-                raise ValueError(f'Unknown event type: {type(event)}')
+            try:
+                if isinstance(event, AgentDelegateObservation):
+                    content = event.get('content', '')
+                elif isinstance(event, AgentFinishAction):
+                    content = event.get('outputs', {}).get('content', '')
+                elif isinstance(event, MessageAction):
+                    content = event.get('content', '')
+                else:
+                    logger.warning(f'Unknown event type: {type(event)}')
+                    continue
+
+                logger.info(f"Checking event content: {content}")
+                if 'OpenHands is all you need!' in content:
+                    return TestResult(success=True)
+            except Exception as e:
+                logger.error(f"Error processing event: {e}")
 
-            if 'OpenHands is all you need!' in content:
-                return TestResult(success=True)
         return TestResult(
             success=False,
             reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
index 52ec927cd334..0f95a3ead73c 100644
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -14,7 +14,12 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
 
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
-        # check if the "The answer is OpenHands is all you need!" is in any message
+        # Log all events for debugging
+        from openhands.core.logger import openhands_logger as logger
+        logger.info("Verifying GitHub PR browsing test result")
+        logger.info(f"Total events: {len(histories)}")
+
+        # check if the license information is in any message
         message_actions = [
             event
             for event in histories
@@ -22,22 +27,30 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
             )
         ]
+        logger.info(f"Total message-like events: {len(message_actions)}")
+
         for event in message_actions:
-            if isinstance(event, AgentDelegateObservation):
-                content = event.content
-            elif isinstance(event, AgentFinishAction):
-                content = event.outputs.get('content', '')
-            elif isinstance(event, MessageAction):
-                content = event.content
-            else:
-                raise ValueError(f'Unknown event type: {type(event)}')
-
-            if (
-                'non-commercial' in content
-                or 'MIT' in content
-                or 'Apache 2.0' in content
-            ):
-                return TestResult(success=True)
+            try:
+                if isinstance(event, AgentDelegateObservation):
+                    content = event.get('content', '')
+                elif isinstance(event, AgentFinishAction):
+                    content = event.get('outputs', {}).get('content', '')
+                elif isinstance(event, MessageAction):
+                    content = event.get('content', '')
+                else:
+                    logger.warning(f'Unknown event type: {type(event)}')
+                    continue
+
+                logger.info(f"Checking event content: {content}")
+                if (
+                    'non-commercial' in content
+                    or 'MIT' in content
+                    or 'Apache 2.0' in content
+                ):
+                    return TestResult(success=True)
+            except Exception as e:
+                logger.error(f"Error processing event: {e}")
+
         return TestResult(
             success=False,
             reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',

From 194a1fb74204b61265ed21e20cc9d8a9a2990f29 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:07:25 +0100
Subject: [PATCH 21/34] fix type issue that was preventing checking results

---
 evaluation/integration_tests/run_infer.py        | 16 ++++++----------
 .../tests/t05_simple_browsing.py                 | 10 +++-------
 .../tests/t06_github_pr_browsing.py              | 10 +++-------
 3 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index ba8b2f124754..f1016c7c48fe 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -26,7 +26,6 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import MessageAction
-from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.base import Runtime
 from openhands.utils.async_utils import call_async_from_sync
 
@@ -129,15 +128,12 @@ def process_instance(
     # # result evaluation
     # # =============================================
 
-    histories = [event_to_dict(event) for event in state.history]
-    
-    # Debug logging
-    logger.info(f"Total events in history: {len(histories)}")
-    for event in histories:
-        logger.info(f"Event type: {event.get('type', 'Unknown')}")
-        if 'content' in event:
-            logger.info(f"Event content: {event['content']}")
-    
+    histories = state.history
+
+    # some basic check
+    logger.info(f'Total events in history: {len(histories)}')
+    assert len(histories) > 0, 'History should not be empty'
+
     test_result: TestResult = test_class.verify_result(runtime, histories)
     metrics = state.metrics.get() if state.metrics else None
 
diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py
index d08aff81f883..e3c624684eb8 100644
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -108,10 +108,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
 
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
-        # Log all events for debugging
         from openhands.core.logger import openhands_logger as logger
-        logger.info("Verifying simple browsing test result")
-        logger.info(f"Total events: {len(histories)}")
 
         # check if the "The answer is OpenHands is all you need!" is in any message
         message_actions = [
@@ -121,7 +118,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
             )
         ]
-        logger.info(f"Total message-like events: {len(message_actions)}")
+        logger.debug(f'Total message-like events: {len(message_actions)}')
 
         for event in message_actions:
             try:
@@ -132,14 +129,13 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 elif isinstance(event, MessageAction):
                     content = event.get('content', '')
                 else:
-                    logger.warning(f'Unknown event type: {type(event)}')
+                    logger.warning(f'Unexpected event type: {type(event)}')
                     continue
 
-                logger.info(f"Checking event content: {content}")
                 if 'OpenHands is all you need!' in content:
                     return TestResult(success=True)
             except Exception as e:
-                logger.error(f"Error processing event: {e}")
+                logger.error(f'Error processing event: {e}')
 
         return TestResult(
             success=False,
diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
index 0f95a3ead73c..697ff49df371 100644
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -14,10 +14,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
 
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
-        # Log all events for debugging
         from openhands.core.logger import openhands_logger as logger
-        logger.info("Verifying GitHub PR browsing test result")
-        logger.info(f"Total events: {len(histories)}")
 
         # check if the license information is in any message
         message_actions = [
@@ -27,7 +24,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
             )
         ]
-        logger.info(f"Total message-like events: {len(message_actions)}")
+        logger.info(f'Total message-like events: {len(message_actions)}')
 
         for event in message_actions:
             try:
@@ -38,10 +35,9 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 elif isinstance(event, MessageAction):
                     content = event.get('content', '')
                 else:
-                    logger.warning(f'Unknown event type: {type(event)}')
+                    logger.warning(f'Unexpected event type: {type(event)}')
                     continue
 
-                logger.info(f"Checking event content: {content}")
                 if (
                     'non-commercial' in content
                     or 'MIT' in content
@@ -49,7 +45,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 ):
                     return TestResult(success=True)
             except Exception as e:
-                logger.error(f"Error processing event: {e}")
+                logger.error(f'Error processing event: {e}')
 
         return TestResult(
             success=False,

From 57d590665fa566954c25fc383549a3e65ff07a81 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:33:20 +0100
Subject: [PATCH 22/34] try with waiting time

---
 .github/workflows/integration-runner.yml | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 9015c76ea9f4..d38918230fb9 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -80,15 +80,18 @@ jobs:
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
-      - name: Cleanup Haiku runtimes
-        if: always()
-        env:
-          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-        run: |
-          poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
-          # Add a small delay to ensure cleanup is complete
-          sleep 10
+      #- name: Cleanup Haiku runtimes
+      #  if: always()
+      #  env:
+      #    ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+      #    SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+      #  run: |
+      #    poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
+      #    # Add a small delay to ensure cleanup is complete
+      #    sleep 10
+
+      - name: Wait a little bit
+        run: sleep 40
 
       - name: Configure config.toml for testing with DeepSeek
         env:

From cafedcb63ec7619af6594b81ada2845b8d9f1bb8 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:38:13 +0100
Subject: [PATCH 23/34] add eval notes

---
 .github/workflows/integration-runner.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index d38918230fb9..6a420f4c997a 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -70,7 +70,7 @@ jobs:
           SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
           EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
 
           # get integration tests report
           REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1)
@@ -111,7 +111,7 @@ jobs:
           EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
 
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
 
           # get integration tests report
           REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1)

From f935f0df740fce773bc800a20bb9e755d0376310 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:18:36 +0100
Subject: [PATCH 24/34] increase timeouts

---
 evaluation/integration_tests/run_infer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index f1016c7c48fe..bbcd00ed8191 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -47,9 +47,13 @@ def get_config(
             # use default base_container_image
             enable_auto_lint=True,
             use_host_network=False,
-            timeout=100,
+            timeout=300,
+            # Add platform to the sandbox config to solve issue 4401
+            platform='linux/amd64',
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_runtime_alive=False,
+            remote_runtime_init_timeout=3600,
         ),
         # do not mount workspace
         workspace_base=None,

From 34a30eeacbe2922fce14dd25370ee4c6e6306c02 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:33:14 +0100
Subject: [PATCH 25/34] try with CI local builds

---
 .github/workflows/integration-runner.yml | 38 +++++++++++++-----------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 6a420f4c997a..ef4e2ef2bb54 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -63,12 +63,16 @@ jobs:
           echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
+      - name: Build environment
+        run: make build
+
       - name: Run integration test evaluation for Haiku
         env:
-          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          RUNTIME: remote
-          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+          #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          #RUNTIME: remote
+          #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
 
@@ -91,7 +95,7 @@ jobs:
       #    sleep 10
 
       - name: Wait a little bit
-        run: sleep 40
+        run: sleep 10
 
       - name: Configure config.toml for testing with DeepSeek
         env:
@@ -105,11 +109,11 @@ jobs:
 
       - name: Run integration test evaluation for DeepSeek
         env:
-          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          RUNTIME: remote
-          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
-
+          #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          #RUNTIME: remote
+          #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
 
@@ -168,13 +172,13 @@ jobs:
       #     path: 'evaluation/evaluation_outputs/outputs'
       #     destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
 
-      - name: Cleanup remote runtimes
-        if: always()  # run this step even if previous steps failed
-        env:
-          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-        run: |
-          poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
+      # - name: Cleanup remote runtimes
+      #   if: always()  # run this step even if previous steps failed
+      #   env:
+      #     ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+      #     SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+      #   run: |
+      #     poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
 
       - name: Comment with results and artifact link
         id: create_comment

From d48fac004c9afeae4c0e0b349cff4213509aab47 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:47:27 +0100
Subject: [PATCH 26/34] fix eval output

---
 evaluation/integration_tests/run_infer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index bbcd00ed8191..19df57fe8c33 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -26,6 +26,7 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import MessageAction
+from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.base import Runtime
 from openhands.utils.async_utils import call_async_from_sync
 
@@ -147,7 +148,7 @@ def process_instance(
         instance=instance.to_dict(),
         instruction=instruction,
         metadata=metadata,
-        history=histories,
+        history=[event_to_dict(event) for event in histories],
         metrics=metrics,
         error=state.last_error if state and state.last_error else None,
         test_result=test_result.model_dump(),

From d4a21d0871b182b27323a4ae681be4f6d32fcf0b Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:55:33 +0100
Subject: [PATCH 27/34] set debug

---
 evaluation/integration_tests/run_infer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 19df57fe8c33..8a21b12ae5b2 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -59,6 +59,8 @@ def get_config(
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,
+        # debug
+        debug=True,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(

From e391604231afacba2795515bfc30129767120819 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:58:27 +0100
Subject: [PATCH 28/34] fix tests!

---
 evaluation/integration_tests/tests/t05_simple_browsing.py   | 6 +++---
 .../integration_tests/tests/t06_github_pr_browsing.py       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py
index e3c624684eb8..3c4cf875cc90 100644
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -123,11 +123,11 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
         for event in message_actions:
             try:
                 if isinstance(event, AgentDelegateObservation):
-                    content = event.get('content', '')
+                    content = event.content
                 elif isinstance(event, AgentFinishAction):
-                    content = event.get('outputs', {}).get('content', '')
+                    content = event.outputs.get('content', '')
                 elif isinstance(event, MessageAction):
-                    content = event.get('content', '')
+                    content = event.content
                 else:
                     logger.warning(f'Unexpected event type: {type(event)}')
                     continue
diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
index 697ff49df371..1797e6b6beed 100644
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -29,11 +29,11 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
         for event in message_actions:
             try:
                 if isinstance(event, AgentDelegateObservation):
-                    content = event.get('content', '')
+                    content = event.content
                 elif isinstance(event, AgentFinishAction):
-                    content = event.get('outputs', {}).get('content', '')
+                    content = event.outputs.get('content', '')
                 elif isinstance(event, MessageAction):
-                    content = event.get('content', '')
+                    content = event.content
                 else:
                     logger.warning(f'Unexpected event type: {type(event)}')
                     continue

From 6ff6fe2a018edf223118e0d403c1c32294d42605 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 22:22:11 +0100
Subject: [PATCH 29/34] fix outputs

---
 .github/workflows/integration-runner.yml | 30 +++++++++---------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index ef4e2ef2bb54..15e5c81e5728 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -77,10 +77,10 @@ jobs:
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
 
           # get integration tests report
-          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1)
-          echo "REPORT_FILE: $REPORT_FILE"
+          REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE_HAIKU"
           echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
-          cat $REPORT_FILE >> $GITHUB_ENV
+          cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
@@ -118,37 +118,29 @@ jobs:
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
 
           # get integration tests report
-          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1)
-          echo "REPORT_FILE: $REPORT_FILE"
+          REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
           echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
-          cat $REPORT_FILE >> $GITHUB_ENV
+          cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
-      - name: Create tar.gz of Haiku evaluation outputs
+      - name: Create archive of evaluation outputs
         run: |
           TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
-          tar -czvf outputs_haiku_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
-
-      - name: Create tar.gz of DeepSeek evaluation outputs
-        run: |
-          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
-          tar -czvf outputs_deepseek_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
+          cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/*  # Only include the actual result directories
 
       - name: Upload evaluation results as artifact
         uses: actions/upload-artifact@v4
         id: upload_results_artifact
         with:
-          # using a single artifact with both archives since they are related to same test run
           name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
-          path: |
-            outputs_haiku_*.tar.gz
-            outputs_deepseek_*.tar.gz
+          path: integration_tests_*.tar.gz
 
       - name: Get artifact URLs
         run: |
-          echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
-          echo "ARTIFACT_URL_DEEPSEEK=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+          echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
 
       # - name: Authenticate to Google Cloud
       #   uses: 'google-github-actions/auth@v2'

From 1956f06755d87c05bbb26d4038b69da96cb978aa Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 22:37:11 +0100
Subject: [PATCH 30/34] keep details in logs, not github comment

---
 evaluation/integration_tests/tests/t05_simple_browsing.py    | 5 ++++-
 evaluation/integration_tests/tests/t06_github_pr_browsing.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py
index 3c4cf875cc90..96bb47875aec 100644
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -137,7 +137,10 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
             except Exception as e:
                 logger.error(f'Error processing event: {e}')
 
+        logger.debug(
+            f'Total messages: {len(message_actions)}. Messages: {message_actions}'
+        )
         return TestResult(
             success=False,
-            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
         )
diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
index 1797e6b6beed..2dc1a01ecd97 100644
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -47,7 +47,10 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
             except Exception as e:
                 logger.error(f'Error processing event: {e}')
 
+        logger.debug(
+            f'Total messages: {len(message_actions)}. Messages: {message_actions}'
+        )
         return TestResult(
             success=False,
-            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
         )

From b5c2519e4ac627164fafd8048beb052b42a474ec Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 22:53:54 +0100
Subject: [PATCH 31/34] tweak schedule

---
 .github/workflows/integration-runner.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 15e5c81e5728..efc8a6e20cc5 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -10,7 +10,7 @@ on:
         required: true
         default: ""
   schedule:
-    - cron: '0 0 * * *'  # Runs at midnight UTC every day
+    - cron: '0 22 * * *'  # Runs at 10pm UTC every day
 
 env:
   N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation

From 0c2218156988d29c352c85a8390c82db4db6e2a9 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 25 Nov 2024 23:16:53 +0100
Subject: [PATCH 32/34] lint-y

---
 .github/workflows/integration-runner.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index efc8a6e20cc5..e698decd9be7 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -6,11 +6,11 @@ on:
   workflow_dispatch:
     inputs:
       reason:
-        description: "Reason for manual trigger"
+        description: 'Reason for manual trigger'
         required: true
-        default: ""
+        default: ''
   schedule:
-    - cron: '0 22 * * *'  # Runs at 10pm UTC every day
+    - cron: '30 22 * * *'  # Runs at 10:30pm UTC every day
 
 env:
   N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation

From 605a24f7f165717d8d0ad2645c3d20934fec2f5d Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 27 Nov 2024 00:33:42 +0100
Subject: [PATCH 33/34] clean up

---
 .github/workflows/integration-runner.yml | 41 ++----------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index e698decd9be7..b8ff30248511 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -68,10 +68,6 @@ jobs:
 
       - name: Run integration test evaluation for Haiku
         env:
-          #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          #RUNTIME: remote
-          #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-          #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
@@ -84,16 +80,6 @@ jobs:
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
-      #- name: Cleanup Haiku runtimes
-      #  if: always()
-      #  env:
-      #    ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-      #    SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-      #  run: |
-      #    poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
-      #    # Add a small delay to ensure cleanup is complete
-      #    sleep 10
-
       - name: Wait a little bit
         run: sleep 10
 
@@ -109,10 +95,6 @@ jobs:
 
       - name: Run integration test evaluation for DeepSeek
         env:
-          #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          #RUNTIME: remote
-          #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-          #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
           poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
@@ -142,11 +124,6 @@ jobs:
         run: |
           echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
 
-      # - name: Authenticate to Google Cloud
-      #   uses: 'google-github-actions/auth@v2'
-      #   with:
-      #     credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
-
       - name: Set timestamp and trigger reason
         run: |
           echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
@@ -158,26 +135,12 @@ jobs:
             echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
           fi
 
-      # - name: Upload evaluation results to Google Cloud Storage
-      #   uses: 'google-github-actions/upload-cloud-storage@v2'
-      #   with:
-      #     path: 'evaluation/evaluation_outputs/outputs'
-      #     destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
-
-      # - name: Cleanup remote runtimes
-      #   if: always()  # run this step even if previous steps failed
-      #   env:
-      #     ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-      #     SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-      #   run: |
-      #     poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
-
       - name: Comment with results and artifact link
         id: create_comment
         uses: KeisukeYamashita/create-comment@v1
         with:
-          # if triggered by PR, use PR number, otherwise use 9 as fallback issue number for manual triggers
-          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9 }}
+          # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
           unique: false
           comment: |
               Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}

From e5b5bf0421c971df340fa30cf10d3038d3f3a16d Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 27 Nov 2024 19:02:05 +0100
Subject: [PATCH 34/34] set up llms

---
 .github/workflows/integration-runner.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index b8ff30248511..4a41ab28c979 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -53,7 +53,7 @@ jobs:
 
       - name: Configure config.toml for testing with Haiku
         env:
-          LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }}
+          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
         run: |
@@ -85,12 +85,14 @@ jobs:
 
       - name: Configure config.toml for testing with DeepSeek
         env:
-          LLM_MODEL: "deepseek/deepseek-chat"
-          LLM_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
           echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
       - name: Run integration test evaluation for DeepSeek