From eaf30578b1c6fd46e2bf0ca148574d47c6d934f2 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 16 Nov 2024 00:39:39 +0000 Subject: [PATCH 01/34] Fix issue #5076: Integration test github action --- .github/workflows/eval-runner.yml | 23 +--- .github/workflows/integration-runner.yml | 135 +++++++++++++++++++++++ 2 files changed, 136 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/integration-runner.yml diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml index f788cf78d2f8..6ebfb0ec6ad9 100644 --- a/.github/workflows/eval-runner.yml +++ b/.github/workflows/eval-runner.yml @@ -1,4 +1,4 @@ -name: Run Evaluation +name: Run SWE-Bench Evaluation on: pull_request: @@ -60,24 +60,6 @@ jobs: echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml echo "temperature = 0.0" >> config.toml - - name: Run integration test evaluation - env: - ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - RUNTIME: remote - SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images - - run: | - poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES - - # get evaluation report - REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1) - echo "REPORT_FILE: $REPORT_FILE" - echo "INTEGRATION_TEST_REPORT<> $GITHUB_ENV - cat $REPORT_FILE >> $GITHUB_ENV - echo >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - name: Run SWE-Bench evaluation env: ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} @@ -145,9 +127,6 @@ jobs: **SWE-Bench Evaluation Report** ${{ env.SWEBENCH_REPORT }} --- - **Integration Tests Evaluation Report** - ${{ env.INTEGRATION_TEST_REPORT }} - --- You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). - name: Post to a Slack channel diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml new file mode 100644 index 000000000000..4c3a04ffdb9f --- /dev/null +++ b/.github/workflows/integration-runner.yml @@ -0,0 +1,135 @@ +name: Run Integration Tests + +on: + pull_request: + types: [labeled] + workflow_dispatch: + inputs: + reason: + description: "Reason for manual trigger" + required: true + default: "" + +env: + N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation + +jobs: + run-integration-tests: + if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + permissions: + contents: "read" + id-token: "write" + pull-requests: "write" + issues: "write" + strategy: + matrix: + python-version: ["3.12"] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install poetry via pipx + run: pipx install poetry + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "poetry" + + - name: Comment on PR if 'integration-test' label is present + if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' + uses: KeisukeYamashita/create-comment@v1 + with: + unique: false + comment: | + Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. + + - name: Install Python dependencies using Poetry + run: poetry install + + - name: Configure config.toml for evaluation + env: + DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }} + run: | + echo "[llm.eval]" > config.toml + echo "model = \"deepseek/deepseek-chat\"" >> config.toml + echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml + echo "temperature = 0.0" >> config.toml + + - name: Run integration test evaluation + env: + ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + RUNTIME: remote + SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images + + run: | + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES + + # get evaluation report + REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1) + echo "REPORT_FILE: $REPORT_FILE" + echo "INTEGRATION_TEST_REPORT<> $GITHUB_ENV + cat $REPORT_FILE >> $GITHUB_ENV + echo >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Create tar.gz of evaluation outputs + run: | + TIMESTAMP=$(date +'%y-%m-%d-%H-%M') + tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs + + - name: Upload evaluation results as artifact + uses: actions/upload-artifact@v4 + id: upload_results_artifact + with: + name: integration-test-outputs + path: evaluation_outputs_*.tar.gz + + - name: Get artifact URL + run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV + + - name: Authenticate to Google Cloud + uses: 'google-github-actions/auth@v2' + with: + credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} + + - name: Set timestamp and trigger reason + run: | + echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV + else + echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV + fi + + - name: Upload evaluation results to Google Cloud Storage + uses: 'google-github-actions/upload-cloud-storage@v2' + with: + path: 'evaluation/evaluation_outputs/outputs' + destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' + + - name: Comment with evaluation results and artifact link + id: create_comment + uses: KeisukeYamashita/create-comment@v1 + with: + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }} + unique: false + comment: | + Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }} + Commit: ${{ github.sha }} + **Integration Tests Evaluation Report** + ${{ env.INTEGRATION_TEST_REPORT }} + --- + You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). + + - name: Post to a Slack channel + id: slack + uses: slackapi/slack-github-action@v1.27.0 + with: + channel-id: 'C07SVQSCR6F' + slack-message: "*Integration Tests Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})" + env: + SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }} From 14fe4c6a6bf5f4eaec1c292e8467d6aa9b0e6799 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sat, 23 Nov 2024 03:03:01 +0100 Subject: [PATCH 02/34] Update integration-runner.yml --- .github/workflows/integration-runner.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 4c3a04ffdb9f..b15938848d6f 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -124,12 +124,4 @@ jobs: ${{ env.INTEGRATION_TEST_REPORT }} --- You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). - - - name: Post to a Slack channel - id: slack - uses: slackapi/slack-github-action@v1.27.0 - with: - channel-id: 'C07SVQSCR6F' - slack-message: "*Integration Tests Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})" - env: - SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }} + From b415ad2a69b6230c1f1b0a1eeb63f67cd396b94e Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sat, 23 Nov 2024 03:11:15 +0100 Subject: [PATCH 03/34] Update integration-runner.yml --- .github/workflows/integration-runner.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index b15938848d6f..ecd440510413 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -11,7 +11,7 @@ on: default: "" env: - N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation + N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation jobs: run-integration-tests: @@ -115,7 +115,7 @@ jobs: id: create_comment uses: KeisukeYamashita/create-comment@v1 with: - number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }} + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} unique: false comment: | Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }} From 0fd1ddff64032a252e5aa6932a2b7e99e4042f6b Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sat, 23 Nov 2024 15:31:49 +0100 Subject: [PATCH 04/34] update variables --- .github/workflows/integration-runner.yml | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index ecd440510413..dd1657dfe6ac 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -51,11 +51,12 @@ jobs: - name: Configure config.toml for evaluation env: - DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} run: | echo "[llm.eval]" > config.toml - echo "model = \"deepseek/deepseek-chat\"" >> config.toml - echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml + echo "model = \"$LLM_MODEL\"" >> config.toml + echo "api_key = \"$LLM_API_KEY\"" >> config.toml echo "temperature = 0.0" >> config.toml - name: Run integration test evaluation @@ -91,10 +92,10 @@ jobs: - name: Get artifact URL run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV - - name: Authenticate to Google Cloud - uses: 'google-github-actions/auth@v2' - with: - credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} + # - name: Authenticate to Google Cloud + # uses: 'google-github-actions/auth@v2' + # with: + # credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} - name: Set timestamp and trigger reason run: | @@ -105,11 +106,11 @@ jobs: echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV fi - - name: Upload evaluation results to Google Cloud Storage - uses: 'google-github-actions/upload-cloud-storage@v2' - with: - path: 'evaluation/evaluation_outputs/outputs' - destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' + # - name: Upload evaluation results to Google Cloud Storage + # uses: 'google-github-actions/upload-cloud-storage@v2' + # with: + # path: 'evaluation/evaluation_outputs/outputs' + # destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' - name: Comment with evaluation results and artifact link id: create_comment @@ -124,4 +125,3 @@ jobs: ${{ env.INTEGRATION_TEST_REPORT }} --- You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). - From bc3f13657652f099134e97be19e17ee6cd0502ec Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sat, 23 Nov 2024 15:39:29 +0100 Subject: [PATCH 05/34] use haiku --- .github/workflows/integration-runner.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index dd1657dfe6ac..c2238495db7f 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -51,11 +51,11 @@ jobs: - name: Configure config.toml for evaluation env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} run: | echo "[llm.eval]" > config.toml - echo "model = \"$LLM_MODEL\"" >> config.toml + echo "model = \"$HAIKU_LLM_MODEL\"" >> config.toml echo "api_key = \"$LLM_API_KEY\"" >> config.toml echo "temperature = 0.0" >> config.toml From 73e88370d2693c349e29d5f1079c926caaf79c64 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sat, 23 Nov 2024 17:49:22 +0100 Subject: [PATCH 06/34] use base url --- .github/workflows/integration-runner.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index c2238495db7f..fb5ea65b3e1b 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -53,10 +53,12 @@ jobs: env: LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} run: | echo "[llm.eval]" > config.toml - echo "model = \"$HAIKU_LLM_MODEL\"" >> config.toml + echo "model = \"$LLM_MODEL\"" >> config.toml echo "api_key = \"$LLM_API_KEY\"" >> config.toml + echo "base_url = \"$LLM_BASE_URL\"" >> config.toml echo "temperature = 0.0" >> config.toml - name: Run integration test evaluation From 7af35189c7b5eb3702c063a988f6587ea207bdf7 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sat, 23 Nov 2024 18:03:56 +0100 Subject: [PATCH 07/34] fix report name --- .github/workflows/integration-runner.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index fb5ea65b3e1b..5946dc3e54bd 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -72,7 +72,7 @@ jobs: poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES # get evaluation report - REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1) + REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE: $REPORT_FILE" echo "INTEGRATION_TEST_REPORT<> $GITHUB_ENV cat $REPORT_FILE >> $GITHUB_ENV From dcd4681a3ffac876ebbbe06df61471071c240d33 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 25 Nov 2024 15:39:09 +0000 Subject: [PATCH 08/34] Fix pr #8: Integration tests (openhands fix issue 5076) --- .github/workflows/integration-tests.yml | 44 +++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/integration-tests.yml diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 000000000000..c435fc0eae89 --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,44 @@ +name: Integration Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 2 * * *' # Runs at 2 AM UTC every day + +jobs: + integration-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.10' + + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version: '18' + + - name: Install Poetry + run: | + pip install poetry + + - name: Install backend dependencies + run: | + poetry install + + - name: Install frontend dependencies + working-directory: ./frontend + run: | + npm install + + - name: Run integration tests + run: | + make build + poetry run pytest tests/integration From 1a24a946d7fa0e3b5320a5cd41fe50f0159a139d Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 17:09:30 +0100 Subject: [PATCH 09/34] Revert "Fix pr #8: Integration tests (openhands fix issue 5076)" This reverts commit dcd4681a3ffac876ebbbe06df61471071c240d33. --- .github/workflows/integration-tests.yml | 44 ------------------------- 1 file changed, 44 deletions(-) delete mode 100644 .github/workflows/integration-tests.yml diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml deleted file mode 100644 index c435fc0eae89..000000000000 --- a/.github/workflows/integration-tests.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: Integration Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - schedule: - - cron: '0 2 * * *' # Runs at 2 AM UTC every day - -jobs: - integration-tests: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.10' - - - name: Set up Node.js - uses: actions/setup-node@v3 - with: - node-version: '18' - - - name: Install Poetry - run: | - pip install poetry - - - name: Install backend dependencies - run: | - poetry install - - - name: Install frontend dependencies - working-directory: ./frontend - run: | - npm install - - - name: Run integration tests - run: | - make build - poetry run pytest tests/integration From 5e5eb0ff5a34fc10ab48d014a76421893c32fd6a Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 25 Nov 2024 16:13:54 +0000 Subject: [PATCH 10/34] Fix pr #8: Integration tests (openhands fix issue 5076) --- .github/workflows/integration-runner.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 5946dc3e54bd..1449bb31c722 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -9,13 +9,15 @@ on: description: "Reason for manual trigger" required: true default: "" + schedule: + - cron: '0 0 * * *' # Runs at midnight UTC every day env: N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation jobs: run-integration-tests: - if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' + if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' runs-on: ubuntu-latest permissions: contents: "read" @@ -104,8 +106,10 @@ jobs: echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV - else + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV + else + echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV fi # - name: Upload evaluation results to Google Cloud Storage @@ -121,7 +125,7 @@ jobs: number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} unique: false comment: | - Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }} + Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} Commit: ${{ github.sha }} **Integration Tests Evaluation Report** ${{ env.INTEGRATION_TEST_REPORT }} From 1f908675f9ff1646bdf4ce3aef2ba050c8c3305a Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 17:41:53 +0100 Subject: [PATCH 11/34] use haiku explicitly, in results too --- .github/workflows/integration-runner.yml | 29 +++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 5946dc3e54bd..5e8eb2e14669 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -3,6 +3,8 @@ name: Run Integration Tests on: pull_request: types: [labeled] + schedule: + - cron: "0 1 * * *" # 1 AM UTC every day workflow_dispatch: inputs: reason: @@ -11,7 +13,7 @@ on: default: "" env: - N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation + N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation jobs: run-integration-tests: @@ -47,9 +49,9 @@ jobs: Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. - name: Install Python dependencies using Poetry - run: poetry install + run: poetry install --without evaluation, llama-index - - name: Configure config.toml for evaluation + - name: Configure config.toml for testing with Haiku env: LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -71,10 +73,10 @@ jobs: run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES - # get evaluation report + # get integration tests report REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE: $REPORT_FILE" - echo "INTEGRATION_TEST_REPORT<> $GITHUB_ENV + echo "INTEGRATION_TEST_REPORT_HAIKU<> $GITHUB_ENV cat $REPORT_FILE >> $GITHUB_ENV echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV @@ -82,17 +84,17 @@ jobs: - name: Create tar.gz of evaluation outputs run: | TIMESTAMP=$(date +'%y-%m-%d-%H-%M') - tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs + tar -czvf outputs_haiku_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs - name: Upload evaluation results as artifact uses: actions/upload-artifact@v4 id: upload_results_artifact with: name: integration-test-outputs - path: evaluation_outputs_*.tar.gz + path: outputs_haiku_*.tar.gz - name: Get artifact URL - run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV + run: echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV # - name: Authenticate to Google Cloud # uses: 'google-github-actions/auth@v2' @@ -114,16 +116,17 @@ jobs: # path: 'evaluation/evaluation_outputs/outputs' # destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' - - name: Comment with evaluation results and artifact link + - name: Comment with results and artifact link id: create_comment uses: KeisukeYamashita/create-comment@v1 with: - number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} + # if triggered by PR, use PR number, otherwise use 9 as fallback issue number for manual triggers + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9 }} unique: false comment: | Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || format('Manual Trigger: {0}', github.event.inputs.reason) }} Commit: ${{ github.sha }} - **Integration Tests Evaluation Report** - ${{ env.INTEGRATION_TEST_REPORT }} + **Integration Tests Report (Haiku)** + ${{ env.INTEGRATION_TEST_REPORT_HAIKU }} --- - You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). + You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL_HAIKU }}). From fa9e65191cfdd5599679d088d279384ad39f778f Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 17:44:36 +0100 Subject: [PATCH 12/34] remove duplicate --- .github/workflows/integration-runner.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index c13b55db08ef..2bfaf0ac1641 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -3,8 +3,6 @@ name: Run Integration Tests on: pull_request: types: [labeled] - schedule: - - cron: "0 1 * * *" # 1 AM UTC every day workflow_dispatch: inputs: reason: From 7e7200e514961239cd4bb31169febdfef75101de Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 17:54:37 +0100 Subject: [PATCH 13/34] Update .github/workflows/integration-runner.yml --- .github/workflows/integration-runner.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 2bfaf0ac1641..ded969e50cf5 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -49,7 +49,7 @@ jobs: Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. - name: Install Python dependencies using Poetry - run: poetry install --without evaluation, llama-index + run: poetry install - name: Configure config.toml for testing with Haiku env: From 96ef986d88c90b9845073ce7a4bad62d3881f1b8 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 18:00:49 +0100 Subject: [PATCH 14/34] Revert "Update .github/workflows/integration-runner.yml" This reverts commit 7e7200e514961239cd4bb31169febdfef75101de. --- .github/workflows/integration-runner.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index ded969e50cf5..2bfaf0ac1641 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -49,7 +49,7 @@ jobs: Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. - name: Install Python dependencies using Poetry - run: poetry install + run: poetry install --without evaluation, llama-index - name: Configure config.toml for testing with Haiku env: From 7c2db5bbcd289e988f6812ea5fc6b2689ef3b62b Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 18:01:19 +0100 Subject: [PATCH 15/34] funny space --- .github/workflows/integration-runner.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 2bfaf0ac1641..dc5d28a34524 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -49,7 +49,7 @@ jobs: Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. - name: Install Python dependencies using Poetry - run: poetry install --without evaluation, llama-index + run: poetry install --without evaluation,llama-index - name: Configure config.toml for testing with Haiku env: From 76df32e5d7331c49be7043fb2fa77a948313829e Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 25 Nov 2024 17:05:19 +0000 Subject: [PATCH 16/34] Fix pr #8: Integration tests (openhands fix issue 5076) --- .github/workflows/integration-runner.yml | 55 +++++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index dc5d28a34524..b9cd0b7dca50 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -63,7 +63,7 @@ jobs: echo "base_url = \"$LLM_BASE_URL\"" >> config.toml echo "temperature = 0.0" >> config.toml - - name: Run integration test evaluation + - name: Run integration test evaluation for Haiku env: ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} RUNTIME: remote @@ -81,20 +81,57 @@ jobs: echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - - name: Create tar.gz of evaluation outputs + - name: Configure config.toml for testing with DeepSeek + env: + LLM_MODEL: "deepseek/deepseek-chat" + LLM_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} + run: | + echo "[llm.eval]" > config.toml + echo "model = \"$LLM_MODEL\"" >> config.toml + echo "api_key = \"$LLM_API_KEY\"" >> config.toml + echo "temperature = 0.0" >> config.toml + + - name: Run integration test evaluation for DeepSeek + env: + ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + RUNTIME: remote + SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images + + run: | + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES + + # get integration tests report + REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1) + echo "REPORT_FILE: $REPORT_FILE" + echo "INTEGRATION_TEST_REPORT_DEEPSEEK<> $GITHUB_ENV + cat $REPORT_FILE >> $GITHUB_ENV + echo >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Create tar.gz of Haiku evaluation outputs run: | TIMESTAMP=$(date +'%y-%m-%d-%H-%M') tar -czvf outputs_haiku_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs + - name: Create tar.gz of DeepSeek evaluation outputs + run: | + TIMESTAMP=$(date +'%y-%m-%d-%H-%M') + tar -czvf outputs_deepseek_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs + - name: Upload evaluation results as artifact uses: actions/upload-artifact@v4 id: upload_results_artifact with: name: integration-test-outputs - path: outputs_haiku_*.tar.gz + path: | + outputs_haiku_*.tar.gz + outputs_deepseek_*.tar.gz - - name: Get artifact URL - run: echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV + - name: Get artifact URLs + run: | + echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV + echo "ARTIFACT_URL_DEEPSEEK=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV # - name: Authenticate to Google Cloud # uses: 'google-github-actions/auth@v2' @@ -129,6 +166,12 @@ jobs: Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} Commit: ${{ github.sha }} **Integration Tests Report (Haiku)** + Haiku LLM Test Results: ${{ env.INTEGRATION_TEST_REPORT_HAIKU }} --- - You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL_HAIKU }}). + **Integration Tests Report (DeepSeek)** + DeepSeek LLM Test Results: + ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} + --- + Haiku Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_HAIKU }}) + DeepSeek Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_DEEPSEEK }}) From 78951201cd826d7874efe974d4d3e763fdb4e3dd Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 18:21:55 +0100 Subject: [PATCH 17/34] artifact fix --- .github/workflows/integration-runner.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index b9cd0b7dca50..fcf124f203ba 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -123,7 +123,8 @@ jobs: uses: actions/upload-artifact@v4 id: upload_results_artifact with: - name: integration-test-outputs + # using a single artifact with both archives since they are related to same test run + name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} path: | outputs_haiku_*.tar.gz outputs_deepseek_*.tar.gz @@ -173,5 +174,4 @@ jobs: DeepSeek LLM Test Results: ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} --- - Haiku Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_HAIKU }}) - DeepSeek Evaluation Outputs: [Download](${{ env.ARTIFACT_URL_DEEPSEEK }}) + Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) From 4e178d532ea9c3dc42d111bf66d83e6338cc14a2 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 18:34:41 +0100 Subject: [PATCH 18/34] clean up remote runtimes --- .github/workflows/integration-runner.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index fcf124f203ba..a4298cfa3237 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -156,6 +156,14 @@ jobs: # path: 'evaluation/evaluation_outputs/outputs' # destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' + - name: Cleanup remote runtimes + if: always() # run this step even if previous steps failed + env: + ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + run: | + poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh + - name: Comment with results and artifact link id: create_comment uses: KeisukeYamashita/create-comment@v1 From fa2544581dd848dcb03ea6d3e8caa9b72f1ad0d2 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 19:16:57 +0100 Subject: [PATCH 19/34] clean up runtimes more aggressively - a bit unexpected though --- .github/workflows/integration-runner.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index a4298cfa3237..9015c76ea9f4 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -69,7 +69,6 @@ jobs: RUNTIME: remote SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images - run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES @@ -81,6 +80,16 @@ jobs: echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV + - name: Cleanup Haiku runtimes + if: always() + env: + ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + run: | + poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh + # Add a small delay to ensure cleanup is complete + sleep 10 + - name: Configure config.toml for testing with DeepSeek env: LLM_MODEL: "deepseek/deepseek-chat" From 4ceda73522a761de4eeaf51a1630b424fc3c2bad Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 25 Nov 2024 18:42:19 +0000 Subject: [PATCH 20/34] Fix pr #8: Integration tests (openhands fix issue 5076) --- evaluation/integration_tests/run_infer.py | 8 ++++ .../tests/t05_simple_browsing.py | 33 +++++++++----- .../tests/t06_github_pr_browsing.py | 45 ++++++++++++------- 3 files changed, 60 insertions(+), 26 deletions(-) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 5e3205fefe2e..ba8b2f124754 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -130,6 +130,14 @@ def process_instance( # # ============================================= histories = [event_to_dict(event) for event in state.history] + + # Debug logging + logger.info(f"Total events in history: {len(histories)}") + for event in histories: + logger.info(f"Event type: {event.get('type', 'Unknown')}") + if 'content' in event: + logger.info(f"Event content: {event['content']}") + test_result: TestResult = test_class.verify_result(runtime, histories) metrics = state.metrics.get() if state.metrics else None diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py index 8f08cb4e7250..d08aff81f883 100644 --- a/evaluation/integration_tests/tests/t05_simple_browsing.py +++ b/evaluation/integration_tests/tests/t05_simple_browsing.py @@ -108,6 +108,11 @@ def initialize_runtime(cls, runtime: Runtime) -> None: @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: + # Log all events for debugging + from openhands.core.logger import openhands_logger as logger + logger.info("Verifying simple browsing test result") + logger.info(f"Total events: {len(histories)}") + # check if the "The answer is OpenHands is all you need!" is in any message message_actions = [ event @@ -116,18 +121,26 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: event, (MessageAction, AgentFinishAction, AgentDelegateObservation) ) ] + logger.info(f"Total message-like events: {len(message_actions)}") + for event in message_actions: - if isinstance(event, AgentDelegateObservation): - content = event.content - elif isinstance(event, AgentFinishAction): - content = event.outputs.get('content', '') - elif isinstance(event, MessageAction): - content = event.content - else: - raise ValueError(f'Unknown event type: {type(event)}') + try: + if isinstance(event, AgentDelegateObservation): + content = event.get('content', '') + elif isinstance(event, AgentFinishAction): + content = event.get('outputs', {}).get('content', '') + elif isinstance(event, MessageAction): + content = event.get('content', '') + else: + logger.warning(f'Unknown event type: {type(event)}') + continue + + logger.info(f"Checking event content: {content}") + if 'OpenHands is all you need!' in content: + return TestResult(success=True) + except Exception as e: + logger.error(f"Error processing event: {e}") - if 'OpenHands is all you need!' in content: - return TestResult(success=True) return TestResult( success=False, reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}', diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 52ec927cd334..0f95a3ead73c 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -14,7 +14,12 @@ def initialize_runtime(cls, runtime: Runtime) -> None: @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: - # check if the "The answer is OpenHands is all you need!" is in any message + # Log all events for debugging + from openhands.core.logger import openhands_logger as logger + logger.info("Verifying GitHub PR browsing test result") + logger.info(f"Total events: {len(histories)}") + + # check if the license information is in any message message_actions = [ event for event in histories @@ -22,22 +27,30 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: event, (MessageAction, AgentFinishAction, AgentDelegateObservation) ) ] + logger.info(f"Total message-like events: {len(message_actions)}") + for event in message_actions: - if isinstance(event, AgentDelegateObservation): - content = event.content - elif isinstance(event, AgentFinishAction): - content = event.outputs.get('content', '') - elif isinstance(event, MessageAction): - content = event.content - else: - raise ValueError(f'Unknown event type: {type(event)}') - - if ( - 'non-commercial' in content - or 'MIT' in content - or 'Apache 2.0' in content - ): - return TestResult(success=True) + try: + if isinstance(event, AgentDelegateObservation): + content = event.get('content', '') + elif isinstance(event, AgentFinishAction): + content = event.get('outputs', {}).get('content', '') + elif isinstance(event, MessageAction): + content = event.get('content', '') + else: + logger.warning(f'Unknown event type: {type(event)}') + continue + + logger.info(f"Checking event content: {content}") + if ( + 'non-commercial' in content + or 'MIT' in content + or 'Apache 2.0' in content + ): + return TestResult(success=True) + except Exception as e: + logger.error(f"Error processing event: {e}") + return TestResult( success=False, reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}', From 194a1fb74204b61265ed21e20cc9d8a9a2990f29 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 20:07:25 +0100 Subject: [PATCH 21/34] fix type issue that was preventing checking results --- evaluation/integration_tests/run_infer.py | 16 ++++++---------- .../tests/t05_simple_browsing.py | 10 +++------- .../tests/t06_github_pr_browsing.py | 10 +++------- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index ba8b2f124754..f1016c7c48fe 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -26,7 +26,6 @@ from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction -from openhands.events.serialization.event import event_to_dict from openhands.runtime.base import Runtime from openhands.utils.async_utils import call_async_from_sync @@ -129,15 +128,12 @@ def process_instance( # # result evaluation # # ============================================= - histories = [event_to_dict(event) for event in state.history] - - # Debug logging - logger.info(f"Total events in history: {len(histories)}") - for event in histories: - logger.info(f"Event type: {event.get('type', 'Unknown')}") - if 'content' in event: - logger.info(f"Event content: {event['content']}") - + histories = state.history + + # some basic check + logger.info(f'Total events in history: {len(histories)}') + assert len(histories) > 0, 'History should not be empty' + test_result: TestResult = test_class.verify_result(runtime, histories) metrics = state.metrics.get() if state.metrics else None diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py index d08aff81f883..e3c624684eb8 100644 --- a/evaluation/integration_tests/tests/t05_simple_browsing.py +++ b/evaluation/integration_tests/tests/t05_simple_browsing.py @@ -108,10 +108,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None: @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: - # Log all events for debugging from openhands.core.logger import openhands_logger as logger - logger.info("Verifying simple browsing test result") - logger.info(f"Total events: {len(histories)}") # check if the "The answer is OpenHands is all you need!" is in any message message_actions = [ @@ -121,7 +118,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: event, (MessageAction, AgentFinishAction, AgentDelegateObservation) ) ] - logger.info(f"Total message-like events: {len(message_actions)}") + logger.debug(f'Total message-like events: {len(message_actions)}') for event in message_actions: try: @@ -132,14 +129,13 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: elif isinstance(event, MessageAction): content = event.get('content', '') else: - logger.warning(f'Unknown event type: {type(event)}') + logger.warning(f'Unexpected event type: {type(event)}') continue - logger.info(f"Checking event content: {content}") if 'OpenHands is all you need!' in content: return TestResult(success=True) except Exception as e: - logger.error(f"Error processing event: {e}") + logger.error(f'Error processing event: {e}') return TestResult( success=False, diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 0f95a3ead73c..697ff49df371 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -14,10 +14,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None: @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: - # Log all events for debugging from openhands.core.logger import openhands_logger as logger - logger.info("Verifying GitHub PR browsing test result") - logger.info(f"Total events: {len(histories)}") # check if the license information is in any message message_actions = [ @@ -27,7 +24,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: event, (MessageAction, AgentFinishAction, AgentDelegateObservation) ) ] - logger.info(f"Total message-like events: {len(message_actions)}") + logger.info(f'Total message-like events: {len(message_actions)}') for event in message_actions: try: @@ -38,10 +35,9 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: elif isinstance(event, MessageAction): content = event.get('content', '') else: - logger.warning(f'Unknown event type: {type(event)}') + logger.warning(f'Unexpected event type: {type(event)}') continue - logger.info(f"Checking event content: {content}") if ( 'non-commercial' in content or 'MIT' in content @@ -49,7 +45,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: ): return TestResult(success=True) except Exception as e: - logger.error(f"Error processing event: {e}") + logger.error(f'Error processing event: {e}') return TestResult( success=False, From 57d590665fa566954c25fc383549a3e65ff07a81 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 20:33:20 +0100 Subject: [PATCH 22/34] try with waiting time --- .github/workflows/integration-runner.yml | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 9015c76ea9f4..d38918230fb9 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -80,15 +80,18 @@ jobs: echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - - name: Cleanup Haiku runtimes - if: always() - env: - ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - run: | - poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh - # Add a small delay to ensure cleanup is complete - sleep 10 + #- name: Cleanup Haiku runtimes + # if: always() + # env: + # ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + # SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + # run: | + # poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh + # # Add a small delay to ensure cleanup is complete + # sleep 10 + + - name: Wait a little bit + run: sleep 40 - name: Configure config.toml for testing with DeepSeek env: From cafedcb63ec7619af6594b81ada2845b8d9f1bb8 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 20:38:13 +0100 Subject: [PATCH 23/34] add eval notes --- .github/workflows/integration-runner.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index d38918230fb9..6a420f4c997a 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -70,7 +70,7 @@ jobs: SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images run: | - poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' # get integration tests report REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1) @@ -111,7 +111,7 @@ jobs: EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images run: | - poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' # get integration tests report REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1) From f935f0df740fce773bc800a20bb9e755d0376310 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 21:18:36 +0100 Subject: [PATCH 24/34] increase timeouts --- evaluation/integration_tests/run_infer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index f1016c7c48fe..bbcd00ed8191 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -47,9 +47,13 @@ def get_config( # use default base_container_image enable_auto_lint=True, use_host_network=False, - timeout=100, + timeout=300, + # Add platform to the sandbox config to solve issue 4401 + platform='linux/amd64', api_key=os.environ.get('ALLHANDS_API_KEY', None), remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=3600, ), # do not mount workspace workspace_base=None, From 34a30eeacbe2922fce14dd25370ee4c6e6306c02 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 21:33:14 +0100 Subject: [PATCH 25/34] try with CI local builds --- .github/workflows/integration-runner.yml | 38 +++++++++++++----------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 6a420f4c997a..ef4e2ef2bb54 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -63,12 +63,16 @@ jobs: echo "base_url = \"$LLM_BASE_URL\"" >> config.toml echo "temperature = 0.0" >> config.toml + - name: Build environment + run: make build + - name: Run integration test evaluation for Haiku env: - ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - RUNTIME: remote - SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images + #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + #RUNTIME: remote + #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images + SANDBOX_FORCE_REBUILD_RUNTIME: True run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' @@ -91,7 +95,7 @@ jobs: # sleep 10 - name: Wait a little bit - run: sleep 40 + run: sleep 10 - name: Configure config.toml for testing with DeepSeek env: @@ -105,11 +109,11 @@ jobs: - name: Run integration test evaluation for DeepSeek env: - ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - RUNTIME: remote - SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images - + #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + #RUNTIME: remote + #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images + SANDBOX_FORCE_REBUILD_RUNTIME: True run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' @@ -168,13 +172,13 @@ jobs: # path: 'evaluation/evaluation_outputs/outputs' # destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' - - name: Cleanup remote runtimes - if: always() # run this step even if previous steps failed - env: - ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - run: | - poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh + # - name: Cleanup remote runtimes + # if: always() # run this step even if previous steps failed + # env: + # ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + # SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + # run: | + # poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh - name: Comment with results and artifact link id: create_comment From d48fac004c9afeae4c0e0b349cff4213509aab47 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 21:47:27 +0100 Subject: [PATCH 26/34] fix eval output --- evaluation/integration_tests/run_infer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index bbcd00ed8191..19df57fe8c33 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -26,6 +26,7 @@ from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction +from openhands.events.serialization.event import event_to_dict from openhands.runtime.base import Runtime from openhands.utils.async_utils import call_async_from_sync @@ -147,7 +148,7 @@ def process_instance( instance=instance.to_dict(), instruction=instruction, metadata=metadata, - history=histories, + history=[event_to_dict(event) for event in histories], metrics=metrics, error=state.last_error if state and state.last_error else None, test_result=test_result.model_dump(), From d4a21d0871b182b27323a4ae681be4f6d32fcf0b Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 21:55:33 +0100 Subject: [PATCH 27/34] set debug --- evaluation/integration_tests/run_infer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 19df57fe8c33..8a21b12ae5b2 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -59,6 +59,8 @@ def get_config( # do not mount workspace workspace_base=None, workspace_mount_path=None, + # debug + debug=True, ) config.set_llm_config( update_llm_config_for_completions_logging( From e391604231afacba2795515bfc30129767120819 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 21:58:27 +0100 Subject: [PATCH 28/34] fix tests! --- evaluation/integration_tests/tests/t05_simple_browsing.py | 6 +++--- .../integration_tests/tests/t06_github_pr_browsing.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py index e3c624684eb8..3c4cf875cc90 100644 --- a/evaluation/integration_tests/tests/t05_simple_browsing.py +++ b/evaluation/integration_tests/tests/t05_simple_browsing.py @@ -123,11 +123,11 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: for event in message_actions: try: if isinstance(event, AgentDelegateObservation): - content = event.get('content', '') + content = event.content elif isinstance(event, AgentFinishAction): - content = event.get('outputs', {}).get('content', '') + content = event.outputs.get('content', '') elif isinstance(event, MessageAction): - content = event.get('content', '') + content = event.content else: logger.warning(f'Unexpected event type: {type(event)}') continue diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 697ff49df371..1797e6b6beed 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -29,11 +29,11 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: for event in message_actions: try: if isinstance(event, AgentDelegateObservation): - content = event.get('content', '') + content = event.content elif isinstance(event, AgentFinishAction): - content = event.get('outputs', {}).get('content', '') + content = event.outputs.get('content', '') elif isinstance(event, MessageAction): - content = event.get('content', '') + content = event.content else: logger.warning(f'Unexpected event type: {type(event)}') continue From 6ff6fe2a018edf223118e0d403c1c32294d42605 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 22:22:11 +0100 Subject: [PATCH 29/34] fix outputs --- .github/workflows/integration-runner.yml | 30 +++++++++--------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index ef4e2ef2bb54..15e5c81e5728 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -77,10 +77,10 @@ jobs: poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' # get integration tests report - REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1) - echo "REPORT_FILE: $REPORT_FILE" + REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) + echo "REPORT_FILE: $REPORT_FILE_HAIKU" echo "INTEGRATION_TEST_REPORT_HAIKU<> $GITHUB_ENV - cat $REPORT_FILE >> $GITHUB_ENV + cat $REPORT_FILE_HAIKU >> $GITHUB_ENV echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV @@ -118,37 +118,29 @@ jobs: poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' # get integration tests report - REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*_maxiter_10_N* -name "report.md" -type f | head -n 1) - echo "REPORT_FILE: $REPORT_FILE" + REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) + echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK" echo "INTEGRATION_TEST_REPORT_DEEPSEEK<> $GITHUB_ENV - cat $REPORT_FILE >> $GITHUB_ENV + cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - - name: Create tar.gz of Haiku evaluation outputs + - name: Create archive of evaluation outputs run: | TIMESTAMP=$(date +'%y-%m-%d-%H-%M') - tar -czvf outputs_haiku_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs - - - name: Create tar.gz of DeepSeek evaluation outputs - run: | - TIMESTAMP=$(date +'%y-%m-%d-%H-%M') - tar -czvf outputs_deepseek_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs + cd evaluation/evaluation_outputs/outputs # Change to the outputs directory + tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories - name: Upload evaluation results as artifact uses: actions/upload-artifact@v4 id: upload_results_artifact with: - # using a single artifact with both archives since they are related to same test run name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} - path: | - outputs_haiku_*.tar.gz - outputs_deepseek_*.tar.gz + path: integration_tests_*.tar.gz - name: Get artifact URLs run: | - echo "ARTIFACT_URL_HAIKU=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV - echo "ARTIFACT_URL_DEEPSEEK=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV + echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV # - name: Authenticate to Google Cloud # uses: 'google-github-actions/auth@v2' From 1956f06755d87c05bbb26d4038b69da96cb978aa Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 22:37:11 +0100 Subject: [PATCH 30/34] keep details in logs, not github comment --- evaluation/integration_tests/tests/t05_simple_browsing.py | 5 ++++- evaluation/integration_tests/tests/t06_github_pr_browsing.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py index 3c4cf875cc90..96bb47875aec 100644 --- a/evaluation/integration_tests/tests/t05_simple_browsing.py +++ b/evaluation/integration_tests/tests/t05_simple_browsing.py @@ -137,7 +137,10 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: except Exception as e: logger.error(f'Error processing event: {e}') + logger.debug( + f'Total messages: {len(message_actions)}. Messages: {message_actions}' + ) return TestResult( success=False, - reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}', + reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.', ) diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 1797e6b6beed..2dc1a01ecd97 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -47,7 +47,10 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: except Exception as e: logger.error(f'Error processing event: {e}') + logger.debug( + f'Total messages: {len(message_actions)}. Messages: {message_actions}' + ) return TestResult( success=False, - reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}', + reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.', ) From b5c2519e4ac627164fafd8048beb052b42a474ec Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 22:53:54 +0100 Subject: [PATCH 31/34] tweak schedule --- .github/workflows/integration-runner.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 15e5c81e5728..efc8a6e20cc5 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -10,7 +10,7 @@ on: required: true default: "" schedule: - - cron: '0 0 * * *' # Runs at midnight UTC every day + - cron: '0 22 * * *' # Runs at 10pm UTC every day env: N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation From 0c2218156988d29c352c85a8390c82db4db6e2a9 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 25 Nov 2024 23:16:53 +0100 Subject: [PATCH 32/34] lint-y --- .github/workflows/integration-runner.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index efc8a6e20cc5..e698decd9be7 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -6,11 +6,11 @@ on: workflow_dispatch: inputs: reason: - description: "Reason for manual trigger" + description: 'Reason for manual trigger' required: true - default: "" + default: '' schedule: - - cron: '0 22 * * *' # Runs at 10pm UTC every day + - cron: '30 22 * * *' # Runs at 10:30pm UTC every day env: N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation From 605a24f7f165717d8d0ad2645c3d20934fec2f5d Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Wed, 27 Nov 2024 00:33:42 +0100 Subject: [PATCH 33/34] clean up --- .github/workflows/integration-runner.yml | 41 ++---------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index e698decd9be7..b8ff30248511 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -68,10 +68,6 @@ jobs: - name: Run integration test evaluation for Haiku env: - #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - #RUNTIME: remote - #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images SANDBOX_FORCE_REBUILD_RUNTIME: True run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' @@ -84,16 +80,6 @@ jobs: echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - #- name: Cleanup Haiku runtimes - # if: always() - # env: - # ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - # SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - # run: | - # poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh - # # Add a small delay to ensure cleanup is complete - # sleep 10 - - name: Wait a little bit run: sleep 10 @@ -109,10 +95,6 @@ jobs: - name: Run integration test evaluation for DeepSeek env: - #ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - #RUNTIME: remote - #SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - #EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images SANDBOX_FORCE_REBUILD_RUNTIME: True run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' @@ -142,11 +124,6 @@ jobs: run: | echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV - # - name: Authenticate to Google Cloud - # uses: 'google-github-actions/auth@v2' - # with: - # credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} - - name: Set timestamp and trigger reason run: | echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV @@ -158,26 +135,12 @@ jobs: echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV fi - # - name: Upload evaluation results to Google Cloud Storage - # uses: 'google-github-actions/upload-cloud-storage@v2' - # with: - # path: 'evaluation/evaluation_outputs/outputs' - # destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' - - # - name: Cleanup remote runtimes - # if: always() # run this step even if previous steps failed - # env: - # ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - # SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - # run: | - # poetry run ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh - - name: Comment with results and artifact link id: create_comment uses: KeisukeYamashita/create-comment@v1 with: - # if triggered by PR, use PR number, otherwise use 9 as fallback issue number for manual triggers - number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9 }} + # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} unique: false comment: | Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} From e5b5bf0421c971df340fa30cf10d3038d3f3a16d Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Wed, 27 Nov 2024 19:02:05 +0100 Subject: [PATCH 34/34] set up llms --- .github/workflows/integration-runner.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index b8ff30248511..4a41ab28c979 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -53,7 +53,7 @@ jobs: - name: Configure config.toml for testing with Haiku env: - LLM_MODEL: ${{ secrets.HAIKU_LLM_MODEL }} + LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} run: | @@ -85,12 +85,14 @@ jobs: - name: Configure config.toml for testing with DeepSeek env: - LLM_MODEL: "deepseek/deepseek-chat" - LLM_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} + LLM_MODEL: "litellm_proxy/deepseek-chat" + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} run: | echo "[llm.eval]" > config.toml echo "model = \"$LLM_MODEL\"" >> config.toml echo "api_key = \"$LLM_API_KEY\"" >> config.toml + echo "base_url = \"$LLM_BASE_URL\"" >> config.toml echo "temperature = 0.0" >> config.toml - name: Run integration test evaluation for DeepSeek