diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml index 9b2576a2644e..b1ace09b35de 100644 --- a/.github/workflows/eval-runner.yml +++ b/.github/workflows/eval-runner.yml @@ -1,4 +1,4 @@ -name: Run Evaluation +name: Run SWE-Bench Evaluation on: pull_request: @@ -58,24 +58,6 @@ jobs: echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml echo "temperature = 0.0" >> config.toml - - name: Run integration test evaluation - env: - ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} - RUNTIME: remote - SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev - EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images - - run: | - poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES - - # get evaluation report - REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1) - echo "REPORT_FILE: $REPORT_FILE" - echo "INTEGRATION_TEST_REPORT<> $GITHUB_ENV - cat $REPORT_FILE >> $GITHUB_ENV - echo >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - name: Run SWE-Bench evaluation env: ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} @@ -143,9 +125,6 @@ jobs: **SWE-Bench Evaluation Report** ${{ env.SWEBENCH_REPORT }} --- - **Integration Tests Evaluation Report** - ${{ env.INTEGRATION_TEST_REPORT }} - --- You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). - name: Post to a Slack channel diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml new file mode 100644 index 000000000000..4a41ab28c979 --- /dev/null +++ b/.github/workflows/integration-runner.yml @@ -0,0 +1,158 @@ +name: Run Integration Tests + +on: + pull_request: + types: [labeled] + workflow_dispatch: + inputs: + reason: + description: 'Reason for manual trigger' + required: true + default: '' + schedule: + - cron: '30 22 * * *' # Runs at 10:30pm UTC every day + +env: + N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation + +jobs: + run-integration-tests: + if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + runs-on: ubuntu-latest + permissions: + contents: "read" + id-token: "write" + pull-requests: "write" + issues: "write" + strategy: + matrix: + python-version: ["3.12"] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install poetry via pipx + run: pipx install poetry + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "poetry" + + - name: Comment on PR if 'integration-test' label is present + if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' + uses: KeisukeYamashita/create-comment@v1 + with: + unique: false + comment: | + Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. + + - name: Install Python dependencies using Poetry + run: poetry install --without evaluation,llama-index + + - name: Configure config.toml for testing with Haiku + env: + LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} + run: | + echo "[llm.eval]" > config.toml + echo "model = \"$LLM_MODEL\"" >> config.toml + echo "api_key = \"$LLM_API_KEY\"" >> config.toml + echo "base_url = \"$LLM_BASE_URL\"" >> config.toml + echo "temperature = 0.0" >> config.toml + + - name: Build environment + run: make build + + - name: Run integration test evaluation for Haiku + env: + SANDBOX_FORCE_REBUILD_RUNTIME: True + run: | + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' + + # get integration tests report + REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) + echo "REPORT_FILE: $REPORT_FILE_HAIKU" + echo "INTEGRATION_TEST_REPORT_HAIKU<> $GITHUB_ENV + cat $REPORT_FILE_HAIKU >> $GITHUB_ENV + echo >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Wait a little bit + run: sleep 10 + + - name: Configure config.toml for testing with DeepSeek + env: + LLM_MODEL: "litellm_proxy/deepseek-chat" + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} + run: | + echo "[llm.eval]" > config.toml + echo "model = \"$LLM_MODEL\"" >> config.toml + echo "api_key = \"$LLM_API_KEY\"" >> config.toml + echo "base_url = \"$LLM_BASE_URL\"" >> config.toml + echo "temperature = 0.0" >> config.toml + + - name: Run integration test evaluation for DeepSeek + env: + SANDBOX_FORCE_REBUILD_RUNTIME: True + run: | + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' + + # get integration tests report + REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) + echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK" + echo "INTEGRATION_TEST_REPORT_DEEPSEEK<> $GITHUB_ENV + cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV + echo >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Create archive of evaluation outputs + run: | + TIMESTAMP=$(date +'%y-%m-%d-%H-%M') + cd evaluation/evaluation_outputs/outputs # Change to the outputs directory + tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories + + - name: Upload evaluation results as artifact + uses: actions/upload-artifact@v4 + id: upload_results_artifact + with: + name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} + path: integration_tests_*.tar.gz + + - name: Get artifact URLs + run: | + echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV + + - name: Set timestamp and trigger reason + run: | + echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV + else + echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV + fi + + - name: Comment with results and artifact link + id: create_comment + uses: KeisukeYamashita/create-comment@v1 + with: + # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} + unique: false + comment: | + Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} + Commit: ${{ github.sha }} + **Integration Tests Report (Haiku)** + Haiku LLM Test Results: + ${{ env.INTEGRATION_TEST_REPORT_HAIKU }} + --- + **Integration Tests Report (DeepSeek)** + DeepSeek LLM Test Results: + ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} + --- + Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 5e3205fefe2e..8a21b12ae5b2 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -48,13 +48,19 @@ def get_config( # use default base_container_image enable_auto_lint=True, use_host_network=False, - timeout=100, + timeout=300, + # Add platform to the sandbox config to solve issue 4401 + platform='linux/amd64', api_key=os.environ.get('ALLHANDS_API_KEY', None), remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=3600, ), # do not mount workspace workspace_base=None, workspace_mount_path=None, + # debug + debug=True, ) config.set_llm_config( update_llm_config_for_completions_logging( @@ -129,7 +135,12 @@ def process_instance( # # result evaluation # # ============================================= - histories = [event_to_dict(event) for event in state.history] + histories = state.history + + # some basic check + logger.info(f'Total events in history: {len(histories)}') + assert len(histories) > 0, 'History should not be empty' + test_result: TestResult = test_class.verify_result(runtime, histories) metrics = state.metrics.get() if state.metrics else None @@ -139,7 +150,7 @@ def process_instance( instance=instance.to_dict(), instruction=instruction, metadata=metadata, - history=histories, + history=[event_to_dict(event) for event in histories], metrics=metrics, error=state.last_error if state and state.last_error else None, test_result=test_result.model_dump(), diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py index 8f08cb4e7250..96bb47875aec 100644 --- a/evaluation/integration_tests/tests/t05_simple_browsing.py +++ b/evaluation/integration_tests/tests/t05_simple_browsing.py @@ -108,6 +108,8 @@ def initialize_runtime(cls, runtime: Runtime) -> None: @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: + from openhands.core.logger import openhands_logger as logger + # check if the "The answer is OpenHands is all you need!" is in any message message_actions = [ event @@ -116,19 +118,29 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: event, (MessageAction, AgentFinishAction, AgentDelegateObservation) ) ] + logger.debug(f'Total message-like events: {len(message_actions)}') + for event in message_actions: - if isinstance(event, AgentDelegateObservation): - content = event.content - elif isinstance(event, AgentFinishAction): - content = event.outputs.get('content', '') - elif isinstance(event, MessageAction): - content = event.content - else: - raise ValueError(f'Unknown event type: {type(event)}') + try: + if isinstance(event, AgentDelegateObservation): + content = event.content + elif isinstance(event, AgentFinishAction): + content = event.outputs.get('content', '') + elif isinstance(event, MessageAction): + content = event.content + else: + logger.warning(f'Unexpected event type: {type(event)}') + continue - if 'OpenHands is all you need!' in content: - return TestResult(success=True) + if 'OpenHands is all you need!' in content: + return TestResult(success=True) + except Exception as e: + logger.error(f'Error processing event: {e}') + + logger.debug( + f'Total messages: {len(message_actions)}. Messages: {message_actions}' + ) return TestResult( success=False, - reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}', + reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.', ) diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 52ec927cd334..2dc1a01ecd97 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -14,7 +14,9 @@ def initialize_runtime(cls, runtime: Runtime) -> None: @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: - # check if the "The answer is OpenHands is all you need!" is in any message + from openhands.core.logger import openhands_logger as logger + + # check if the license information is in any message message_actions = [ event for event in histories @@ -22,23 +24,33 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: event, (MessageAction, AgentFinishAction, AgentDelegateObservation) ) ] + logger.info(f'Total message-like events: {len(message_actions)}') + for event in message_actions: - if isinstance(event, AgentDelegateObservation): - content = event.content - elif isinstance(event, AgentFinishAction): - content = event.outputs.get('content', '') - elif isinstance(event, MessageAction): - content = event.content - else: - raise ValueError(f'Unknown event type: {type(event)}') - - if ( - 'non-commercial' in content - or 'MIT' in content - or 'Apache 2.0' in content - ): - return TestResult(success=True) + try: + if isinstance(event, AgentDelegateObservation): + content = event.content + elif isinstance(event, AgentFinishAction): + content = event.outputs.get('content', '') + elif isinstance(event, MessageAction): + content = event.content + else: + logger.warning(f'Unexpected event type: {type(event)}') + continue + + if ( + 'non-commercial' in content + or 'MIT' in content + or 'Apache 2.0' in content + ): + return TestResult(success=True) + except Exception as e: + logger.error(f'Error processing event: {e}') + + logger.debug( + f'Total messages: {len(message_actions)}. Messages: {message_actions}' + ) return TestResult( success=False, - reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}', + reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.', )