diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 4a41ab28c979..120572aa0cdd 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -141,8 +141,8 @@ jobs: id: create_comment uses: KeisukeYamashita/create-comment@v1 with: - # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers - number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} + # if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }} unique: false comment: | Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} @@ -155,4 +155,4 @@ jobs: DeepSeek LLM Test Results: ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} --- - Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) + Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 3b6f1c6ff2cc..2da68b9b82b9 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -218,6 +218,8 @@ def load_integration_tests() -> pd.DataFrame: ) df = pd.read_json(output_file, lines=True, orient='records') + + # record success and reason for failure for the final report df['success'] = df['test_result'].apply(lambda x: x['success']) df['reason'] = df['test_result'].apply(lambda x: x['reason']) logger.info('-' * 100) @@ -231,9 +233,16 @@ def load_integration_tests() -> pd.DataFrame: ) logger.info('-' * 100) + # record cost for each instance, with 3 decimal places + df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3)) + logger.info(f'Total cost: USD {df["cost"].sum():.2f}') + report_file = os.path.join(metadata.eval_output_dir, 'report.md') with open(report_file, 'w') as f: f.write( f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n' ) - f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False)) + f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n') + f.write( + df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False) + )