Evals #699
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Evals | |
on: | |
push: | |
branches: | |
- main | |
pull_request: | |
types: | |
- opened | |
- synchronize | |
schedule: | |
- cron: "0 */6 * * *" # every 6 hours | |
env: | |
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" | |
EXPERIMENTAL_EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest,o1-mini,o1-preview" | |
EVAL_CATEGORIES: "observe,act,combination,extract,experimental,text_extract" | |
concurrency: | |
group: ${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
run-lint: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Lint | |
run: npm run lint | |
run-build: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Build | |
run: npm run build | |
run-e2e-tests: | |
needs: [run-lint, run-build] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run E2E Tests | |
run: npm run e2e | |
run-extract-evals: | |
needs: [run-lint, run-build, run-e2e-tests] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
# 1. Run extract category with domExtract | |
- name: Run Extract Evals (domExtract) | |
run: npm run evals category extract -- --extract-method=domExtract | |
- name: Save Extract Dom Results | |
run: mv eval-summary.json eval-summary-extract-dom.json | |
# 2. Once domExtract finishes, run extract category with textExtract | |
- name: Run Extract Evals (textExtract) | |
run: npm run evals category extract -- --extract-method=textExtract | |
- name: Save Extract Text Results | |
run: mv eval-summary.json eval-summary-extract-text.json | |
# 3. Log and Compare Extract Evals Performance | |
- name: Log and Compare Extract Evals Performance | |
run: | | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) | |
dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) | |
echo "DomExtract Extract category score: $dom_score%" | |
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" | |
experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) | |
text_score=$(jq '.categories.extract' eval-summary-extract-text.json) | |
echo "TextExtract Extract category score: $text_score%" | |
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" | |
# 4. If domExtract <80% fail CI | |
if (( $(echo "$dom_score < 80" | bc -l) )); then | |
echo "DomExtract extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
run-text-extract-evals: | |
needs: [run-extract-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 120 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
# 1. Run text_extract category with textExtract first | |
- name: Run text_extract Evals (textExtract) | |
run: npm run evals category text_extract -- --extract-method=textExtract | |
- name: Save text_extract Text Results | |
run: mv eval-summary.json eval-summary-text_extract-text.json | |
# 2. Then run text_extract category with domExtract | |
- name: Run text_extract Evals (domExtract) | |
run: npm run evals category text_extract -- --extract-method=domExtract | |
- name: Save text_extract Dom Results | |
run: mv eval-summary.json eval-summary-text_extract-dom.json | |
# 3. Log and Compare text_extract Evals Performance | |
- name: Log and Compare text_extract Evals Performance | |
run: | | |
experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) | |
text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) | |
echo "TextExtract text_extract category score: $text_score%" | |
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json) | |
dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json) | |
echo "DomExtract text_extract category score: $dom_score%" | |
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" | |
# 4. If textExtract (for text_extract category) <80% fail CI | |
if (( $(echo "$text_score < 80" | bc -l) )); then | |
echo "textExtract text_extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
run-act-evals: | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
needs: [run-text-extract-evals] | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Act Evals | |
run: npm run evals category act | |
- name: Log Act Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
act_score=$(jq '.categories.act' eval-summary.json) | |
echo "Act category score: $act_score%" | |
if (( $(echo "$act_score < 80" | bc -l) )); then | |
echo "Act category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for act category. Failing CI." | |
exit 1 | |
fi | |
run-observe-evals: | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
needs: [run-act-evals] | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Observe Evals | |
run: npm run evals category observe | |
- name: Log Observe Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
observe_score=$(jq '.categories.observe' eval-summary.json) | |
echo "Observe category score: $observe_score%" | |
if (( $(echo "$observe_score < 80" | bc -l) )); then | |
echo "Observe category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for observe category. Failing CI." | |
exit 1 | |
fi | |
run-combination-evals: | |
runs-on: ubuntu-latest | |
timeout-minutes: 40 | |
needs: [run-observe-evals] | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Combination Evals | |
run: npm run evals category combination | |
- name: Log Combination Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
combination_score=$(jq '.categories.combination' eval-summary.json) | |
echo "Combination category score: $combination_score%" | |
if (( $(echo "$combination_score < 85" | bc -l) )); then | |
echo "Combination category score is below 85%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for combination category. Failing CI." | |
exit 1 | |
fi | |
run-experimental-evals: | |
runs-on: ubuntu-latest | |
timeout-minutes: 120 | |
needs: [run-text-extract-evals] | |
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Experimental Evals | |
run: npm run evals category experimental | |
- name: Log Experimental Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
experimental_score=$(jq '.categories.experimental' eval-summary.json) | |
echo "Experimental category score: $experimental_score%" | |
else | |
echo "Eval summary not found for experimental category. Failing CI." | |
exit 1 | |
fi |