Skip to content

Commit eefe829

Browse files
committed
test: add gpt-4o-mini to smoke test github workflow
- Add gpt-4o-mini to smoke test github workflow - Add gpt-4o-mini smoke test golden files - Remove outdated gpt-4o model and 4-turbo smoke test config and golden files - Add golden files for gpt-4o-2024-08-06 - Regenerate golden files for existing models to drop callProgress events (we weren't comparing these anyway) Signed-off-by: Nick Hale <[email protected]>
1 parent 9e3893c commit eefe829

15 files changed

+2609
-12337
lines changed

.github/workflows/smoke.yaml

+8-8
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
6060
echo "run_smoke_tests=false" >> $GITHUB_OUTPUT
6161
62-
gpt-4o-2024-05-13:
62+
gpt-4o-2024-08-06:
6363
needs: check-label
6464
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
6565
runs-on: ubuntu-22.04
@@ -81,14 +81,14 @@ jobs:
8181
go-version: "1.21"
8282
- env:
8383
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
84-
GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-05-13
85-
name: Run smoke test for gpt-4o-2024-05-13
84+
GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-08-06
85+
name: Run smoke test for gpt-4o-2024-08-06
8686
run: |
87-
echo "Running smoke test for model gpt-4o-2024-05-13"
87+
echo "Running smoke test for model gpt-4o-2024-08-06"
8888
export PATH="$(pwd)/bin:${PATH}"
8989
make smoke
9090
91-
gpt-4-turbo-2024-04-09:
91+
gpt-4o-mini-2024-07-18:
9292
needs: check-label
9393
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
9494
runs-on: ubuntu-22.04
@@ -110,10 +110,10 @@ jobs:
110110
go-version: "1.21"
111111
- env:
112112
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
113-
GPTSCRIPT_DEFAULT_MODEL: gpt-4-turbo-2024-04-09
114-
name: Run smoke test for gpt-4-turbo-2024-04-09
113+
GPTSCRIPT_DEFAULT_MODEL: gpt-4o-mini-2024-07-18
114+
name: Run smoke test for gpt-4o-mini-2024-07-18
115115
run: |
116-
echo "Running smoke test for model gpt-4-turbo-2024-04-09"
116+
echo "Running smoke test for model gpt-4o-mini-2024-07-18"
117117
export PATH="$(pwd)/bin:${PATH}"
118118
make smoke
119119

pkg/tests/judge/judge.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,10 @@ func New[T any](client *openai.Client) (*Judge[T], error) {
8686
}
8787

8888
func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
89-
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
89+
comparisonJSON, err := json.Marshal(&comparison[T]{
9090
Expected: expected,
9191
Actual: actual,
92-
}, "", " ")
92+
})
9393
if err != nil {
9494
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
9595
}

pkg/tests/smoke/smoke_test.go

+5
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,11 @@ func getActualEvents(t *testing.T, eventsFile string) []event {
175175

176176
var e event
177177
require.NoError(t, json.Unmarshal([]byte(line), &e))
178+
179+
if e.Type == runner.EventTypeCallProgress {
180+
continue
181+
}
182+
178183
events = append(events, e)
179184
}
180185

pkg/tests/smoke/testdata/Bob/claude-3-5-sonnet-20240620-expected.json

+69-379
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)