Evaluation - Simulate usage using examples produced from sessions (#253)

# Use Simulation For Evaluation This PR completely overhauls how we do evaluation as outlined in [TN011EVALDATA](https://foyle.io/docs/tech-notes/tn011_eval_data/) One of the major pain points in our approach to evaluation has been building up a sufficiently large dataset for evaluation. This PR solves this problem by using examples generated from sessions produced by actual usage. This ensures that the more we use Foyle the more data we have available for evaluation. Another challenge for evaluation has been what do we use for our set of learned examples during evaluation? Using actual sessions solves this problem because sessions are ordered in time. During evaluation we start out with no learned examples. We then replay the sessions in the same order the occurred. Foyle can then learn from those sessions using its learning process to improve accuracy on subsequent examples. # Making the Evaluator a Simulator In order to achieve this we redo our Evaluator to act more like a simulator that simulates what a user would do by using the sessions as examples of intent and actions. We refactor the Evaluator to follow the pattern we first used in the AssertJob of having the experiment driver (the evaluator) interact with the Agent via RPC. This makes it easy to setup and configure an independent instance of the Agent with the suitable parameters for the experiment. # Use sqlite for storing the results We rewrite the evaluator to use sqlite to story the evaluation results rather than using pebble. This gives much better querying capabilities for exploring the evaluation results. We store the EvalResult proto in JSON not binary format so that we can use sqlite's capabilities to query the data. # Level 1 Evals This PR deletes the Assertor code because it is rendered out of data by all the changes. In a subsequent PR we should integration the level 1 assertions into the evaluator. Tracked in #261 # Code Cleanup Delete code for computing the distance between expected and actual programs. We have switched to LLM as judge. That metric is likely not useful anymore because generated code are often multi-line mini programs that the metric couldn't handle. Delete the data/eval directory. These were handcrafted evaluation examples expressed as markdown files. With this PR we are making two changes 1. Store EvalExamples as protos to allow richer data representations 2. Produce evaluation datasets from logs and actual usage Fix #140
jlewi · Sep 27, 2024 · f4cdbd6 · f4cdbd6
1 parent dbbfaf8
commit f4cdbd6
Show file tree

Hide file tree

Showing 71 changed files with 1,875 additions and 1,801 deletions.
diff --git a/app/api/assert.go b/app/api/assert.go
diff --git a/app/api/experiment.go b/app/api/experiment.go
@@ -13,18 +13,13 @@ type Experiment struct {
 }
 
 type ExperimentSpec struct {
-	// EvalDir is the directory containing the evaluation the evaluation input
-	EvalDir string `json:"evalDir" yaml:"evalDir"`
-
-	// DBDir is the directory for the pebble database that will store the results
-	DBDir string `json:"dbDir" yaml:"dbDir"`
+	// AgentAddress is the address of the agent to use to generate completions
+	AgentAddress string `json:"agentAddress" yaml:"agentAddress"`
 
-	// SheetID is the ID of the Google Sheet to update with the results.
-	SheetID string `json:"sheetID" yaml:"sheetID"`
-
-	// SheetName is the name of the sheet to update.
-	SheetName string `json:"sheetName" yaml:"sheetName"`
+	// EvalDir is the directory containing the evaluation examples.
+	// These should be EvalExample protos.
+	EvalDir string `json:"evalDir" yaml:"evalDir"`
 
-	// Agent is the configuration for the agent
-	Agent *AgentConfig `json:"agent,omitempty" yaml:"agent,omitempty"`
+	// OutputDB is the path to the file to store the results in.
+	OutputDB string `json:"outputDB" yaml:"outputDB"`
 }
diff --git a/app/cmd/protos.go b/app/cmd/protos.go
@@ -0,0 +1,67 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"strings"
+
+	"github.com/go-logr/zapr"
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+	"github.com/pkg/errors"
+	"github.com/spf13/cobra"
+	"go.uber.org/zap"
+	"google.golang.org/protobuf/encoding/protojson"
+	"google.golang.org/protobuf/proto"
+)
+
+// NewProtoToJsonCmd creates a command for converting a proto to json
+func NewProtoToJsonCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "prototojson <file>",
+		Short: "Dump the binary proto file to json",
+		Run: func(cmd *cobra.Command, args []string) {
+			err := func() error {
+				log := zapr.NewLogger(zap.L())
+				if len(args) == 0 {
+					log.Info("prototojson takes at least one argument which should be the path of the proto to dump.")
+				}
+
+				file := args[0]
+
+				var message proto.Message
+				var typeName string
+				if strings.HasSuffix(file, ".evalexample.binpb") {
+					message = &v1alpha1.EvalExample{}
+					typeName = "EvalExample"
+				}
+
+				if strings.HasSuffix(file, ".example.binpb") {
+					message = &v1alpha1.Example{}
+					typeName = "Example"
+				}
+
+				if message == nil {
+					return errors.Errorf("The type of proto could not be determined from the path suffix for file: %s", file)
+				}
+				data, err := os.ReadFile(file)
+				if err != nil {
+					return errors.Wrapf(err, "Error reading file %s", file)
+				}
+
+				if err := proto.Unmarshal(data, message); err != nil {
+					return errors.Wrapf(err, "Error unmarshalling proto of type %s from file %s", typeName, file)
+				}
+
+				jsonP := protojson.Format(message)
+				fmt.Fprintf(os.Stdout, "%s\n", jsonP)
+				return nil
+			}()
+			if err != nil {
+				fmt.Printf("Error running convert;\n %+v\n", err)
+				os.Exit(1)
+			}
+		},
+	}
+
+	return cmd
+}
diff --git a/app/cmd/root.go b/app/cmd/root.go
@@ -31,5 +31,6 @@ func NewRootCmd() *cobra.Command {
 	rootCmd.AddCommand(NewConfigCmd())
 	rootCmd.AddCommand(NewLogsCmd())
 	rootCmd.AddCommand(NewApplyCmd())
+	rootCmd.AddCommand(NewProtoToJsonCmd())
 	return rootCmd
 }
diff --git a/app/go.mod b/app/go.mod
@@ -37,7 +37,7 @@ require (
 	github.com/oklog/ulid/v2 v2.1.0
 	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c
 	github.com/pkg/errors v0.9.1
-	github.com/sashabaranov/go-openai v1.29.0
+	github.com/sashabaranov/go-openai v1.30.3
 	github.com/spf13/cobra v1.8.0
 	github.com/spf13/viper v1.18.2
 	github.com/stateful/runme/v3 v3.3.1-0.20240515132033-7fd1591498c6

diff --git a/app/go.sum b/app/go.sum
@@ -548,6 +548,8 @@ github.com/sashabaranov/go-openai v1.20.4 h1:095xQ/fAtRa0+Rj21sezVJABgKfGPNbyx/s
 github.com/sashabaranov/go-openai v1.20.4/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/sashabaranov/go-openai v1.29.0 h1:eBH6LSjtX4md5ImDCX8hNhHQvaRf22zujiERoQpsvLo=
 github.com/sashabaranov/go-openai v1.29.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/sashabaranov/go-openai v1.30.3 h1:TEdRP3otRXX2A7vLoU+kI5XpoSo7VUUlM/rEttUqgek=
+github.com/sashabaranov/go-openai v1.30.3/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8=
 github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
 github.com/sethvargo/go-envconfig v0.9.0 h1:Q6FQ6hVEeTECULvkJZakq3dZMeBQ3JUpcKMfPQbKMDE=

diff --git a/app/pkg/agent/agent.go b/app/pkg/agent/agent.go
@@ -463,7 +463,10 @@ func (a *Agent) GenerateCells(ctx context.Context, req *connect.Request[v1alpha1
 		Cells: cells,
 	}
 
-	return connect.NewResponse[v1alpha1.GenerateCellsResponse](resp), nil
+	// We need to attach the traceId to the response.
+	cResp := connect.NewResponse[v1alpha1.GenerateCellsResponse](resp)
+	cResp.Header().Set(TraceIDHeader, span.SpanContext().TraceID().String())
+	return cResp, nil
 }
 
 // createCompletion is a helper function to create a single completion as part of a stream.

diff --git a/app/pkg/agent/const.go b/app/pkg/agent/const.go
@@ -0,0 +1,5 @@
+package agent
+
+const (
+	TraceIDHeader = "Foyle-Trace-ID"
+)
diff --git a/app/pkg/analyze/fsql/eval_query.sql b/app/pkg/analyze/fsql/eval_query.sql
@@ -0,0 +1,19 @@
+-- name: UpdateResult :exec
+INSERT OR REPLACE INTO results
+(id, time, proto_json)
+VALUES
+(?, ?, ?);
+
+-- name: GetResult :one
+SELECT * FROM results
+WHERE id = ?;
+
+
+-- name: ListResults :many
+-- This queries for results.
+-- Results are listed in descending order of time (most recent first) because the primary use is for resuming
+-- in the evaluator
+SELECT * FROM results
+WHERE (:cursor = '' OR time < :cursor)
+ORDER BY time DESC
+    LIMIT :page_size;
diff --git a/app/pkg/analyze/fsql/eval_query.sql.go b/app/pkg/analyze/fsql/eval_query.sql.go
diff --git a/app/pkg/analyze/fsql/models.go b/app/pkg/analyze/fsql/models.go
diff --git a/app/pkg/analyze/fsql/query.sql b/app/pkg/analyze/fsql/query.sql
@@ -20,4 +20,5 @@ WHERE contextID = ?;
 INSERT OR REPLACE INTO sessions 
 (contextID, startTime, endTime, selectedId, selectedKind, total_input_tokens, total_output_tokens, num_generate_traces, proto)
 VALUES 
-(?, ?, ?, ?, ?, ?, ?, ?, ?);
+(?, ?, ?, ?, ?, ?, ?, ?, ?);
+
diff --git a/app/pkg/analyze/fsql/schema.sql b/app/pkg/analyze/fsql/schema.sql
@@ -23,3 +23,14 @@ CREATE TABLE IF NOT EXISTS sessions (
     -- TODO(jeremy): Should we store the proto in JSON format so that we can run SQL queries on values in it?
     proto BLOB
 );
+
+-- Results contains evaluation results
+CREATE TABLE IF NOT EXISTS results (
+    id VARCHAR(255) PRIMARY KEY,
+    -- time is the time of the evaluation example
+    -- protobufs can't have null timestamps so no point allowing nulls
+    time TIMESTAMP NOT NULL,
+
+    -- The JSON serialization of the proto.
+    proto_json TEXT NOT NULL
+);
diff --git a/app/pkg/analyze/fsql/sqlc.yaml b/app/pkg/analyze/fsql/sqlc.yaml
@@ -1,7 +1,9 @@
 version: "2"
 sql:
   - engine: "sqlite"
-    queries: "query.sql"
+    queries:
+      - "eval_query.sql"
+      - "query.sql"
     schema: "schema.sql"
     gen:
       go:

diff --git a/app/pkg/analyze/session_manager.go b/app/pkg/analyze/session_manager.go
@@ -28,6 +28,13 @@ const (
 	SQLLiteDriver = "sqlite"
 )
 
+// GetDDL return the DDL for the database.
+// This is a hack because the DDL statements for the sessions and eval results tables are in the same file and package.
+// The Evaluator needs to be able to get the DDL in order to create the eval results table. We should clean this up
+func GetDDL() string {
+	return ddl
+}
+
 // SessionUpdater is a function that updates a session.
 type SessionUpdater func(session *logspb.Session) error
 
@@ -39,6 +46,7 @@ type SessionsManager struct {
 
 func NewSessionsManager(db *sql.DB) (*SessionsManager, error) {
 	// create tables
+	// TODO(jeremy): I think this creates the evalresults table as well because we don't separate the DDL statements.
 	if _, err := db.ExecContext(context.TODO(), ddl); err != nil {
 		return nil, err
 	}
@@ -283,6 +291,10 @@ func (m *SessionsManager) DumpExamples(ctx context.Context, request *connect.Req
 }
 
 // protoToRow converts from the proto representation of a session to the database row representation.
+//
+// TODO(jeremy): I think it would be better to make the return type fsql.UpdateSessionParams. Right now the only
+// place this function gets called is in the Update method and the returned value is immediately converted to
+// fsql.UpdateSessionParams.
 func protoToRow(session *logspb.Session) (*fsql.Session, error) {
 	log := logs.NewLogger()
 	protoBytes, err := proto.Marshal(session)
@@ -303,7 +315,6 @@ func protoToRow(session *logspb.Session) (*fsql.Session, error) {
 		}
 	}
 
-	// TODO: How do we deal with the end/starttime? In sqlc should we specify the type as timestamp?
 	return &fsql.Session{
 		Contextid:         session.ContextId,
 		Starttime:         session.StartTime.AsTime(),
@@ -376,6 +387,7 @@ func getExampleFromSession(s *logspb.Session) (*v1alpha1.EvalExample, error) {
 		Id:            s.ContextId,
 		ExpectedCells: expectedCells,
 		FullContext:   newContext,
+		Time:          s.GetStartTime(),
 	}
 
 	return example, nil