diff --git a/app/api/assert.go b/app/api/assert.go deleted file mode 100644 index 28c7006f..00000000 --- a/app/api/assert.go +++ /dev/null @@ -1,39 +0,0 @@ -package api - -import "k8s.io/apimachinery/pkg/runtime/schema" - -var ( - AssertJobGVK = schema.FromAPIVersionAndKind(Group+"/"+Version, "AssertJob") -) - -// AssertJob is a struct that represents an assert job. This is a job that runs level one evaluations. -type AssertJob struct { - Metadata Metadata `json:"metadata" yaml:"metadata"` - Spec AssertJobSpec `json:"spec" yaml:"spec"` -} - -type AssertJobSpec struct { - // Sources is a list of sources to get the data from - Sources []EvalSource `json:"sources" yaml:"sources"` - - // AgentAddress is the address of the agent to use to generate completions - AgentAddress string `json:"agentAddress" yaml:"agentAddress"` - - // DBDir is the directory for the pebble database that will store the results - DBDir string `json:"dbDir" yaml:"dbDir"` - - // SheetID is the ID of the Google Sheet to update with the results. - SheetID string `json:"sheetID" yaml:"sheetID"` - - // SheetName is the name of the sheet to update. - SheetName string `json:"sheetName" yaml:"sheetName"` -} - -type EvalSource struct { - MarkdownSource *MarkdownSource `json:"markdownSource,omitempty" yaml:"markdownSource,omitempty"` -} - -type MarkdownSource struct { - // Path to the markdown files to use as evaluation data. - Path string `json:"path" yaml:"path"` -} diff --git a/app/api/experiment.go b/app/api/experiment.go index 3a8595cb..d198f5fd 100644 --- a/app/api/experiment.go +++ b/app/api/experiment.go @@ -13,18 +13,13 @@ type Experiment struct { } type ExperimentSpec struct { - // EvalDir is the directory containing the evaluation the evaluation input - EvalDir string `json:"evalDir" yaml:"evalDir"` - - // DBDir is the directory for the pebble database that will store the results - DBDir string `json:"dbDir" yaml:"dbDir"` + // AgentAddress is the address of the agent to use to generate completions + AgentAddress string `json:"agentAddress" yaml:"agentAddress"` - // SheetID is the ID of the Google Sheet to update with the results. - SheetID string `json:"sheetID" yaml:"sheetID"` - - // SheetName is the name of the sheet to update. - SheetName string `json:"sheetName" yaml:"sheetName"` + // EvalDir is the directory containing the evaluation examples. + // These should be EvalExample protos. + EvalDir string `json:"evalDir" yaml:"evalDir"` - // Agent is the configuration for the agent - Agent *AgentConfig `json:"agent,omitempty" yaml:"agent,omitempty"` + // OutputDB is the path to the file to store the results in. + OutputDB string `json:"outputDB" yaml:"outputDB"` } diff --git a/app/cmd/protos.go b/app/cmd/protos.go new file mode 100644 index 00000000..71a761f6 --- /dev/null +++ b/app/cmd/protos.go @@ -0,0 +1,67 @@ +package cmd + +import ( + "fmt" + "os" + "strings" + + "github.com/go-logr/zapr" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" + "github.com/pkg/errors" + "github.com/spf13/cobra" + "go.uber.org/zap" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" +) + +// NewProtoToJsonCmd creates a command for converting a proto to json +func NewProtoToJsonCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "prototojson ", + Short: "Dump the binary proto file to json", + Run: func(cmd *cobra.Command, args []string) { + err := func() error { + log := zapr.NewLogger(zap.L()) + if len(args) == 0 { + log.Info("prototojson takes at least one argument which should be the path of the proto to dump.") + } + + file := args[0] + + var message proto.Message + var typeName string + if strings.HasSuffix(file, ".evalexample.binpb") { + message = &v1alpha1.EvalExample{} + typeName = "EvalExample" + } + + if strings.HasSuffix(file, ".example.binpb") { + message = &v1alpha1.Example{} + typeName = "Example" + } + + if message == nil { + return errors.Errorf("The type of proto could not be determined from the path suffix for file: %s", file) + } + data, err := os.ReadFile(file) + if err != nil { + return errors.Wrapf(err, "Error reading file %s", file) + } + + if err := proto.Unmarshal(data, message); err != nil { + return errors.Wrapf(err, "Error unmarshalling proto of type %s from file %s", typeName, file) + } + + jsonP := protojson.Format(message) + fmt.Fprintf(os.Stdout, "%s\n", jsonP) + return nil + }() + if err != nil { + fmt.Printf("Error running convert;\n %+v\n", err) + os.Exit(1) + } + }, + } + + return cmd +} diff --git a/app/cmd/root.go b/app/cmd/root.go index 585719a5..754fad27 100644 --- a/app/cmd/root.go +++ b/app/cmd/root.go @@ -31,5 +31,6 @@ func NewRootCmd() *cobra.Command { rootCmd.AddCommand(NewConfigCmd()) rootCmd.AddCommand(NewLogsCmd()) rootCmd.AddCommand(NewApplyCmd()) + rootCmd.AddCommand(NewProtoToJsonCmd()) return rootCmd } diff --git a/app/go.mod b/app/go.mod index 9c94bb03..a5c4b849 100644 --- a/app/go.mod +++ b/app/go.mod @@ -37,7 +37,7 @@ require ( github.com/oklog/ulid/v2 v2.1.0 github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c github.com/pkg/errors v0.9.1 - github.com/sashabaranov/go-openai v1.29.0 + github.com/sashabaranov/go-openai v1.30.3 github.com/spf13/cobra v1.8.0 github.com/spf13/viper v1.18.2 github.com/stateful/runme/v3 v3.3.1-0.20240515132033-7fd1591498c6 diff --git a/app/go.sum b/app/go.sum index 62276dca..8208fbb8 100644 --- a/app/go.sum +++ b/app/go.sum @@ -548,6 +548,8 @@ github.com/sashabaranov/go-openai v1.20.4 h1:095xQ/fAtRa0+Rj21sezVJABgKfGPNbyx/s github.com/sashabaranov/go-openai v1.20.4/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= github.com/sashabaranov/go-openai v1.29.0 h1:eBH6LSjtX4md5ImDCX8hNhHQvaRf22zujiERoQpsvLo= github.com/sashabaranov/go-openai v1.29.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= +github.com/sashabaranov/go-openai v1.30.3 h1:TEdRP3otRXX2A7vLoU+kI5XpoSo7VUUlM/rEttUqgek= +github.com/sashabaranov/go-openai v1.30.3/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sethvargo/go-envconfig v0.9.0 h1:Q6FQ6hVEeTECULvkJZakq3dZMeBQ3JUpcKMfPQbKMDE= diff --git a/app/pkg/agent/agent.go b/app/pkg/agent/agent.go index 9dbe5325..a1d652e5 100644 --- a/app/pkg/agent/agent.go +++ b/app/pkg/agent/agent.go @@ -463,7 +463,10 @@ func (a *Agent) GenerateCells(ctx context.Context, req *connect.Request[v1alpha1 Cells: cells, } - return connect.NewResponse[v1alpha1.GenerateCellsResponse](resp), nil + // We need to attach the traceId to the response. + cResp := connect.NewResponse[v1alpha1.GenerateCellsResponse](resp) + cResp.Header().Set(TraceIDHeader, span.SpanContext().TraceID().String()) + return cResp, nil } // createCompletion is a helper function to create a single completion as part of a stream. diff --git a/app/pkg/agent/const.go b/app/pkg/agent/const.go new file mode 100644 index 00000000..62593e3e --- /dev/null +++ b/app/pkg/agent/const.go @@ -0,0 +1,5 @@ +package agent + +const ( + TraceIDHeader = "Foyle-Trace-ID" +) diff --git a/app/pkg/analyze/fsql/eval_query.sql b/app/pkg/analyze/fsql/eval_query.sql new file mode 100644 index 00000000..49200b3b --- /dev/null +++ b/app/pkg/analyze/fsql/eval_query.sql @@ -0,0 +1,19 @@ +-- name: UpdateResult :exec +INSERT OR REPLACE INTO results +(id, time, proto_json) +VALUES +(?, ?, ?); + +-- name: GetResult :one +SELECT * FROM results +WHERE id = ?; + + +-- name: ListResults :many +-- This queries for results. +-- Results are listed in descending order of time (most recent first) because the primary use is for resuming +-- in the evaluator +SELECT * FROM results +WHERE (:cursor = '' OR time < :cursor) +ORDER BY time DESC + LIMIT :page_size; \ No newline at end of file diff --git a/app/pkg/analyze/fsql/eval_query.sql.go b/app/pkg/analyze/fsql/eval_query.sql.go new file mode 100644 index 00000000..af3e7756 --- /dev/null +++ b/app/pkg/analyze/fsql/eval_query.sql.go @@ -0,0 +1,79 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.27.0 +// source: eval_query.sql + +package fsql + +import ( + "context" + "time" +) + +const getResult = `-- name: GetResult :one +SELECT id, time, proto_json FROM results +WHERE id = ? +` + +func (q *Queries) GetResult(ctx context.Context, id string) (Result, error) { + row := q.db.QueryRowContext(ctx, getResult, id) + var i Result + err := row.Scan(&i.ID, &i.Time, &i.ProtoJson) + return i, err +} + +const listResults = `-- name: ListResults :many +SELECT id, time, proto_json FROM results +WHERE (?1 = '' OR time < ?1) +ORDER BY time DESC + LIMIT ?2 +` + +type ListResultsParams struct { + Cursor interface{} + PageSize int64 +} + +// This queries for results. +// Results are listed in descending order of time (most recent first) because the primary use is for resuming +// in the evaluator +func (q *Queries) ListResults(ctx context.Context, arg ListResultsParams) ([]Result, error) { + rows, err := q.db.QueryContext(ctx, listResults, arg.Cursor, arg.PageSize) + if err != nil { + return nil, err + } + defer rows.Close() + var items []Result + for rows.Next() { + var i Result + if err := rows.Scan(&i.ID, &i.Time, &i.ProtoJson); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const updateResult = `-- name: UpdateResult :exec +INSERT OR REPLACE INTO results +(id, time, proto_json) +VALUES +(?, ?, ?) +` + +type UpdateResultParams struct { + ID string + Time time.Time + ProtoJson string +} + +func (q *Queries) UpdateResult(ctx context.Context, arg UpdateResultParams) error { + _, err := q.db.ExecContext(ctx, updateResult, arg.ID, arg.Time, arg.ProtoJson) + return err +} diff --git a/app/pkg/analyze/fsql/models.go b/app/pkg/analyze/fsql/models.go index e9290994..acf66881 100644 --- a/app/pkg/analyze/fsql/models.go +++ b/app/pkg/analyze/fsql/models.go @@ -8,6 +8,12 @@ import ( "time" ) +type Result struct { + ID string + Time time.Time + ProtoJson string +} + type Session struct { Contextid string Starttime time.Time diff --git a/app/pkg/analyze/fsql/query.sql b/app/pkg/analyze/fsql/query.sql index 36804361..98bc70bd 100644 --- a/app/pkg/analyze/fsql/query.sql +++ b/app/pkg/analyze/fsql/query.sql @@ -20,4 +20,5 @@ WHERE contextID = ?; INSERT OR REPLACE INTO sessions (contextID, startTime, endTime, selectedId, selectedKind, total_input_tokens, total_output_tokens, num_generate_traces, proto) VALUES -(?, ?, ?, ?, ?, ?, ?, ?, ?); \ No newline at end of file +(?, ?, ?, ?, ?, ?, ?, ?, ?); + diff --git a/app/pkg/analyze/fsql/schema.sql b/app/pkg/analyze/fsql/schema.sql index f23de11a..38d3106c 100644 --- a/app/pkg/analyze/fsql/schema.sql +++ b/app/pkg/analyze/fsql/schema.sql @@ -23,3 +23,14 @@ CREATE TABLE IF NOT EXISTS sessions ( -- TODO(jeremy): Should we store the proto in JSON format so that we can run SQL queries on values in it? proto BLOB ); + +-- Results contains evaluation results +CREATE TABLE IF NOT EXISTS results ( + id VARCHAR(255) PRIMARY KEY, + -- time is the time of the evaluation example + -- protobufs can't have null timestamps so no point allowing nulls + time TIMESTAMP NOT NULL, + + -- The JSON serialization of the proto. + proto_json TEXT NOT NULL +); \ No newline at end of file diff --git a/app/pkg/analyze/fsql/sqlc.yaml b/app/pkg/analyze/fsql/sqlc.yaml index 67caeb53..311cce97 100644 --- a/app/pkg/analyze/fsql/sqlc.yaml +++ b/app/pkg/analyze/fsql/sqlc.yaml @@ -1,7 +1,9 @@ version: "2" sql: - engine: "sqlite" - queries: "query.sql" + queries: + - "eval_query.sql" + - "query.sql" schema: "schema.sql" gen: go: diff --git a/app/pkg/analyze/session_manager.go b/app/pkg/analyze/session_manager.go index 41d9e577..ce4aa0b5 100644 --- a/app/pkg/analyze/session_manager.go +++ b/app/pkg/analyze/session_manager.go @@ -28,6 +28,13 @@ const ( SQLLiteDriver = "sqlite" ) +// GetDDL return the DDL for the database. +// This is a hack because the DDL statements for the sessions and eval results tables are in the same file and package. +// The Evaluator needs to be able to get the DDL in order to create the eval results table. We should clean this up +func GetDDL() string { + return ddl +} + // SessionUpdater is a function that updates a session. type SessionUpdater func(session *logspb.Session) error @@ -39,6 +46,7 @@ type SessionsManager struct { func NewSessionsManager(db *sql.DB) (*SessionsManager, error) { // create tables + // TODO(jeremy): I think this creates the evalresults table as well because we don't separate the DDL statements. if _, err := db.ExecContext(context.TODO(), ddl); err != nil { return nil, err } @@ -283,6 +291,10 @@ func (m *SessionsManager) DumpExamples(ctx context.Context, request *connect.Req } // protoToRow converts from the proto representation of a session to the database row representation. +// +// TODO(jeremy): I think it would be better to make the return type fsql.UpdateSessionParams. Right now the only +// place this function gets called is in the Update method and the returned value is immediately converted to +// fsql.UpdateSessionParams. func protoToRow(session *logspb.Session) (*fsql.Session, error) { log := logs.NewLogger() protoBytes, err := proto.Marshal(session) @@ -303,7 +315,6 @@ func protoToRow(session *logspb.Session) (*fsql.Session, error) { } } - // TODO: How do we deal with the end/starttime? In sqlc should we specify the type as timestamp? return &fsql.Session{ Contextid: session.ContextId, Starttime: session.StartTime.AsTime(), @@ -376,6 +387,7 @@ func getExampleFromSession(s *logspb.Session) (*v1alpha1.EvalExample, error) { Id: s.ContextId, ExpectedCells: expectedCells, FullContext: newContext, + Time: s.GetStartTime(), } return example, nil diff --git a/app/pkg/analyze/session_manager_test.go b/app/pkg/analyze/session_manager_test.go index f738442a..ab8b1a13 100644 --- a/app/pkg/analyze/session_manager_test.go +++ b/app/pkg/analyze/session_manager_test.go @@ -75,6 +75,7 @@ func Test_SessionsCRUD(t *testing.T) { var ( session1 = &logspb.Session{ ContextId: "1", + StartTime: timeMustParse(time.RFC3339, "2021-01-01T00:01:00Z"), FullContext: &v1alpha1.FullContext{ Notebook: &parserv1.Notebook{ Cells: []*parserv1.Cell{ @@ -124,7 +125,8 @@ func Test_getExampleFromSession(t *testing.T) { name: "Basic", session: session1, expected: &v1alpha1.EvalExample{ - Id: "1", + Id: "1", + Time: timeMustParse(time.RFC3339, "2021-01-01T00:01:00Z"), FullContext: &v1alpha1.FullContext{ Notebook: &parserv1.Notebook{ Cells: []*parserv1.Cell{ @@ -153,7 +155,7 @@ func Test_getExampleFromSession(t *testing.T) { t.Fatalf("Error getting example from session: %v", err) } - comparer := cmpopts.IgnoreUnexported(v1alpha1.EvalExample{}, v1alpha1.FullContext{}, parserv1.Notebook{}, parserv1.Cell{}) + comparer := cmpopts.IgnoreUnexported(v1alpha1.EvalExample{}, v1alpha1.FullContext{}, parserv1.Notebook{}, parserv1.Cell{}, timestamppb.Timestamp{}) if d := cmp.Diff(actual, c.expected, comparer); d != "" { t.Fatalf("Unexpected diff between expected and actual example:\n%v", d) } diff --git a/app/pkg/application/app.go b/app/pkg/application/app.go index 0abcea46..6dbeb986 100644 --- a/app/pkg/application/app.go +++ b/app/pkg/application/app.go @@ -271,13 +271,6 @@ func (a *App) SetupRegistry() error { return err } - assertor, err := eval.NewAssertRunner(*a.Config) - if err != nil { - return err - } - if err := a.Registry.Register(api.AssertJobGVK, assertor); err != nil { - return err - } return nil } diff --git a/app/pkg/eval/assertor.go b/app/pkg/eval/assertor.go deleted file mode 100644 index 548b15f6..00000000 --- a/app/pkg/eval/assertor.go +++ /dev/null @@ -1,249 +0,0 @@ -package eval - -import ( - "context" - "crypto/tls" - "net" - "net/http" - "os" - - "github.com/jlewi/foyle/app/pkg/dbutil" - "github.com/jlewi/foyle/app/pkg/docs" - - "github.com/cockroachdb/pebble" - "github.com/jlewi/foyle/app/api" - "github.com/jlewi/foyle/app/pkg/config" - "github.com/jlewi/foyle/app/pkg/logs" - "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" - "github.com/jlewi/foyle/protos/go/foyle/v1alpha1/v1alpha1connect" - "github.com/jlewi/monogo/helpers" - "github.com/pkg/errors" - "golang.org/x/net/http2" - "google.golang.org/protobuf/proto" - "sigs.k8s.io/kustomize/kyaml/yaml" -) - -// AssertRunner runs assertions in batch mode -type AssertRunner struct { - config config.Config - - assertions []Assertion -} - -func NewAssertRunner(config config.Config) (*AssertRunner, error) { - runner := &AssertRunner{config: config} - - // Load the assertions - runner.assertions = make([]Assertion, 0, 10) - runner.assertions = append(runner.assertions, &AssertCodeAfterMarkdown{}) - runner.assertions = append(runner.assertions, &AssertOneCodeCell{}) - runner.assertions = append(runner.assertions, &AssertEndsWithCodeCell{}) - return runner, nil -} - -func newHTTPClient() *http.Client { - // N.B. We need to use HTTP2 if we want to support bidirectional streaming - //http.DefaultClient, - return &http.Client{ - Transport: &http2.Transport{ - AllowHTTP: true, - DialTLSContext: func(ctx context.Context, network, addr string, _ *tls.Config) (net.Conn, error) { - // Use the standard Dial function to create a plain TCP connection - return net.Dial(network, addr) - }, - }, - } -} -func newGenerateClient(baseURL string) v1alpha1connect.GenerateServiceClient { - // Create a new client - client := v1alpha1connect.NewGenerateServiceClient( - newHTTPClient(), - baseURL, - ) - return client -} - -func (r *AssertRunner) ReconcileNode(ctx context.Context, node *yaml.RNode) error { - job := &api.AssertJob{} - if err := node.YNode().Decode(job); err != nil { - return errors.Wrapf(err, "Failed to decode AssertJob") - } - - return r.Reconcile(ctx, *job) -} - -func (r *AssertRunner) Reconcile(ctx context.Context, job api.AssertJob) error { - log := logs.FromContext(ctx).WithValues("job", job.Metadata.Name) - log.Info("Opening database", "database", job.Spec.DBDir) - db, err := pebble.Open(job.Spec.DBDir, &pebble.Options{}) - if err != nil { - return err - } - defer helpers.DeferIgnoreError(db.Close) - - if job.Spec.AgentAddress == "" { - return errors.New("AgentAddress is required") - } - - if len(job.Spec.Sources) == 0 { - return errors.New("Sources must be specified") - } - - client := newGenerateClient(job.Spec.AgentAddress) - - // Process all the sources - for _, source := range job.Spec.Sources { - if source.MarkdownSource == nil { - return errors.New("Only MarkdownSource is supported") - } - files, err := listEvalFiles(ctx, source.MarkdownSource.Path) - if err != nil { - return err - } - - log.Info("Found eval files", "numFiles", len(files)) - - // Now iterate over the DB and figure out which files haven't been loaded into the db. - - unloadedFiles, err := findUnloadedFiles(ctx, db, files) - if err != nil { - return err - } - log.Info("Found unloaded files", "numFiles", len(unloadedFiles)) - - // We need to load the evaluation data into the database. - if err := loadMarkdownFiles(ctx, db, unloadedFiles); err != nil { - return err - } - } - - // Now generate predictions for any results that are missing them. - if err := reconcilePredictions(ctx, db, client); err != nil { - return err - } - - if err := reconcileAssertions(ctx, r.assertions, db); err != nil { - return err - } - return nil -} - -// reconcileAssertions reconciles the assertions with the results -func reconcileAssertions(ctx context.Context, assertions []Assertion, db *pebble.DB) error { - olog := logs.FromContext(ctx) - iter, err := db.NewIterWithContext(ctx, nil) - if err != nil { - return err - } - defer iter.Close() - - for iter.First(); iter.Valid(); iter.Next() { - key := iter.Key() - if key == nil { - break - } - - log := olog.WithValues("id", string(key)) - value, err := iter.ValueAndErr() - if err != nil { - return errors.Wrapf(err, "Failed to read value for key %s", string(key)) - } - - result := &v1alpha1.EvalResult{} - if err := proto.Unmarshal(value, result); err != nil { - return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key)) - } - - actual := make(map[string]bool) - for _, a := range result.GetAssertions() { - actual[a.GetName()] = true - } - - if result.Assertions == nil { - result.Assertions = make([]*v1alpha1.Assertion, 0, len(assertions)) - } - - for _, a := range assertions { - if _, ok := actual[a.Name()]; ok { - continue - } - - // Run the assertion - newA, err := a.Assert(ctx, result.Example.Query, nil, result.Actual) - - if err != nil { - log.Error(err, "Failed to run assertion", "name", a.Name()) - } - - result.Assertions = append(result.Assertions, newA) - } - - if err := updateResult(ctx, string(key), result, db); err != nil { - return err - } - } - return nil -} - -// loadMarkdownFiles loads a bunch of markdown files into example protos. -// Unlike loadMarkdownAnswerFiles this function doesn't load any answers. -func loadMarkdownFiles(ctx context.Context, db *pebble.DB, files []string) error { - oLog := logs.FromContext(ctx) - - allErrors := &helpers.ListOfErrors{} - for _, path := range files { - log := oLog.WithValues("path", path) - log.Info("Processing file") - - contents, err := os.ReadFile(path) - if err != nil { - log.Error(err, "Failed to read file") - allErrors.AddCause(err) - // Keep going - continue - } - - doc := &v1alpha1.Doc{} - - blocks, err := docs.MarkdownToBlocks(string(contents)) - if err != nil { - log.Error(err, "Failed to convert markdown to blocks") - allErrors.AddCause(err) - // Keep going - continue - } - - doc.Blocks = blocks - - if len(doc.GetBlocks()) < 2 { - log.Info("Skipping doc; too few blocks; at least two are required") - continue - } - - // We generate a stable ID for the example by hashing the contents of the document. - example := &v1alpha1.Example{ - Query: doc, - } - example.Id = HashExample(example) - - result := &v1alpha1.EvalResult{ - Example: example, - ExampleFile: path, - // initialize distance to a negative value so we can tell when it hasn't been computed - Distance: uninitializedDistance, - } - - if err := dbutil.SetProto(db, example.GetId(), result); err != nil { - log.Error(err, "Failed to write result to DB") - allErrors.AddCause(err) - // Keep going - continue - } - } - - if len(allErrors.Causes) > 0 { - return allErrors - } - - return nil -} diff --git a/app/pkg/eval/clients.go b/app/pkg/eval/clients.go new file mode 100644 index 00000000..54d498f2 --- /dev/null +++ b/app/pkg/eval/clients.go @@ -0,0 +1,24 @@ +package eval + +import ( + "context" + "crypto/tls" + "net" + "net/http" + + "golang.org/x/net/http2" +) + +func newHTTPClient() *http.Client { + // N.B. We need to use HTTP2 if we want to support bidirectional streaming + //http.DefaultClient, + return &http.Client{ + Transport: &http2.Transport{ + AllowHTTP: true, + DialTLSContext: func(ctx context.Context, network, addr string, _ *tls.Config) (net.Conn, error) { + // Use the standard Dial function to create a plain TCP connection + return net.Dial(network, addr) + }, + }, + } +} diff --git a/app/pkg/eval/distance.go b/app/pkg/eval/distance.go deleted file mode 100644 index a863e020..00000000 --- a/app/pkg/eval/distance.go +++ /dev/null @@ -1,178 +0,0 @@ -package eval - -import ( - "errors" - "math" - "strings" - - "github.com/agnivade/levenshtein" - "github.com/jlewi/foyle/app/pkg/executor" -) - -const ( - alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" -) - -type command struct { - unnamed []string - named map[string]string -} - -type DistanceResult struct { - Distance int - Max int - Normalized float32 -} - -// Distance computes the distance between two instructions -// -// For details refer to tn003_learning_eval.md. -func Distance(left executor.Instruction, right executor.Instruction) (DistanceResult, error) { - // Split each instruction into named and unnamed arguments - leftArgs := splitInstruction(left) - rightArgs := splitInstruction(right) - - result := DistanceResult{ - Distance: uninitializedDistance, - Max: uninitializedDistance, - Normalized: uninitializedDistance, - } - - // Compute the distance of the unnamed arguments - unamedDistance, err := editDistance(leftArgs.unnamed, rightArgs.unnamed) - if err != nil { - return result, err - } - - // Compute the distance of the named arguments - namedDistance := dictDistance(leftArgs.named, rightArgs.named) - - totalDistance := unamedDistance + namedDistance - - result.Distance = totalDistance - - // Compute the max distance. - // For the unnamed arguments the maximum distance is the length of which ever command is longer - // For the named arguments the maximum distance is the number of unique keys in the dictionaries. - max := int(math.Max(float64(len(leftArgs.unnamed)), float64(len(rightArgs.unnamed)))) - - // Need to count the number of unique keys in the dictionaries. - unique := map[string]string{} - - for k := range leftArgs.named { - unique[k] = "" - } - for k := range rightArgs.named { - unique[k] = "" - } - - max += len(unique) - normalizedDistance := float32(totalDistance) / float32(max) - - result.Max = max - result.Normalized = normalizedDistance - return result, nil -} - -// editDistance computes the edit distance between two slices of strings. -// Each string in the slice is considered a single token. -func editDistance(left []string, right []string) (int, error) { - // Our levenstein distance function operates on strings. - // So we need to map our tokens to single character strings. - // We do this by building up a dictionary mapping the tokens to single character strings. - // We currently use a fixed alphabet of 62 characters. We should be able to easily extend this to 100s or thousands - // of characters because our levenstein library works with UTF8. Just using 1 byte we should be able to represent - // 128 characters. I wanted to keep it to printable characters. Seems unlikely we will have commands - // of more than 62 tokens. - - t := tokenizer{ - index: 0, - dict: map[string]string{}, - } - leftVal, err := t.tokenize(left) - if err != nil { - return 0, err - } - - rightVal, err := t.tokenize(right) - if err != nil { - return 0, err - } - // I picked this particular library because - // Its code was readable and pretty compact. - // https://github.com/ka-weihe/fast-levenshtein claims to be 15 times faster but its code is unreadable because - // its so heavily optimized. Its also not thread safe although it would be trivial to make it so. - return levenshtein.ComputeDistance(leftVal, rightVal), nil -} - -type tokenizer struct { - index int - dict map[string]string -} - -func (t *tokenizer) tokenize(vals []string) (string, error) { - result := "" - for _, l := range vals { - if _, ok := t.dict[l]; !ok { - t.dict[l] = string(alphabet[t.index]) - t.index++ - if t.index >= len(alphabet) { - return "", errors.New("Too many tokens") - } - } - result += t.dict[l] - } - return result, nil -} - -// dictDistance computes the distance between two dictionaries. -func dictDistance(left map[string]string, right map[string]string) int { - // Each key in one dictionary but not the other contributes 1 to the distance. - distance := 0 - distance += countKeysNotInRight(left, right) - distance += countKeysNotInRight(right, left) - - // Now we need to check the values of the keys that are in both dictionaries. - // If the values don't match then we need to add 1 to the distance. - for k := range left { - if _, ok := right[k]; !ok { - continue - } - - if left[k] != right[k] { - distance += 1 - } - } - - return distance -} - -func countKeysNotInRight(left map[string]string, right map[string]string) int { - d := 0 - for k := range left { - if _, ok := right[k]; !ok { - d += 1 - } - } - return d -} - -func splitInstruction(instruction executor.Instruction) command { - c := command{ - unnamed: []string{instruction.Command.Name}, - named: map[string]string{}, - } - - for _, arg := range instruction.Command.Args { - if strings.HasPrefix(arg, "--") { - pieces := strings.Split(arg, "=") - if len(pieces) >= 2 { - c.named[pieces[0]] = strings.Join(pieces[1:], "=") - continue - } - } - c.unnamed = append(c.unnamed, arg) - } - - return c -} diff --git a/app/pkg/eval/distance_test.go b/app/pkg/eval/distance_test.go deleted file mode 100644 index fc571f30..00000000 --- a/app/pkg/eval/distance_test.go +++ /dev/null @@ -1,232 +0,0 @@ -package eval - -import ( - "testing" - - "github.com/go-cmd/cmd" - "github.com/google/go-cmp/cmp" - "github.com/jlewi/foyle/app/pkg/executor" -) - -func Test_Distance(t *testing.T) { - type testCase struct { - name string - left []string - right []string - expected DistanceResult - } - - cases := []testCase{ - { - name: "equal", - left: []string{"gcloud", "-p", "acme", "--foo=bar", "baz"}, - right: []string{"gcloud", "-p", "acme", "--foo=bar", "baz"}, - expected: DistanceResult{ - Distance: 0, - Max: 5, - Normalized: 0, - }, - }, - { - name: "notequal", - left: []string{"gcloud", "-p", "acme", "--foo=bar", "baz"}, - right: []string{"gcloud", "acme", "--foo=lab", "baz"}, - expected: DistanceResult{ - Distance: 2, - Max: 5, - Normalized: 0.4, - }, - }, - { - name: "maxdist", - left: []string{"gcloud", "logging", "read", "logName=\"projects/foyle-dev/logs/hydros\" jsonPayload.image=\"carabou\"", "--freshness=1d", "--project=foyle-dev"}, - right: []string{"docker", "build", "--progress=plain", "-t", "carabou", "."}, - expected: DistanceResult{ - Distance: 8, - // longest unnamed has 5 arguments and then there are 3 unique named arguments - Max: 8, - Normalized: 1.0, - }, - }, - { - name: "empty-array", - left: []string{""}, - right: []string{"gcloud", "-p", "acme", "baz"}, - expected: DistanceResult{ - Distance: 4, - Max: 4, - Normalized: 1.0, - }, - }, - } - - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - left := executor.Instruction{ - Command: cmd.NewCmd(c.left[0], c.left[1:]...), - } - right := executor.Instruction{ - Command: cmd.NewCmd(c.right[0], c.right[1:]...), - } - actual, err := Distance(left, right) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - if actual.Distance != c.expected.Distance { - t.Errorf("Expected %d but got %d", c.expected.Distance, actual.Distance) - } - if actual.Max != c.expected.Max { - t.Errorf("Expected %d but got %d", c.expected.Max, actual.Max) - } - if actual.Normalized != c.expected.Normalized { - t.Errorf("Expected normalized %f but got %f", c.expected.Normalized, actual.Normalized) - } - }) - } -} - -func Test_SplitInstruction(t *testing.T) { - type testCase struct { - name string - args []string - expected command - } - - cases := []testCase{ - { - name: "simple", - args: []string{"gcloud", "-p", "acme", "--foo=bar", "baz"}, - expected: command{ - unnamed: []string{"gcloud", "-p", "acme", "baz"}, - named: map[string]string{ - "--foo": "bar", - }, - }, - }, - { - name: "equals-in-value", - args: []string{"foyle", "config", "--foo=bar=baz"}, - expected: command{ - unnamed: []string{"foyle", "config"}, - named: map[string]string{ - "--foo": "bar=baz", - }, - }, - }, - } - - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - instruction := executor.Instruction{ - Command: cmd.NewCmd(c.args[0], c.args[1:]...), - } - actual := splitInstruction(instruction) - if d := cmp.Diff(c.expected, actual, cmp.AllowUnexported(command{})); d != "" { - t.Errorf("Unexpected result (-want +got):\n%s", d) - } - }) - } -} - -func Test_editDistance(t *testing.T) { - type testCase struct { - name string - left []string - right []string - expected int - } - - cases := []testCase{ - { - name: "simple", - left: []string{"gcloud", "-p", "acme", "baz"}, - right: []string{"gcloud", "-p", "acme", "baz"}, - expected: 0, - }, - { - name: "simple", - left: []string{"-p", "acme", "baz", "extra"}, - right: []string{"gcloud", "-p", "acme", "baz"}, - expected: 2, - }, - { - name: "substitution", - left: []string{"acme", "foo", "baz"}, - right: []string{"acme", "bar", "baz"}, - expected: 1, - }, - } - - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - actual, err := editDistance(c.left, c.right) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - if actual != c.expected { - t.Errorf("Expected %d but got %d", c.expected, actual) - } - }) - } -} - -func Test_dictDistance(t *testing.T) { - type testCase struct { - name string - left map[string]string - right map[string]string - expected int - } - - cases := []testCase{ - { - name: "equal", - left: map[string]string{ - "a": "1", - "b": "2", - }, - right: map[string]string{ - "a": "1", - "b": "2", - }, - expected: 0, - }, - { - name: "extra", - left: map[string]string{ - "a": "1", - "b": "2", - "e": "3", - }, - right: map[string]string{ - "a": "1", - "b": "2", - "f": "4", - }, - expected: 2, - }, - { - name: "diff", - left: map[string]string{ - "a": "1", - "b": "2", - "c": "3", - }, - right: map[string]string{ - "a": "1", - "b": "2", - "c": "4", - }, - expected: 1, - }, - } - - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - actual := dictDistance(c.left, c.right) - if actual != c.expected { - t.Errorf("Expected %d but got %d", c.expected, actual) - } - }) - } -} diff --git a/app/pkg/eval/evaluator.go b/app/pkg/eval/evaluator.go index 2317c635..45361781 100644 --- a/app/pkg/eval/evaluator.go +++ b/app/pkg/eval/evaluator.go @@ -4,42 +4,38 @@ import ( "context" "os" "path/filepath" + "sort" + "time" + "connectrpc.com/connect" + "github.com/jlewi/foyle/app/pkg/agent" + "github.com/jlewi/foyle/app/pkg/oai" + "github.com/jlewi/foyle/app/pkg/runme/converters" + "github.com/jlewi/foyle/app/pkg/runme/ulid" logspb "github.com/jlewi/foyle/protos/go/foyle/logs" - - "github.com/go-cmd/cmd" - - "github.com/jlewi/foyle/app/pkg/dbutil" + "github.com/jlewi/foyle/protos/go/foyle/logs/logspbconnect" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1/v1alpha1connect" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" "github.com/jlewi/foyle/app/api" - "github.com/jlewi/foyle/app/pkg/agent" - "github.com/jlewi/foyle/app/pkg/oai" "sigs.k8s.io/kustomize/kyaml/yaml" - "github.com/cockroachdb/pebble" "github.com/jlewi/foyle/app/pkg/config" - "github.com/jlewi/foyle/app/pkg/docs" "github.com/jlewi/foyle/app/pkg/executor" "github.com/jlewi/foyle/app/pkg/logs" "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" - "github.com/jlewi/monogo/helpers" "github.com/pkg/errors" - "google.golang.org/api/googleapi" - "google.golang.org/api/impersonate" - "google.golang.org/api/option" - "google.golang.org/api/sheets/v4" "google.golang.org/protobuf/proto" ) -const ( - uninitializedDistance = -1 -) - type Evaluator struct { config config.Config parser *executor.BashishParser } +// N.B. One issue with noise in the simulation is that the speed of log processing affects whether example +// has been learned from by the next time it is processed. + // NewEvaluator creates a new Evaluator // The evaluator assumes that the analyzer is already running in the background and processing logs. // TODO(https://github.com/jlewi/foyle/issues/140): The evaluator may need to be updated now that we continuously @@ -71,22 +67,33 @@ func (e *Evaluator) ReconcileNode(ctx context.Context, node *yaml.RNode) error { func (e *Evaluator) Reconcile(ctx context.Context, experiment api.Experiment) error { log := logs.FromContext(ctx).WithValues("experiment", experiment.Metadata.Name) - log.Info("Opening database", "database", experiment.Spec.DBDir) - db, err := pebble.Open(experiment.Spec.DBDir, &pebble.Options{}) - if err != nil { - return err + + if experiment.Spec.AgentAddress == "" { + return errors.New("AgentAddress is required") } - defer helpers.DeferIgnoreError(db.Close) - if experiment.Spec.Agent == nil { - return errors.New("Agent is required") + if experiment.Spec.OutputDB == "" { + return errors.New("OutputDB is required") } - agent, err := e.setupAgent(ctx, *experiment.Spec.Agent) + + if experiment.Spec.EvalDir == "" { + return errors.New("EvalDir is required") + } + + aiClient := newAIServiceClient(experiment.Spec.AgentAddress) + + logsClient := logspbconnect.NewLogsServiceClient( + newHTTPClient(), + experiment.Spec.AgentAddress, + ) + + manager, err := openResultsManager(experiment.Spec.OutputDB) if err != nil { - return err + return errors.Wrapf(err, "Failed to open results manager from file %s", experiment.Spec.OutputDB) } - // List all the files + // Find all the binary protobuf files in the eval directory. + // This should contain EvalExample protos. files, err := listEvalFiles(ctx, experiment.Spec.EvalDir) if err != nil { return err @@ -94,497 +101,449 @@ func (e *Evaluator) Reconcile(ctx context.Context, experiment api.Experiment) er log.Info("Found eval files", "numFiles", len(files)) - // Now iterate over the DB and figure out which files haven't been loaded into the db. - - unloadedFiles, err := findUnloadedFiles(ctx, db, files) + // Now we need to get the id of the last processed example so we can skip over examples that have already been + // processed + lastProcessedTime, err := getLastProcessedTime(ctx, manager) if err != nil { - return err + return errors.Wrapf(err, "Failed to get last processed time") } - log.Info("Found unloaded files", "numFiles", len(unloadedFiles)) - // We need to load the evaluation data into the database. - if err := loadMarkdownAnswerFiles(ctx, db, unloadedFiles); err != nil { - return err - } + // N.B. Since we set lastProcessedTime to the time of the last processed result we won't reprocess that result. + // We might want to rethink that if we want to be able to reprocess an example that failed on an error and + // we want to retry that - // Now generate predictions for any results that are missing them. - if err := e.reconcilePredictions(ctx, db, agent); err != nil { - return err - } + log.Info("Processing eval examples", "lastProcessedTime", lastProcessedTime) - // TODO(jeremy): We should get the traces via API because only one process can access the pebble DB at a time. - // And the agent needs access to the pebble DB traces. - tracesDB, err := pebble.Open(e.config.GetTracesDBDir(), &pebble.Options{}) - if err != nil { - return err - } - defer helpers.DeferIgnoreError(tracesDB.Close) + // Loop over the eval examples and load them + examples := make([]*v1alpha1.EvalExample, 0, len(files)) + for _, exampleFile := range files { + b, err := os.ReadFile(exampleFile) + if err != nil { + // TODO(jeremy): We should probably store the error in the DB. + log.Error(err, "Failed to read file", "file", exampleFile) + continue + } - if err := e.reconcileBestRAGResult(ctx, db, tracesDB); err != nil { - return err + example := &v1alpha1.EvalExample{} + if err := proto.Unmarshal(b, example); err != nil { + log.Error(err, "Failed to unmarshal example", "file", exampleFile) + continue + } + examples = append(examples, example) } - // Compute the distance - if err := e.reconcileDistance(ctx, db); err != nil { - return err - } + // Now sort the examples in time order so we can process them in the same order they actually occurred + sortEvalExamplesInTime(examples) - // Update the Google Sheet - if err := e.updateGoogleSheet(ctx, experiment, db); err != nil { + // Now generate predictions for any results that are missing them. + if err := e.processExamples(ctx, examples, lastProcessedTime, aiClient, logsClient, manager); err != nil { return err } + return nil } -func (e *Evaluator) setupAgent(ctx context.Context, agentConfig api.AgentConfig) (*agent.Agent, error) { - cfg := e.config.DeepCopy() - - // Swap out the AgentConfig - cfg.Agent = &agentConfig - - // Ensure we are in evaluation mode. - cfg.Agent.EvalMode = true - - client, err := oai.NewClient(cfg) - if err != nil { - return nil, err - } +func (e *Evaluator) processExamples(ctx context.Context, examples []*v1alpha1.EvalExample, lastProcessedTime time.Time, client v1alpha1connect.AIServiceClient, logsClient logspbconnect.LogsServiceClient, manager *ResultsManager) error { + oLog := logs.FromContext(ctx) - // TODO(jeremy): This will need to be updated when we support other configurations. - completer, err := oai.NewCompleter(cfg, client) + oaiClient, err := oai.NewClient(e.config) if err != nil { - return nil, err + return errors.Wrapf(err, "Failed to create OpenAI client") } - log := logs.FromContext(ctx) - log.Info("Creating agen without inMemoryExampleDB", "config", cfg.Agent) - if cfg.Agent.RAG != nil && cfg.Agent.RAG.Enabled { - return nil, errors.New("RAG is enabled but eval code needs to be updated to ddeal with streaming logs") - } - - // TODO(jeremy): How should we construct inMemoryExampleDB? In the eval case? - agent, err := agent.NewAgent(cfg, completer, nil) + judge, err := NewJudge(oaiClient) if err != nil { - return nil, err + return errors.Wrapf(err, "Failed to create Judge") } - return agent, nil -} -// TODO(jeremy): We should use reconcilePredictions which uses the client to generate the predictions. -func (e *Evaluator) reconcilePredictions(ctx context.Context, db *pebble.DB, agent *agent.Agent) error { - olog := logs.FromContext(ctx) - iter, err := db.NewIterWithContext(ctx, nil) - if err != nil { - return err - } - defer iter.Close() + // Now iterate over the examples and process them. + for _, example := range examples { + log := oLog.WithValues("exampleId", example.GetId()) - for iter.First(); iter.Valid(); iter.Next() { - key := iter.Key() - if key == nil { - break - } - - log := olog.WithValues("id", string(key)) - value, err := iter.ValueAndErr() - if err != nil { - return errors.Wrapf(err, "Failed to read value for key %s", string(key)) - } - - result := &v1alpha1.EvalResult{} - if err := proto.Unmarshal(value, result); err != nil { - return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key)) - } - - if len(result.GetActual()) > 0 { - log.Info("Skipping; already have answer", "path", result.ExampleFile) - // We have the answer so we don't need to generate it. + // TODO(jeremy): Should we just read the row from the database and check if it exists and has been completed? + // Finding the lastProcessed time and then using that seems like maybe its premature optimization? But maybe + // since I wrote it might as well keep it. + exampleTime := example.GetTime().AsTime() + if exampleTime.Before(lastProcessedTime) || exampleTime == lastProcessedTime { + log.V(logs.Debug).Info("Skipping example; already processed") continue } - if len(result.Actual) == 0 { - // Initialize a trace - resp, err := func() (*v1alpha1.GenerateResponse, error) { - newCtx, span := tracer().Start(ctx, "(*Evaluator).reconcilePredictions") - defer span.End() - - // We need to generate the answer. - return agent.Generate(newCtx, &v1alpha1.GenerateRequest{ - Doc: result.Example.Query, - }) - }() - if err != nil { - result.Error = err.Error() - result.Status = v1alpha1.EvalResultStatus_ERROR - continue - } + var processErr error - result.Actual = resp.GetBlocks() - result.GenTraceId = resp.GetTraceId() + uErr := manager.Update(ctx, example.GetId(), func(result *v1alpha1.EvalResult) error { + processErr = e.processResult(ctx, result, example, client, logsClient, judge) + // We need to return for the transaction to be committed. + return nil + }) - log.Info("Writing result to DB") - if err := updateResult(ctx, string(key), result, db); err != nil { - return errors.Wrapf(err, "Failed to write result to DB") - } + if processErr != nil { + log.Error(processErr, "Failed to process example") + // For now we abort on error to see what's going on. + return processErr } - } - return nil -} - -func updateResult(ctx context.Context, id string, result *v1alpha1.EvalResult, db *pebble.DB) error { - b, err := proto.Marshal(result) - if err != nil { - return errors.Wrapf(err, "Failed to marshal result") - } - if err := db.Set([]byte(id), b, nil); err != nil { - return errors.Wrapf(err, "Failed to write result to DB") - } - return nil -} -func (e *Evaluator) reconcileDistance(ctx context.Context, db *pebble.DB) error { - olog := logs.FromContext(ctx) - iter, err := db.NewIterWithContext(ctx, nil) - if err != nil { - return err - } - defer iter.Close() - - for iter.First(); iter.Valid(); iter.Next() { - key := iter.Key() - if key == nil { - break + if uErr != nil { + log.Error(uErr, "Failed to update result") + // For now we abort on error to see what's going on. + return uErr } - log := olog.WithValues("id", string(key)) - value, err := iter.ValueAndErr() + result, err := manager.Get(ctx, example.GetId()) if err != nil { - return errors.Wrapf(err, "Failed to read value for key %s", string(key)) - } - - result := &v1alpha1.EvalResult{} - if err := proto.Unmarshal(value, result); err != nil { - return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key)) + return errors.Wrapf(err, "Failed to get latest result for example %s", example.GetId()) } - if result.Distance >= 0 && result.Status != v1alpha1.EvalResultStatus_UNKNOWN_EVAL_RESULT_STATUS { - log.Info("Skipping; distance already computed") + if result.Error != "" { + // Generating a completion failed for this example so we should keep going. + // There won't be a blocklog to wait for. continue } - updateEvalResultDistance(ctx, e.parser, result) - log.Info("Updating distance", "distance", result.Distance) - if err := updateResult(ctx, string(key), result, db); err != nil { - log.Error(err, "Failed to update result") + if err := e.waitForBlockLog(ctx, result, logsClient); err != nil { + log.Error(err, "Failed to wait for block log") + // For now we abort on error to see what's going on. + return errors.Wrapf(err, "Failed to get block log for example %s", example.GetId()) } - } - return nil -} -func (e *Evaluator) reconcileBestRAGResult(ctx context.Context, db *pebble.DB, traces *pebble.DB) error { - olog := logs.FromContext(ctx) - iter, err := db.NewIterWithContext(ctx, nil) - if err != nil { - return err - } - defer iter.Close() + var ragErr error + // Getting the bestRAG result depends on the trace having been processed so we run after waiting for the BlockLog + uErr = manager.Update(ctx, example.GetId(), func(result *v1alpha1.EvalResult) error { + ragErr = e.reconcileBestRAGResult(ctx, result, logsClient) + return nil + }) - for iter.First(); iter.Valid(); iter.Next() { - key := iter.Key() - if key == nil { - break + if ragErr != nil { + log.Error(ragErr, "Failed to reconcile best RAG result") + // For now we abort on error to see what's going on. + return ragErr } - log := olog.WithValues("id", string(key)) - value, err := iter.ValueAndErr() - if err != nil { - return errors.Wrapf(err, "Failed to read value for key %s", string(key)) + if uErr != nil { + log.Error(uErr, "Failed to update result") + // For now we abort on error to see what's going on. + return uErr } - result := &v1alpha1.EvalResult{} - if err := proto.Unmarshal(value, result); err != nil { - return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key)) - } + } + return nil +} - // TODO(jeremy): How do we skip this step in the case where the experiment didn't involve RAG - if result.BestRagResult != nil { - log.Info("Skipping; best RAG result already computed") - continue - } +// processResult process the result. It is updated in place +func (e *Evaluator) processResult(ctx context.Context, result *v1alpha1.EvalResult, example *v1alpha1.EvalExample, client v1alpha1connect.AIServiceClient, logsClient logspbconnect.LogsServiceClient, judge *Judge) error { + result.Example = example - genTrace := &logspb.Trace{} - if err := dbutil.GetProto(traces, result.GenTraceId, genTrace); err != nil { - log.Error(err, "Failed to read gen trace", "id", result.GenTraceId) - continue - } + if err := runGenerate(ctx, result, client); err != nil { + return err + } - for _, span := range genTrace.Spans { - if span.GetRag() == nil { - continue - } - rag := span.GetRag() - if rag.Results == nil { - continue - } + if result.Error != "" { + // Since an error occurred generating a completion for this example we can't continue to + // process this example + // We return nil because we want the evaluator to continue with other examples + return nil + } - for _, ragResult := range rag.Results { - if ragResult.Example == nil { - continue - } - if result.BestRagResult == nil { - result.BestRagResult = ragResult - continue - } - - if result.BestRagResult.Score < ragResult.Score { - result.BestRagResult = ragResult - } - } - } + if err := runExecute(ctx, result, client); err != nil { + return err + } - if result.BestRagResult == nil { - continue - } - if err := updateResult(ctx, string(key), result, db); err != nil { - log.Error(err, "Failed to update result") - } + if err := judge.Score(ctx, result); err != nil { + err := errors.Wrapf(err, "Failed to judge example %s", example.GetId()) + result.Error = err.Error() + return err } + return nil } -func updateEvalResultDistance(ctx context.Context, parser *executor.BashishParser, result *v1alpha1.EvalResult) { - log := logs.FromContext(ctx).WithValues("id", result.GetExample().GetId()) - var actualBlock *v1alpha1.Block +// runGenerate runs the generate step for the example. +// +// runGenerate returns an error if there is a problem that should cause evaluation to abort rather than processing +// other examples (e.g. unable to contact the agent). If there is a problem generating a completion for this specific +// example then the result will be nil but result.Error will be set +func runGenerate(ctx context.Context, result *v1alpha1.EvalResult, client v1alpha1connect.AIServiceClient) error { + log := logs.FromContext(ctx) + // ID for the generate session + genSessionID := ulid.GenerateID() + + // We need to send a session event to the agent to simulate the session starting. + // This is because SessionStart event will contain the full context used with the execution + logEventReq := &v1alpha1.LogEventsRequest{} + logEventReq.Events = append(logEventReq.Events, &v1alpha1.LogEvent{ + Type: v1alpha1.LogEventType_SESSION_START, + ContextId: genSessionID, + SelectedIndex: result.Example.GetFullContext().GetSelected(), + }) - for _, b := range result.Actual { - if b.Kind == v1alpha1.BlockKind_CODE { - actualBlock = b - break - } + _, err := client.LogEvents(ctx, connect.NewRequest(logEventReq)) + if err != nil { + log.Error(err, "Failed to log events") + // For now abort on error to see what's going on. + return errors.Wrapf(err, "Failed to log events") } - if len(result.Example.GetAnswer()) > 1 { - log.Info("Warning; expected answer more than one answer block. Only the first is used") + request := &v1alpha1.GenerateCellsRequest{ + Notebook: result.Example.GetFullContext().GetNotebook(), } - expected, err := parser.Parse(result.Example.Answer[0].GetContents()) + resp, err := client.GenerateCells(ctx, connect.NewRequest(request)) if err != nil { - log.Error(err, "Failed to parse expected answer to command") - result.Error = err.Error() - result.Status = v1alpha1.EvalResultStatus_ERROR - return - } - - var actual []executor.Instruction - if actualBlock != nil { - parsed, err := parser.Parse(actualBlock.GetContents()) - if err != nil { - log.Error(err, "Failed to parse actual answer to command") + if connectErr := new(connect.Error); errors.As(err, &connectErr) { + // TODO(https://github.com/jlewi/foyle/issues/257) + // Currently GenerateCells returns a connect.Error if the completer can't generate a completion + // because of too many tokens. + if connect.CodeOf(err) == connect.CodeUnknown { + result.Error = err.Error() + // We return nil because the problem is specific to this example so the evaluator should move on + // to other examples + return nil + } + } else { + log.Error(err, "Failed to generate cells") result.Error = err.Error() - result.Status = v1alpha1.EvalResultStatus_ERROR - return - } - actual = parsed - } else { - // Since there is no code block. Initialize actual to an empty command. - // This will cause the distance computed to be the maximum possible distance which is what we want - actual = []executor.Instruction{ - { - Command: cmd.NewCmd(""), - }, + return err } } - distance, err := Distance(expected[0], actual[0]) + result.ActualCells = resp.Msg.GetCells() - if err != nil { - log.Error(err, "Failed to compute distance") - result.Error = err.Error() - result.Status = v1alpha1.EvalResultStatus_ERROR - return + traceParent := resp.Header().Get(agent.TraceIDHeader) + if traceParent == "" { + return errors.New("GenerateCells response didn't contain traceparent header") } + result.GenTraceId = traceParent - if distance.Max < distance.Distance { - log.Error(errors.New("Distance is greater than max distance"), "Distance is greater than max distance", "distance", distance.Distance, "max", distance.Max) + // We need to close the generate session session. + endEventsReq := &v1alpha1.LogEventsRequest{ + Events: []*v1alpha1.LogEvent{ + { + ContextId: genSessionID, + Type: v1alpha1.LogEventType_SESSION_END, + }, + }, } - result.Distance = int32(distance.Distance) - result.NormalizedDistance = distance.Normalized - result.Status = v1alpha1.EvalResultStatus_DONE + _, err = client.LogEvents(ctx, connect.NewRequest(endEventsReq)) + if err != nil { + log.Error(err, "Failed to log events") + // For now abort on error to see what's going on. + return errors.Wrapf(err, "Failed to log events") + } + return nil } -func (e *Evaluator) updateGoogleSheet(ctx context.Context, experiment api.Experiment, db *pebble.DB) error { +func runExecute(ctx context.Context, result *v1alpha1.EvalResult, client v1alpha1connect.AIServiceClient) error { log := logs.FromContext(ctx) - if e.config.Eval == nil || e.config.Eval.GCPServiceAccount == "" { - return errors.New("GCPServiceAccount is required to update Google Sheet") - } + // We need to send a LOG event to the agent to simulate the cells being executed. + executeEventReq := &v1alpha1.LogEventsRequest{} - sheetName := experiment.Spec.SheetName - sheetID := experiment.Spec.SheetID + // Start a session to execute the cell + execSessionID := ulid.GenerateID() - if sheetID == "" { - return errors.New("SheetID is required to update Google Sheet") + if len(result.Example.ExpectedCells) != 1 { + return errors.New("Expected cells isn't 1; How did this make it into the evaluation dataset? Shouldn't all examples in the eval set have 1 expected cell") } - if sheetName == "" { - return errors.New("SheetName is required to update Google Sheet") + if len(result.ActualCells) < 1 { + // In this case the LLM failed to generate a cell. There's no point sending an execution event because + // There's no cellId to link the executed cell to the generation event. + // Currently, Foyle doesn't have a way of learning when the LLM fails to generate a cell. Learning + // only occurs if 1) Foyle generates a cell, 2) user edits cell 3) user executes the cell + return nil } - log = log.WithValues("spreadsheetID", sheetID, "sheetName", sheetName) - log.Info("Updating Google Sheet") - credentialsConfig := &impersonate.CredentialsConfig{ - TargetPrincipal: e.config.Eval.GCPServiceAccount, - Scopes: []string{"https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive"}, - } + cell := result.Example.ExpectedCells[0] - credentials, err := impersonate.CredentialsTokenSource(ctx, *credentialsConfig) - if err != nil { - log.Error(err, "Unable to create impersonated credentials") - return err + if cell.Kind != parserv1.CellKind_CELL_KIND_CODE { + return errors.New("The expected cell in the example isn't of type CELL_KIND_CODE. How did this make it into the evaluation dataset? Shouldn't all examples in the eval set have 1 expected cell of type CELL_KIND_CODE") } - srv, err := sheets.NewService(ctx, option.WithTokenSource(credentials)) - if err != nil { - log.Error(err, "Unable to retrieve Sheets client") - return err + actualID := converters.GetCellID(result.ActualCells[0]) + if actualID == "" { + return errors.New("Actual cell ID is empty") } - // Create the sheet if it doesn't exist - batchUpdateRequest := &sheets.BatchUpdateSpreadsheetRequest{ - Requests: []*sheets.Request{ - { - AddSheet: &sheets.AddSheetRequest{ - Properties: &sheets.SheetProperties{ - Title: experiment.Spec.SheetName, - }, - }, - }, + converters.SetCellID(cell, actualID) + + executeEventReq.Events = append(executeEventReq.Events, &v1alpha1.LogEvent{ + Type: v1alpha1.LogEventType_SESSION_START, + ContextId: execSessionID, + }) + + executeEventReq.Events = append(executeEventReq.Events, &v1alpha1.LogEvent{ + ContextId: execSessionID, + Type: v1alpha1.LogEventType_EXECUTE, + Cells: []*parserv1.Cell{ + cell, }, + SelectedIndex: 0, + SelectedId: converters.GetCellID(cell), + }) + + executeEventReq.Events = append(executeEventReq.Events, &v1alpha1.LogEvent{ + Type: v1alpha1.LogEventType_SESSION_END, + ContextId: execSessionID, + }) + + if _, err := client.LogEvents(ctx, connect.NewRequest(executeEventReq)); err != nil { + log.Error(err, "Failed to log events") + result.Error = errors.Wrapf(err, "Failed to log events").Error() + return errors.Wrapf(err, "Failed to log events") } + return nil +} - _, err = srv.Spreadsheets.BatchUpdate(experiment.Spec.SheetID, batchUpdateRequest).Context(ctx).Do() - if err != nil { - apiErr, ok := err.(*googleapi.Error) - if ok { - if apiErr.Code == 400 { - log.V(1).Info("Sheet already exists") - } else { - log.Error(err, "Unable to create new sheet ") - return errors.Wrapf(err, "Unable to create new sheet named: %s", sheetName) - } - } else { - return errors.Wrapf(err, "Unable to create new sheet named: %s", sheetName) +func (e *Evaluator) waitForBlockLog(ctx context.Context, result *v1alpha1.EvalResult, client logspbconnect.LogsServiceClient) error { + // We need to wait for the block log to be processed. + // This is done to + // 1. Increase the likelihood we have learned from the block + // 2. To verify that the evaluator properly sends the data needed for the agent to learn from the block. + log := logs.FromContext(ctx) + if len(result.GetActualCells()) == 0 { + // Since no cells were actually generated there won't be any blocklog to wait for. + return nil + } + + // TODO(jeremy): What should we do if there's more than 1 code cell? + var codeCell *parserv1.Cell + for _, cell := range result.GetActualCells() { + if cell.Kind == parserv1.CellKind_CELL_KIND_CODE { + codeCell = cell + break } } - // Prepare the value range to write - writeRange := sheetName - values := [][]interface{}{{"id", "file", "prompt", "actual", "expected", "distance", "normalized_distance", "best_rag"}} + if codeCell == nil { + // Since there is no code cell there's no blockLog to fetch + return nil + } - iter, err := db.NewIterWithContext(ctx, nil) - if err != nil { - return err + cellID := converters.GetCellID(codeCell) + if cellID == "" { + return errors.New("Cell ID is empty") } - defer iter.Close() - for iter.First(); iter.Valid(); iter.Next() { - key := iter.Key() - if key == nil { - break - } + timeOut := time.Now().Add(3 * time.Minute) + + var blockLog *logspb.BlockLog + for time.Now().Before(timeOut) { + + resp, err := client.GetBlockLog(ctx, connect.NewRequest(&logspb.GetBlockLogRequest{ + Id: cellID, + })) - value, err := iter.ValueAndErr() if err != nil { - return errors.Wrapf(err, "Failed to read value for key %s", string(key)) + log.Info("Failed to get block log", "err", err) + time.Sleep(5 * time.Second) + continue } - result := &v1alpha1.EvalResult{} - if err := proto.Unmarshal(value, result); err != nil { - return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key)) + blockLog = resp.Msg.GetBlockLog() + if blockLog.ExecutedBlock == nil || blockLog.GeneratedBlock == nil { + log.Info("Block log isn't ready yet") + time.Sleep(5 * time.Second) + continue } - prompt := docs.DocToMarkdown(result.Example.Query) - row := []interface{}{result.Example.Id, result.ExampleFile, prompt, docs.BlocksToMarkdown(result.Actual), docs.BlocksToMarkdown(result.Example.Answer), result.Distance, result.NormalizedDistance} - - bestRAG := "" - if result.BestRagResult != nil { - if result.BestRagResult.Example.Query != nil { - bestRAG = docs.DocToMarkdown(result.BestRagResult.Example.Query) - } + if blockLog.GeneratedBlock.GetContents() != result.ActualCells[0].Value { + return errors.Errorf("BlockLog generated block doesn't match actual cell. This means the result of GenerateCells returned to the evaluator doesn't match the result that the Agent read from the BlockLogs and stored in its BlockLog; want: %s; got %s", result.ActualCells[0].Value, blockLog.GeneratedBlock.GetContents()) } - row = append(row, bestRAG) - values = append(values, row) - } - valueRange := &sheets.ValueRange{ - Values: values, - } - // Write the value range to the sheet - _, err = srv.Spreadsheets.Values.Update(sheetID, writeRange, valueRange). - ValueInputOption("USER_ENTERED"). - Context(ctx). - Do() - if err != nil { - log.Error(err, "Unable to write data to sheet") - return errors.Wrapf(err, "Unable to write data to sheet") + return nil } - return nil + return errors.New("Timed out waiting for block log. This could indicate we aren't properly sending the events needed to generate a BlockLog suitable for learning.") } -func findUnloadedFiles(ctx context.Context, db *pebble.DB, files []string) ([]string, error) { - unprocessed := map[string]bool{} - - iter, err := db.NewIterWithContext(ctx, nil) - if err != nil { - return nil, err +func (e *Evaluator) reconcileBestRAGResult(ctx context.Context, evalResult *v1alpha1.EvalResult, client logspbconnect.LogsServiceClient) error { + if evalResult.GenTraceId == "" { + return errors.WithStack(errors.New("GenTraceId is empty")) } - defer iter.Close() - for _, file := range files { - unprocessed[file] = true - } + timeOut := time.Now().Add(3 * time.Minute) + var genTrace *logspb.Trace + for { + if time.Now().After(timeOut) { + return errors.Errorf("Timed out waiting for traceId to be ready; traceId: %s", evalResult.GenTraceId) + } + + resp, err := client.GetTrace(ctx, connect.NewRequest(&logspb.GetTraceRequest{ + Id: evalResult.GenTraceId, + })) - // Iterate over the files in the DB and remove them from the list of files to load. - for iter.First(); iter.Valid(); iter.Next() { - key := iter.Key() - if key == nil { + if err == nil { + genTrace = resp.Msg.GetTrace() break } - value, err := iter.ValueAndErr() - if err != nil { - // Should we ignore the error? - return nil, errors.Wrapf(err, "Failed to read value for key %s", string(key)) + // Check if the error is a "not found" error + // We want to retry if the trace isn't found because it might not have been processed yet + if connect.CodeOf(err) != connect.CodeNotFound { + // If it's any other error, consider it a permanent error + return errors.Wrapf(err, "Failed to get trace %s", evalResult.GenTraceId) } - result := &v1alpha1.EvalResult{} - if err := proto.Unmarshal(value, result); err != nil { - return nil, errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key)) - } + time.Sleep(5 * time.Second) + } - delete(unprocessed, result.ExampleFile) + // TODO(jeremy): Should we update EvalResult to indicate the failure + // What should we do if the experiment doesn't involve learning + if genTrace == nil { + return errors.WithStack(errors.Errorf("Trace %s is nil", evalResult.GenTraceId)) } - toProcess := make([]string, 0, len(unprocessed)) - for file := range unprocessed { - toProcess = append(toProcess, file) + for _, span := range genTrace.Spans { + if span.GetRag() == nil { + continue + } + rag := span.GetRag() + if rag.Results == nil { + continue + } + + for _, ragResult := range rag.Results { + if ragResult.Example == nil { + continue + } + if evalResult.BestRagResult == nil { + evalResult.BestRagResult = ragResult + continue + } + + if evalResult.BestRagResult.Score < ragResult.Score { + evalResult.BestRagResult = ragResult + } + } } - return toProcess, nil + return nil +} + +// isSortedByTimeDescending checks if the slice is sorted by Time in descending order +func isSortedByTimeDescending(slice []*v1alpha1.EvalResult) bool { + for i := 1; i < len(slice); i++ { + if slice[i-1].Example.GetTime().AsTime().Before(slice[i].Example.GetTime().AsTime()) { + return false + } + } + return true } -// listEvalFiles returns a list of the all the markdown files in the eval directory. +// listEvalFiles returns a list of the all the binary protobuf files in the directory evalDir. func listEvalFiles(ctx context.Context, evalDir string) ([]string, error) { examples := make([]string, 0, 100) + if evalDir == "" { + return examples, errors.Wrapf(errors.New("evalDir is empty"), "evalDir is empty") + } err := filepath.Walk(evalDir, func(path string, info os.FileInfo, err error) error { if info.IsDir() { return nil } - if filepath.Ext(path) != ".md" { + if filepath.Ext(path) != ".binpb" { return nil } @@ -595,73 +554,45 @@ func listEvalFiles(ctx context.Context, evalDir string) ([]string, error) { return examples, err } -// loadMarkdownFiles loads a bunch of markdown files representing evaluation data and converts them into example -// protos. The final block in the markdown file is treated as the answer. -func loadMarkdownAnswerFiles(ctx context.Context, db *pebble.DB, files []string) error { - oLog := logs.FromContext(ctx) - - allErrors := &helpers.ListOfErrors{} - for _, path := range files { - log := oLog.WithValues("path", path) - log.Info("Processing file") - - contents, err := os.ReadFile(path) - if err != nil { - log.Error(err, "Failed to read file") - allErrors.AddCause(err) - // Keep going - continue - } +// getLastProcessedTime returns the time of the last processed example +func getLastProcessedTime(ctx context.Context, manager *ResultsManager) (time.Time, error) { + // Default the time of the lastProcessedEval example to some time in the past. + // This way all examples should be after it and get reprocessed + lastProcessedTime := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC) - doc := &v1alpha1.Doc{} + alreadyProcessed, _, err := manager.ListResults(ctx, nil, 10) - blocks, err := docs.MarkdownToBlocks(string(contents)) - if err != nil { - log.Error(err, "Failed to convert markdown to blocks") - allErrors.AddCause(err) - // Keep going - continue - } - - doc.Blocks = blocks - - if len(doc.GetBlocks()) < 2 { - log.Info("Skipping doc; too few blocks; at least two are required") - continue - } + if err != nil { + return lastProcessedTime, errors.Wrapf(err, "Failed to list already processed results") + } - answer := doc.GetBlocks()[len(doc.GetBlocks())-1] - doc.Blocks = doc.Blocks[:len(doc.GetBlocks())-1] - if answer.Kind != v1alpha1.BlockKind_CODE { - log.Info("Skipping doc; last block must be code") - continue - } + if len(alreadyProcessed) == 0 { + return lastProcessedTime, nil + } - // We generate a stable ID for the example by hashing the contents of the document. - example := &v1alpha1.Example{ - Query: doc, - Answer: []*v1alpha1.Block{answer}, - } - example.Id = HashExample(example) + if !isSortedByTimeDescending(alreadyProcessed) { + return lastProcessedTime, errors.New("Results aren't sorted by time in descending order") + } - result := &v1alpha1.EvalResult{ - Example: example, - ExampleFile: path, - // initialize distance to a negative value so we can tell when it hasn't been computed - Distance: uninitializedDistance, - } + return alreadyProcessed[0].Example.GetTime().AsTime(), nil +} - if err := dbutil.SetProto(db, example.GetId(), result); err != nil { - log.Error(err, "Failed to write result to DB") - allErrors.AddCause(err) - // Keep going - continue - } - } +func sortEvalExamplesInTime(examples []*v1alpha1.EvalExample) { + sort.Slice(examples, func(i, j int) bool { + // Convert the Time field to time.Time objects + timeI := examples[i].Time.AsTime() + timeJ := examples[j].Time.AsTime() - if len(allErrors.Causes) > 0 { - return allErrors - } + // Compare the times + return timeI.Before(timeJ) + }) +} - return nil +func newAIServiceClient(baseURL string) v1alpha1connect.AIServiceClient { + // Create a new client + client := v1alpha1connect.NewAIServiceClient( + newHTTPClient(), + baseURL, + ) + return client } diff --git a/app/pkg/eval/evaluator_test.go b/app/pkg/eval/evaluator_test.go index c77135d4..c06694e6 100644 --- a/app/pkg/eval/evaluator_test.go +++ b/app/pkg/eval/evaluator_test.go @@ -6,15 +6,16 @@ import ( "path/filepath" "testing" - "github.com/jlewi/foyle/app/pkg/executor" + "connectrpc.com/connect" + "github.com/go-logr/zapr" + "github.com/jlewi/foyle/app/pkg/agent" + "github.com/jlewi/foyle/app/pkg/runme/converters" "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" - - "github.com/jlewi/foyle/app/api" "github.com/pkg/errors" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" - "github.com/cockroachdb/pebble" + "github.com/jlewi/foyle/app/api" "github.com/jlewi/foyle/app/pkg/config" - "github.com/jlewi/monogo/helpers" "go.uber.org/zap" ) @@ -23,7 +24,8 @@ func Test_Evaluator(t *testing.T) { t.Skipf("Test is skipped in GitHub actions") } - t.Fatalf("Evaluator test needs to be updated per https://github.com/jlewi/foyle/issues/140") + // This test assumes you have already started an agent with the appropriate configuration that you + // want to evaluate. log, err := zap.NewDevelopmentConfig().Build() if err != nil { @@ -47,126 +49,176 @@ func Test_Evaluator(t *testing.T) { } if err := e.Reconcile(context.Background(), *experiment); err != nil { - t.Fatalf("Error reconciling; %v", err) + t.Fatalf("Error reconciling; %+v", err) } } -func Test_Evaluator_Google_Sheets(t *testing.T) { - if os.Getenv("GITHUB_ACTIONS") != "" { - t.Skipf("Test is skipped in GitHub actions") +func Test_Evaluator_RunGenerate(t *testing.T) { + result := &v1alpha1.EvalResult{ + Example: &v1alpha1.EvalExample{ + FullContext: &v1alpha1.FullContext{ + Notebook: &parserv1.Notebook{ + Cells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_MARKUP, + Value: "RunSomeCode", + }, + }, + }, + }, + ExpectedCells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_CODE, + Value: "gcloud builds list", + }, + }, + }, } + fake := &fakeClient{ + GenerateCellsResponse: &v1alpha1.GenerateCellsResponse{ + Cells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_CODE, + Value: "some command", + }, + }, + }, - t.Fatalf("Evaluator test needs to be updated per https://github.com/jlewi/foyle/issues/140") + generateTraceID: "someTrace", + } + if err := runGenerate(context.Background(), result, fake); err != nil { + t.Fatalf("Error running execute; %v+", err) + } - log, err := zap.NewDevelopmentConfig().Build() - if err != nil { - t.Fatalf("Error creating logger; %v", err) + if result.ActualCells[0].Value != "some command" { + t.Errorf("Expected actual cell to be 'some command' but got %v", result.ActualCells[0].Value) } - zap.ReplaceGlobals(log) - if err := config.InitViper(nil); err != nil { - t.Fatalf("Error initializing Viper; %v", err) + if result.GetGenTraceId() != "someTrace" { + t.Errorf("Expected trace id to be 'some trace' but got %v", result.GetGenTraceId()) } - cfg := config.GetConfig() - e, err := NewEvaluator(*cfg) - if err != nil { - t.Fatalf("Error creating evaluator; %v", err) + // Make sure the events are correct + if fake.Events[0].Type != v1alpha1.LogEventType_SESSION_START { + t.Errorf("Expected first event to be a session start but got %v", fake.Events[0].Type) } - experiment, err := experimentForTesting() - if err != nil { - t.Fatalf("Error creating experiment; %v", err) + if fake.Events[1].Type != v1alpha1.LogEventType_SESSION_END { + t.Errorf("Expected last event to be a session end but got %v", fake.Events[0].Type) } +} - db, err := pebble.Open(experiment.Spec.DBDir, &pebble.Options{}) - if err != nil { - t.Fatalf("Error opening DB; %v", err) +func Test_Evaluator_RunExecute(t *testing.T) { + result := &v1alpha1.EvalResult{ + Example: &v1alpha1.EvalExample{ + ExpectedCells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_CODE, + Value: "gcloud executed command", + Metadata: map[string]string{ + converters.IdField: "idFieldShouldBeIgnored", + converters.RunmeIdField: "runmeIdFieldShouldBeIgnored", + }, + }, + }, + }, + ActualCells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_CODE, + Value: "gcloud predicted command", + Metadata: map[string]string{ + converters.IdField: "idOfActualCell", + }, + }, + }, + } + + fake := &fakeClient{} + if err := runExecute(context.Background(), result, fake); err != nil { + t.Fatalf("Error running execute; %v+", err) + } + + // Make sure the events are correct + if fake.Events[0].Type != v1alpha1.LogEventType_SESSION_START { + t.Errorf("Expected first event to be a session start but got %v", fake.Events[0].Type) + } + + if fake.Events[1].Type != v1alpha1.LogEventType_EXECUTE { + t.Errorf("Expected event to be a execution event but got %v", fake.Events[0].Type) } - defer helpers.DeferIgnoreError(db.Close) - if err := e.updateGoogleSheet(context.Background(), *experiment, db); err != nil { - t.Fatalf("Error updating Google Sheet; %v", err) + + if fake.Events[1].SelectedId != "idOfActualCell" { + t.Errorf("SelectedID is not correct") + } + + if converters.GetCellID(fake.Events[1].Cells[0]) != "idOfActualCell" { + t.Errorf("ID of cell is not correct") + } + + if fake.Events[1].Cells[0].Value != "gcloud executed command" { + t.Errorf("Executed cell is not the expected cell ") + } + + if fake.Events[2].Type != v1alpha1.LogEventType_SESSION_END { + t.Errorf("Expected last event to be a session end but got %v", fake.Events[0].Type) } } -func experimentForTesting() (*api.Experiment, error) { - cwd, err := os.Getwd() - if err != nil { - return nil, errors.Wrapf(err, "Error getting working directory") +type fakeClient struct { + Events []*v1alpha1.LogEvent + GenerateCellsResponse *v1alpha1.GenerateCellsResponse + generateTraceID string +} + +func (f *fakeClient) StreamGenerate(context.Context) *connect.BidiStreamForClient[v1alpha1.StreamGenerateRequest, v1alpha1.StreamGenerateResponse] { + //TODO implement me + panic("implement me") +} + +func (f *fakeClient) GenerateCells(ctx context.Context, req *connect.Request[v1alpha1.GenerateCellsRequest]) (*connect.Response[v1alpha1.GenerateCellsResponse], error) { + if f.GenerateCellsResponse == nil { + return connect.NewResponse(&v1alpha1.GenerateCellsResponse{}), nil } - evalDir, err := filepath.Abs(filepath.Join(cwd, "..", "..", "..", "data", "eval")) + resp := connect.NewResponse(f.GenerateCellsResponse) + resp.Header().Set(agent.TraceIDHeader, f.generateTraceID) + + return resp, nil +} + +func (f *fakeClient) GetExample(context.Context, *connect.Request[v1alpha1.GetExampleRequest]) (*connect.Response[v1alpha1.GetExampleResponse], error) { + //TODO implement me + panic("implement me") +} + +func (f *fakeClient) Status(context.Context, *connect.Request[v1alpha1.StatusRequest]) (*connect.Response[v1alpha1.StatusResponse], error) { + //TODO implement me + panic("implement me") +} + +func (f *fakeClient) LogEvents(ctx context.Context, req *connect.Request[v1alpha1.LogEventsRequest]) (*connect.Response[v1alpha1.LogEventsResponse], error) { + if f.Events == nil { + f.Events = make([]*v1alpha1.LogEvent, 0, 100) + } + f.Events = append(f.Events, req.Msg.Events...) + return connect.NewResponse(&v1alpha1.LogEventsResponse{}), nil +} + +func experimentForTesting() (*api.Experiment, error) { + log := zapr.NewLogger(zap.L()) + oDir, err := os.MkdirTemp("", "testOutput") if err != nil { - return nil, errors.Wrapf(err, "Error getting eval directory") + return nil, errors.Wrapf(err, "Error creating temp directory") } + dbFile := filepath.Join(oDir, "results.sqlite") + log.Info("Output database", "database", dbFile) + return &api.Experiment{ Spec: api.ExperimentSpec{ - EvalDir: evalDir, - DBDir: "/tmp/foyle/eval", - SheetID: "1O0thD-p9DBF4G_shGMniivBB3pdaYifgSzWXBxELKqE", - SheetName: "Results", - Agent: &api.AgentConfig{ - Model: config.DefaultModel, - // No need to test RAG as part of testing evaluation. - RAG: &api.RAGConfig{ - Enabled: false, - }, - }, + // EvalDir is the directory containing the eval example protos + EvalDir: "/Users/jlewi/tmp/examples-for-testing", + AgentAddress: "http://localhost:10777/api", + OutputDB: dbFile, }, }, nil } - -func Test_updateEvalResultDistance(t *testing.T) { - type testCase struct { - name string - result *v1alpha1.EvalResult - expectedDistance int32 - expectedNormalized float32 - } - - cases := []testCase{ - { - // Test the case where the actual answer contains no codeblocks - name: "nocodeblocks", - result: &v1alpha1.EvalResult{ - Example: &v1alpha1.Example{ - Id: "1234", - Answer: []*v1alpha1.Block{ - { - Kind: v1alpha1.BlockKind_CODE, - Contents: "gcloud builds list", - }, - }, - }, - ExampleFile: "", - Actual: []*v1alpha1.Block{ - { - Kind: v1alpha1.BlockKind_MARKUP, - Contents: "Not a code cell", - }, - }, - }, - expectedDistance: 3, - expectedNormalized: 1.0, - }, - } - parser, err := executor.NewBashishParser() - if err != nil { - t.Fatalf("Error creating parser; %v", err) - } - - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - updateEvalResultDistance(context.Background(), parser, c.result) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - if c.result.Distance != c.expectedDistance { - t.Errorf("Expected distance %d but got %d", c.expectedDistance, c.result.Distance) - } - if c.result.NormalizedDistance != c.expectedNormalized { - t.Errorf("Expected normalized distance %f but got %f", c.expectedNormalized, c.result.NormalizedDistance) - } - }) - } -} diff --git a/app/pkg/eval/judge.go b/app/pkg/eval/judge.go new file mode 100644 index 00000000..61173ace --- /dev/null +++ b/app/pkg/eval/judge.go @@ -0,0 +1,155 @@ +package eval + +import ( + "context" + _ "embed" + "encoding/json" + "strings" + "text/template" + + "github.com/jlewi/foyle/app/pkg/docs" + "github.com/jlewi/foyle/app/pkg/runme/converters" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" + "github.com/pkg/errors" + "github.com/sashabaranov/go-openai" + "github.com/sashabaranov/go-openai/jsonschema" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" +) + +//go:embed judge_prompt.tmpl +var promptTemplateString string + +var ( + promptTemplate = template.Must(template.New("judge_prompt").Parse(promptTemplateString)) +) + +const ( + temperature = 0.9 +) + +type promptArgs struct { + Expected string + Actual string +} + +func NewJudge(client *openai.Client) (*Judge, error) { + return &Judge{ + client: client, + }, nil +} + +// Judge is an LLM judge +type Judge struct { + client *openai.Client +} + +func (j *Judge) Score(ctx context.Context, result *v1alpha1.EvalResult) error { + if len(result.GetExample().ExpectedCells) != 1 { + return errors.New("expected a single expected cell") + } + + // We don't check if actualCells is 1 because if its empty then the program is wrong and the judge + // should hopefully detect that. + + // Convert the cells to markdown + expectedNB := &parserv1.Notebook{ + Cells: result.GetExample().ExpectedCells, + } + + actualNB := &parserv1.Notebook{ + Cells: result.GetActualCells(), + } + + expectedDoc, err := converters.NotebookToDoc(expectedNB) + if err != nil { + return errors.Wrap(err, "Failed to convert expected cells to doc") + } + + actualDoc, err := converters.NotebookToDoc(actualNB) + if err != nil { + return errors.Wrap(err, "Failed to convert actual cells to doc") + } + + expectedMD := docs.DocToMarkdown(expectedDoc) + actualMD := docs.DocToMarkdown(actualDoc) + + args := promptArgs{ + Expected: expectedMD, + Actual: actualMD, + } + + var sb strings.Builder + if err := promptTemplate.Execute(&sb, args); err != nil { + return errors.Wrapf(err, "Failed to execute prompt template") + } + + messages := []openai.ChatCompletionMessage{ + //{Role: openai.ChatMessageRoleSystem, + // Content: systemPrompt, + //}, + {Role: openai.ChatMessageRoleUser, + Content: sb.String(), + }, + } + + // TODO(jeremy): Use ResponseFormat to enforce JSON output + // https://platform.openai.com/docs/guides/structured-outputs/how-to-use?context=without_parse + request := openai.ChatCompletionRequest{ + Model: openai.GPT4o20240806, + Messages: messages, + MaxTokens: 2000, + Temperature: temperature, + ResponseFormat: &openai.ChatCompletionResponseFormat{ + Type: openai.ChatCompletionResponseFormatTypeJSONSchema, + JSONSchema: &openai.ChatCompletionResponseFormatJSONSchema{ + Name: "JudgeOutput", + Schema: &jsonschema.Definition{ + Type: jsonschema.Object, + Description: "", + Enum: nil, + // TODO( + Properties: map[string]jsonschema.Definition{ + "equivalent": { + Type: jsonschema.Boolean, + }, + "reason": { + Type: jsonschema.String, + }, + }, + Required: []string{"equivalent", "reason"}, + }, + }, + }, + } + + resp, err := j.client.CreateChatCompletion(ctx, request) + if err != nil { + return err + } + + if len(resp.Choices) == 0 { + return errors.New("No choices in response") + } + + choice := resp.Choices[0] + output := &JudgeOutput{} + if err := json.Unmarshal([]byte(choice.Message.Content), output); err != nil { + return errors.Wrap(err, "Failed to unmarshal output") + } + + if output.Equivalent { + result.CellsMatchResult = v1alpha1.CellsMatchResult_MATCH + } else { + result.CellsMatchResult = v1alpha1.CellsMatchResult_MISMATCH + } + + result.JudgeExplanation = output.Reason + + return nil +} + +// JudgeOutput is the JSON output we expect the judge to emit +type JudgeOutput struct { + Equivalent bool `json:"equivalent"` + Reason string `json:"reason"` +} diff --git a/app/pkg/eval/judge_prompt.tmpl b/app/pkg/eval/judge_prompt.tmpl new file mode 100644 index 00000000..53116100 --- /dev/null +++ b/app/pkg/eval/judge_prompt.tmpl @@ -0,0 +1,22 @@ +You will be given code blocks containing two bash programs. Your task is to decide whether the two programs +are equivalent. Emit the output as a JSON dictionary with two fields `equivalent` and `reason`. The `equivalent` field +should be a boolen which is true if the programs are equivalent and false otherwise. The `reason` field should be a +string with a human-readable explanation of why the programs are equivalent or not. + +When deciding whether the programs are equivalent you should apply the following rules + +* When comparing two CLI invocations ignore the order of the arguments. For example `ls -l -a` is equivalent to `ls -a -l`. +* When comparing two CLI invocations if one invocation uses the long form of an argument and the other uses the short form + then the two invocations are not equivalent. For example `ls -l` is not equivalent to `ls --long`. +* If two CLI invocations use different binaries but are functionally similar then they are not equivalent. For example + `ls -l` is not equivalent to `cat -n`. + + +{{.Expected}} + + + +{{.Actual}} + + +Are these two programs equivalent? diff --git a/app/pkg/eval/judge_test.go b/app/pkg/eval/judge_test.go new file mode 100644 index 00000000..2087777d --- /dev/null +++ b/app/pkg/eval/judge_test.go @@ -0,0 +1,58 @@ +package eval + +import ( + "context" + "os" + "testing" + + "github.com/jlewi/foyle/app/pkg/config" + "github.com/jlewi/foyle/app/pkg/oai" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" +) + +func Test_Judge(t *testing.T) { + if os.Getenv("GITHUB_ACTIONS") != "" { + t.Skip("Skipping test; this test doesn't run in GitHub Actions") + } + + if err := config.InitViper(nil); err != nil { + t.Fatalf("Failed to initialize Viper: %v", err) + } + + cfg := config.GetConfig() + + client, err := oai.NewClient(*cfg) + if err != nil { + t.Fatalf("Failed to create OpenAI client: %v", err) + } + + judge, err := NewJudge(client) + if err != nil { + t.Fatalf("Failed to create Judge: %v", err) + } + + result := &v1alpha1.EvalResult{ + Example: &v1alpha1.EvalExample{ + ExpectedCells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_CODE, + Value: "kubectl get pods", + }, + }, + }, + ActualCells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_CODE, + Value: "kubectl -n foyle get pods", + }, + }, + } + + if err := judge.Score(context.TODO(), result); err != nil { + t.Fatalf("Failed to score: %+v", err) + } + + t.Logf("Judge Equivalent: %v", result.CellsMatchResult.String()) + t.Logf("Judge Explanation:\n%v", result.JudgeExplanation) +} diff --git a/app/pkg/eval/reconcilers.go b/app/pkg/eval/reconcilers.go deleted file mode 100644 index ba84cb5b..00000000 --- a/app/pkg/eval/reconcilers.go +++ /dev/null @@ -1,83 +0,0 @@ -package eval - -import ( - "context" - - "connectrpc.com/connect" - "github.com/cockroachdb/pebble" - "github.com/jlewi/foyle/app/pkg/logs" - "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" - "github.com/jlewi/foyle/protos/go/foyle/v1alpha1/v1alpha1connect" - "github.com/pkg/errors" - "google.golang.org/protobuf/proto" -) - -// reconcilePredictions reconciles predictions for examples in the database. -func reconcilePredictions(ctx context.Context, db *pebble.DB, client v1alpha1connect.GenerateServiceClient) error { - olog := logs.FromContext(ctx) - iter, err := db.NewIterWithContext(ctx, nil) - if err != nil { - return err - } - defer iter.Close() - - for iter.First(); iter.Valid(); iter.Next() { - key := iter.Key() - if key == nil { - break - } - - log := olog.WithValues("id", string(key)) - value, err := iter.ValueAndErr() - if err != nil { - return errors.Wrapf(err, "Failed to read value for key %s", string(key)) - } - - result := &v1alpha1.EvalResult{} - if err := proto.Unmarshal(value, result); err != nil { - return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key)) - } - - if len(result.GetActual()) > 0 { - log.V(logs.Debug).Info("not generating a completion; already have answer", "path", result.ExampleFile) - // We have the answer so we don't need to generate it. - continue - } - - if len(result.Actual) == 0 { - // Initialize a trace - resp, err := func() (*connect.Response[v1alpha1.GenerateResponse], error) { - newCtx, span := tracer().Start(ctx, "(*Evaluator).reconcilePredictions") - defer span.End() - - req := connect.NewRequest(&v1alpha1.GenerateRequest{ - Doc: result.Example.Query, - }) - // We need to generate the answer. - return client.Generate(newCtx, req) - }() - - if err != nil { - connectErr, ok := err.(*connect.Error) - if ok { - // If this is a permanent error we want to abort with an error - if connectErr.Code() == connect.CodeUnavailable || connectErr.Code() == connect.CodeUnimplemented { - return errors.Wrap(err, "Unable to connect to the agent.") - } - } - result.Error = err.Error() - result.Status = v1alpha1.EvalResultStatus_ERROR - continue - } - - result.Actual = resp.Msg.GetBlocks() - result.GenTraceId = resp.Msg.GetTraceId() - - log.Info("Writing result to DB") - if err := updateResult(ctx, string(key), result, db); err != nil { - return errors.Wrapf(err, "Failed to write result to DB") - } - } - } - return nil -} diff --git a/app/pkg/eval/results_manager.go b/app/pkg/eval/results_manager.go new file mode 100644 index 00000000..80512c5e --- /dev/null +++ b/app/pkg/eval/results_manager.go @@ -0,0 +1,226 @@ +package eval + +import ( + "context" + "database/sql" + _ "embed" + "os" + "path/filepath" + "time" + + "github.com/jlewi/foyle/app/pkg/analyze" + "github.com/jlewi/foyle/app/pkg/analyze/fsql" + "github.com/jlewi/foyle/app/pkg/logs" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" + "github.com/jlewi/monogo/helpers" + "github.com/pkg/errors" + "google.golang.org/protobuf/encoding/protojson" +) + +// ResultsManager manages the database containing the evaluation results +type ResultsManager struct { + queries *fsql.Queries + db *sql.DB +} + +// EvalResultUpdater is a function that updates an evaluation result. +type EvalResultUpdater func(result *v1alpha1.EvalResult) error + +func openResultsManager(dbFile string) (*ResultsManager, error) { + stat, err := os.Stat(dbFile) + if err == nil && stat.IsDir() { + return nil, errors.Wrapf(err, "Can't open database: %v; it is a directory", dbFile) + } + dbDir := filepath.Dir(dbFile) + if err := os.MkdirAll(dbDir, helpers.UserGroupAllPerm); err != nil { + return nil, errors.Wrapf(err, "Failed to create directory: %v", dbDir) + } + + db, err := sql.Open(analyze.SQLLiteDriver, dbFile) + + if err != nil { + return nil, errors.Wrapf(err, "Failed to open database: %v", dbFile) + } + + manager, err := NewResultsManager(db) + if err != nil { + return nil, err + } + return manager, nil +} + +func NewResultsManager(db *sql.DB) (*ResultsManager, error) { + // create tables + // TODO(jeremy): This creates the analyzer and ResultsManager table because we don't separate the DDL statements. + // We might want to refactor to support that. + if _, err := db.ExecContext(context.TODO(), analyze.GetDDL()); err != nil { + return nil, err + } + + // Create the dbtx from the actual database + queries := fsql.New(db) + + return &ResultsManager{ + queries: queries, + db: db, + }, nil +} + +// Get retrieves an example with the given id +func (m *ResultsManager) Get(ctx context.Context, id string) (*v1alpha1.EvalResult, error) { + queries := m.queries + + // Read the record + row, err := queries.GetResult(ctx, id) + + if err != nil { + return nil, err + } + + result := &v1alpha1.EvalResult{} + if err := protojson.Unmarshal([]byte(row.ProtoJson), result); err != nil { + return nil, errors.Wrapf(err, "Failed to deserialize EvalResult") + } + + return result, nil +} + +// Update updates an evaluation result. Update performs a read-modify-write operation on the results with the given id. +// The updateFunc is called with the example to be updated. The updateFunc should modify the session in place. +// If the updateFunc returns an error then the example is not updated. +// If the given id doesn't exist then an empty Session is passed to updateFunc and the result will be +// inserted if the updateFunc returns nil. If the session result exists then the result is passed to updateFunc +// and the updated value is then written to the database +// +// TODO(jeremy): How should the update function signal an error that shouldn't block the update and should be reported +// by Update. For example, when processing a result; we might have an error processing an example (e.g. generating +// a completion). We still want to update the database though and signal to caller of the Update that the error failed. +// Should the EvalResultUpdater return a boolean indicating whether to commit or rollback the transaction? +// Should Update wrap the EvalResultUpdater in a error that stores the error returned by updateFunc? +func (m *ResultsManager) Update(ctx context.Context, id string, updateFunc EvalResultUpdater) error { + log := logs.FromContext(ctx) + if id == "" { + return errors.WithStack(errors.New("id must be non-empty")) + } + log = log.WithValues("exampleId", id) + + tx, err := m.db.BeginTx(ctx, &sql.TxOptions{}) + if err != nil { + return errors.Wrapf(err, "Failed to start transaction") + } + + queries := m.queries.WithTx(tx) + // Read the record + row, err := queries.GetResult(ctx, id) + + // If the session doesn't exist then we do nothing because session is initializeed to empty session + rowPb := &v1alpha1.EvalResult{ + // Initialize the id. + Example: &v1alpha1.EvalExample{ + Id: id, + }, + } + if err != nil { + if err != sql.ErrNoRows { + if txErr := tx.Rollback(); txErr != nil { + log.Error(txErr, "Failed to rollback transaction") + } + return errors.Wrapf(err, "Failed to get result with id %v", id) + } + } else { + // Deserialize the proto + if err := protojson.Unmarshal([]byte(row.ProtoJson), rowPb); err != nil { + if txErr := tx.Rollback(); txErr != nil { + log.Error(txErr, "Failed to rollback transaction") + } + return errors.Wrapf(err, "Failed to deserialize result") + } + } + + if err := updateFunc(rowPb); err != nil { + if txErr := tx.Rollback(); txErr != nil { + log.Error(txErr, "Failed to rollback transaction") + } + return errors.Wrapf(err, "Failed to update result") + } + + update, err := protoToRowUpdate(rowPb) + if err != nil { + if txErr := tx.Rollback(); txErr != nil { + log.Error(txErr, "Failed to rollback transaction") + } + return errors.Wrapf(err, "Failed to convert EvalResult proto to table row") + } + + if update.ID != id { + if txErr := tx.Rollback(); txErr != nil { + log.Error(txErr, "Failed to rollback transaction") + } + return errors.WithStack(errors.Errorf("id in EvalResult doesn't match id. Update was called with ID: %v but session has ID: %v", id, update.ID)) + } + + if err := queries.UpdateResult(ctx, *update); err != nil { + if txErr := tx.Rollback(); txErr != nil { + log.Error(txErr, "Failed to rollback transaction") + } + return errors.Wrapf(err, "Failed to update session") + } + + if err := tx.Commit(); err != nil { + return errors.Wrapf(err, "Failed to commit transaction") + } + + return nil +} + +// ListResults lists the results in the database if cursor is nil then the first page is returned. +// If cursor is non-nil then the next page is returned. +// The cursor is the time. +func (m *ResultsManager) ListResults(ctx context.Context, cursor *time.Time, pageSize int) ([]*v1alpha1.EvalResult, *time.Time, error) { + params := fsql.ListResultsParams{ + PageSize: int64(pageSize), + } + + if cursor != nil { + params.Cursor = *cursor + } else { + params.Cursor = "" + } + + rows, err := m.queries.ListResults(ctx, params) + + if err != nil { + return nil, nil, errors.Wrapf(err, "Failed to list results") + } + + results := make([]*v1alpha1.EvalResult, 0) + + // ListResults return nil if there are no results + if rows == nil { + return results, nil, nil + } + + for _, row := range rows { + result := &v1alpha1.EvalResult{} + if err := protojson.Unmarshal([]byte(row.ProtoJson), result); err != nil { + return nil, nil, errors.Wrapf(err, "Failed to deserialize EvalResult") + } + results = append(results, result) + } + lastTime := &time.Time{} + *lastTime = rows[len(rows)-1].Time + return results, lastTime, nil +} + +func protoToRowUpdate(result *v1alpha1.EvalResult) (*fsql.UpdateResultParams, error) { + protoJson, err := protojson.Marshal(result) + if err != nil { + return nil, errors.Wrapf(err, "Failed to serialize EvalResult to JSON") + } + + return &fsql.UpdateResultParams{ + ID: result.GetExample().GetId(), + Time: result.Example.GetTime().AsTime(), + ProtoJson: string(protoJson), + }, nil +} diff --git a/app/pkg/eval/results_manager_test.go b/app/pkg/eval/results_manager_test.go new file mode 100644 index 00000000..3e24f8e0 --- /dev/null +++ b/app/pkg/eval/results_manager_test.go @@ -0,0 +1,149 @@ +package eval + +import ( + "context" + "database/sql" + "os" + "path/filepath" + "testing" + "time" + + "github.com/jlewi/foyle/app/pkg/analyze" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/timestamppb" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" + "google.golang.org/protobuf/encoding/protojson" +) + +func Test_protoToRowUpdate(t *testing.T) { + type testCase struct { + name string + result *v1alpha1.EvalResult + } + + cases := []testCase{ + { + name: "Basic", + result: &v1alpha1.EvalResult{ + Example: &v1alpha1.EvalExample{ + Id: "1", + }, + }, + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + actual, err := protoToRowUpdate(c.result) + if err != nil { + t.Fatalf("Error converting EvalResult to row update: %v", err) + } + + if actual.ID != c.result.Example.Id { + t.Fatalf("Expected ID to be %v but got %v", c.result.Example.Id, actual.ID) + } + + if actual.Time != c.result.Example.Time.AsTime() { + t.Fatalf("Expected Time to be %v but got %v", c.result.Example.Time, actual.Time) + } + + // We can't compare the serialized protos because the JSON serialization is non-deterministic + + actualPB := &v1alpha1.EvalResult{} + if err := protojson.Unmarshal([]byte(actual.ProtoJson), actualPB); err != nil { + t.Fatalf("Error deserializing actual result: %v", err) + } + + comparer := cmpopts.IgnoreUnexported(v1alpha1.EvalResult{}, v1alpha1.EvalExample{}, time.Time{}) + if d := cmp.Diff(c.result, actualPB, comparer); d != "" { + t.Fatalf("Unexpected diff between expected and actual EvalResults:\n%v", d) + } + }) + } +} + +func Test_ListResults(t *testing.T) { + tempDir, err := os.MkdirTemp("", "Test_ListResults") + defer os.RemoveAll(tempDir) + if err != nil { + t.Fatalf("Error creating temp dir: %v", err) + } + + dbFile := filepath.Join(tempDir, "results.db") + + db, err := sql.Open(analyze.SQLLiteDriver, dbFile) + if err != nil { + t.Fatalf("Error creating database: %v", err) + } + + m, err := NewResultsManager(db) + if err != nil { + t.Fatalf("Error creating ResultsManager: %v", err) + } + + // Try listing the results when there are no results + rows, _, err := m.ListResults(context.Background(), nil, 10) + if err != nil { + t.Fatalf("Error listing results: %v", err) + } + + if len(rows) != 0 { + t.Fatalf("Expected no results but got %v", len(rows)) + } + + // Now insert some rows. + baseTime := time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC) + results := []*v1alpha1.EvalResult{ + { + Example: &v1alpha1.EvalExample{ + Id: "1", + Time: timestamppb.New(baseTime), + }, + }, + { + Example: &v1alpha1.EvalExample{ + Id: "2", + Time: timestamppb.New(baseTime.Add(time.Hour)), + }, + }, + { + Example: &v1alpha1.EvalExample{ + Id: "3", + Time: timestamppb.New(baseTime.Add(-1 * time.Hour)), + }, + }, + } + + for _, r := range results { + uErr := m.Update(context.Background(), r.Example.Id, func(result *v1alpha1.EvalResult) error { + proto.Merge(result, r) + return nil + }) + if uErr != nil { + t.Fatalf("Error inserting result: %v", err) + } + } + + // List the results + rows, cursor, err := m.ListResults(context.Background(), nil, 10) + + if err != nil { + t.Fatalf("Error listing results: %v", err) + } + + if len(rows) != 3 { + t.Fatalf("Expected 3 results but got %v", len(rows)) + } + + if !isSortedByTimeDescending(rows) { + t.Fatalf("Results are not sorted by time") + } + + expected := baseTime.Add(-1 * time.Hour) + if *cursor != baseTime.Add(-1*time.Hour) { + t.Fatalf("Cursor is invalid; Got %v; Want %v", *cursor, expected) + } +} diff --git a/app/pkg/eval/service.go b/app/pkg/eval/service.go index 22b097b8..854e5de9 100644 --- a/app/pkg/eval/service.go +++ b/app/pkg/eval/service.go @@ -3,8 +3,11 @@ package eval import ( "context" - "github.com/go-logr/zapr" "github.com/jlewi/foyle/app/pkg/docs" + "github.com/jlewi/foyle/app/pkg/runme/converters" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" + + "github.com/go-logr/zapr" "go.uber.org/zap" "connectrpc.com/connect" @@ -140,11 +143,25 @@ func toAssertionRow(result *v1alpha1.EvalResult) (*v1alpha1.AssertionRow, error) row := &v1alpha1.AssertionRow{ Id: result.Example.GetId(), - ExampleFile: result.GetExampleFile(), + ExampleFile: result.GetExample().FullContext.NotebookUri, + } + + doc, err := converters.NotebookToDoc(result.GetExample().GetFullContext().GetNotebook()) + + if err != nil { + return nil, errors.Wrapf(err, "Failed to convert notebook to doc") + } + + actualDoc, err := converters.NotebookToDoc(&parserv1.Notebook{ + Cells: result.GetActualCells(), + }) + + if err != nil { + return nil, errors.Wrapf(err, "Failed to convert actual cells to doc") } - row.DocMd = docs.DocToMarkdown(result.GetExample().GetQuery()) - row.AnswerMd = docs.BlocksToMarkdown(result.GetActual()) + row.DocMd = docs.DocToMarkdown(doc) + row.AnswerMd = docs.DocToMarkdown(actualDoc) for _, a := range result.GetAssertions() { switch a.Name { diff --git a/app/pkg/eval/service_test.go b/app/pkg/eval/service_test.go index e4e30bda..355b91dd 100644 --- a/app/pkg/eval/service_test.go +++ b/app/pkg/eval/service_test.go @@ -4,6 +4,8 @@ import ( "fmt" "testing" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" ) @@ -16,21 +18,23 @@ func Test_ToAssertRow(t *testing.T) { cases := []testCase{ { evalResult: &v1alpha1.EvalResult{ - Example: &v1alpha1.Example{ + Example: &v1alpha1.EvalExample{ Id: "1234", - Query: &v1alpha1.Doc{ - Blocks: []*v1alpha1.Block{ - { - Kind: v1alpha1.BlockKind_MARKUP, - Contents: "Hello world", + FullContext: &v1alpha1.FullContext{ + Notebook: &parserv1.Notebook{ + Cells: []*parserv1.Cell{ + { + Kind: parserv1.CellKind_CELL_KIND_MARKUP, + Value: "Hello world", + }, }, }, }, }, - Actual: []*v1alpha1.Block{ + ActualCells: []*parserv1.Cell{ { - Kind: v1alpha1.BlockKind_MARKUP, - Contents: "word", + Kind: parserv1.CellKind_CELL_KIND_MARKUP, + Value: "word", }, }, Assertions: []*v1alpha1.Assertion{ diff --git a/app/pkg/eval/tracer.go b/app/pkg/eval/tracer.go deleted file mode 100644 index c440b11c..00000000 --- a/app/pkg/eval/tracer.go +++ /dev/null @@ -1,10 +0,0 @@ -package eval - -import ( - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/trace" -) - -func tracer() trace.Tracer { - return otel.Tracer("github.com/jlewi/foyle/app/pkg/eval") -} diff --git a/app/pkg/logsviewer/eval_viewer.go b/app/pkg/logsviewer/eval_viewer.go index f25339ea..a824e6b2 100644 --- a/app/pkg/logsviewer/eval_viewer.go +++ b/app/pkg/logsviewer/eval_viewer.go @@ -6,6 +6,8 @@ import ( "net/http" "strings" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" + "connectrpc.com/connect" "github.com/go-logr/zapr" "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" @@ -191,9 +193,11 @@ func (c *EvalResultsTable) Render() app.UI { } row := app.Tr().Class(rowStyle).Body( app.Td().Text(resultSet.data[i].GetExample().GetId()), - app.Td().Text(resultSet.data[i].GetExampleFile()), - app.Td().Text(resultSet.data[i].GetDistance()), - app.Td().Text(resultSet.data[i].GetNormalizedDistance()), + // TODO(jeremy): These fields were removed as part of refactoring how we do evaluation. Can we just + // delete the code? Do other things need to be updated? + //app.Td().Text(resultSet.data[i].GetExampleFile()), + //app.Td().Text(resultSet.data[i].GetDistance()), + //app.Td().Text(resultSet.data[i].GetNormalizedDistance()), ) // For each row we add a click handler to display the corresponding example. @@ -251,7 +255,7 @@ func (m *evalView) handleSetEvalView(ctx app.Context, action app.Action) { m.HTMLContent = "No evaluation result is currently selected" break } - value, err := docToHTML(current.Example.Query) + value, err := nbToHTML(current.Example.FullContext.Notebook) if err == nil { m.HTMLContent = value } else { @@ -264,10 +268,11 @@ func (m *evalView) handleSetEvalView(ctx app.Context, action app.Action) { m.HTMLContent = "No evaluation result is currently selected" break } - doc := &v1alpha1.Doc{ - Blocks: current.Actual, + + nb := &parserv1.Notebook{ + Cells: current.ActualCells, } - value, err := docToHTML(doc) + value, err := nbToHTML(nb) if err == nil { m.HTMLContent = value } else { @@ -280,10 +285,10 @@ func (m *evalView) handleSetEvalView(ctx app.Context, action app.Action) { m.HTMLContent = "No evaluation result is currently selected" break } - doc := &v1alpha1.Doc{ - Blocks: current.Example.Answer, + nb := &parserv1.Notebook{ + Cells: current.Example.ExpectedCells, } - value, err := docToHTML(doc) + value, err := nbToHTML(nb) if err == nil { m.HTMLContent = value } else { diff --git a/app/pkg/logsviewer/views.go b/app/pkg/logsviewer/views.go index e579f84f..b00ec799 100644 --- a/app/pkg/logsviewer/views.go +++ b/app/pkg/logsviewer/views.go @@ -3,6 +3,9 @@ package logsviewer import ( "bytes" + "github.com/jlewi/foyle/app/pkg/runme/converters" + parserv1 "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" + logspb "github.com/jlewi/foyle/protos/go/foyle/logs" "github.com/jlewi/foyle/app/pkg/docs" @@ -54,6 +57,19 @@ func renderExecutedBlock(block *logspb.BlockLog) (string, error) { return buf.String(), nil } +// nbToHTML returns the notebook as html +func nbToHTML(nb *parserv1.Notebook) (string, error) { + if nb == nil { + return "", errors.New("notebook is nil") + } + doc, err := converters.NotebookToDoc(nb) + if err != nil { + return "", errors.Wrapf(err, "Failed to convert notebook to doc") + } + + return docToHTML(doc) +} + // docToHTML returns the dock as html func docToHTML(doc *v1alpha1.Doc) (string, error) { if doc == nil { diff --git a/app/pkg/runme/converters/cells_to_blocks.go b/app/pkg/runme/converters/cells_to_blocks.go index 87f684d3..a8de188d 100644 --- a/app/pkg/runme/converters/cells_to_blocks.go +++ b/app/pkg/runme/converters/cells_to_blocks.go @@ -79,6 +79,15 @@ func GetCellID(cell *parserv1.Cell) string { } return "" } + +func SetCellID(cell *parserv1.Cell, id string) { + // Delete any existing IDs + for _, idField := range []string{IdField, RunmeIdField} { + delete(cell.Metadata, idField) + } + cell.Metadata[RunmeIdField] = id +} + func CellKindToBlockKind(kind parserv1.CellKind) v1alpha1.BlockKind { switch kind { case parserv1.CellKind_CELL_KIND_CODE: diff --git a/data/eval/foyle/block_log.md b/data/eval/foyle/block_log.md deleted file mode 100644 index 33ab61c8..00000000 --- a/data/eval/foyle/block_log.md +++ /dev/null @@ -1,5 +0,0 @@ -Get the log for block 01HZ0N1ZZ8NJ7PSRYB6WEMH08M - -```sh {"id":"01HZ2HG6A2PHAEHG95CNG4MK9Y"} -curl http://localhost:8080/api/blocklogs/01HZ0N1ZZ8NJ7PSRYB6WEMH08M | jq . -``` \ No newline at end of file diff --git a/data/eval/foyle/block_to_execute_trace_id.md b/data/eval/foyle/block_to_execute_trace_id.md deleted file mode 100644 index c970bf6d..00000000 --- a/data/eval/foyle/block_to_execute_trace_id.md +++ /dev/null @@ -1,5 +0,0 @@ -Get the ids of the execution traces for block 01HZ0W9X2XF914XMG6REX1WVWG - -```sh {"id":"01HZ2HPXBNPNZXBVZZDX1JYEYS"} -curl http://localhost:8080/api/blocklogs/01HZ0W9X2XF914XMG6REX1WVWG | jq .execTraceIds -``` \ No newline at end of file diff --git a/data/eval/foyle/block_to_generate_trace_id.md b/data/eval/foyle/block_to_generate_trace_id.md deleted file mode 100644 index 8dae6147..00000000 --- a/data/eval/foyle/block_to_generate_trace_id.md +++ /dev/null @@ -1,5 +0,0 @@ -Get the trace that generated block 01HZ0W9X2XF914XMG6REX1WVWG - -```sh {"id":"01HZ2HMT3M3SCDKRDDQJ9B225S"} -curl http://localhost:8080/api/blocklogs/01HZ0W9X2XF914XMG6REX1WVWG | jq .genTraceId -``` \ No newline at end of file diff --git a/data/eval/foyle/eval_size.md b/data/eval/foyle/eval_size.md deleted file mode 100644 index 91a1f579..00000000 --- a/data/eval/foyle/eval_size.md +++ /dev/null @@ -1,5 +0,0 @@ -How big is foyle's evaluation data set? - -```sh {"id":"01HZ2HSY4ME2N3PNYMGC8VV871"} -find ~/git_foyle/data -name "*.md" | wc -l -``` \ No newline at end of file diff --git a/data/eval/foyle/extension_version.md b/data/eval/foyle/extension_version.md deleted file mode 100644 index 0eb43f0e..00000000 --- a/data/eval/foyle/extension_version.md +++ /dev/null @@ -1,5 +0,0 @@ -What version of RunMe is installed? - -```sh {"id":"01HZ31FENV04M6B32S2VGT7HVY"} -code --list-extensions --show-versions | grep stateful.runme -``` \ No newline at end of file diff --git a/data/eval/foyle/foyle_serve.md b/data/eval/foyle/foyle_serve.md deleted file mode 100644 index ea755832..00000000 --- a/data/eval/foyle/foyle_serve.md +++ /dev/null @@ -1,5 +0,0 @@ -Start foyle - -```sh {"id":"01HZ2JH67WNWZT68QMNGBSWS4E"} -foyle serve -``` \ No newline at end of file diff --git a/data/eval/foyle/foyle_train.md b/data/eval/foyle/foyle_train.md deleted file mode 100644 index 1585d031..00000000 --- a/data/eval/foyle/foyle_train.md +++ /dev/null @@ -1,6 +0,0 @@ -Run foyle training - -```sh {"id":"01HZ2HDG52NE65Q4R0RN0113MJ"} -foyle logs process -foyle learn -``` \ No newline at end of file diff --git a/data/eval/foyle/how_man_tokens.md b/data/eval/foyle/how_man_tokens.md deleted file mode 100644 index b08b8730..00000000 --- a/data/eval/foyle/how_man_tokens.md +++ /dev/null @@ -1,5 +0,0 @@ -How many characters are in the foyle codebase? - -```sh {"id":"01HZ50XMEPVBMNVDQYVZTF16S1"} -find ~/git_foyle/ -name "*.go" | xargs cat | wc -c -``` \ No newline at end of file diff --git a/data/eval/foyle/releases.md b/data/eval/foyle/releases.md deleted file mode 100644 index 39476f9f..00000000 --- a/data/eval/foyle/releases.md +++ /dev/null @@ -1,6 +0,0 @@ -What's the latest release of foyle? - -```sh {"id":"01HZ5YW9V71ADTGKER90FV7VWK"} -cd ~/git_foyle -gh release list -``` \ No newline at end of file diff --git a/data/eval/gcp/gcb_hydros.md b/data/eval/gcp/gcb_hydros.md deleted file mode 100755 index 29a7caaa..00000000 --- a/data/eval/gcp/gcb_hydros.md +++ /dev/null @@ -1,5 +0,0 @@ -Get the cloud build jobs for commit abc1234 - -```bash {"id":"01HZ2GPZR77SNCQ83KV7HB9Z2M"} -gcloud builds list --limit=10 --format="value(ID,createTime,duration,tags,logUrl,status)" --project=foyle-public --filter="tags:commit-abc1234" -``` diff --git a/data/eval/gcp/gcb_hydros_by_image.md b/data/eval/gcp/gcb_hydros_by_image.md deleted file mode 100755 index 619aecf7..00000000 --- a/data/eval/gcp/gcb_hydros_by_image.md +++ /dev/null @@ -1,5 +0,0 @@ -List the GCB jobs that build image backend/caribou - -```bash {"id":"01HZ2GP8BFCQ5BKGHEYRFW44YD"} -gcloud builds list --limit=10 --filter='tags="us-west1-docker.pkg.dev_foyle-public_images_backend_caribou"' --format="value(ID,createTime,duration,tags,logUrl,status)" --project=foyle-public -``` diff --git a/data/eval/gcp/gcb_list.md b/data/eval/gcp/gcb_list.md deleted file mode 100755 index bcda6b92..00000000 --- a/data/eval/gcp/gcb_list.md +++ /dev/null @@ -1,5 +0,0 @@ -List the most recent image builds - -```bash {"id":"01HZ2GQWVPYWJD70GWFPBMT9ZF"} -gcloud builds list --project=foyle-public -``` diff --git a/data/eval/gcp/gke_list.md b/data/eval/gcp/gke_list.md deleted file mode 100755 index 8aa249e2..00000000 --- a/data/eval/gcp/gke_list.md +++ /dev/null @@ -1,5 +0,0 @@ -Describe the dev cluster? - -```bash {"id":"01HZ2GR94SQ241XAPBBEARWEF1"} -gcloud container clusters describe --region=us-west1 --project=foyle-dev dev -``` diff --git a/data/eval/gcp/gpu_quota.md b/data/eval/gcp/gpu_quota.md deleted file mode 100755 index cfc4e8ff..00000000 --- a/data/eval/gcp/gpu_quota.md +++ /dev/null @@ -1,5 +0,0 @@ -Check for preemptible A100 quota in us-central1 - -```bash {"id":"01HZ2GRHWRZG8BT9CANA3G2RH6"} -gcloud compute regions describe us-central1 --format=json | jq '.quotas[] | select(.metric | contains("NVIDIA_A100"))' -``` diff --git a/data/eval/gcp/images_list.md b/data/eval/gcp/images_list.md deleted file mode 100755 index 850fa06f..00000000 --- a/data/eval/gcp/images_list.md +++ /dev/null @@ -1,5 +0,0 @@ -List images - -```bash {"id":"01HZ2GS0RV84FTSZS6ARARW7TH"} -gcloud artifacts docker images list --sort-by=~create_time --include-tags us-west1-docker.pkg.dev/foyle-public/images -``` diff --git a/data/eval/git/pr_description.md b/data/eval/git/pr_description.md deleted file mode 100755 index 1f2499ca..00000000 --- a/data/eval/git/pr_description.md +++ /dev/null @@ -1,5 +0,0 @@ -Create a PR description - -```bash {"id":"01HZ2GSBYQ0JDE5CM5NEWNDV2Z"} -git diff origin/main | llm --model=gpt-4-0125-preview -s "Create a PR description from the following diff" -``` diff --git a/data/eval/git/tag_vscode_web.md b/data/eval/git/tag_vscode_web.md deleted file mode 100755 index db751846..00000000 --- a/data/eval/git/tag_vscode_web.md +++ /dev/null @@ -1,4 +0,0 @@ -Add the tag 6f19eac45ccb88cc176776ea79411f834a12a575 to the image ghcr.io/jlewi/vscode-web-assets:v20240403t185418 -```bash -gcrane tag ghcr.io/jlewi/vscode-web-assets:v20240403t185418 6f19eac45ccb88cc176776ea79411f834a12a575 -``` diff --git a/data/eval/gitops/flux.md b/data/eval/gitops/flux.md deleted file mode 100644 index 8098c8ed..00000000 --- a/data/eval/gitops/flux.md +++ /dev/null @@ -1,5 +0,0 @@ -Sync the manifests to the dev cluster - -```sh {"id":"01HZ315CWSZPNMJCRJJFMB4S8S"} -flux reconcile kustomization dev-cluster --with-source -``` \ No newline at end of file diff --git a/data/eval/honeycomb/nl_to_query.md b/data/eval/honeycomb/nl_to_query.md deleted file mode 100755 index 678a2ec3..00000000 --- a/data/eval/honeycomb/nl_to_query.md +++ /dev/null @@ -1,4 +0,0 @@ -Generate a honeycomb query to count the number of traces for the last 7 days broken down by region in the foyle dataset -```bash -/Users/jlewi/git_hccli/hccli nltoq --nlq=\"Count the number of traces for the last 7 days broken down by regions\" --dataset=foyle -``` diff --git a/data/eval/hydros/image_logs.md b/data/eval/hydros/image_logs.md deleted file mode 100755 index be07569c..00000000 --- a/data/eval/hydros/image_logs.md +++ /dev/null @@ -1,4 +0,0 @@ -Get the logs for building the image carabou -```bash -gcloud logging read 'logName="projects/foyle-dev/logs/hydros" jsonPayload.image="carabou"' --freshness=1d --project=foyle-dev -``` diff --git a/data/eval/iac/pulumi_drift.md b/data/eval/iac/pulumi_drift.md deleted file mode 100644 index 765febe2..00000000 --- a/data/eval/iac/pulumi_drift.md +++ /dev/null @@ -1,6 +0,0 @@ -Show any drift in the dev infrastructure - -```sh {"id":"01HZ2J3RT6G1NY3YX4H0TW5VX8"} -pulumi -C /Users/jlewi/git_foyle/iac/dev refresh -y -pulumi -C /Users/jlewi/git_foyle/iac/dev preview --diff -``` \ No newline at end of file diff --git a/data/eval/iac/pulumi_up.md b/data/eval/iac/pulumi_up.md deleted file mode 100755 index 66488162..00000000 --- a/data/eval/iac/pulumi_up.md +++ /dev/null @@ -1,6 +0,0 @@ -Sync the dev infra - -```bash {"id":"01HZ2GTQAA8ECXR36P75EJBGK4"} -pulumi -C /Users/jlewi/git_foyle/iac/dev refresh -y -pulumi -C /Users/jlewi/git_foyle/iac/dev up --diff --skip-preview --non-interactive -``` diff --git a/data/eval/istio/istio.md b/data/eval/istio/istio.md deleted file mode 100644 index 2749c690..00000000 --- a/data/eval/istio/istio.md +++ /dev/null @@ -1,5 +0,0 @@ -Dump the istio routes for the pod jupyter in namespace kubeflow - -```sh {"id":"01HZ312QN5N9HMJR86Y9VATT04"} -istioctl proxy-config route jupyter.kubeflow -``` \ No newline at end of file diff --git a/data/eval/replicate/push.md b/data/eval/replicate/push.md deleted file mode 100755 index 07891e8d..00000000 --- a/data/eval/replicate/push.md +++ /dev/null @@ -1,4 +0,0 @@ -Push the honeycomb nl to query model to replicate -```bash -cog push r8.im/jlewi/honeycomb -``` diff --git a/data/eval/replicate/secret.md b/data/eval/replicate/secret.md deleted file mode 100644 index 4d1207af..00000000 --- a/data/eval/replicate/secret.md +++ /dev/null @@ -1,5 +0,0 @@ -Fetch the replicate API token - -```sh {"id":"01HZ318TVX3C2ET61QQXGHYTDG"} -gcloud secrets versions access latest --secret=replicate-api-token --project=foyle-dev -``` \ No newline at end of file diff --git a/data/eval/runme/logs.md b/data/eval/runme/logs.md deleted file mode 100644 index c5eb157c..00000000 --- a/data/eval/runme/logs.md +++ /dev/null @@ -1,5 +0,0 @@ -Check the runme logs for an execution for the block 01HYZXS2Q5XYX7P3PT1KH5Q881 - -```sh {"id":"01HYZY4V5NNQN5Q2RDZ5G15JY8"} -grep -r 01HYZXS2Q5XYX7P3PT1KH5Q881 "/Users/jlewi/Library/Application Support/runme/logs/" -``` \ No newline at end of file diff --git a/developer_guides/eval.md b/developer_guides/eval.md deleted file mode 100644 index 05e990a8..00000000 --- a/developer_guides/eval.md +++ /dev/null @@ -1,99 +0,0 @@ -## Running Evaluation - -## Running Level 1 Evaluation - -Level 1 evaluations are assertions that run on AI responses. - -To evaluate changes to the agent first setup an instance of the agent with the changes you want. -Be sure to configure it so that it stores logs and responses in a different directory than your production -agent because you don't want the evaluation data to contaminate the learning process. - -```sh {"id":"01J4DJT0G24YH9K4F8YRTSZD8N"} -export REPOROOT=$(git rev-parse --show-toplevel) -export RUNDIR=${REPOROOT}/experiments/runs/$(date +%Y%m%d_%H%M%S) -echo "Using run directory: ${RUNDIR}" -``` - -### Setup the configuration for the agent in this run - -```sh {"id":"01J4DKE3M85ETKNHFH4G0HT0M6"} -mkdir -p ${RUNDIR} -cp ~/.foyle/config.yaml ${RUNDIR}/config.yaml -``` - -* Adjust the ports used by the agent to avoid conflicts with the production agent - -```sh {"id":"01J4DKK0N36XN2HV4GQK7YRXCC"} -yq e '.server.httpPort = 55080' -i ${RUNDIR}/config.yaml -yq e '.server.grpcPort = 55090' -i ${RUNDIR}/config.yaml -``` - -* We make a copy of the training directory to a new directory for this evaluation run. - -```sh {"id":"01J4DKP9P59GCGNG6QXX6KR9AF"} -cp -r ~/.foyle/training ${RUNDIR}/ -``` - -```sh {"id":"01J4DKQXXB8P7CV7VS4YS5DHDD"} -yq e ".learner.exampleDirs=[\"${RUNDIR}/training\"]" -i ${RUNDIR}/config.yaml -``` - -* Remove the RunMe directory for the extra log directory -* We don't want to reprocess RunMe logs -* Since we aren't actually using the Frontend there are no RunMe logs to process anyway - -```sh {"id":"01J4F79ZE8YAAKV252G2T7XD25"} -yq e ".learner.logDirs=[]" -i ${RUNDIR}/config.yaml -``` - -* configure the assertions - -```sh {"id":"01J4F896JP8FZ3N8BGVPZ7VHJ4"} -cp -f ${REPOROOT}/experiments/assertions.yaml ${RUNDIR}/assertions.yaml -yq e ".spec.agentAddress=http://localhost:55080/api" -i ${RUNDIR}/assertions.yaml -yq e ".spec.dbDir=\"${RUNDIR}/evalDB\"" -i ${RUNDIR}/assertions.yaml - -``` - -### Run the agent - -* Start the agent containing the changes you want to evaluate - -```sh {"id":"01J4DM107F0GJWJKFV4P77TAQY"} -cd ${REPOROOT}/app -export CONFIGFILE=${RUNDIR}/config.yaml -go run github.com/jlewi/foyle/app serve --config=${CONFIGFILE} -``` - -### Run evaluation driver - -```sh {"id":"01J4F8KQ7N5DE3JQRX33T60BB0"} -cd ${REPOROOT}/app -export CONFIGFILE=${RUNDIR}/config.yaml -go run github.com/jlewi/foyle/app apply --config=${CONFIGFILE} ${RUNDIR}/assertions.yaml -``` - -### Analyze the results - -```sh {"id":"01J4HN72G5EY98MYPCZG7V02WZ","interactive":"false","mimeType":"application/json"} -curl -s -H "Content-Type: application/json" http://localhost:55080/api/EvalService/AssertionTable -d "{\"database\":\"${RUNDIR}/evalDB\"}" | jq .rows -``` - -## Run baseline experiment - -```sh {"id":"01HZ38BC6WJF5RB9ZYTXBJE38M"} -foyle apply ~/git_foyle/experiments/norag.yaml -``` - -## Run experiment with RAG - -```sh {"id":"01HZ38QWPZ565XH11CCKYCF1M7"} -foyle apply ~/git_foyle/experiments/rag.yaml -``` - -### Adding Level 1 Evals - -1. Define the Assertion in [eval/assertions.go](../app/pkg/eval/assertions.go) -2. Update [Assertor in assertor.go](../app/pkg/eval/assertor.go) to include the new assertion -3. Update [AssertRow proto](../protos/eval/eval.proto) to include the new assertion -4. Update [toAssertionRow](../app/pkg/eval/service.go) to include the new assertion in `AssertRow` \ No newline at end of file diff --git a/docs/content/en/docs/contributor/_index.md b/docs/content/en/docs/contributor/_index.md new file mode 100644 index 00000000..484fec62 --- /dev/null +++ b/docs/content/en/docs/contributor/_index.md @@ -0,0 +1,7 @@ +--- +description: Documentation for contributors to Foyle +title: Contributor +weight: 20 +--- + +Documentation about contributing and developing Foyle. \ No newline at end of file diff --git a/docs/content/en/docs/contributor/assertions.md b/docs/content/en/docs/contributor/assertions.md new file mode 100644 index 00000000..8d14773d --- /dev/null +++ b/docs/content/en/docs/contributor/assertions.md @@ -0,0 +1,12 @@ +--- +description: Adding Level 1 Assertions +title: Adding Assertions +weight: 20 +--- + +## Adding Level 1 Evals + +1. Define the Assertion in [eval/assertions.go](../../../../../app/pkg/eval/assertions.go) +2. Update [Assertor in assertor.go](../app/pkg/eval/assertor.go) to include the new assertion +3. Update [AssertRow proto](../protos/eval/eval.proto) to include the new assertion +4. Update [toAssertionRow](../../../../../app/pkg/eval/service.go) to include the new assertion in `As \ No newline at end of file diff --git a/docs/content/en/docs/contributor/eval.md b/docs/content/en/docs/contributor/eval.md new file mode 100644 index 00000000..b6f862de --- /dev/null +++ b/docs/content/en/docs/contributor/eval.md @@ -0,0 +1,110 @@ +--- +description: Documentation for contributors to Foyle +title: Evaluation +weight: 10 +--- + + +## What You'll Learn + +* How to setup and run experiments to evaluate the quality of the AI responses in Foyle + +## Produce an evaluation dataset + +In order to evaluate Foyle you need a dataset of examples that consist of notebooks and the expected cells to +be appended to the notebook. If you've been using Foyle then you can produce a dataset of examples from the logs. + +```bash +DATA_DIR= +curl -X POST http://localhost:8877/api/foyle.logs.SessionsService/DumpExamples -H "Content-Type: application/json" -d "{\"output\": \"$DATA_DIR\"}" +``` +This assumes you are running Foyle on the default port of 8877. If you are running Foyle on a different port you will need +to adjust the URL accordingly. + +Everytime you execute a cell it is logged to Foyle. Foyle turns this into an example where the input is all the cells +in the notebook before the cell you executed and the output is the cell you executed. This allows us to evaluate +how well Foyle does generating the executed cell given the preceding cells in the notebook. + +## Setup Foyle + +Create a Foyle configuration with the parameters you want to test. + +Create a directory to store the Foyle configuration. + +```bash +make ${EXPERIMENTS_DIR}/${NAME} +``` + +Edit `{EXPERIMENTS_DIR}/${NAME}/config.yaml` to set the parameters you want to test; e.g. + +* Assign a different port to the agent to avoid conflicting with other experiments or the production agent +* Configure the Model and Model Provider +* Configure RAG + +## Configure the Experiment + +Create the file `{$EXPERIMENTS_DIR}/${NAME}/config.yaml` + +```yaml +kind: Experiment +apiVersion: foyle.io/v1alpha1 +metadata: + name: "gpt35" +spec: + evalDir: + agentAddress: "http://localhost:/api" + outputDB: "{$EXPERIMENTS_DIR}/${NAME}/results.sqlite" +``` + +* Set evalDir to the directory where you dumped the session to evaluation examples +* Set agentAddress to the address of the agent you want to evaluate + * Use the port you assigned to the agent in `config.yaml` +* Set outputDB to the path of the sqlite database to store the results in + + +## Running the Experiment + +Start an instance of the agent with the configuration you want to evaluate. + +```bash +foyle serve --config=${EXPERIMENTS_DIR}/${NAME}/config.yaml +``` + +Run the experiment + +```bash +foyle apply ${EXPERIMENTS_DIR}/${NAME}/experiment.yaml +``` + +## Analyzing the Results + +You can use sqlite to query the results. + +The queries below compute the following + +* The number of results in the dataset +* The number of results where an error prevented a response from being generated +* The distribution of the `cellsMatchResult` field in the results + * A value of `MATCH` indicates the generated cell matches the expected cell + * A value of `MISMATCH` indicates the generated cell doesn't match the expected cell + * A value of `""` (empty string) indicates no value was computed most likely because an error occurred + +```bash +# Count the total number of results +sqlite3 --header --column ${RESULTSDB} "SELECT COUNT(*) FROM results" + +# Count the number of errors +sqlite3 --header --column ${RESULTSDB} "SELECT COUNT(*) FROM results WHERE json_extract(proto_json, '$.error') IS NOT NULL" + +# Group results by cellsMatchResult +sqlite3 --header --column ${RESULTSDB} "SELECT json_extract(proto_json, '$.cellsMatchResult') as match_result, COUNT(*) as count FROM results GROUP BY match_result" +``` + +You can use the following query to look at the errors + +``` +sqlite3 ${RESULTSDB} "SELECT +id, +json_extract(proto_json, '$.error') as error +FROM results WHERE json_extract(proto_json, '$.error') IS NOT NULL;" +``` diff --git a/docs/content/en/docs/observability/ai.md b/docs/content/en/docs/observability/ai.md index 4f5ca3e8..da6dbc45 100644 --- a/docs/content/en/docs/observability/ai.md +++ b/docs/content/en/docs/observability/ai.md @@ -40,8 +40,8 @@ fi * **Note** There appears to be a bug right now in the HTML rendering causing a bunch of newlines to be introduced relative to what's in the actual markdown in the JSON request ```bash -jq -r '.responseHtml' /tmp/response.json > /tmp/response.html -cat /tmp/response.html +jq -r '.requestHtml' /tmp/response.json > /tmp/request.html +cat /tmp/request.html ``` * To view the response diff --git a/experiments/rag.yaml b/experiments/rag.yaml index 2e9bffad..e96f2186 100644 --- a/experiments/rag.yaml +++ b/experiments/rag.yaml @@ -6,8 +6,8 @@ metadata: spec: evalDir: /Users/jlewi/git_foyle/data/eval dbDir: /Users/jlewi/foyle_experiments/20250530-1612/learning - sheetID: "1iJbkdUSxEkEX24xMH2NYpxqYcM_7-0koSRuANccDAb8" - sheetName: "WithRAG" +# sheetID: "1iJbkdUSxEkEX24xMH2NYpxqYcM_7-0koSRuANccDAb8" +# sheetName: "WithRAG" agent: model: gpt-3.5-turbo-0125 rag: diff --git a/protos/foyle/v1alpha1/eval.proto b/protos/foyle/v1alpha1/eval.proto index 9ba66dc7..f043165a 100644 --- a/protos/foyle/v1alpha1/eval.proto +++ b/protos/foyle/v1alpha1/eval.proto @@ -4,7 +4,7 @@ import "foyle/v1alpha1/agent.proto"; import "foyle/v1alpha1/doc.proto"; import "foyle/v1alpha1/trainer.proto"; import "runme/parser/v1/parser.proto"; - +import "google/protobuf/timestamp.proto"; import "google/protobuf/struct.proto"; option go_package = "github.com/jlewi/foyle/protos/go/foyle/v1alpha1"; @@ -22,20 +22,20 @@ enum AssertResult { SKIPPED = 3; } +enum CellsMatchResult { + UNKNOWN_CellsMatchResult = 0; + MATCH = 1; + MISMATCH = 2; +} + // EvalResult represents an evaluation result message EvalResult { // Example is the answer and expected result - Example example = 1; + EvalExample example = 1; - // example_file is the file containing the example - string example_file = 2; + repeated runme.parser.v1.Cell actual_cells = 11; - // Actual response - repeated Block actual = 3; - - // The distance between the actual and expected response - int32 distance = 4; - float normalized_distance = 7; + // Error indicates an error generating the completion. string error = 5; // Status of the evaluation @@ -48,6 +48,24 @@ message EvalResult { RAGResult best_rag_result = 9; repeated Assertion assertions = 10; + + + // cells_match_result is the LLM judge's evaluation of whether the actual and expected response match + // We use an enum so we can encode unknown + CellsMatchResult cells_match_result = 12; + + // Explanation given by the LLM judge + string judge_explanation = 13; + + // Removed fields + // example_file is the file containing the example + // string example_file = 2; + // Actual response + // repeated Block actual = 3; + // The distance between the actual and expected response + // int32 distance = 4; + // float normalized_distance = 7; + reserved 2, 3, 4, 7; } message Assertion { @@ -94,7 +112,14 @@ message AssertionTableRequest { // EvalExample is a datapoint for evaluation message EvalExample { + // TODO(jeremy): Right now we are using the id to encode the sessionId that the eval example is associated with. + // Should we add a sessionId field and not make them the same? string id = 1; + + // time is the time corresponding to the example. + // Examples need to be replayed in the same order they actually occurred to avoid cheatin + google.protobuf.Timestamp time = 4; + // FullContext is the context used as input FullContext full_context = 2; @@ -102,6 +127,7 @@ message EvalExample { repeated runme.parser.v1.Cell expected_cells = 3; } + message AssertionTableResponse { repeated AssertionRow rows = 1; } diff --git a/protos/go/foyle/logs/traces.zap.go b/protos/go/foyle/logs/traces.zap.go index d3c580cf..cf49d5c1 100644 --- a/protos/go/foyle/logs/traces.zap.go +++ b/protos/go/foyle/logs/traces.zap.go @@ -7,11 +7,11 @@ import ( fmt "fmt" math "math" proto "github.com/golang/protobuf/proto" - _ "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" _ "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" _ "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/runner/v1" _ "google.golang.org/protobuf/types/known/structpb" _ "google.golang.org/protobuf/types/known/timestamppb" + _ "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" go_uber_org_zap_zapcore "go.uber.org/zap/zapcore" github_com_golang_protobuf_ptypes "github.com/golang/protobuf/ptypes" ) diff --git a/protos/go/foyle/v1alpha1/eval.pb.go b/protos/go/foyle/v1alpha1/eval.pb.go index f2ba41c8..517e8523 100644 --- a/protos/go/foyle/v1alpha1/eval.pb.go +++ b/protos/go/foyle/v1alpha1/eval.pb.go @@ -11,6 +11,7 @@ import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" _ "google.golang.org/protobuf/types/known/structpb" + timestamppb "google.golang.org/protobuf/types/known/timestamppb" reflect "reflect" sync "sync" ) @@ -123,6 +124,55 @@ func (AssertResult) EnumDescriptor() ([]byte, []int) { return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{1} } +type CellsMatchResult int32 + +const ( + CellsMatchResult_UNKNOWN_CellsMatchResult CellsMatchResult = 0 + CellsMatchResult_MATCH CellsMatchResult = 1 + CellsMatchResult_MISMATCH CellsMatchResult = 2 +) + +// Enum value maps for CellsMatchResult. +var ( + CellsMatchResult_name = map[int32]string{ + 0: "UNKNOWN_CellsMatchResult", + 1: "MATCH", + 2: "MISMATCH", + } + CellsMatchResult_value = map[string]int32{ + "UNKNOWN_CellsMatchResult": 0, + "MATCH": 1, + "MISMATCH": 2, + } +) + +func (x CellsMatchResult) Enum() *CellsMatchResult { + p := new(CellsMatchResult) + *p = x + return p +} + +func (x CellsMatchResult) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (CellsMatchResult) Descriptor() protoreflect.EnumDescriptor { + return file_foyle_v1alpha1_eval_proto_enumTypes[2].Descriptor() +} + +func (CellsMatchResult) Type() protoreflect.EnumType { + return &file_foyle_v1alpha1_eval_proto_enumTypes[2] +} + +func (x CellsMatchResult) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use CellsMatchResult.Descriptor instead. +func (CellsMatchResult) EnumDescriptor() ([]byte, []int) { + return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{2} +} + // EvalResult represents an evaluation result type EvalResult struct { state protoimpl.MessageState @@ -130,15 +180,9 @@ type EvalResult struct { unknownFields protoimpl.UnknownFields // Example is the answer and expected result - Example *Example `protobuf:"bytes,1,opt,name=example,proto3" json:"example,omitempty"` - // example_file is the file containing the example - ExampleFile string `protobuf:"bytes,2,opt,name=example_file,json=exampleFile,proto3" json:"example_file,omitempty"` - // Actual response - Actual []*Block `protobuf:"bytes,3,rep,name=actual,proto3" json:"actual,omitempty"` - // The distance between the actual and expected response - Distance int32 `protobuf:"varint,4,opt,name=distance,proto3" json:"distance,omitempty"` - NormalizedDistance float32 `protobuf:"fixed32,7,opt,name=normalized_distance,json=normalizedDistance,proto3" json:"normalized_distance,omitempty"` - Error string `protobuf:"bytes,5,opt,name=error,proto3" json:"error,omitempty"` + Example *EvalExample `protobuf:"bytes,1,opt,name=example,proto3" json:"example,omitempty"` + ActualCells []*v1.Cell `protobuf:"bytes,11,rep,name=actual_cells,json=actualCells,proto3" json:"actual_cells,omitempty"` + Error string `protobuf:"bytes,5,opt,name=error,proto3" json:"error,omitempty"` // Status of the evaluation Status EvalResultStatus `protobuf:"varint,6,opt,name=status,proto3,enum=EvalResultStatus" json:"status,omitempty"` // The ID of the generate trace @@ -146,6 +190,11 @@ type EvalResult struct { // Best matching RAG result BestRagResult *RAGResult `protobuf:"bytes,9,opt,name=best_rag_result,json=bestRagResult,proto3" json:"best_rag_result,omitempty"` Assertions []*Assertion `protobuf:"bytes,10,rep,name=assertions,proto3" json:"assertions,omitempty"` + // cells_match_result is the LLM judge's evaluation of whether the actual and expected response match + // We use an enum so we can encode unknown + CellsMatchResult CellsMatchResult `protobuf:"varint,12,opt,name=cells_match_result,json=cellsMatchResult,proto3,enum=CellsMatchResult" json:"cells_match_result,omitempty"` + // Explanation given by the LLM judge + JudgeExplanation string `protobuf:"bytes,13,opt,name=judge_explanation,json=judgeExplanation,proto3" json:"judge_explanation,omitempty"` } func (x *EvalResult) Reset() { @@ -180,41 +229,20 @@ func (*EvalResult) Descriptor() ([]byte, []int) { return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{0} } -func (x *EvalResult) GetExample() *Example { +func (x *EvalResult) GetExample() *EvalExample { if x != nil { return x.Example } return nil } -func (x *EvalResult) GetExampleFile() string { - if x != nil { - return x.ExampleFile - } - return "" -} - -func (x *EvalResult) GetActual() []*Block { +func (x *EvalResult) GetActualCells() []*v1.Cell { if x != nil { - return x.Actual + return x.ActualCells } return nil } -func (x *EvalResult) GetDistance() int32 { - if x != nil { - return x.Distance - } - return 0 -} - -func (x *EvalResult) GetNormalizedDistance() float32 { - if x != nil { - return x.NormalizedDistance - } - return 0 -} - func (x *EvalResult) GetError() string { if x != nil { return x.Error @@ -250,6 +278,20 @@ func (x *EvalResult) GetAssertions() []*Assertion { return nil } +func (x *EvalResult) GetCellsMatchResult() CellsMatchResult { + if x != nil { + return x.CellsMatchResult + } + return CellsMatchResult_UNKNOWN_CellsMatchResult +} + +func (x *EvalResult) GetJudgeExplanation() string { + if x != nil { + return x.JudgeExplanation + } + return "" +} + type Assertion struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -565,7 +607,12 @@ type EvalExample struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // TODO(jeremy): Right now we are using the id to encode the sessionId that the eval example is associated with. + // Should we add a sessionId field and not make them the same? Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // time is the time corresponding to the example. + // Examples need to be replayed in the same order they actually occurred to avoid cheatin + Time *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=time,proto3" json:"time,omitempty"` // FullContext is the context used as input FullContext *FullContext `protobuf:"bytes,2,opt,name=full_context,json=fullContext,proto3" json:"full_context,omitempty"` // Expected cells is the expected value for generation @@ -611,6 +658,13 @@ func (x *EvalExample) GetId() string { return "" } +func (x *EvalExample) GetTime() *timestamppb.Timestamp { + if x != nil { + return x.Time + } + return nil +} + func (x *EvalExample) GetFullContext() *FullContext { if x != nil { return x.FullContext @@ -683,91 +737,104 @@ var file_foyle_v1alpha1_eval_proto_rawDesc = []byte{ 0x6f, 0x1a, 0x1c, 0x66, 0x6f, 0x79, 0x6c, 0x65, 0x2f, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x31, 0x2f, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1c, 0x72, 0x75, 0x6e, 0x6d, 0x65, 0x2f, 0x70, 0x61, 0x72, 0x73, 0x65, 0x72, 0x2f, 0x76, 0x31, - 0x2f, 0x70, 0x61, 0x72, 0x73, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1c, 0x67, - 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x73, - 0x74, 0x72, 0x75, 0x63, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x83, 0x03, 0x0a, 0x0a, - 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x22, 0x0a, 0x07, 0x65, 0x78, - 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x08, 0x2e, 0x45, 0x78, - 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x52, 0x07, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x12, 0x21, - 0x0a, 0x0c, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x02, - 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x46, 0x69, 0x6c, - 0x65, 0x12, 0x1e, 0x0a, 0x06, 0x61, 0x63, 0x74, 0x75, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x03, 0x28, - 0x0b, 0x32, 0x06, 0x2e, 0x42, 0x6c, 0x6f, 0x63, 0x6b, 0x52, 0x06, 0x61, 0x63, 0x74, 0x75, 0x61, - 0x6c, 0x12, 0x1a, 0x0a, 0x08, 0x64, 0x69, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x18, 0x04, 0x20, - 0x01, 0x28, 0x05, 0x52, 0x08, 0x64, 0x69, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x12, 0x2f, 0x0a, - 0x13, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x64, 0x5f, 0x64, 0x69, 0x73, 0x74, - 0x61, 0x6e, 0x63, 0x65, 0x18, 0x07, 0x20, 0x01, 0x28, 0x02, 0x52, 0x12, 0x6e, 0x6f, 0x72, 0x6d, - 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x64, 0x44, 0x69, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x12, 0x14, - 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x12, 0x29, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x06, - 0x20, 0x01, 0x28, 0x0e, 0x32, 0x11, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, - 0x20, 0x0a, 0x0c, 0x67, 0x65, 0x6e, 0x5f, 0x74, 0x72, 0x61, 0x63, 0x65, 0x5f, 0x69, 0x64, 0x18, - 0x08, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x67, 0x65, 0x6e, 0x54, 0x72, 0x61, 0x63, 0x65, 0x49, - 0x64, 0x12, 0x32, 0x0a, 0x0f, 0x62, 0x65, 0x73, 0x74, 0x5f, 0x72, 0x61, 0x67, 0x5f, 0x72, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x18, 0x09, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0a, 0x2e, 0x52, 0x41, 0x47, - 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x0d, 0x62, 0x65, 0x73, 0x74, 0x52, 0x61, 0x67, 0x52, - 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x2a, 0x0a, 0x0a, 0x61, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, - 0x6f, 0x6e, 0x73, 0x18, 0x0a, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0a, 0x2e, 0x41, 0x73, 0x73, 0x65, - 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x0a, 0x61, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, - 0x73, 0x22, 0x5e, 0x0a, 0x09, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x12, - 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, - 0x6d, 0x65, 0x12, 0x25, 0x0a, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x18, 0x02, 0x20, 0x01, - 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x52, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x64, 0x65, 0x74, - 0x61, 0x69, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x64, 0x65, 0x74, 0x61, 0x69, - 0x6c, 0x22, 0x33, 0x0a, 0x15, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, - 0x69, 0x73, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x64, 0x61, - 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x64, 0x61, - 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x22, 0x3b, 0x0a, 0x16, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, - 0x12, 0x21, 0x0a, 0x05, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, - 0x0b, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x05, 0x69, 0x74, - 0x65, 0x6d, 0x73, 0x22, 0xa4, 0x02, 0x0a, 0x0c, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, - 0x6e, 0x52, 0x6f, 0x77, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x02, 0x69, 0x64, 0x12, 0x20, 0x0a, 0x0b, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x46, - 0x69, 0x6c, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x65, 0x78, 0x61, 0x6d, 0x70, - 0x6c, 0x65, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x15, 0x0a, 0x06, 0x64, 0x6f, 0x63, 0x5f, 0x6d, 0x64, - 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x64, 0x6f, 0x63, 0x4d, 0x64, 0x12, 0x1b, 0x0a, - 0x09, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x5f, 0x6d, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x08, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x4d, 0x64, 0x12, 0x3d, 0x0a, 0x13, 0x63, 0x6f, - 0x64, 0x65, 0x5f, 0x61, 0x66, 0x74, 0x65, 0x72, 0x5f, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, - 0x6e, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, - 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x11, 0x63, 0x6f, 0x64, 0x65, 0x41, 0x66, 0x74, 0x65, - 0x72, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x12, 0x31, 0x0a, 0x0d, 0x6f, 0x6e, 0x65, - 0x5f, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x63, 0x65, 0x6c, 0x6c, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, - 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, - 0x0b, 0x6f, 0x6e, 0x65, 0x43, 0x6f, 0x64, 0x65, 0x43, 0x65, 0x6c, 0x6c, 0x12, 0x3c, 0x0a, 0x13, - 0x65, 0x6e, 0x64, 0x73, 0x5f, 0x77, 0x69, 0x74, 0x68, 0x5f, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x63, - 0x65, 0x6c, 0x6c, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, - 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x10, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, - 0x74, 0x68, 0x43, 0x6f, 0x64, 0x65, 0x43, 0x65, 0x6c, 0x6c, 0x22, 0x33, 0x0a, 0x15, 0x41, 0x73, - 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x61, 0x62, 0x6c, 0x65, 0x52, 0x65, 0x71, 0x75, - 0x65, 0x73, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x22, - 0x8c, 0x01, 0x0a, 0x0b, 0x45, 0x76, 0x61, 0x6c, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x12, - 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, - 0x2f, 0x0a, 0x0c, 0x66, 0x75, 0x6c, 0x6c, 0x5f, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x18, - 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0c, 0x2e, 0x46, 0x75, 0x6c, 0x6c, 0x43, 0x6f, 0x6e, 0x74, - 0x65, 0x78, 0x74, 0x52, 0x0b, 0x66, 0x75, 0x6c, 0x6c, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, - 0x12, 0x3c, 0x0a, 0x0e, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x63, 0x65, 0x6c, - 0x6c, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x15, 0x2e, 0x72, 0x75, 0x6e, 0x6d, 0x65, - 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x72, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x65, 0x6c, 0x6c, 0x52, - 0x0d, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x43, 0x65, 0x6c, 0x6c, 0x73, 0x22, 0x3b, - 0x0a, 0x16, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x61, 0x62, 0x6c, 0x65, - 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21, 0x0a, 0x04, 0x72, 0x6f, 0x77, 0x73, - 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, - 0x6f, 0x6e, 0x52, 0x6f, 0x77, 0x52, 0x04, 0x72, 0x6f, 0x77, 0x73, 0x2a, 0x47, 0x0a, 0x10, 0x45, - 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, - 0x1e, 0x0a, 0x1a, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, 0x45, 0x56, 0x41, 0x4c, 0x5f, - 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x10, 0x00, 0x12, - 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x45, 0x52, 0x52, - 0x4f, 0x52, 0x10, 0x02, 0x2a, 0x4d, 0x0a, 0x0c, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x14, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, - 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x10, 0x00, 0x12, 0x0a, - 0x0a, 0x06, 0x50, 0x41, 0x53, 0x53, 0x45, 0x44, 0x10, 0x01, 0x12, 0x0a, 0x0a, 0x06, 0x46, 0x41, - 0x49, 0x4c, 0x45, 0x44, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x53, 0x4b, 0x49, 0x50, 0x50, 0x45, - 0x44, 0x10, 0x03, 0x32, 0x8d, 0x01, 0x0a, 0x0b, 0x45, 0x76, 0x61, 0x6c, 0x53, 0x65, 0x72, 0x76, + 0x2f, 0x70, 0x61, 0x72, 0x73, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1f, 0x67, + 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x74, + 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1c, + 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, + 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xb7, 0x03, 0x0a, + 0x0a, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x26, 0x0a, 0x07, 0x65, + 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0c, 0x2e, 0x45, + 0x76, 0x61, 0x6c, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x52, 0x07, 0x65, 0x78, 0x61, 0x6d, + 0x70, 0x6c, 0x65, 0x12, 0x38, 0x0a, 0x0c, 0x61, 0x63, 0x74, 0x75, 0x61, 0x6c, 0x5f, 0x63, 0x65, + 0x6c, 0x6c, 0x73, 0x18, 0x0b, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x15, 0x2e, 0x72, 0x75, 0x6e, 0x6d, + 0x65, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x72, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x65, 0x6c, 0x6c, + 0x52, 0x0b, 0x61, 0x63, 0x74, 0x75, 0x61, 0x6c, 0x43, 0x65, 0x6c, 0x6c, 0x73, 0x12, 0x14, 0x0a, + 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, + 0x72, 0x6f, 0x72, 0x12, 0x29, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x06, 0x20, + 0x01, 0x28, 0x0e, 0x32, 0x11, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, + 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x20, + 0x0a, 0x0c, 0x67, 0x65, 0x6e, 0x5f, 0x74, 0x72, 0x61, 0x63, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x08, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x67, 0x65, 0x6e, 0x54, 0x72, 0x61, 0x63, 0x65, 0x49, 0x64, + 0x12, 0x32, 0x0a, 0x0f, 0x62, 0x65, 0x73, 0x74, 0x5f, 0x72, 0x61, 0x67, 0x5f, 0x72, 0x65, 0x73, + 0x75, 0x6c, 0x74, 0x18, 0x09, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0a, 0x2e, 0x52, 0x41, 0x47, 0x52, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x0d, 0x62, 0x65, 0x73, 0x74, 0x52, 0x61, 0x67, 0x52, 0x65, + 0x73, 0x75, 0x6c, 0x74, 0x12, 0x2a, 0x0a, 0x0a, 0x61, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, + 0x6e, 0x73, 0x18, 0x0a, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0a, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, + 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x0a, 0x61, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x73, + 0x12, 0x3f, 0x0a, 0x12, 0x63, 0x65, 0x6c, 0x6c, 0x73, 0x5f, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5f, + 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x11, 0x2e, 0x43, + 0x65, 0x6c, 0x6c, 0x73, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, + 0x10, 0x63, 0x65, 0x6c, 0x6c, 0x73, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x73, 0x75, 0x6c, + 0x74, 0x12, 0x2b, 0x0a, 0x11, 0x6a, 0x75, 0x64, 0x67, 0x65, 0x5f, 0x65, 0x78, 0x70, 0x6c, 0x61, + 0x6e, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x10, 0x6a, 0x75, + 0x64, 0x67, 0x65, 0x45, 0x78, 0x70, 0x6c, 0x61, 0x6e, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4a, 0x04, + 0x08, 0x02, 0x10, 0x03, 0x4a, 0x04, 0x08, 0x03, 0x10, 0x04, 0x4a, 0x04, 0x08, 0x04, 0x10, 0x05, + 0x4a, 0x04, 0x08, 0x07, 0x10, 0x08, 0x22, 0x5e, 0x0a, 0x09, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, + 0x69, 0x6f, 0x6e, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x25, 0x0a, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, + 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, + 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x16, + 0x0a, 0x06, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, + 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x22, 0x33, 0x0a, 0x15, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, + 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, + 0x1a, 0x0a, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x22, 0x3b, 0x0a, 0x16, 0x45, + 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21, 0x0a, 0x05, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x18, 0x01, + 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0b, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, + 0x74, 0x52, 0x05, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x22, 0xa4, 0x02, 0x0a, 0x0c, 0x41, 0x73, 0x73, + 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x6f, 0x77, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x20, 0x0a, 0x0b, 0x65, 0x78, 0x61, + 0x6d, 0x70, 0x6c, 0x65, 0x46, 0x69, 0x6c, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, + 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x15, 0x0a, 0x06, 0x64, + 0x6f, 0x63, 0x5f, 0x6d, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x64, 0x6f, 0x63, + 0x4d, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x5f, 0x6d, 0x64, 0x18, + 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x4d, 0x64, 0x12, + 0x3d, 0x0a, 0x13, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x61, 0x66, 0x74, 0x65, 0x72, 0x5f, 0x6d, 0x61, + 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x41, + 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x11, 0x63, 0x6f, 0x64, + 0x65, 0x41, 0x66, 0x74, 0x65, 0x72, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x12, 0x31, + 0x0a, 0x0d, 0x6f, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x63, 0x65, 0x6c, 0x6c, 0x18, + 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, + 0x73, 0x75, 0x6c, 0x74, 0x52, 0x0b, 0x6f, 0x6e, 0x65, 0x43, 0x6f, 0x64, 0x65, 0x43, 0x65, 0x6c, + 0x6c, 0x12, 0x3c, 0x0a, 0x13, 0x65, 0x6e, 0x64, 0x73, 0x5f, 0x77, 0x69, 0x74, 0x68, 0x5f, 0x63, + 0x6f, 0x64, 0x65, 0x5f, 0x63, 0x65, 0x6c, 0x6c, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, + 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x10, 0x65, + 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x43, 0x6f, 0x64, 0x65, 0x43, 0x65, 0x6c, 0x6c, 0x22, + 0x33, 0x0a, 0x15, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x61, 0x62, 0x6c, + 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x64, 0x61, 0x74, 0x61, + 0x62, 0x61, 0x73, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x64, 0x61, 0x74, 0x61, + 0x62, 0x61, 0x73, 0x65, 0x22, 0xbc, 0x01, 0x0a, 0x0b, 0x45, 0x76, 0x61, 0x6c, 0x45, 0x78, 0x61, + 0x6d, 0x70, 0x6c, 0x65, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x02, 0x69, 0x64, 0x12, 0x2e, 0x0a, 0x04, 0x74, 0x69, 0x6d, 0x65, 0x18, 0x04, 0x20, 0x01, + 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x04, + 0x74, 0x69, 0x6d, 0x65, 0x12, 0x2f, 0x0a, 0x0c, 0x66, 0x75, 0x6c, 0x6c, 0x5f, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x78, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0c, 0x2e, 0x46, 0x75, 0x6c, + 0x6c, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x52, 0x0b, 0x66, 0x75, 0x6c, 0x6c, 0x43, 0x6f, + 0x6e, 0x74, 0x65, 0x78, 0x74, 0x12, 0x3c, 0x0a, 0x0e, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, + 0x64, 0x5f, 0x63, 0x65, 0x6c, 0x6c, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x15, 0x2e, + 0x72, 0x75, 0x6e, 0x6d, 0x65, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x72, 0x2e, 0x76, 0x31, 0x2e, + 0x43, 0x65, 0x6c, 0x6c, 0x52, 0x0d, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x43, 0x65, + 0x6c, 0x6c, 0x73, 0x22, 0x3b, 0x0a, 0x16, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, + 0x54, 0x61, 0x62, 0x6c, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21, 0x0a, + 0x04, 0x72, 0x6f, 0x77, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x41, 0x73, + 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x6f, 0x77, 0x52, 0x04, 0x72, 0x6f, 0x77, 0x73, + 0x2a, 0x47, 0x0a, 0x10, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x53, 0x74, + 0x61, 0x74, 0x75, 0x73, 0x12, 0x1e, 0x0a, 0x1a, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, + 0x45, 0x56, 0x41, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x54, 0x41, 0x54, + 0x55, 0x53, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x09, + 0x0a, 0x05, 0x45, 0x52, 0x52, 0x4f, 0x52, 0x10, 0x02, 0x2a, 0x4d, 0x0a, 0x0c, 0x41, 0x73, 0x73, + 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x14, 0x55, 0x4e, 0x4b, + 0x4e, 0x4f, 0x57, 0x4e, 0x5f, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, + 0x74, 0x10, 0x00, 0x12, 0x0a, 0x0a, 0x06, 0x50, 0x41, 0x53, 0x53, 0x45, 0x44, 0x10, 0x01, 0x12, + 0x0a, 0x0a, 0x06, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x53, + 0x4b, 0x49, 0x50, 0x50, 0x45, 0x44, 0x10, 0x03, 0x2a, 0x49, 0x0a, 0x10, 0x43, 0x65, 0x6c, 0x6c, + 0x73, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x1c, 0x0a, 0x18, + 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, 0x43, 0x65, 0x6c, 0x6c, 0x73, 0x4d, 0x61, 0x74, + 0x63, 0x68, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x10, 0x00, 0x12, 0x09, 0x0a, 0x05, 0x4d, 0x41, + 0x54, 0x43, 0x48, 0x10, 0x01, 0x12, 0x0c, 0x0a, 0x08, 0x4d, 0x49, 0x53, 0x4d, 0x41, 0x54, 0x43, + 0x48, 0x10, 0x02, 0x32, 0x8d, 0x01, 0x0a, 0x0b, 0x45, 0x76, 0x61, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x39, 0x0a, 0x04, 0x4c, 0x69, 0x73, 0x74, 0x12, 0x16, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x17, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, @@ -795,48 +862,50 @@ func file_foyle_v1alpha1_eval_proto_rawDescGZIP() []byte { return file_foyle_v1alpha1_eval_proto_rawDescData } -var file_foyle_v1alpha1_eval_proto_enumTypes = make([]protoimpl.EnumInfo, 2) +var file_foyle_v1alpha1_eval_proto_enumTypes = make([]protoimpl.EnumInfo, 3) var file_foyle_v1alpha1_eval_proto_msgTypes = make([]protoimpl.MessageInfo, 8) var file_foyle_v1alpha1_eval_proto_goTypes = []interface{}{ (EvalResultStatus)(0), // 0: EvalResultStatus (AssertResult)(0), // 1: AssertResult - (*EvalResult)(nil), // 2: EvalResult - (*Assertion)(nil), // 3: Assertion - (*EvalResultListRequest)(nil), // 4: EvalResultListRequest - (*EvalResultListResponse)(nil), // 5: EvalResultListResponse - (*AssertionRow)(nil), // 6: AssertionRow - (*AssertionTableRequest)(nil), // 7: AssertionTableRequest - (*EvalExample)(nil), // 8: EvalExample - (*AssertionTableResponse)(nil), // 9: AssertionTableResponse - (*Example)(nil), // 10: Example - (*Block)(nil), // 11: Block + (CellsMatchResult)(0), // 2: CellsMatchResult + (*EvalResult)(nil), // 3: EvalResult + (*Assertion)(nil), // 4: Assertion + (*EvalResultListRequest)(nil), // 5: EvalResultListRequest + (*EvalResultListResponse)(nil), // 6: EvalResultListResponse + (*AssertionRow)(nil), // 7: AssertionRow + (*AssertionTableRequest)(nil), // 8: AssertionTableRequest + (*EvalExample)(nil), // 9: EvalExample + (*AssertionTableResponse)(nil), // 10: AssertionTableResponse + (*v1.Cell)(nil), // 11: runme.parser.v1.Cell (*RAGResult)(nil), // 12: RAGResult - (*FullContext)(nil), // 13: FullContext - (*v1.Cell)(nil), // 14: runme.parser.v1.Cell + (*timestamppb.Timestamp)(nil), // 13: google.protobuf.Timestamp + (*FullContext)(nil), // 14: FullContext } var file_foyle_v1alpha1_eval_proto_depIdxs = []int32{ - 10, // 0: EvalResult.example:type_name -> Example - 11, // 1: EvalResult.actual:type_name -> Block + 9, // 0: EvalResult.example:type_name -> EvalExample + 11, // 1: EvalResult.actual_cells:type_name -> runme.parser.v1.Cell 0, // 2: EvalResult.status:type_name -> EvalResultStatus 12, // 3: EvalResult.best_rag_result:type_name -> RAGResult - 3, // 4: EvalResult.assertions:type_name -> Assertion - 1, // 5: Assertion.result:type_name -> AssertResult - 2, // 6: EvalResultListResponse.items:type_name -> EvalResult - 1, // 7: AssertionRow.code_after_markdown:type_name -> AssertResult - 1, // 8: AssertionRow.one_code_cell:type_name -> AssertResult - 1, // 9: AssertionRow.ends_with_code_cell:type_name -> AssertResult - 13, // 10: EvalExample.full_context:type_name -> FullContext - 14, // 11: EvalExample.expected_cells:type_name -> runme.parser.v1.Cell - 6, // 12: AssertionTableResponse.rows:type_name -> AssertionRow - 4, // 13: EvalService.List:input_type -> EvalResultListRequest - 7, // 14: EvalService.AssertionTable:input_type -> AssertionTableRequest - 5, // 15: EvalService.List:output_type -> EvalResultListResponse - 9, // 16: EvalService.AssertionTable:output_type -> AssertionTableResponse - 15, // [15:17] is the sub-list for method output_type - 13, // [13:15] is the sub-list for method input_type - 13, // [13:13] is the sub-list for extension type_name - 13, // [13:13] is the sub-list for extension extendee - 0, // [0:13] is the sub-list for field type_name + 4, // 4: EvalResult.assertions:type_name -> Assertion + 2, // 5: EvalResult.cells_match_result:type_name -> CellsMatchResult + 1, // 6: Assertion.result:type_name -> AssertResult + 3, // 7: EvalResultListResponse.items:type_name -> EvalResult + 1, // 8: AssertionRow.code_after_markdown:type_name -> AssertResult + 1, // 9: AssertionRow.one_code_cell:type_name -> AssertResult + 1, // 10: AssertionRow.ends_with_code_cell:type_name -> AssertResult + 13, // 11: EvalExample.time:type_name -> google.protobuf.Timestamp + 14, // 12: EvalExample.full_context:type_name -> FullContext + 11, // 13: EvalExample.expected_cells:type_name -> runme.parser.v1.Cell + 7, // 14: AssertionTableResponse.rows:type_name -> AssertionRow + 5, // 15: EvalService.List:input_type -> EvalResultListRequest + 8, // 16: EvalService.AssertionTable:input_type -> AssertionTableRequest + 6, // 17: EvalService.List:output_type -> EvalResultListResponse + 10, // 18: EvalService.AssertionTable:output_type -> AssertionTableResponse + 17, // [17:19] is the sub-list for method output_type + 15, // [15:17] is the sub-list for method input_type + 15, // [15:15] is the sub-list for extension type_name + 15, // [15:15] is the sub-list for extension extendee + 0, // [0:15] is the sub-list for field type_name } func init() { file_foyle_v1alpha1_eval_proto_init() } @@ -950,7 +1019,7 @@ func file_foyle_v1alpha1_eval_proto_init() { File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_foyle_v1alpha1_eval_proto_rawDesc, - NumEnums: 2, + NumEnums: 3, NumMessages: 8, NumExtensions: 0, NumServices: 1, diff --git a/protos/go/foyle/v1alpha1/eval.zap.go b/protos/go/foyle/v1alpha1/eval.zap.go index dfa80def..571d7534 100644 --- a/protos/go/foyle/v1alpha1/eval.zap.go +++ b/protos/go/foyle/v1alpha1/eval.zap.go @@ -8,8 +8,10 @@ import ( math "math" proto "github.com/golang/protobuf/proto" _ "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/parser/v1" + _ "google.golang.org/protobuf/types/known/timestamppb" _ "google.golang.org/protobuf/types/known/structpb" go_uber_org_zap_zapcore "go.uber.org/zap/zapcore" + github_com_golang_protobuf_ptypes "github.com/golang/protobuf/ptypes" ) // Reference imports to suppress errors if they are not otherwise used. @@ -33,12 +35,9 @@ func (m *EvalResult) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder) } } - keyName = "example_file" // field example_file = 2 - enc.AddString(keyName, m.ExampleFile) - - keyName = "actual" // field actual = 3 + keyName = "actual_cells" // field actual_cells = 11 enc.AddArray(keyName, go_uber_org_zap_zapcore.ArrayMarshalerFunc(func(aenc go_uber_org_zap_zapcore.ArrayEncoder) error { - for _, rv := range m.Actual { + for _, rv := range m.ActualCells { _ = rv if rv != nil { var vv interface{} = rv @@ -50,12 +49,6 @@ func (m *EvalResult) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder) return nil })) - keyName = "distance" // field distance = 4 - enc.AddInt32(keyName, m.Distance) - - keyName = "normalized_distance" // field normalized_distance = 7 - enc.AddFloat32(keyName, m.NormalizedDistance) - keyName = "error" // field error = 5 enc.AddString(keyName, m.Error) @@ -87,6 +80,12 @@ func (m *EvalResult) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder) return nil })) + keyName = "cells_match_result" // field cells_match_result = 12 + enc.AddString(keyName, m.CellsMatchResult.String()) + + keyName = "judge_explanation" // field judge_explanation = 13 + enc.AddString(keyName, m.JudgeExplanation) + return nil } @@ -206,6 +205,11 @@ func (m *EvalExample) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder keyName = "id" // field id = 1 enc.AddString(keyName, m.Id) + keyName = "time" // field time = 4 + if t, err := github_com_golang_protobuf_ptypes.Timestamp(m.Time); err == nil { + enc.AddTime(keyName, t) + } + keyName = "full_context" // field full_context = 2 if m.FullContext != nil { var vv interface{} = m.FullContext