Skip to content

Commit

Permalink
Learning Bug Fix: Need to enqueue blocks after execution events (#241)
Browse files Browse the repository at this point in the history
* We need to enqueue blocks in the Learner when we detect an execution
event
* If we don't then blocks don't get checked to see if we can use them
for learning.
* The reason we were occasionally seeing learning occur without this is
there's some small probabity
* The generation event for the block will be processed by an Analyzer
and enqueue the item in the learner
   * The execution event is processed by the Analyzer
   * The learner reconciles the generation event
* i.e. since the Analyzer and learner run in parallel its possible the
execution event gets detected before the learner has had a chance to
handle the block in which case the execution event will be part of the
blocklog when it is processed by the learner.

* We shouldn't notify the learner after processing a log entry about how
a block was generated.
* This is a legacy from when block generation was logged in Foyle logs
but cell executions were logged in RunMe logs
* In that case we didn't have any guarantee in the order in which block
generation and execution events would be processed
* Now that execution events are logged in Foyle Logs we can be
reasonably assured that log entries for cell execution will be logged
after the log for how the cell was generated. Therefore we can wait
until encountering an execution event to process the block in the
learner.

* This is most likely the most recent bug in #215
  • Loading branch information
jlewi authored Sep 14, 2024
1 parent 8ce6e49 commit e347283
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 20 deletions.
21 changes: 11 additions & 10 deletions app/pkg/analyze/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ type Analyzer struct {

watcher *fsnotify.Watcher

blockNotifier PostBlockEvent
learnNotifier PostBlockEvent

handleLogFileIsDone sync.WaitGroup
handleBlocksIsDone sync.WaitGroup
Expand Down Expand Up @@ -140,10 +140,10 @@ type blockItem struct {
type PostBlockEvent func(id string) error

// Run runs the analyzer; continually processing logs.
// blockNotifier is an optional function that will be called when a block is updated.
// learnNotifier is an optional function that will be called when a block is updated.
// This should be non blocking.
func (a *Analyzer) Run(ctx context.Context, logDirs []string, blockNotifier PostBlockEvent) error {
a.blockNotifier = blockNotifier
func (a *Analyzer) Run(ctx context.Context, logDirs []string, learnNotifier PostBlockEvent) error {
a.learnNotifier = learnNotifier
// Find all the current files
jsonFiles, err := findLogFilesInDirs(ctx, logDirs)
if err != nil {
Expand Down Expand Up @@ -447,6 +447,12 @@ func (a *Analyzer) processLogEvent(ctx context.Context, entry *api.LogEntry) {
}); err != nil {
log.Error(err, "Failed to update block with execution", "blockId", bid)
}
// We need to enqueue the block for processing since it was executed.
if a.learnNotifier != nil {
if err := a.learnNotifier(bid); err != nil {
log.Error(err, "Error notifying block event", "blockId", bid)
}
}
case v1alpha1.LogEventType_ACCEPTED:
fallthrough
case v1alpha1.LogEventType_REJECTED:
Expand Down Expand Up @@ -637,12 +643,7 @@ func (a *Analyzer) handleBlockEvents(ctx context.Context) {
return buildBlockLog(ctx, block, a.tracesDB)
})
if err != nil {
log.Error(err, "Error processing block", "block", blockItem.id)
}
if a.blockNotifier != nil {
if err := a.blockNotifier(blockItem.id); err != nil {
log.Error(err, "Error notifying block event", "block", blockItem.id)
}
log.Error(err, "Error processing block", "blockId", blockItem.id)
}
if a.signalBlockDone != nil {
a.signalBlockDone <- blockItem.id
Expand Down
24 changes: 24 additions & 0 deletions app/pkg/learn/learner.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import (
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/jlewi/monogo/files"
"github.com/jlewi/monogo/helpers"

Expand All @@ -30,6 +33,21 @@ const (
fileSuffix = ".example.binpb"
)

var (
enqueuedCounter = promauto.NewCounter(prometheus.CounterOpts{
Name: "learner_enqueued_total",
Help: "Total number of enqueued blocks",
})

cellsProcessed = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "learner_blocks_processed",
Help: "Number of blocks processed by the learner",
},
[]string{"status"},
)
)

// Learner handles the learn loop to learn from past mistakes.
//
// TODO(jeremy): Should we call this a trainer?
Expand Down Expand Up @@ -74,6 +92,7 @@ func (l *Learner) Enqueue(id string) error {
return errors.New("Queue is shutting down; can't enqueue anymore items")
}
l.queue.Add(id)
enqueuedCounter.Inc()
return nil
}

Expand Down Expand Up @@ -130,26 +149,31 @@ func (l *Learner) Reconcile(ctx context.Context, id string) error {

if b.ExecutedBlock == nil {
// Skip unexecuted block
cellsProcessed.WithLabelValues("unexecuted").Inc()
return nil
}

if b.GeneratedBlock == nil {
// Block wasn't the result of AI generation
cellsProcessed.WithLabelValues("notgenerated").Inc()
return nil
}

if b.EvalMode {
log.V(logs.Debug).Info("Skipping block which was created as part of an eval", "id", b.GetId())
cellsProcessed.WithLabelValues("eval").Inc()
return nil
}

// TODO(jeremy): Should we use some sort of distance metric? e.g. edit distance? We could potentially
// Use the metric used for eval.
if strings.TrimSpace(b.ExecutedBlock.GetContents()) == strings.TrimSpace(b.GeneratedBlock.GetContents()) {
log.V(logs.Debug).Info("Skipping executed block which matches generated block", "id", b.GetId())
cellsProcessed.WithLabelValues("nochange").Inc()
return nil
}

cellsProcessed.WithLabelValues("learn").Inc()
expectedFiles := l.getExampleFiles(b.GetId())

log.Info("Found new training example", "blockId", b.GetId())
Expand Down
14 changes: 14 additions & 0 deletions app/pkg/server/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package server

import (
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

// prometheusHandler wraps the promhttp.Handler() and returns a Gin handler
func prometheusHandler() gin.HandlerFunc {
h := promhttp.Handler()
return func(c *gin.Context) {
h.ServeHTTP(c.Writer, c.Request)
}
}
2 changes: 2 additions & 0 deletions app/pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ func (s *Server) createGinEngine() error {

// TODO(jeremy): Should we turn this into a protobuf service and use connect?
router.GET("/healthz", s.healthCheck)
// Add the Prometheus metrics endpoint
router.GET("/metrics", prometheusHandler())
router.NoRoute(func(c *gin.Context) {
log.Info("Request for not found path", "path", c.Request.URL.Path)
c.JSON(http.StatusNotFound, gin.H{"message": "Not found", "path": c.Request.URL.Path})
Expand Down
40 changes: 30 additions & 10 deletions docs/content/en/docs/learning/troubleshoot_learning.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,11 @@ To get the cell ID for a given cell
2. Find code block containing your cell
3. Your cell will contain metadata which contains the ID e.g.

````
```bash {"id":"01J6DG428ER427GJNTKC15G6JM"}
CELLID=01J6DFZ9V3R29TCDEY2AYA8XTV
curl http://localhost:8080/api/blocklogs/${CELLID} | jq .
```sh
` ` `bash {"id":"01J6DG428ER427GJNTKC15G6JM"}
echo hello world
` ` `
```
````

## Did Block Logs Get Created

Expand All @@ -61,14 +60,14 @@ curl http://localhost:8080/api/blocklogs/${CELLID} | jq .
* The cell should be one that was generated by the AI and you think learning should have occurred on

```bash
CELLID=01J7KQPBYCT9VM2KFBY48JC7J0
CELLID=01J7S3QZMS5F742JFPWZDCTVRG
curl -X POST http://localhost:8877/api/foyle.logs.LogsService/GetBlockLog -H "Content-Type: application/json" -d "{\"id\": \"${CELLID}\"}" | jq .
```

* If this returns not found then no log was created for this cell and there is a problem with Log Processing
* The correct output should look like the following

```
```json
{
"blockLog": {
"id": "01J7KQPBYCT9VM2KFBY48JC7J0",
Expand All @@ -94,12 +93,14 @@ curl -X POST http://localhost:8877/api/foyle.logs.LogsService/GetBlockLog -H "Co

* Notably the output should include the following fields

* **generatedBlock** - This is the block that was generated by the AI
* **executedBlock** - This is the block that the user actually executed
* **generatedBlock** - This is the block that was generated by the AI
* **executedBlock** - This is the block that the user actually executed

* If the **generatedBlock** and **executedBlock** are the same then no learning occured
* If the **generatedBlock** is missing then this means the block wasn't generated by Foyle and learning won't occur
* This can happen if you insert a blank cell and manually enter a command

* This can happen if you insert a blank cell and manually enter a command

* If the **executedBlock** is missing then this means the block wasn't executed and learning won't occur

## Was a cell executed?
Expand Down Expand Up @@ -167,3 +168,22 @@ jq -c 'select(.message == "Building block log")' ${LASTLOG}
```bash
jq -c 'select(.severity == "error")' ${LASTLOG}
```

# Check Prometheus counters

Check to make sure blocks are being enqueued for learner processing

```bash
curl -s http://localhost:8877/metrics | grep learner_enqueued_total
```

* If the number is 0 please open an issue in GitHub because there is most likely a bug

Check the metrics for post processing of blocks

```bash
curl -s http://localhost:8877/metrics | grep learner_blocks_processed
```

* The value of `learner_blocks_processed{status="learn"}` is the number of blocks that contributed to learning
* The value of `learner_blocks_processed{status="unexecuted"}` is the number of blocks that were ignored because they were not executed

0 comments on commit e347283

Please sign in to comment.