From 18f4051cb0a3098f472af08461fd14f7eefd2ec1 Mon Sep 17 00:00:00 2001
From: Yash Anand <yashanand1910@gmail.com>
Date: Tue, 6 Aug 2024 12:29:25 +0000
Subject: [PATCH] Create checkpoint in worker

---
 pkg/types/scheduler.go           |  1 +
 pkg/worker/worker.go             | 89 +++++++++++++++++---------------
 sdk/src/beta9/runner/endpoint.py |  4 +-
 3 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/pkg/types/scheduler.go b/pkg/types/scheduler.go
index a070b5bf0..ba460dff2 100644
--- a/pkg/types/scheduler.go
+++ b/pkg/types/scheduler.go
@@ -80,6 +80,7 @@ type ContainerRequest struct {
 	Mounts       []Mount   `json:"mounts"`
 	RetryCount   int       `json:"retry_count"`
 	PoolSelector string    `json:"pool_selector"`
+  CheckpointEnabled bool `json:"checkpoint_enabled"`
 }
 
 const ContainerExitCodeTtlS int = 300
diff --git a/pkg/worker/worker.go b/pkg/worker/worker.go
index e48289714..f18bc0563 100644
--- a/pkg/worker/worker.go
+++ b/pkg/worker/worker.go
@@ -7,6 +7,7 @@ import (
 	"errors"
 	"fmt"
 	"log"
+	"net/http"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -163,14 +164,6 @@ func NewWorker() (*Worker, error) {
 		return nil, err
 	}
 
-	var cedanaClient *CedanaClient = nil
-	if config.Cedana.Enabled {
-		cedanaClient, err = NewCedanaClient(config.Cedana.HostName, "")
-		if err != nil {
-			log.Printf("Unable to create Cedana client, checkpoint/restore unavailable: %+v\n", err)
-		}
-	}
-
 	return &Worker{
 		ctx:                  ctx,
 		cancel:               cancel,
@@ -186,7 +179,6 @@ func NewWorker() (*Worker, error) {
 		redisClient:          redisClient,
 		podAddr:              podAddr,
 		imageClient:          imageClient,
-		cedanaClient:         cedanaClient,
 		podHostName:          podHostName,
 		eventBus:             nil,
 		workerId:             workerId,
@@ -393,46 +385,43 @@ func (s *Worker) createCheckpoint(request *types.ContainerRequest) {
 		return
 	}
 
-	_ = fmt.Sprintf("0.0.0.0:%d/health", instance.Port)
 	// TODO: we need a reliable way to detect that the container is completely booted
 
-	/*
-		   We can't just use health checks because one worker could be up while the others
-		   are still booting. After we have a reliable way of doing that,
-		   we should probably be polling whatever method that is here, and also checking for the
-		   containers existence. Something like this:
-
-		   elapsed := 0
-		   start := time.Now()
-		   containerReady := false
-		   timeout := time.Duration(time.Second * 120)
-		   for := range time.Ticker(time.Second) {
-		   	   // 1. check if container exists
-				instance, exists := s.containerInstances.Get(request.ContainerId)
-				if !exists {
-					return
-				}
-
-			   // 2. check if container is ready
-			   if weAreGood {
-			      containerReady := true
-				  break
-			   }
-
-			   if time.Since(start) > timeout {
-			 		break
-			   }
+	start := time.Now()
+	timeout := time.Duration(time.Second * 120)
+	for time.Since(start) < timeout {
+		if s.cedanaClient == nil {
+			cedanaClient, err := NewCedanaClient(context.TODO())
+			if err != nil {
+				log.Printf("<%s> - failed to create cedana client: %v\n", request.ContainerId, err)
 			}
-
-			if !containerReady {
+			s.cedanaClient = cedanaClient
+		} else {
+			instance, exists = s.containerInstances.Get(request.ContainerId)
+			if !exists {
 				return
 			}
-	*/
 
-	err := s.cedanaClient.Checkpoint(request.ContainerId)
+			// Endpoint already configured to ensure /health is successful only if all
+			resp, err := http.Get(fmt.Sprintf("0.0.0.0:%d/health", instance.Port))
+			if err == nil && resp.StatusCode == 200 {
+				break
+			}
+		}
+
+		time.Sleep(time.Second)
+	}
+
+	err := s.cedanaClient.Checkpoint(context.TODO(), request.ContainerId)
 	if err != nil {
 		log.Printf("<%s> - cedana checkpoint failed: %+v\n", request.ContainerId, err)
 	}
+
+	// Notify that the checkpoint was done
+	resp, err := http.Get(fmt.Sprintf("0.0.0.0:%d/checkpoint_done", instance.Port))
+	if err != nil || resp.StatusCode != 200 {
+		log.Printf("<%s> - failed to notify container of checkpoint: %v\n", request.ContainerId, err)
+	}
 }
 
 // Invoke a runc container using a predefined config spec
@@ -441,9 +430,7 @@ func (s *Worker) SpawnAsync(request *types.ContainerRequest, bundlePath string,
 
 	go s.containerWg.Add(1)
 
-	// TODO: also need to check the stub config here for experimental cedana flag
-	// which we may want to attach to the container request for ease of access
-	if s.config.Cedana.Enabled && s.cedanaClient != nil {
+	if request.CheckpointEnabled {
 		go s.createCheckpoint(request)
 	}
 
@@ -706,6 +693,7 @@ func (s *Worker) getContainerEnvironment(request *types.ContainerRequest, option
 		fmt.Sprintf("CONTAINER_ID=%s", request.ContainerId),
 		fmt.Sprintf("BETA9_GATEWAY_HOST=%s", os.Getenv("BETA9_GATEWAY_HOST")),
 		fmt.Sprintf("BETA9_GATEWAY_PORT=%s", os.Getenv("BETA9_GATEWAY_PORT")),
+		fmt.Sprintf("CHECKPOINT_ENABLED=%t", request.CheckpointEnabled),
 		"PYTHONUNBUFFERED=1",
 	}
 
@@ -779,6 +767,21 @@ func (s *Worker) specFromRequest(request *types.ContainerRequest, options *Conta
 		spec.Hooks.Prestart = nil
 	}
 
+	if request.CheckpointEnabled {
+		// XXX: Hook to spawn daemon inside the container. In later cedana versions, it should be possible
+		// to spawn and manage from the worker instead.
+		err = AddCedanaDaemonHook(request, &spec.Hooks.StartContainer, &s.config.Checkpointing.Cedana)
+		if err != nil {
+			log.Printf("failed to add cedana hook, checkpoint/restore unavailable: %v", err)
+		}
+
+		// XXX: Modify the entrypoint to start process using cedana. Won't be needed once daemon is started in
+		// worker as cedana will be able to checkpoint/restore the container directly.
+		originalArgsString := strings.Join(spec.Process.Args, " ")
+		// TODO: Use '-it' flag to keep STDIN open and attach pseudo-TTY
+		spec.Process.Args = []string{CedanaPath, "exec", originalArgsString, "-w", defaultContainerDirectory}
+	}
+
 	spec.Process.Env = append(spec.Process.Env, env...)
 	spec.Root.Readonly = false
 
diff --git a/sdk/src/beta9/runner/endpoint.py b/sdk/src/beta9/runner/endpoint.py
index da5adf884..b4c63f138 100644
--- a/sdk/src/beta9/runner/endpoint.py
+++ b/sdk/src/beta9/runner/endpoint.py
@@ -175,9 +175,11 @@ def __init__(self, logger: logging.Logger, worker: UvicornWorker) -> None:
         # to become ready first, while others wait on the lock. The client is expected to
         # call /checkpoint_done endpoint to signal that the checkpoint was done.
         # Once this worker handles a checkpoint_done request, it releases the lock and
-        # all workers proceed. The checkpoint barrier ensures that the first worker does not escape.
+        # all workers proceed. The checkpoint barrier at the end again ensures that the
+        # first worker does not escape.
 
         if cfg.checkpoint_enabled:
+            checkpointBarrier.wait()
             readyLock.acquire()
 
         @self.app.get("/health")