check processing upon setting health

hemilabs · Feb 3, 2025 · 30f8789 · 30f8789
1 parent b42d194
commit 30f8789
Showing 1 changed file with 76 additions and 72 deletions.
diff --git a/op-conductor/conductor/service.go b/op-conductor/conductor/service.go
@@ -520,6 +520,22 @@ func (oc *OpConductor) handleHealthUpdate(hcerr error) {
 		oc.queueAction()
 	}
 
+	if !healthy {
+		callCtx, cancel := context.WithTimeout(oc.shutdownCtx, 5*time.Second)
+		defer cancel()
+
+		processingBitcoin, err := oc.ctrl.GetCreatingBitcoinAttributesForNextBlock(callCtx)
+		if err != nil {
+			log.Error(fmt.Sprintf("could not determine if processing bitcoin: %s", err))
+		} else if processingBitcoin {
+			log.Info("the sequencer is determined to be unhealthy, " +
+				"but we're currently processing bitcoin attributes for the next block, assuming healthy")
+			oc.healthy.Store(true)
+			oc.hcerr = nil
+			return
+		}
+	}
+
 	oc.healthy.Store(healthy)
 	oc.hcerr = hcerr
 }
@@ -534,83 +550,71 @@ func (oc *OpConductor) action() {
 	status := NewState(oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load())
 	oc.log.Debug("entering action with status", "status", status)
 
-	callCtx, cancel := context.WithTimeout(oc.shutdownCtx, 5*time.Second)
-	defer cancel()
-	processingBitcoin, err := oc.ctrl.GetCreatingBitcoinAttributesForNextBlock(callCtx)
-	if err == nil {
-		log.Trace(fmt.Sprintf("are we processing bitcoin in the hvm? %x", processingBitcoin))
-
-		// exhaust all cases below for completeness, 3 state, 8 cases.
-		switch {
-		case !status.leader && !status.healthy && !status.active:
-			// if follower is not healthy and not sequencing, just log an error
-			oc.log.Error("server (follower) is not healthy", "server", oc.cons.ServerID())
-		case !status.leader && !status.healthy && status.active:
-			// sequencer is not leader, not healthy, but it is sequencing, stop it
-			err = oc.stopSequencer()
-		case !status.leader && status.healthy && !status.active:
-			// normal follower, do nothing
-		case !status.leader && status.healthy && status.active:
-			// stop sequencer, this happens when current server steps down as leader.
-			err = oc.stopSequencer()
-		case status.leader && !status.healthy && !status.active:
-			// There are 2 scenarios we need to handle:
-			// 1. current node is follower, active sequencer became unhealthy and started the leadership transfer process.
-			//    however if leadership transfer took longer than the time for health monitor to treat the node as unhealthy,
-			//    then basically the entire network is stalled and we need to start sequencing in this case.
-			if !oc.prevState.leader && !oc.prevState.active {
-				_, _, cerr := oc.compareUnsafeHead(oc.shutdownCtx)
-				if cerr == nil && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
-					// if unsafe in consensus is the same as unsafe in op-node, then it is scenario #1 and we should start sequencer.
-					err = oc.startSequencer()
-					break
-				}
-			}
-
-			// 2. for other cases, we should try to transfer leader to another node.
-			//    for example, if follower became a leader and unhealthy at the same time (just unhealthy itself), then we should transfer leadership.
-			err = oc.transferLeader()
-		case status.leader && !status.healthy && status.active:
-			// There are two scenarios we need to handle here:
-			// 1. we're transitioned from case status.leader && !status.healthy && !status.active, see description above
-			//    then we should continue to sequence blocks and try to bring ourselves back to healthy state.
-			//    note: we need to also make sure that the health error is not due to ErrSequencerConnectionDown
-			//    		because in this case, we should stop sequencing and transfer leadership to other nodes.
-			if oc.prevState.leader && !oc.prevState.healthy && !oc.prevState.active && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
-				err = errors.New("waiting for sequencing to become healthy by itself")
+	// exhaust all cases below for completeness, 3 state, 8 cases.
+	switch {
+	case !status.leader && !status.healthy && !status.active:
+		// if follower is not healthy and not sequencing, just log an error
+		oc.log.Error("server (follower) is not healthy", "server", oc.cons.ServerID())
+	case !status.leader && !status.healthy && status.active:
+		// sequencer is not leader, not healthy, but it is sequencing, stop it
+		err = oc.stopSequencer()
+	case !status.leader && status.healthy && !status.active:
+		// normal follower, do nothing
+	case !status.leader && status.healthy && status.active:
+		// stop sequencer, this happens when current server steps down as leader.
+		err = oc.stopSequencer()
+	case status.leader && !status.healthy && !status.active:
+		// There are 2 scenarios we need to handle:
+		// 1. current node is follower, active sequencer became unhealthy and started the leadership transfer process.
+		//    however if leadership transfer took longer than the time for health monitor to treat the node as unhealthy,
+		//    then basically the entire network is stalled and we need to start sequencing in this case.
+		if !oc.prevState.leader && !oc.prevState.active {
+			_, _, cerr := oc.compareUnsafeHead(oc.shutdownCtx)
+			if cerr == nil && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
+				// if unsafe in consensus is the same as unsafe in op-node, then it is scenario #1 and we should start sequencer.
+				err = oc.startSequencer()
 				break
 			}
+		}
 
-			if processingBitcoin {
-				err = errors.New("we're processing a large number of bitcoin attributes, refusing to stop sequencer at this time")
-				break
-			}
+		// 2. for other cases, we should try to transfer leader to another node.
+		//    for example, if follower became a leader and unhealthy at the same time (just unhealthy itself), then we should transfer leadership.
+		err = oc.transferLeader()
+	case status.leader && !status.healthy && status.active:
+		// There are two scenarios we need to handle here:
+		// 1. we're transitioned from case status.leader && !status.healthy && !status.active, see description above
+		//    then we should continue to sequence blocks and try to bring ourselves back to healthy state.
+		//    note: we need to also make sure that the health error is not due to ErrSequencerConnectionDown
+		//    		because in this case, we should stop sequencing and transfer leadership to other nodes.
+		if oc.prevState.leader && !oc.prevState.healthy && !oc.prevState.active && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
+			err = errors.New("waiting for sequencing to become healthy by itself")
+			break
+		}
 
-			// 2. we're here becasuse an healthy leader became unhealthy itself
-			//    then we should try to stop sequencing locally and transfer leadership.
-			var result *multierror.Error
-			// Try to stop sequencer first, but since sequencer is not healthy, we may not be able to stop it.
-			// In this case, it's fine to continue to try to transfer leadership to another server. This is safe because
-			// 1. if leadership transfer succeeded, then we'll retry and enter case !status.leader && status.healthy && status.active, which will try to stop sequencer.
-			// 2. even if the retry continues to fail and current server stays in active sequencing mode, it would be safe because our hook in op-node will prevent it from committing any new blocks to the network via p2p (if it's not leader any more)
-			if e := oc.stopSequencer(); e != nil {
-				result = multierror.Append(result, e)
-			}
-			// try to transfer leadership to another server despite if sequencer is stopped or not. There are 4 scenarios here:
-			// 1. [sequencer stopped, leadership transfer succeeded] which is the happy case and we handed over sequencing to another server.
-			// 2. [sequencer stopped, leadership transfer failed] we'll enter into case status.leader && !status.healthy && !status.active and retry transfer leadership.
-			// 3. [sequencer active, leadership transfer succeeded] we'll enter into case !status.leader && status.healthy && status.active and retry stop sequencer.
-			// 4. [sequencer active, leadership transfer failed] we're in the same state and will retry here again.
-			if e := oc.transferLeader(); e != nil {
-				result = multierror.Append(result, e)
-			}
-			err = result.ErrorOrNil()
-		case status.leader && status.healthy && !status.active:
-			// start sequencer
-			err = oc.startSequencer()
-		case status.leader && status.healthy && status.active:
-			// normal leader, do nothing
+		// 2. we're here becasuse an healthy leader became unhealthy itself
+		//    then we should try to stop sequencing locally and transfer leadership.
+		var result *multierror.Error
+		// Try to stop sequencer first, but since sequencer is not healthy, we may not be able to stop it.
+		// In this case, it's fine to continue to try to transfer leadership to another server. This is safe because
+		// 1. if leadership transfer succeeded, then we'll retry and enter case !status.leader && status.healthy && status.active, which will try to stop sequencer.
+		// 2. even if the retry continues to fail and current server stays in active sequencing mode, it would be safe because our hook in op-node will prevent it from committing any new blocks to the network via p2p (if it's not leader any more)
+		if e := oc.stopSequencer(); e != nil {
+			result = multierror.Append(result, e)
+		}
+		// try to transfer leadership to another server despite if sequencer is stopped or not. There are 4 scenarios here:
+		// 1. [sequencer stopped, leadership transfer succeeded] which is the happy case and we handed over sequencing to another server.
+		// 2. [sequencer stopped, leadership transfer failed] we'll enter into case status.leader && !status.healthy && !status.active and retry transfer leadership.
+		// 3. [sequencer active, leadership transfer succeeded] we'll enter into case !status.leader && status.healthy && status.active and retry stop sequencer.
+		// 4. [sequencer active, leadership transfer failed] we're in the same state and will retry here again.
+		if e := oc.transferLeader(); e != nil {
+			result = multierror.Append(result, e)
 		}
+		err = result.ErrorOrNil()
+	case status.leader && status.healthy && !status.active:
+		// start sequencer
+		err = oc.startSequencer()
+	case status.leader && status.healthy && status.active:
+		// normal leader, do nothing
 	}
 
 	oc.log.Debug("exiting action with status and error", "status", status, "err", err)