Skip to content

Commit 74c3f11

Browse files
committed
[tmpnet] Enable runtime-specific restart behavior
Delegate responsibility for restart to the node runtime to allow the process runtime to continue to start/stop and the kube runtime to scale down/scale up.
1 parent 324a27a commit 74c3f11

File tree

3 files changed

+97
-71
lines changed

3 files changed

+97
-71
lines changed

tests/fixture/tmpnet/network.go

+55-54
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,16 @@ func RestartNetwork(ctx context.Context, log logging.Logger, dir string) error {
191191
return network.Restart(ctx)
192192
}
193193

194+
// Restart the provided nodes. Blocks on the nodes accepting API requests but not their health.
195+
func restartNodes(ctx context.Context, nodes ...*Node) error {
196+
for _, node := range nodes {
197+
if err := node.Restart(ctx); err != nil {
198+
return fmt.Errorf("failed to restart node %s: %w", node.NodeID, err)
199+
}
200+
}
201+
return nil
202+
}
203+
194204
// Reads a network from the provided directory.
195205
func ReadNetwork(ctx context.Context, log logging.Logger, dir string) (*Network, error) {
196206
canonicalDir, err := toCanonicalDir(dir)
@@ -441,26 +451,20 @@ func (n *Network) Bootstrap(ctx context.Context, log logging.Logger) error {
441451
bootstrapNode.Flags[config.SybilProtectionEnabledKey] = *existingSybilProtectionValue
442452
}
443453

454+
// Ensure the bootstrap node is restarted to pick up subnet and chain configuration
455+
//
456+
// TODO(marun) This restart might be unnecessary if:
457+
// - sybil protection didn't change
458+
// - the node is not a subnet validator
444459
log.Info("restarting bootstrap node",
445460
zap.Stringer("nodeID", bootstrapNode.NodeID),
446461
)
447-
448-
if len(n.Nodes) == 1 {
449-
// Ensure the node is restarted to pick up subnet and chain configuration
450-
return n.RestartNode(ctx, bootstrapNode)
462+
if err := bootstrapNode.Restart(ctx); err != nil {
463+
return err
451464
}
452465

453-
// TODO(marun) This last restart of the bootstrap node might be unnecessary if:
454-
// - sybil protection didn't change
455-
// - the node is not a subnet validator
456-
457-
// Ensure the bootstrap node is restarted to pick up configuration changes. Avoid using
458-
// RestartNode since the node won't be able to report healthy until other nodes are started.
459-
if err := bootstrapNode.Stop(ctx); err != nil {
460-
return fmt.Errorf("failed to stop node %s: %w", bootstrapNode.NodeID, err)
461-
}
462-
if err := n.StartNode(ctx, bootstrapNode); err != nil {
463-
return fmt.Errorf("failed to start node %s: %w", bootstrapNode.NodeID, err)
466+
if len(n.Nodes) == 1 {
467+
return nil
464468
}
465469

466470
log.Info("starting remaining nodes")
@@ -486,31 +490,6 @@ func (n *Network) StartNode(ctx context.Context, node *Node) error {
486490
return nil
487491
}
488492

489-
// Restart a single node.
490-
func (n *Network) RestartNode(ctx context.Context, node *Node) error {
491-
runtimeConfig := node.getRuntimeConfig()
492-
if runtimeConfig.Process != nil && runtimeConfig.Process.ReuseDynamicPorts {
493-
// Attempt to save the API port currently being used so the
494-
// restarted node can reuse it. This may result in the node
495-
// failing to start if the operating system allocates the port
496-
// to a different process between node stop and start.
497-
if err := node.SaveAPIPort(); err != nil {
498-
return err
499-
}
500-
}
501-
502-
if err := node.Stop(ctx); err != nil {
503-
return fmt.Errorf("failed to stop node %s: %w", node.NodeID, err)
504-
}
505-
if err := n.StartNode(ctx, node); err != nil {
506-
return fmt.Errorf("failed to start node %s: %w", node.NodeID, err)
507-
}
508-
n.log.Info("waiting for node to report healthy",
509-
zap.Stringer("nodeID", node.NodeID),
510-
)
511-
return node.WaitForHealthy(ctx)
512-
}
513-
514493
// Stops all nodes in the network.
515494
func (n *Network) Stop(ctx context.Context) error {
516495
// Ensure the node state is up-to-date
@@ -540,11 +519,22 @@ func (n *Network) Stop(ctx context.Context) error {
540519
return nil
541520
}
542521

543-
// Restarts all nodes in the network.
522+
// Restarts all non-ephemeral nodes in the network.
544523
func (n *Network) Restart(ctx context.Context) error {
545524
n.log.Info("restarting network")
546-
for _, node := range n.Nodes {
547-
if err := n.RestartNode(ctx, node); err != nil {
525+
if err := restartNodes(ctx, n.Nodes...); err != nil {
526+
return err
527+
}
528+
return WaitForHealthyNodes(ctx, n.log, n.Nodes...)
529+
}
530+
531+
// Waits for the provided nodes to become healthy.
532+
func WaitForHealthyNodes(ctx context.Context, log logging.Logger, nodes ...*Node) error {
533+
for _, node := range nodes {
534+
log.Info("waiting for node to become healthy",
535+
zap.Stringer("nodeID", node.NodeID),
536+
)
537+
if err := node.WaitForHealthy(ctx); err != nil {
548538
return err
549539
}
550540
}
@@ -669,15 +659,20 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI
669659
if restartRequired {
670660
log.Info("restarting node(s) to enable them to track the new subnet(s)")
671661

662+
runningNodes := make([]*Node, 0, len(reconfiguredNodes))
672663
for _, node := range reconfiguredNodes {
673-
if len(node.URI) == 0 {
674-
// Only running nodes should be restarted
675-
continue
676-
}
677-
if err := n.RestartNode(ctx, node); err != nil {
678-
return err
664+
if len(node.URI) > 0 {
665+
runningNodes = append(runningNodes, node)
679666
}
680667
}
668+
669+
if err := restartNodes(ctx, runningNodes...); err != nil {
670+
return err
671+
}
672+
673+
if err := WaitForHealthyNodes(ctx, n.log, runningNodes...); err != nil {
674+
return err
675+
}
681676
}
682677

683678
// Add validators for the subnet
@@ -738,15 +733,21 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI
738733
log.Info("restarting node(s) to pick up chain configuration")
739734

740735
// Restart nodes to allow configuration for the new chains to take effect
736+
nodesToRestart := make([]*Node, 0, len(n.Nodes))
741737
for _, node := range n.Nodes {
742-
if !validatorsToRestart.Contains(node.NodeID) {
743-
continue
744-
}
745-
if err := n.RestartNode(ctx, node); err != nil {
746-
return err
738+
if validatorsToRestart.Contains(node.NodeID) {
739+
nodesToRestart = append(nodesToRestart, node)
747740
}
748741
}
749742

743+
if err := restartNodes(ctx, nodesToRestart...); err != nil {
744+
return err
745+
}
746+
747+
if err := WaitForHealthyNodes(ctx, log, nodesToRestart...); err != nil {
748+
return err
749+
}
750+
750751
return nil
751752
}
752753

tests/fixture/tmpnet/node.go

+5-17
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"fmt"
1111
"io"
1212
"maps"
13-
"net"
1413
"net/http"
1514
"net/netip"
1615
"os"
@@ -58,6 +57,7 @@ type NodeRuntime interface {
5857
Start(ctx context.Context) error
5958
InitiateStop(ctx context.Context) error
6059
WaitForStopped(ctx context.Context) error
60+
Restart(ctx context.Context) error
6161
IsHealthy(ctx context.Context) (bool, error)
6262
}
6363

@@ -165,6 +165,10 @@ func (n *Node) WaitForStopped(ctx context.Context) error {
165165
return n.getRuntime().WaitForStopped(ctx)
166166
}
167167

168+
func (n *Node) Restart(ctx context.Context) error {
169+
return n.getRuntime().Restart(ctx)
170+
}
171+
168172
func (n *Node) readState(ctx context.Context) error {
169173
return n.getRuntime().readState(ctx)
170174
}
@@ -383,22 +387,6 @@ func (n *Node) composeFlags() (FlagsMap, error) {
383387
return flags, nil
384388
}
385389

386-
// Saves the currently allocated API port to the node's configuration
387-
// for use across restarts.
388-
func (n *Node) SaveAPIPort() error {
389-
hostPort := strings.TrimPrefix(n.URI, "http://")
390-
if len(hostPort) == 0 {
391-
// Without an API URI there is nothing to save
392-
return nil
393-
}
394-
_, port, err := net.SplitHostPort(hostPort)
395-
if err != nil {
396-
return err
397-
}
398-
n.Flags[config.HTTPPortKey] = port
399-
return nil
400-
}
401-
402390
// WaitForHealthy blocks until node health is true or an error (including context timeout) is observed.
403391
func (n *Node) WaitForHealthy(ctx context.Context) error {
404392
if _, ok := ctx.Deadline(); !ok {

tests/fixture/tmpnet/process_runtime.go

+37
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"io"
1313
"io/fs"
1414
"maps"
15+
"net"
1516
"net/netip"
1617
"os"
1718
"os/exec"
@@ -235,6 +236,26 @@ func (p *ProcessRuntime) WaitForStopped(ctx context.Context) error {
235236
}
236237
}
237238

239+
// Restarts the node
240+
func (p *ProcessRuntime) Restart(ctx context.Context) error {
241+
if p.getRuntimeConfig().ReuseDynamicPorts {
242+
// Attempt to save the API port currently being used so the
243+
// restarted node can reuse it. This may result in the node
244+
// failing to start if the operating system allocates the port
245+
// to a different process between node stop and start.
246+
if err := p.saveAPIPort(); err != nil {
247+
return err
248+
}
249+
}
250+
if err := p.node.Stop(ctx); err != nil {
251+
return fmt.Errorf("failed to stop node %s: %w", p.node.NodeID, err)
252+
}
253+
if err := p.Start(ctx); err != nil {
254+
return fmt.Errorf("failed to start node %s: %w", p.node.NodeID, err)
255+
}
256+
return nil
257+
}
258+
238259
func (p *ProcessRuntime) IsHealthy(ctx context.Context) (bool, error) {
239260
// Check that the node process is running as a precondition for
240261
// checking health. getProcess will also ensure that the node's
@@ -404,6 +425,22 @@ func (p *ProcessRuntime) GetLocalStakingAddress(_ context.Context) (netip.AddrPo
404425
return p.node.StakingAddress, func() {}, nil
405426
}
406427

428+
// Saves the currently allocated API port to the node's configuration
429+
// for use across restarts.
430+
func (p *ProcessRuntime) saveAPIPort() error {
431+
hostPort := strings.TrimPrefix(p.node.URI, "http://")
432+
if len(hostPort) == 0 {
433+
// Without an API URI there is nothing to save
434+
return nil
435+
}
436+
_, port, err := net.SplitHostPort(hostPort)
437+
if err != nil {
438+
return err
439+
}
440+
p.node.Flags[config.HTTPPortKey] = port
441+
return nil
442+
}
443+
407444
// watchLogFileForFatal waits for the specified file path to exist and then checks each of
408445
// its lines for the string 'FATAL' until such a line is observed or the provided context
409446
// is canceled. If line containing 'FATAL' is encountered, it will be provided as an error

0 commit comments

Comments
 (0)