From a6e5f10f888b7515eeaaf5fc6481173c411c6901 Mon Sep 17 00:00:00 2001 From: Eric Weber Date: Fri, 30 Aug 2024 15:22:40 -0500 Subject: [PATCH] fix: allow communication with instance managers in an unknown state Longhorn 6552 Signed-off-by: Eric Weber --- controller/engine_controller.go | 10 +++++----- controller/instance_manager_controller.go | 6 +++--- controller/replica_controller.go | 8 ++++---- engineapi/instance_manager.go | 23 ++++++++++++++++++----- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/controller/engine_controller.go b/controller/engine_controller.go index dd5cf77c6f..c4b3f0ae35 100644 --- a/controller/engine_controller.go +++ b/controller/engine_controller.go @@ -456,7 +456,7 @@ func (ec *EngineController) CreateInstance(obj interface{}) (*longhorn.InstanceP return nil, err } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return nil, err } @@ -578,7 +578,7 @@ func (ec *EngineController) DeleteInstance(obj interface{}) (err error) { return nil } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, true) if err != nil { return err } @@ -613,7 +613,7 @@ func (ec *EngineController) GetInstance(obj interface{}) (*longhorn.InstanceProc return nil, err } } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return nil, err } @@ -633,7 +633,7 @@ func (ec *EngineController) LogInstance(ctx context.Context, obj interface{}) (* return nil, nil, err } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return nil, nil, err } @@ -2130,7 +2130,7 @@ func (ec *EngineController) UpgradeEngineInstance(e *longhorn.Engine, log *logru return err } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return err } diff --git a/controller/instance_manager_controller.go b/controller/instance_manager_controller.go index 2683dead64..b08baa5172 100644 --- a/controller/instance_manager_controller.go +++ b/controller/instance_manager_controller.go @@ -83,7 +83,7 @@ type InstanceManagerMonitor struct { } func updateInstanceManagerVersion(im *longhorn.InstanceManager) error { - cli, err := engineapi.NewInstanceManagerClient(im) + cli, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return err } @@ -507,7 +507,7 @@ func (imc *InstanceManagerController) syncLogSettingsToInstanceManagerPod(im *lo return nil } - client, err := engineapi.NewInstanceManagerClient(im) + client, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return errors.Wrapf(err, "failed to create instance manager client for %v", im.Name) } @@ -1533,7 +1533,7 @@ func (imc *InstanceManagerController) startMonitoring(im *longhorn.InstanceManag } // TODO: #2441 refactor this when we do the resource monitoring refactor - client, err := engineapi.NewInstanceManagerClient(im) + client, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { log.WithError(err).Errorf("Failed to initialize im client to %v before monitoring", im.Name) return diff --git a/controller/replica_controller.go b/controller/replica_controller.go index 47241dc390..8ed954971b 100644 --- a/controller/replica_controller.go +++ b/controller/replica_controller.go @@ -357,7 +357,7 @@ func (rc *ReplicaController) CreateInstance(obj interface{}) (*longhorn.Instance return nil, err } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return nil, err } @@ -562,7 +562,7 @@ func (rc *ReplicaController) DeleteInstance(obj interface{}) (err error) { } }() - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return err } @@ -628,7 +628,7 @@ func (rc *ReplicaController) GetInstance(obj interface{}) (*longhorn.InstancePro } } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return nil, err } @@ -659,7 +659,7 @@ func (rc *ReplicaController) LogInstance(ctx context.Context, obj interface{}) ( return nil, nil, err } - c, err := engineapi.NewInstanceManagerClient(im) + c, err := engineapi.NewInstanceManagerClient(im, false) if err != nil { return nil, nil, err } diff --git a/engineapi/instance_manager.go b/engineapi/instance_manager.go index 1ed9358fe9..20e7aa0674 100644 --- a/engineapi/instance_manager.go +++ b/engineapi/instance_manager.go @@ -92,11 +92,24 @@ func CheckInstanceManagerProxySupport(im *longhorn.InstanceManager) error { return nil } -// NewInstanceManagerClient creates a new instance manager client -func NewInstanceManagerClient(im *longhorn.InstanceManager) (*InstanceManagerClient, error) { - // Do not check the major version here. Since IM cannot get the major version without using this client to call VersionGet(). - if im.Status.CurrentState != longhorn.InstanceManagerStateRunning || im.Status.IP == "" { - return nil, fmt.Errorf("invalid Instance Manager %v, state: %v, IP: %v", im.Name, im.Status.CurrentState, im.Status.IP) +// NewInstanceManagerClient creates a new instance manager client. Usually, we only want to attempt to communicate with +// an instance manager in state running. However, sometimes it makes sense to make a best effort attempt to communicate +// with an instance manager in state unknown. It should be safe to do so, since Kubernetes should not reassign the pod's +// IP address until it is at least terminating, which puts the instance manager in state error. However, there is an +// increased chance of failure. +func NewInstanceManagerClient(im *longhorn.InstanceManager, allowUnknown bool) (*InstanceManagerClient, error) { + // Do not check the major version here since IM cannot get the major version without using this client to call + // VersionGet(). + + if im.Status.IP == "" { + return nil, fmt.Errorf("invalid instance manager %v, state %v, IP %v", im.Name, im.Status.CurrentState, im.Status.IP) + } + if im.Status.CurrentState == longhorn.InstanceManagerStateRunning { + // Nothing to do. + } else if im.Status.CurrentState == longhorn.InstanceManagerStateUnknown && allowUnknown { + logrus.Warnf("Communicating with instance manager %v, state %v, IP %v", im.Name, im.Status.CurrentState, im.Status.IP) + } else { + return nil, fmt.Errorf("invalid instance manager %v, state %v, IP %v", im.Name, im.Status.CurrentState, im.Status.IP) } // TODO: Initialize the following gRPC clients are similar. This can be simplified via factory method.