From 92e59eebf9c9758444746075199906b16d145e50 Mon Sep 17 00:00:00 2001 From: Ido Heyvi Date: Tue, 3 Dec 2024 14:36:35 +0200 Subject: [PATCH] Changing upgrade package structure as a placeholder for supporting both inbox(legacy) and requestor (maintenance OP) upgrade modes Signed-off-by: Ido Heyvi --- pkg/upgrade/{ => common}/consts.go | 2 +- pkg/upgrade/{ => common}/cordon_manager.go | 2 +- .../{ => common}/cordon_manager_test.go | 7 +- pkg/upgrade/{ => common}/drain_manager.go | 2 +- .../{ => common}/drain_manager_test.go | 28 +- .../node_upgrade_state_provider.go | 2 +- .../node_upgrade_state_provider_test.go | 19 +- pkg/upgrade/{ => common}/pod_manager.go | 2 +- pkg/upgrade/{ => common}/pod_manager_test.go | 90 +-- .../{ => common}/safe_driver_load_manager.go | 2 +- .../safe_driver_load_manager_test.go | 13 +- .../upgrade_common.go} | 289 ++------- pkg/upgrade/{ => common}/upgrade_suit_test.go | 102 +-- pkg/upgrade/{ => common}/util.go | 2 +- .../{ => common}/validation_manager.go | 2 +- .../{ => common}/validation_manager_test.go | 45 +- pkg/upgrade/inbox/upgrade_inbox.go | 77 +++ .../{ => manager}/mocks/CordonManager.go | 0 .../{ => manager}/mocks/DrainManager.go | 6 +- .../mocks/NodeUpgradeStateProvider.go | 0 pkg/upgrade/{ => manager}/mocks/PodManager.go | 18 +- .../{ => manager}/mocks/ValidationManager.go | 0 pkg/upgrade/manager/upgrade_state.go | 235 +++++++ .../{ => manager}/upgrade_state_test.go | 597 +++++++++--------- pkg/upgrade/manager/upgrade_suit_test.go | 419 ++++++++++++ pkg/upgrade/requestor/upgrade_requestor.go | 77 +++ 26 files changed, 1285 insertions(+), 753 deletions(-) rename pkg/upgrade/{ => common}/consts.go (99%) rename pkg/upgrade/{ => common}/cordon_manager.go (99%) rename pkg/upgrade/{ => common}/cordon_manager_test.go (88%) rename pkg/upgrade/{ => common}/drain_manager.go (99%) rename pkg/upgrade/{ => common}/drain_manager_test.go (72%) rename pkg/upgrade/{ => common}/node_upgrade_state_provider.go (99%) rename pkg/upgrade/{ => common}/node_upgrade_state_provider_test.go (75%) rename pkg/upgrade/{ => common}/pod_manager.go (99%) rename pkg/upgrade/{ => common}/pod_manager_test.go (78%) rename pkg/upgrade/{ => common}/safe_driver_load_manager.go (99%) rename pkg/upgrade/{ => common}/safe_driver_load_manager_test.go (86%) rename pkg/upgrade/{upgrade_state.go => common/upgrade_common.go} (73%) rename pkg/upgrade/{ => common}/upgrade_suit_test.go (80%) rename pkg/upgrade/{ => common}/util.go (99%) rename pkg/upgrade/{ => common}/validation_manager.go (99%) rename pkg/upgrade/{ => common}/validation_manager_test.go (70%) create mode 100644 pkg/upgrade/inbox/upgrade_inbox.go rename pkg/upgrade/{ => manager}/mocks/CordonManager.go (100%) rename pkg/upgrade/{ => manager}/mocks/DrainManager.go (84%) rename pkg/upgrade/{ => manager}/mocks/NodeUpgradeStateProvider.go (100%) rename pkg/upgrade/{ => manager}/mocks/PodManager.go (86%) rename pkg/upgrade/{ => manager}/mocks/ValidationManager.go (100%) create mode 100644 pkg/upgrade/manager/upgrade_state.go rename pkg/upgrade/{ => manager}/upgrade_state_test.go (59%) create mode 100644 pkg/upgrade/manager/upgrade_suit_test.go create mode 100644 pkg/upgrade/requestor/upgrade_requestor.go diff --git a/pkg/upgrade/consts.go b/pkg/upgrade/common/consts.go similarity index 99% rename from pkg/upgrade/consts.go rename to pkg/upgrade/common/consts.go index 3b7866af..ece826c8 100644 --- a/pkg/upgrade/consts.go +++ b/pkg/upgrade/common/consts.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common const ( // UpgradeStateLabelKeyFmt is the format of the node label key indicating driver upgrade states diff --git a/pkg/upgrade/cordon_manager.go b/pkg/upgrade/common/cordon_manager.go similarity index 99% rename from pkg/upgrade/cordon_manager.go rename to pkg/upgrade/common/cordon_manager.go index b07791e3..93c18626 100644 --- a/pkg/upgrade/cordon_manager.go +++ b/pkg/upgrade/common/cordon_manager.go @@ -11,7 +11,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "context" diff --git a/pkg/upgrade/cordon_manager_test.go b/pkg/upgrade/common/cordon_manager_test.go similarity index 88% rename from pkg/upgrade/cordon_manager_test.go rename to pkg/upgrade/common/cordon_manager_test.go index f175a3a3..cbcf8788 100644 --- a/pkg/upgrade/cordon_manager_test.go +++ b/pkg/upgrade/common/cordon_manager_test.go @@ -14,15 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade_test +package common_test import ( "context" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" ) var _ = Describe("CordonManager tests", func() { @@ -30,7 +29,7 @@ var _ = Describe("CordonManager tests", func() { ctx := context.TODO() node := createNode("test-node") - cordonManager := upgrade.NewCordonManager(k8sInterface, log) + cordonManager := common.NewCordonManager(k8sInterface, log) err := cordonManager.Cordon(ctx, node) Expect(err).To(Succeed()) Expect(node.Spec.Unschedulable).To(BeTrue()) diff --git a/pkg/upgrade/drain_manager.go b/pkg/upgrade/common/drain_manager.go similarity index 99% rename from pkg/upgrade/drain_manager.go rename to pkg/upgrade/common/drain_manager.go index 4e0a70ce..31667302 100644 --- a/pkg/upgrade/drain_manager.go +++ b/pkg/upgrade/common/drain_manager.go @@ -11,7 +11,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "context" diff --git a/pkg/upgrade/drain_manager_test.go b/pkg/upgrade/common/drain_manager_test.go similarity index 72% rename from pkg/upgrade/drain_manager_test.go rename to pkg/upgrade/common/drain_manager_test.go index 8a82caa6..5420cd53 100644 --- a/pkg/upgrade/drain_manager_test.go +++ b/pkg/upgrade/common/drain_manager_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade_test +package common_test import ( "context" @@ -26,7 +26,7 @@ import ( "k8s.io/apimachinery/pkg/types" v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1" - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" ) var _ = Describe("DrainManager tests", func() { @@ -35,7 +35,7 @@ var _ = Describe("DrainManager tests", func() { node := createNode("node") - drainManager := upgrade.NewDrainManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) + drainManager := common.NewDrainManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) drainSpec := &v1alpha1.DrainSpec{ Enable: true, Force: false, @@ -44,7 +44,7 @@ var _ = Describe("DrainManager tests", func() { DeleteEmptyDir: true, } nodeArray := []*corev1.Node{node} - err := drainManager.ScheduleNodesDrain(ctx, &upgrade.DrainConfiguration{Nodes: nodeArray, Spec: drainSpec}) + err := drainManager.ScheduleNodesDrain(ctx, &common.DrainConfiguration{Nodes: nodeArray, Spec: drainSpec}) Expect(err).To(Succeed()) time.Sleep(time.Second) @@ -61,7 +61,7 @@ var _ = Describe("DrainManager tests", func() { node2 := createNode("node2") node3 := createNode("node3") - drainManager := upgrade.NewDrainManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) + drainManager := common.NewDrainManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) drainSpec := &v1alpha1.DrainSpec{ Enable: true, Force: false, @@ -70,7 +70,7 @@ var _ = Describe("DrainManager tests", func() { DeleteEmptyDir: true, } nodeArray := []*corev1.Node{node1, node2, node3} - err := drainManager.ScheduleNodesDrain(ctx, &upgrade.DrainConfiguration{Nodes: nodeArray, Spec: drainSpec}) + err := drainManager.ScheduleNodesDrain(ctx, &common.DrainConfiguration{Nodes: nodeArray, Spec: drainSpec}) Expect(err).To(Succeed()) time.Sleep(time.Second) @@ -93,7 +93,7 @@ var _ = Describe("DrainManager tests", func() { It("DrainManager should not fail on empty node list", func() { ctx := context.TODO() - drainManager := upgrade.NewDrainManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) + drainManager := common.NewDrainManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) drainSpec := &v1alpha1.DrainSpec{ Enable: true, Force: false, @@ -101,7 +101,7 @@ var _ = Describe("DrainManager tests", func() { TimeoutSecond: 1, DeleteEmptyDir: true, } - err := drainManager.ScheduleNodesDrain(ctx, &upgrade.DrainConfiguration{Nodes: nil, Spec: drainSpec}) + err := drainManager.ScheduleNodesDrain(ctx, &common.DrainConfiguration{Nodes: nil, Spec: drainSpec}) Expect(err).To(Succeed()) time.Sleep(time.Second) @@ -111,10 +111,10 @@ var _ = Describe("DrainManager tests", func() { node := createNode("node") - drainManager := upgrade.NewDrainManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) + drainManager := common.NewDrainManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) nodeArray := []*corev1.Node{node} - err := drainManager.ScheduleNodesDrain(ctx, &upgrade.DrainConfiguration{Nodes: nodeArray, Spec: nil}) + err := drainManager.ScheduleNodesDrain(ctx, &common.DrainConfiguration{Nodes: nodeArray, Spec: nil}) Expect(err).ToNot(Succeed()) time.Sleep(time.Second) @@ -129,10 +129,10 @@ var _ = Describe("DrainManager tests", func() { node := createNode("node") - drainManager := upgrade.NewDrainManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) + drainManager := common.NewDrainManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) nodeArray := []*corev1.Node{node} - err := drainManager.ScheduleNodesDrain(ctx, &upgrade.DrainConfiguration{Nodes: nodeArray, Spec: &v1alpha1.DrainSpec{}}) + err := drainManager.ScheduleNodesDrain(ctx, &common.DrainConfiguration{Nodes: nodeArray, Spec: &v1alpha1.DrainSpec{}}) Expect(err).To(Succeed()) time.Sleep(time.Second) @@ -147,11 +147,11 @@ var _ = Describe("DrainManager tests", func() { node := createNode("node") - drainManager := upgrade.NewDrainManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) + drainManager := common.NewDrainManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, eventRecorder) nodeArray := []*corev1.Node{node} err := drainManager.ScheduleNodesDrain( - ctx, &upgrade.DrainConfiguration{Nodes: nodeArray, Spec: &v1alpha1.DrainSpec{Enable: false}}) + ctx, &common.DrainConfiguration{Nodes: nodeArray, Spec: &v1alpha1.DrainSpec{Enable: false}}) Expect(err).To(Succeed()) time.Sleep(time.Second) diff --git a/pkg/upgrade/node_upgrade_state_provider.go b/pkg/upgrade/common/node_upgrade_state_provider.go similarity index 99% rename from pkg/upgrade/node_upgrade_state_provider.go rename to pkg/upgrade/common/node_upgrade_state_provider.go index e6e1760b..b497c799 100644 --- a/pkg/upgrade/node_upgrade_state_provider.go +++ b/pkg/upgrade/common/node_upgrade_state_provider.go @@ -11,7 +11,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "context" diff --git a/pkg/upgrade/node_upgrade_state_provider_test.go b/pkg/upgrade/common/node_upgrade_state_provider_test.go similarity index 75% rename from pkg/upgrade/node_upgrade_state_provider_test.go rename to pkg/upgrade/common/node_upgrade_state_provider_test.go index b51a7d11..54ccb69a 100644 --- a/pkg/upgrade/node_upgrade_state_provider_test.go +++ b/pkg/upgrade/common/node_upgrade_state_provider_test.go @@ -14,17 +14,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade_test +package common_test import ( "context" "fmt" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" - - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" ) var _ = Describe("NodeUpgradeStateProvider tests", func() { @@ -38,19 +37,19 @@ var _ = Describe("NodeUpgradeStateProvider tests", func() { node = createNode(fmt.Sprintf("node-%s", id)) }) It("NodeUpgradeStateProvider should change node upgrade state and retrieve the latest node object", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStateUpgradeRequired) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStateUpgradeRequired) Expect(err).To(Succeed()) node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateUpgradeRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateUpgradeRequired)) }) It("NodeUpgradeStateProvider should change node upgrade annotation and retrieve the latest node object", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - key := upgrade.GetUpgradeInitialStateAnnotationKey() + key := common.GetUpgradeInitialStateAnnotationKey() err := provider.ChangeNodeUpgradeAnnotation(ctx, node, key, "true") Expect(err).To(Succeed()) @@ -59,9 +58,9 @@ var _ = Describe("NodeUpgradeStateProvider tests", func() { Expect(node.Annotations[key]).To(Equal("true")) }) It("NodeUpgradeStateProvider should delete node upgrade annotation and retrieve the latest node object", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - key := upgrade.GetUpgradeInitialStateAnnotationKey() + key := common.GetUpgradeInitialStateAnnotationKey() err := provider.ChangeNodeUpgradeAnnotation(ctx, node, key, "null") Expect(err).To(Succeed()) diff --git a/pkg/upgrade/pod_manager.go b/pkg/upgrade/common/pod_manager.go similarity index 99% rename from pkg/upgrade/pod_manager.go rename to pkg/upgrade/common/pod_manager.go index f704ede4..1a08f20e 100644 --- a/pkg/upgrade/pod_manager.go +++ b/pkg/upgrade/common/pod_manager.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "context" diff --git a/pkg/upgrade/pod_manager_test.go b/pkg/upgrade/common/pod_manager_test.go similarity index 78% rename from pkg/upgrade/pod_manager_test.go rename to pkg/upgrade/common/pod_manager_test.go index 8cb31b03..9a3f499b 100644 --- a/pkg/upgrade/pod_manager_test.go +++ b/pkg/upgrade/common/pod_manager_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade_test +package common_test import ( "context" @@ -31,13 +31,13 @@ import ( "k8s.io/apimachinery/pkg/types" v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1" - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" ) var _ = Describe("PodManager", func() { var node *corev1.Node var namespace *corev1.Namespace - var podManagerConfig upgrade.PodManagerConfig + var podManagerConfig common.PodManagerConfig var ctx context.Context var id string @@ -51,7 +51,7 @@ var _ = Describe("PodManager", func() { node = createNode(fmt.Sprintf("node-%s", id)) namespace = createNamespace(fmt.Sprintf("namespace-%s", id)) // default PodManagerConfig - podManagerConfig = upgrade.PodManagerConfig{ + podManagerConfig = common.PodManagerConfig{ WaitForCompletionSpec: &v1alpha1.WaitForCompletionSpec{ PodSelector: "", TimeoutSecond: 0, @@ -80,7 +80,7 @@ var _ = Describe("PodManager", func() { Expect(err).To(Succeed()) Expect(podList.Items).To(HaveLen(4)) - manager := upgrade.NewPodManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, nil, eventRecorder) + manager := common.NewPodManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, nil, eventRecorder) err = manager.SchedulePodsRestart(ctx, restartPods) Expect(err).To(Succeed()) @@ -102,7 +102,7 @@ var _ = Describe("PodManager", func() { Expect(err).To(Succeed()) Expect(podList.Items).To(HaveLen(0)) - manager := upgrade.NewPodManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, nil, eventRecorder) + manager := common.NewPodManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, nil, eventRecorder) err = manager.SchedulePodsRestart(ctx, []*corev1.Pod{deletedPod}) Expect(err).To(HaveOccurred()) }) @@ -112,7 +112,7 @@ var _ = Describe("PodManager", func() { Expect(err).To(Succeed()) Expect(podList.Items).To(HaveLen(0)) - manager := upgrade.NewPodManager(k8sInterface, upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, nil, eventRecorder) + manager := common.NewPodManager(k8sInterface, common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log, nil, eventRecorder) err = manager.SchedulePodsRestart(ctx, []*corev1.Pod{}) Expect(err).To(Succeed()) }) @@ -121,8 +121,8 @@ var _ = Describe("PodManager", func() { Describe("ScheduleCheckOnPodCompletion", func() { It("should change the state of the node only after job completion", func() { // initialize upgrade state of the node - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStateWaitForJobsRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStateWaitForJobsRequired) Expect(err).To(Succeed()) // create pod to be running on testnode @@ -141,21 +141,21 @@ var _ = Describe("PodManager", func() { Expect(podList.Items).NotTo(BeEmpty()) podManagerConfig.WaitForCompletionSpec.PodSelector = "app=my-app" - manager := upgrade.NewPodManager(k8sInterface, provider, log, nil, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, nil, eventRecorder) err = manager.ScheduleCheckOnPodCompletion(ctx, &podManagerConfig) Expect(err).To(Succeed()) // verify upgrade state is changed to new state on workload pod completion node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStatePodDeletionRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStatePodDeletionRequired)) // verify annotation which tracks start time is not added. Expect(isWaitForCompletionAnnotationPresent(node)).To(Equal(false)) }) It("should not change the state of the node if workload pod is running", func() { // initialize upgrade state of the node - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStateWaitForJobsRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStateWaitForJobsRequired) Expect(err).To(Succeed()) // create pod to be running on testnode @@ -169,21 +169,21 @@ var _ = Describe("PodManager", func() { Expect(podList.Items).NotTo(BeEmpty()) podManagerConfig.WaitForCompletionSpec.PodSelector = "app=my-app" - manager := upgrade.NewPodManager(k8sInterface, provider, log, nil, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, nil, eventRecorder) err = manager.ScheduleCheckOnPodCompletion(ctx, &podManagerConfig) Expect(err).To(Succeed()) // verify upgrade state is unchanged with workload pod running node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateWaitForJobsRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateWaitForJobsRequired)) // verify annotation is added to track the start time. Expect(isWaitForCompletionAnnotationPresent(node)).To(Equal(false)) }) It("should change the state of the node if workload pod is running and timeout is reached", func() { // initialize upgrade state of the node - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStateWaitForJobsRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStateWaitForJobsRequired) Expect(err).To(Succeed()) // create pod to be running on testnode @@ -198,20 +198,20 @@ var _ = Describe("PodManager", func() { podManagerConfig.WaitForCompletionSpec.PodSelector = "app=my-app" podManagerConfig.WaitForCompletionSpec.TimeoutSecond = 30 - manager := upgrade.NewPodManager(k8sInterface, provider, log, nil, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, nil, eventRecorder) err = manager.ScheduleCheckOnPodCompletion(ctx, &podManagerConfig) Expect(err).To(Succeed()) // verify upgrade state is unchanged with workload pod running node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateWaitForJobsRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateWaitForJobsRequired)) // verify annotation is added track the start time. Expect(isWaitForCompletionAnnotationPresent(node)).To(Equal(true)) startTime := strconv.FormatInt(time.Now().Unix()-35, 10) - provider.ChangeNodeUpgradeAnnotation(ctx, node, upgrade.GetWaitForPodCompletionStartTimeAnnotationKey(), startTime) + provider.ChangeNodeUpgradeAnnotation(ctx, node, common.GetWaitForPodCompletionStartTimeAnnotationKey(), startTime) podManagerConfig.Nodes = []*corev1.Node{node} @@ -221,7 +221,7 @@ var _ = Describe("PodManager", func() { // verify upgrade state is unchanged with workload pod running node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStatePodDeletionRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStatePodDeletionRequired)) // verify annotation is removed to track the start time. Expect(isWaitForCompletionAnnotationPresent(node)).To(Equal(false)) }) @@ -245,12 +245,12 @@ var _ = Describe("PodManager", func() { } // initialize upgrade state of the node - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStatePodDeletionRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStatePodDeletionRequired) Expect(err).To(Succeed()) podManagerConfig.DeletionSpec.Force = true - manager := upgrade.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) err = manager.SchedulePodEviction(ctx, &podManagerConfig) Expect(err).To(Succeed()) @@ -265,7 +265,7 @@ var _ = Describe("PodManager", func() { // verify upgrade state node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStatePodRestartRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStatePodRestartRequired)) }) It("should fail to delete all standalone gpu pods without force,"+ @@ -275,11 +275,11 @@ var _ = Describe("PodManager", func() { NewPod(fmt.Sprintf("gpu-pod2-%s", id), namespace.Name, node.Name).WithResource("nvidia.com/mig-1g.5gb", "1").Create(), } - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStatePodDeletionRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStatePodDeletionRequired) Expect(err).To(Succeed()) - manager := upgrade.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) podManagerConfig.DrainEnabled = false err = manager.SchedulePodEviction(ctx, &podManagerConfig) // Note: SchedulePodEviction() will not return an error if issues were encountered @@ -298,7 +298,7 @@ var _ = Describe("PodManager", func() { // verify upgrade state is set to UpgradeStateFailed node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateFailed)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateFailed)) }) It("should fail to delete all standalone gpu pods without force,"+ @@ -308,11 +308,11 @@ var _ = Describe("PodManager", func() { NewPod(fmt.Sprintf("gpu-pod2-%s", id), namespace.Name, node.Name).WithResource("nvidia.com/mig-1g.5gb", "1").Create(), } - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStatePodDeletionRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStatePodDeletionRequired) Expect(err).To(Succeed()) - manager := upgrade.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) podManagerConfig.DrainEnabled = true err = manager.SchedulePodEviction(ctx, &podManagerConfig) // Note: SchedulePodEviction() will not return an error if issues were encountered @@ -330,7 +330,7 @@ var _ = Describe("PodManager", func() { // verify upgrade state is set to UpgradeStateDrainRequired node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateDrainRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateDrainRequired)) }) It("should delete all standalone gpu pods using emptyDir when force=true and deleteEmptyDir=true"+ @@ -343,13 +343,13 @@ var _ = Describe("PodManager", func() { gpuPods = append(gpuPods, NewPod("test-gpu-pod", namespace.Name, node.Name).WithResource("nvidia.com/gpu", "1").WithEmptyDir().Create()) // initialize upgrade state of the node - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStatePodDeletionRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStatePodDeletionRequired) Expect(err).To(Succeed()) podManagerConfig.DeletionSpec.Force = true podManagerConfig.DeletionSpec.DeleteEmptyDir = true - manager := upgrade.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) err = manager.SchedulePodEviction(ctx, &podManagerConfig) Expect(err).To(Succeed()) @@ -364,7 +364,7 @@ var _ = Describe("PodManager", func() { // verify upgrade state node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStatePodRestartRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStatePodRestartRequired)) }) It("should fail to delete all standalone gpu pods with emptyDir when force=true and deleteEmptyDir=false,"+ @@ -373,13 +373,13 @@ var _ = Describe("PodManager", func() { NewPod(fmt.Sprintf("gpu-pod1-%s", id), namespace.Name, node.Name).WithResource("nvidia.com/gpu", "1").WithEmptyDir().Create(), } - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStatePodDeletionRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStatePodDeletionRequired) Expect(err).To(Succeed()) podManagerConfig.DeletionSpec.Force = true podManagerConfig.DrainEnabled = false - manager := upgrade.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) err = manager.SchedulePodEviction(ctx, &podManagerConfig) // Note: SchedulePodEviction() will not return an error if issues were encountered // when deleting pods on a node. The node will be transitioned to the UpgradeFailed @@ -397,7 +397,7 @@ var _ = Describe("PodManager", func() { // verify upgrade state is set to UpgradeStateFailed node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateFailed)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateFailed)) }) It("should fail to delete all standalone gpu pods with emptyDir when force=true and deleteEmptyDir=false,"+ @@ -406,13 +406,13 @@ var _ = Describe("PodManager", func() { NewPod(fmt.Sprintf("gpu-pod1-%s", id), namespace.Name, node.Name).WithResource("nvidia.com/gpu", "1").WithEmptyDir().Create(), } - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStatePodDeletionRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStatePodDeletionRequired) Expect(err).To(Succeed()) podManagerConfig.DeletionSpec.Force = true podManagerConfig.DrainEnabled = true - manager := upgrade.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) + manager := common.NewPodManager(k8sInterface, provider, log, gpuPodSpecFilter, eventRecorder) err = manager.SchedulePodEviction(ctx, &podManagerConfig) // Note: SchedulePodEviction() will not return an error if issues were encountered // when deleting pods on a node. @@ -429,7 +429,7 @@ var _ = Describe("PodManager", func() { // verify upgrade state is set to UpgradeStateDrainRequired node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateDrainRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateDrainRequired)) }) }) }) diff --git a/pkg/upgrade/safe_driver_load_manager.go b/pkg/upgrade/common/safe_driver_load_manager.go similarity index 99% rename from pkg/upgrade/safe_driver_load_manager.go rename to pkg/upgrade/common/safe_driver_load_manager.go index 46cd6ee2..f1cf77a0 100644 --- a/pkg/upgrade/safe_driver_load_manager.go +++ b/pkg/upgrade/common/safe_driver_load_manager.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "context" diff --git a/pkg/upgrade/safe_driver_load_manager_test.go b/pkg/upgrade/common/safe_driver_load_manager_test.go similarity index 86% rename from pkg/upgrade/safe_driver_load_manager_test.go rename to pkg/upgrade/common/safe_driver_load_manager_test.go index eeb45650..7444bad9 100644 --- a/pkg/upgrade/safe_driver_load_manager_test.go +++ b/pkg/upgrade/common/safe_driver_load_manager_test.go @@ -14,20 +14,19 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade_test +package common_test import ( "context" "fmt" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" corev1 "k8s.io/api/core/v1" - - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" ) var _ = Describe("SafeDriverLoadManager", func() { @@ -35,7 +34,7 @@ var _ = Describe("SafeDriverLoadManager", func() { node *corev1.Node ctx context.Context id string - mgr upgrade.SafeDriverLoadManager + mgr common.SafeDriverLoadManager ) BeforeEach(func() { ctx = context.Background() @@ -43,10 +42,10 @@ var _ = Describe("SafeDriverLoadManager", func() { id = randSeq(5) // create k8s objects node = createNode(fmt.Sprintf("node-%s", id)) - mgr = upgrade.NewSafeDriverLoadManager(upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log) + mgr = common.NewSafeDriverLoadManager(common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder), log) }) It("IsWaitingForSafeDriverLoad", func() { - annotationKey := upgrade.GetUpgradeDriverWaitForSafeLoadAnnotationKey() + annotationKey := common.GetUpgradeDriverWaitForSafeLoadAnnotationKey() Expect(k8sClient.Patch( ctx, node, client.RawPatch(types.StrategicMergePatchType, []byte(fmt.Sprintf(`{"metadata":{"annotations":{%q: "true"}}}`, @@ -61,7 +60,7 @@ var _ = Describe("SafeDriverLoadManager", func() { Expect(mgr.IsWaitingForSafeDriverLoad(ctx, node)).To(BeFalse()) }) It("UnblockLoading", func() { - annotationKey := upgrade.GetUpgradeDriverWaitForSafeLoadAnnotationKey() + annotationKey := common.GetUpgradeDriverWaitForSafeLoadAnnotationKey() Expect(k8sClient.Patch( ctx, node, client.RawPatch(types.StrategicMergePatchType, []byte(fmt.Sprintf(`{"metadata":{"annotations":{%q: "true"}}}`, diff --git a/pkg/upgrade/upgrade_state.go b/pkg/upgrade/common/upgrade_common.go similarity index 73% rename from pkg/upgrade/upgrade_state.go rename to pkg/upgrade/common/upgrade_common.go index d6577946..d79cb501 100644 --- a/pkg/upgrade/upgrade_state.go +++ b/pkg/upgrade/common/upgrade_common.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "context" @@ -24,7 +24,6 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -61,17 +60,7 @@ func NewClusterUpgradeState() ClusterUpgradeState { return ClusterUpgradeState{NodeStates: make(map[string][]*NodeUpgradeState)} } -// ClusterUpgradeStateManager is an interface for performing cluster upgrades of driver containers -// -//nolint:interfacebloat -type ClusterUpgradeStateManager interface { - // ApplyState receives a complete cluster upgrade state and, based on upgrade policy, processes each node's state. - // Based on the current state of the node, it is calculated if the node can be moved to the next state right now - // or whether any actions need to be scheduled for the node to move to the next state. - // The function is stateless and idempotent. If the error was returned before all nodes' states were processed, - // ApplyState would be called again and complete the processing - all the decisions are based on the input data. - ApplyState(ctx context.Context, - currentState *ClusterUpgradeState, upgradePolicy *v1alpha1.DriverUpgradePolicySpec) (err error) +type CommonUpgradeStateManager interface { // BuildState builds a point-in-time snapshot of the driver upgrade state in the cluster. BuildState(ctx context.Context, namespace string, driverLabels map[string]string) (*ClusterUpgradeState, error) // GetTotalManagedNodes returns the total count of nodes managed for driver upgrades @@ -89,19 +78,17 @@ type ClusterUpgradeStateManager interface { GetUpgradesPending(ctx context.Context, currentState *ClusterUpgradeState) int // WithPodDeletionEnabled provides an option to enable the optional 'pod-deletion' // state and pass a custom PodDeletionFilter to use - WithPodDeletionEnabled(filter PodDeletionFilter) ClusterUpgradeStateManager + WithPodDeletionEnabled(filter PodDeletionFilter) CommonUpgradeStateManager // WithValidationEnabled provides an option to enable the optional 'validation' state // and pass a podSelector to specify which pods are performing the validation - WithValidationEnabled(podSelector string) ClusterUpgradeStateManager + WithValidationEnabled(podSelector string) CommonUpgradeStateManager // IsPodDeletionEnabled returns true if 'pod-deletion' state is enabled IsPodDeletionEnabled() bool // IsValidationEnabled returns true if 'validation' state is enabled IsValidationEnabled() bool } -// ClusterUpgradeStateManagerImpl serves as a state machine for the ClusterUpgradeState -// It processes each node and based on its state schedules the required jobs to change their state to the next one -type ClusterUpgradeStateManagerImpl struct { +type CommonUpgradeManagerImpl struct { Log logr.Logger K8sClient client.Client K8sInterface kubernetes.Interface @@ -119,23 +106,23 @@ type ClusterUpgradeStateManagerImpl struct { validationStateEnabled bool } -// NewClusterUpgradeStateManager creates a new instance of ClusterUpgradeStateManagerImpl -func NewClusterUpgradeStateManager( +// NewCommonUpgradeStateManager creates a new instance of CommonUpgradeManagerImpl +func NewCommonUpgradeStateManager( log logr.Logger, k8sConfig *rest.Config, - eventRecorder record.EventRecorder) (ClusterUpgradeStateManager, error) { + eventRecorder record.EventRecorder) (*CommonUpgradeManagerImpl, error) { k8sClient, err := client.New(k8sConfig, client.Options{Scheme: scheme.Scheme}) if err != nil { - return nil, fmt.Errorf("error creating k8s client: %v", err) + return &CommonUpgradeManagerImpl{}, fmt.Errorf("error creating k8s client: %v", err) } k8sInterface, err := kubernetes.NewForConfig(k8sConfig) if err != nil { - return nil, fmt.Errorf("error creating k8s interface: %v", err) + return &CommonUpgradeManagerImpl{}, fmt.Errorf("error creating k8s interface: %v", err) } nodeUpgradeStateProvider := NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - manager := &ClusterUpgradeStateManagerImpl{ + commonUpgrade := CommonUpgradeManagerImpl{ Log: log, K8sClient: k8sClient, K8sInterface: k8sInterface, @@ -147,12 +134,13 @@ func NewClusterUpgradeStateManager( ValidationManager: NewValidationManager(k8sInterface, log, eventRecorder, nodeUpgradeStateProvider, ""), SafeDriverLoadManager: NewSafeDriverLoadManager(nodeUpgradeStateProvider, log), } - return manager, nil + + return &commonUpgrade, nil } // WithPodDeletionEnabled provides an option to enable the optional 'pod-deletion' state and pass a custom // PodDeletionFilter to use -func (m *ClusterUpgradeStateManagerImpl) WithPodDeletionEnabled(filter PodDeletionFilter) ClusterUpgradeStateManager { +func (m *CommonUpgradeManagerImpl) WithPodDeletionEnabled(filter PodDeletionFilter) CommonUpgradeStateManager { if filter == nil { m.Log.V(consts.LogLevelWarning).Info("Cannot enable PodDeletion state as PodDeletionFilter is nil") return m @@ -164,7 +152,7 @@ func (m *ClusterUpgradeStateManagerImpl) WithPodDeletionEnabled(filter PodDeleti // WithValidationEnabled provides an option to enable the optional 'validation' state and pass a podSelector to specify // which pods are performing the validation -func (m *ClusterUpgradeStateManagerImpl) WithValidationEnabled(podSelector string) ClusterUpgradeStateManager { +func (m *CommonUpgradeManagerImpl) WithValidationEnabled(podSelector string) CommonUpgradeStateManager { if podSelector == "" { m.Log.V(consts.LogLevelWarning).Info("Cannot enable Validation state as podSelector is empty") return m @@ -176,12 +164,12 @@ func (m *ClusterUpgradeStateManagerImpl) WithValidationEnabled(podSelector strin } // IsPodDeletionEnabled returns true if 'pod-deletion' state is enabled -func (m *ClusterUpgradeStateManagerImpl) IsPodDeletionEnabled() bool { +func (m *CommonUpgradeManagerImpl) IsPodDeletionEnabled() bool { return m.podDeletionStateEnabled } // IsValidationEnabled returns true if 'validation' state is enabled -func (m *ClusterUpgradeStateManagerImpl) IsValidationEnabled() bool { +func (m *CommonUpgradeManagerImpl) IsValidationEnabled() bool { return m.validationStateEnabled } @@ -189,13 +177,13 @@ func (m *ClusterUpgradeStateManagerImpl) IsValidationEnabled() bool { // TODO: Drop ctx as it's not used // //nolint:revive -func (m *ClusterUpgradeStateManagerImpl) GetCurrentUnavailableNodes(ctx context.Context, +func (m *CommonUpgradeManagerImpl) GetCurrentUnavailableNodes(ctx context.Context, currentState *ClusterUpgradeState) int { unavailableNodes := 0 for _, nodeUpgradeStateList := range currentState.NodeStates { for _, nodeUpgradeState := range nodeUpgradeStateList { // check if the node is cordoned - if m.isNodeUnschedulable(nodeUpgradeState.Node) { + if m.IsNodeUnschedulable(nodeUpgradeState.Node) { m.Log.V(consts.LogLevelDebug).Info("Node is cordoned", "node", nodeUpgradeState.Node.Name) unavailableNodes++ continue @@ -211,7 +199,7 @@ func (m *ClusterUpgradeStateManagerImpl) GetCurrentUnavailableNodes(ctx context. } // BuildState builds a point-in-time snapshot of the driver upgrade state in the cluster. -func (m *ClusterUpgradeStateManagerImpl) BuildState(ctx context.Context, namespace string, +func (m *CommonUpgradeManagerImpl) BuildState(ctx context.Context, namespace string, driverLabels map[string]string) (*ClusterUpgradeState, error) { m.Log.V(consts.LogLevelInfo).Info("Building state") @@ -280,7 +268,7 @@ func (m *ClusterUpgradeStateManagerImpl) BuildState(ctx context.Context, namespa // buildNodeUpgradeState creates a mapping between a node, // the driver POD running on them and the daemon set, controlling this pod -func (m *ClusterUpgradeStateManagerImpl) buildNodeUpgradeState( +func (m *CommonUpgradeManagerImpl) buildNodeUpgradeState( ctx context.Context, pod *corev1.Pod, ds *appsv1.DaemonSet) (*NodeUpgradeState, error) { node, err := m.NodeUpgradeStateProvider.GetNode(ctx, pod.Spec.NodeName) if err != nil { @@ -295,7 +283,7 @@ func (m *ClusterUpgradeStateManagerImpl) buildNodeUpgradeState( } // getDriverDaemonSets retrieves DaemonSets with given labels and returns UID->DaemonSet map -func (m *ClusterUpgradeStateManagerImpl) getDriverDaemonSets(ctx context.Context, namespace string, +func (m *CommonUpgradeManagerImpl) getDriverDaemonSets(ctx context.Context, namespace string, labels map[string]string) (map[types.UID]*appsv1.DaemonSet, error) { // Get list of driver pods daemonSetList := &appsv1.DaemonSetList{} @@ -317,7 +305,7 @@ func (m *ClusterUpgradeStateManagerImpl) getDriverDaemonSets(ctx context.Context } // getPodsOwnedbyDs returns a list of the pods owned by the specified DaemonSet -func (m *ClusterUpgradeStateManagerImpl) getPodsOwnedbyDs(ds *appsv1.DaemonSet, pods []corev1.Pod) []corev1.Pod { +func (m *CommonUpgradeManagerImpl) getPodsOwnedbyDs(ds *appsv1.DaemonSet, pods []corev1.Pod) []corev1.Pod { dsPodList := []corev1.Pod{} for i := range pods { pod := &pods[i] @@ -338,7 +326,7 @@ func (m *ClusterUpgradeStateManagerImpl) getPodsOwnedbyDs(ds *appsv1.DaemonSet, } // getOrphanedPods returns a list of the pods not owned by any DaemonSet -func (m *ClusterUpgradeStateManagerImpl) getOrphanedPods(pods []corev1.Pod) []corev1.Pod { +func (m *CommonUpgradeManagerImpl) getOrphanedPods(pods []corev1.Pod) []corev1.Pod { podList := []corev1.Pod{} for i := range pods { pod := &pods[i] @@ -354,138 +342,9 @@ func isOrphanedPod(pod *corev1.Pod) bool { return pod.OwnerReferences == nil || len(pod.OwnerReferences) < 1 } -// ApplyState receives a complete cluster upgrade state and, based on upgrade policy, processes each node's state. -// Based on the current state of the node, it is calculated if the node can be moved to the next state right now -// or whether any actions need to be scheduled for the node to move to the next state. -// The function is stateless and idempotent. If the error was returned before all nodes' states were processed, -// ApplyState would be called again and complete the processing - all the decisions are based on the input data. -// -//nolint:funlen -func (m *ClusterUpgradeStateManagerImpl) ApplyState(ctx context.Context, - currentState *ClusterUpgradeState, upgradePolicy *v1alpha1.DriverUpgradePolicySpec) (err error) { - m.Log.V(consts.LogLevelInfo).Info("State Manager, got state update") - - if currentState == nil { - return fmt.Errorf("currentState should not be empty") - } - - if upgradePolicy == nil || !upgradePolicy.AutoUpgrade { - m.Log.V(consts.LogLevelInfo).Info("Driver auto upgrade is disabled, skipping") - return nil - } - - m.Log.V(consts.LogLevelInfo).Info("Node states:", - "Unknown", len(currentState.NodeStates[UpgradeStateUnknown]), - UpgradeStateDone, len(currentState.NodeStates[UpgradeStateDone]), - UpgradeStateUpgradeRequired, len(currentState.NodeStates[UpgradeStateUpgradeRequired]), - UpgradeStateCordonRequired, len(currentState.NodeStates[UpgradeStateCordonRequired]), - UpgradeStateWaitForJobsRequired, len(currentState.NodeStates[UpgradeStateWaitForJobsRequired]), - UpgradeStatePodDeletionRequired, len(currentState.NodeStates[UpgradeStatePodDeletionRequired]), - UpgradeStateFailed, len(currentState.NodeStates[UpgradeStateFailed]), - UpgradeStateDrainRequired, len(currentState.NodeStates[UpgradeStateDrainRequired]), - UpgradeStatePodRestartRequired, len(currentState.NodeStates[UpgradeStatePodRestartRequired]), - UpgradeStateValidationRequired, len(currentState.NodeStates[UpgradeStateValidationRequired]), - UpgradeStateUncordonRequired, len(currentState.NodeStates[UpgradeStateUncordonRequired])) - - totalNodes := m.GetTotalManagedNodes(ctx, currentState) - upgradesInProgress := m.GetUpgradesInProgress(ctx, currentState) - currentUnavailableNodes := m.GetCurrentUnavailableNodes(ctx, currentState) - maxUnavailable := totalNodes - - if upgradePolicy.MaxUnavailable != nil { - maxUnavailable, err = intstr.GetScaledValueFromIntOrPercent(upgradePolicy.MaxUnavailable, totalNodes, true) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to compute maxUnavailable from the current total nodes") - return err - } - } - - upgradesAvailable := m.GetUpgradesAvailable(ctx, currentState, upgradePolicy.MaxParallelUpgrades, maxUnavailable) - - m.Log.V(consts.LogLevelInfo).Info("Upgrades in progress", - "currently in progress", upgradesInProgress, - "max parallel upgrades", upgradePolicy.MaxParallelUpgrades, - "upgrade slots available", upgradesAvailable, - "currently unavailable nodes", currentUnavailableNodes, - "total number of nodes", totalNodes, - "maximum nodes that can be unavailable", maxUnavailable) - - // Determine the object to log this event - // m.EventRecorder.Eventf(m.Namespace, v1.EventTypeNormal, GetEventReason(), - // "InProgress: %d, MaxParallelUpgrades: %d, UpgradeSlotsAvailable: %s", upgradesInProgress, - // upgradePolicy.MaxParallelUpgrades, upgradesAvailable) - - // First, check if unknown or ready nodes need to be upgraded - err = m.ProcessDoneOrUnknownNodes(ctx, currentState, UpgradeStateUnknown) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to process nodes", "state", UpgradeStateUnknown) - return err - } - err = m.ProcessDoneOrUnknownNodes(ctx, currentState, UpgradeStateDone) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to process nodes", "state", UpgradeStateDone) - return err - } - // Start upgrade process for upgradesAvailable number of nodes - err = m.ProcessUpgradeRequiredNodes(ctx, currentState, upgradesAvailable) - if err != nil { - m.Log.V(consts.LogLevelError).Error( - err, "Failed to process nodes", "state", UpgradeStateUpgradeRequired) - return err - } - - err = m.ProcessCordonRequiredNodes(ctx, currentState) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to cordon nodes") - return err - } - - err = m.ProcessWaitForJobsRequiredNodes(ctx, currentState, upgradePolicy.WaitForCompletion) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to waiting for required jobs to complete") - return err - } - - drainEnabled := upgradePolicy.DrainSpec != nil && upgradePolicy.DrainSpec.Enable - err = m.ProcessPodDeletionRequiredNodes(ctx, currentState, upgradePolicy.PodDeletion, drainEnabled) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to delete pods") - return err - } - - // Schedule nodes for drain - err = m.ProcessDrainNodes(ctx, currentState, upgradePolicy.DrainSpec) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to schedule nodes drain") - return err - } - err = m.ProcessPodRestartNodes(ctx, currentState) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to schedule pods restart") - return err - } - err = m.ProcessUpgradeFailedNodes(ctx, currentState) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to process nodes in 'upgrade-failed' state") - return err - } - err = m.ProcessValidationRequiredNodes(ctx, currentState) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to validate driver upgrade") - return err - } - err = m.ProcessUncordonRequiredNodes(ctx, currentState) - if err != nil { - m.Log.V(consts.LogLevelError).Error(err, "Failed to uncordon nodes") - return err - } - m.Log.V(consts.LogLevelInfo).Info("State Manager, finished processing") - return nil -} - // ProcessDoneOrUnknownNodes iterates over UpgradeStateDone or UpgradeStateUnknown nodes and determines // whether each specific node should be in UpgradeStateUpgradeRequired or UpgradeStateDone state. -func (m *ClusterUpgradeStateManagerImpl) ProcessDoneOrUnknownNodes( +func (m *CommonUpgradeManagerImpl) ProcessDoneOrUnknownNodes( ctx context.Context, currentClusterState *ClusterUpgradeState, nodeStateName string) error { m.Log.V(consts.LogLevelInfo).Info("ProcessDoneOrUnknownNodes") @@ -495,7 +354,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessDoneOrUnknownNodes( m.Log.V(consts.LogLevelError).Error(err, "Failed to get daemonset template/pod revision hash") return err } - isUpgradeRequested := m.isUpgradeRequested(nodeState.Node) + isUpgradeRequested := m.IsUpgradeRequested(nodeState.Node) isWaitingForSafeDriverLoad, err := m.SafeDriverLoadManager.IsWaitingForSafeDriverLoad(ctx, nodeState.Node) if err != nil { m.Log.V(consts.LogLevelError).Error( @@ -509,7 +368,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessDoneOrUnknownNodes( if (!isPodSynced && !isOrphaned) || isWaitingForSafeDriverLoad || isUpgradeRequested { // If node requires upgrade and is Unschedulable, track this in an // annotation and leave node in Unschedulable state when upgrade completes. - if isNodeUnschedulable(nodeState.Node) { + if IsNodeUnschedulable(nodeState.Node) { annotationKey := GetUpgradeInitialStateAnnotationKey() annotationValue := trueString m.Log.V(consts.LogLevelInfo).Info( @@ -555,7 +414,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessDoneOrUnknownNodes( // bool: True if Pod is in sync with DaemonSet. (For Orphanded Pods, always false) // bool: True if the Pod is Orphaned // error: In case of error retrivieng the Revision Hashes -func (m *ClusterUpgradeStateManagerImpl) podInSyncWithDS(ctx context.Context, +func (m *CommonUpgradeManagerImpl) podInSyncWithDS(ctx context.Context, nodeState *NodeUpgradeState) (bool, bool, error) { if nodeState.IsOrphanedPod() { return false, true, nil @@ -578,61 +437,13 @@ func (m *ClusterUpgradeStateManagerImpl) podInSyncWithDS(ctx context.Context, } // isUpgradeRequested returns true if node is labeled to request an upgrade -func (m *ClusterUpgradeStateManagerImpl) isUpgradeRequested(node *corev1.Node) bool { +func (m *CommonUpgradeManagerImpl) IsUpgradeRequested(node *corev1.Node) bool { return node.Annotations[GetUpgradeRequestedAnnotationKey()] == "true" } -// ProcessUpgradeRequiredNodes processes UpgradeStateUpgradeRequired nodes and moves them to UpgradeStateCordonRequired -// until the limit on max parallel upgrades is reached. -func (m *ClusterUpgradeStateManagerImpl) ProcessUpgradeRequiredNodes( - ctx context.Context, currentClusterState *ClusterUpgradeState, upgradesAvailable int) error { - m.Log.V(consts.LogLevelInfo).Info("ProcessUpgradeRequiredNodes") - for _, nodeState := range currentClusterState.NodeStates[UpgradeStateUpgradeRequired] { - if m.isUpgradeRequested(nodeState.Node) { - // Make sure to remove the upgrade-requested annotation - err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeAnnotation(ctx, nodeState.Node, - GetUpgradeRequestedAnnotationKey(), "null") - if err != nil { - m.Log.V(consts.LogLevelError).Error( - err, "Failed to delete node upgrade-requested annotation") - return err - } - } - if m.skipNodeUpgrade(nodeState.Node) { - m.Log.V(consts.LogLevelInfo).Info("Node is marked for skipping upgrades", "node", nodeState.Node.Name) - continue - } - - if upgradesAvailable <= 0 { - // when no new node upgrades are available, progess with manually cordoned nodes - if m.isNodeUnschedulable(nodeState.Node) { - m.Log.V(consts.LogLevelDebug).Info("Node is already cordoned, progressing for driver upgrade", - "node", nodeState.Node.Name) - } else { - m.Log.V(consts.LogLevelDebug).Info("Node upgrade limit reached, pausing further upgrades", - "node", nodeState.Node.Name) - continue - } - } - - err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeState(ctx, nodeState.Node, UpgradeStateCordonRequired) - if err == nil { - upgradesAvailable-- - m.Log.V(consts.LogLevelInfo).Info("Node waiting for cordon", - "node", nodeState.Node.Name) - } else { - m.Log.V(consts.LogLevelError).Error( - err, "Failed to change node upgrade state", "state", UpgradeStateCordonRequired) - return err - } - } - - return nil -} - // ProcessCordonRequiredNodes processes UpgradeStateCordonRequired nodes, // cordons them and moves them to UpgradeStateWaitForJobsRequired state -func (m *ClusterUpgradeStateManagerImpl) ProcessCordonRequiredNodes( +func (m *CommonUpgradeManagerImpl) ProcessCordonRequiredNodes( ctx context.Context, currentClusterState *ClusterUpgradeState) error { m.Log.V(consts.LogLevelInfo).Info("ProcessCordonRequiredNodes") @@ -655,7 +466,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessCordonRequiredNodes( // ProcessWaitForJobsRequiredNodes processes UpgradeStateWaitForJobsRequired nodes, // waits for completion of jobs and moves them to UpgradeStatePodDeletionRequired state. -func (m *ClusterUpgradeStateManagerImpl) ProcessWaitForJobsRequiredNodes( +func (m *CommonUpgradeManagerImpl) ProcessWaitForJobsRequiredNodes( ctx context.Context, currentClusterState *ClusterUpgradeState, waitForCompletionSpec *v1alpha1.WaitForCompletionSpec) error { m.Log.V(consts.LogLevelInfo).Info("ProcessWaitForJobsRequiredNodes") @@ -695,7 +506,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessWaitForJobsRequiredNodes( // ProcessPodDeletionRequiredNodes processes UpgradeStatePodDeletionRequired nodes, // deletes select pods on a node, and moves the nodes to UpgradeStateDrainRequiredRequired state. // Pods selected for deletion are determined via PodManager.PodDeletion -func (m *ClusterUpgradeStateManagerImpl) ProcessPodDeletionRequiredNodes( +func (m *CommonUpgradeManagerImpl) ProcessPodDeletionRequiredNodes( ctx context.Context, currentClusterState *ClusterUpgradeState, podDeletionSpec *v1alpha1.PodDeletionSpec, drainEnabled bool) error { m.Log.V(consts.LogLevelInfo).Info("ProcessPodDeletionRequiredNodes") @@ -728,7 +539,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessPodDeletionRequiredNodes( // ProcessDrainNodes schedules UpgradeStateDrainRequired nodes for drain. // If drain is disabled by upgrade policy, moves the nodes straight to UpgradeStatePodRestartRequired state. -func (m *ClusterUpgradeStateManagerImpl) ProcessDrainNodes( +func (m *CommonUpgradeManagerImpl) ProcessDrainNodes( ctx context.Context, currentClusterState *ClusterUpgradeState, drainSpec *v1alpha1.DrainSpec) error { m.Log.V(consts.LogLevelInfo).Info("ProcessDrainNodes") if drainSpec == nil || !drainSpec.Enable { @@ -761,7 +572,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessDrainNodes( // ProcessPodRestartNodes processes UpgradeStatePodRestartRequirednodes and schedules driver pod restart for them. // If the pod has already been restarted and is in Ready state - moves the node to UpgradeStateUncordonRequired state. -func (m *ClusterUpgradeStateManagerImpl) ProcessPodRestartNodes( +func (m *CommonUpgradeManagerImpl) ProcessPodRestartNodes( ctx context.Context, currentClusterState *ClusterUpgradeState) error { m.Log.V(consts.LogLevelInfo).Info("ProcessPodRestartNodes") @@ -832,7 +643,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessPodRestartNodes( // ProcessUpgradeFailedNodes processes UpgradeStateFailed nodes and checks whether the driver pod on the node // has been successfully restarted. If the pod is in Ready state - moves the node to UpgradeStateUncordonRequired state. -func (m *ClusterUpgradeStateManagerImpl) ProcessUpgradeFailedNodes( +func (m *CommonUpgradeManagerImpl) ProcessUpgradeFailedNodes( ctx context.Context, currentClusterState *ClusterUpgradeState) error { m.Log.V(consts.LogLevelInfo).Info("ProcessUpgradeFailedNodes") @@ -877,7 +688,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessUpgradeFailedNodes( } // ProcessValidationRequiredNodes processes UpgradeStateValidationRequired nodes -func (m *ClusterUpgradeStateManagerImpl) ProcessValidationRequiredNodes( +func (m *CommonUpgradeManagerImpl) ProcessValidationRequiredNodes( ctx context.Context, currentClusterState *ClusterUpgradeState) error { m.Log.V(consts.LogLevelInfo).Info("ProcessValidationRequiredNodes") @@ -912,7 +723,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessValidationRequiredNodes( // ProcessUncordonRequiredNodes processes UpgradeStateUncordonRequired nodes, // uncordons them and moves them to UpgradeStateDone state -func (m *ClusterUpgradeStateManagerImpl) ProcessUncordonRequiredNodes( +func (m *CommonUpgradeManagerImpl) ProcessUncordonRequiredNodes( ctx context.Context, currentClusterState *ClusterUpgradeState) error { m.Log.V(consts.LogLevelInfo).Info("ProcessUncordonRequiredNodes") @@ -933,7 +744,7 @@ func (m *ClusterUpgradeStateManagerImpl) ProcessUncordonRequiredNodes( return nil } -func (m *ClusterUpgradeStateManagerImpl) isDriverPodInSync(ctx context.Context, +func (m *CommonUpgradeManagerImpl) isDriverPodInSync(ctx context.Context, nodeState *NodeUpgradeState) (bool, error) { isPodSynced, isOrphaned, err := m.podInSyncWithDS(ctx, nodeState) if err != nil { @@ -963,7 +774,7 @@ func (m *ClusterUpgradeStateManagerImpl) isDriverPodInSync(ctx context.Context, return false, nil } -func (m *ClusterUpgradeStateManagerImpl) isDriverPodFailing(pod *corev1.Pod) bool { +func (m *CommonUpgradeManagerImpl) isDriverPodFailing(pod *corev1.Pod) bool { for _, status := range pod.Status.InitContainerStatuses { if !status.Ready && status.RestartCount > 10 { return true @@ -978,12 +789,12 @@ func (m *ClusterUpgradeStateManagerImpl) isDriverPodFailing(pod *corev1.Pod) boo } // isNodeUnschedulable returns true if the node is cordoned -func (m *ClusterUpgradeStateManagerImpl) isNodeUnschedulable(node *corev1.Node) bool { +func (m *CommonUpgradeManagerImpl) IsNodeUnschedulable(node *corev1.Node) bool { return node.Spec.Unschedulable } // isNodeConditionReady returns true if the node condition is ready -func (m *ClusterUpgradeStateManagerImpl) isNodeConditionReady(node *corev1.Node) bool { +func (m *CommonUpgradeManagerImpl) isNodeConditionReady(node *corev1.Node) bool { for _, condition := range node.Status.Conditions { if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue { return false @@ -993,14 +804,14 @@ func (m *ClusterUpgradeStateManagerImpl) isNodeConditionReady(node *corev1.Node) } // skipNodeUpgrade returns true if node is labeled to skip driver upgrades -func (m *ClusterUpgradeStateManagerImpl) skipNodeUpgrade(node *corev1.Node) bool { +func (m *CommonUpgradeManagerImpl) SkipNodeUpgrade(node *corev1.Node) bool { return node.Labels[GetUpgradeSkipNodeLabelKey()] == trueString } // updateNodeToUncordonOrDoneState skips moving the node to the UncordonRequired state if the node // was Unschedulable at the beginning of the upgrade so that the node remains in the same state as // when the upgrade started. In addition, the annotation tracking this information is removed. -func (m *ClusterUpgradeStateManagerImpl) updateNodeToUncordonOrDoneState(ctx context.Context, node *corev1.Node) error { +func (m *CommonUpgradeManagerImpl) updateNodeToUncordonOrDoneState(ctx context.Context, node *corev1.Node) error { newUpgradeState := UpgradeStateUncordonRequired annotationKey := GetUpgradeInitialStateAnnotationKey() if _, ok := node.Annotations[annotationKey]; ok { @@ -1027,7 +838,7 @@ func (m *ClusterUpgradeStateManagerImpl) updateNodeToUncordonOrDoneState(ctx con return nil } -func isNodeUnschedulable(node *corev1.Node) bool { +func IsNodeUnschedulable(node *corev1.Node) bool { return node.Spec.Unschedulable } @@ -1035,7 +846,7 @@ func isNodeUnschedulable(node *corev1.Node) bool { // TODO: Drop ctx as it's not used // //nolint:revive -func (m *ClusterUpgradeStateManagerImpl) GetTotalManagedNodes(ctx context.Context, +func (m *CommonUpgradeManagerImpl) GetTotalManagedNodes(ctx context.Context, currentState *ClusterUpgradeState) int { totalNodes := len(currentState.NodeStates[UpgradeStateUnknown]) + len(currentState.NodeStates[UpgradeStateDone]) + @@ -1053,7 +864,7 @@ func (m *ClusterUpgradeStateManagerImpl) GetTotalManagedNodes(ctx context.Contex } // GetUpgradesInProgress returns count of nodes on which upgrade is in progress -func (m *ClusterUpgradeStateManagerImpl) GetUpgradesInProgress(ctx context.Context, +func (m *CommonUpgradeManagerImpl) GetUpgradesInProgress(ctx context.Context, currentState *ClusterUpgradeState) int { totalNodes := m.GetTotalManagedNodes(ctx, currentState) return totalNodes - (len(currentState.NodeStates[UpgradeStateUnknown]) + @@ -1065,13 +876,13 @@ func (m *ClusterUpgradeStateManagerImpl) GetUpgradesInProgress(ctx context.Conte // TODO: Drop ctx as it's not used // //nolint:revive -func (m *ClusterUpgradeStateManagerImpl) GetUpgradesDone(ctx context.Context, +func (m *CommonUpgradeManagerImpl) GetUpgradesDone(ctx context.Context, currentState *ClusterUpgradeState) int { return len(currentState.NodeStates[UpgradeStateDone]) } // GetUpgradesAvailable returns count of nodes on which upgrade can be done -func (m *ClusterUpgradeStateManagerImpl) GetUpgradesAvailable(ctx context.Context, +func (m *CommonUpgradeManagerImpl) GetUpgradesAvailable(ctx context.Context, currentState *ClusterUpgradeState, maxParallelUpgrades int, maxUnavailable int) int { upgradesInProgress := m.GetUpgradesInProgress(ctx, currentState) totalNodes := m.GetTotalManagedNodes(ctx, currentState) @@ -1105,7 +916,7 @@ func (m *ClusterUpgradeStateManagerImpl) GetUpgradesAvailable(ctx context.Contex // TODO: Drop ctx as it's not used // //nolint:revive -func (m *ClusterUpgradeStateManagerImpl) GetUpgradesFailed(ctx context.Context, +func (m *CommonUpgradeManagerImpl) GetUpgradesFailed(ctx context.Context, currentState *ClusterUpgradeState) int { return len(currentState.NodeStates[UpgradeStateFailed]) } @@ -1114,7 +925,7 @@ func (m *ClusterUpgradeStateManagerImpl) GetUpgradesFailed(ctx context.Context, // TODO: Drop ctx as it's not used // //nolint:revive -func (m *ClusterUpgradeStateManagerImpl) GetUpgradesPending(ctx context.Context, +func (m *CommonUpgradeManagerImpl) GetUpgradesPending(ctx context.Context, currentState *ClusterUpgradeState) int { return len(currentState.NodeStates[UpgradeStateUpgradeRequired]) } diff --git a/pkg/upgrade/upgrade_suit_test.go b/pkg/upgrade/common/upgrade_suit_test.go similarity index 80% rename from pkg/upgrade/upgrade_suit_test.go rename to pkg/upgrade/common/upgrade_suit_test.go index d2b75c2d..a080fb61 100644 --- a/pkg/upgrade/upgrade_suit_test.go +++ b/pkg/upgrade/common/upgrade_suit_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade_test +package common_test import ( "context" @@ -26,9 +26,7 @@ import ( . "github.com/onsi/gomega" "github.com/stretchr/testify/mock" appsv1 "k8s.io/api/apps/v1" - batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -42,8 +40,8 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/mocks" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/manager/mocks" // +kubebuilder:scaffold:imports ) @@ -94,13 +92,13 @@ var _ = BeforeSuite(func() { log = ctrl.Log.WithName("upgradeSuitTest") // set driver name to be managed by the upgrade-manager - upgrade.SetDriverName("gpu") + common.SetDriverName("gpu") nodeUpgradeStateProvider = mocks.NodeUpgradeStateProvider{} nodeUpgradeStateProvider. On("ChangeNodeUpgradeState", mock.Anything, mock.Anything, mock.Anything). Return(func(ctx context.Context, node *corev1.Node, newNodeState string) error { - node.Labels[upgrade.GetUpgradeStateLabelKey()] = newNodeState + node.Labels[common.GetUpgradeStateLabelKey()] = newNodeState return nil }) nodeUpgradeStateProvider. @@ -145,7 +143,7 @@ var _ = BeforeSuite(func() { On("GetPodControllerRevisionHash", mock.Anything, mock.Anything). Return( func(ctx context.Context, pod *corev1.Pod) string { - return pod.Labels[upgrade.PodControllerRevisionHashLabelKey] + return pod.Labels[common.PodControllerRevisionHashLabelKey] }, func(ctx context.Context, pod *corev1.Pod) error { return nil @@ -218,7 +216,7 @@ func (n Node) WithUpgradeState(state string) Node { if n.Labels == nil { n.Labels = make(map[string]string) } - n.Labels[upgrade.GetUpgradeStateLabelKey()] = state + n.Labels[common.GetUpgradeStateLabelKey()] = state return n } @@ -379,79 +377,12 @@ func createNamespace(name string) *corev1.Namespace { return namespace } -func createPod(name, namespace string, labels map[string]string, nodeName string) *corev1.Pod { - gracePeriodSeconds := int64(0) - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Labels: labels, - }, - Spec: corev1.PodSpec{ - TerminationGracePeriodSeconds: &gracePeriodSeconds, - NodeName: nodeName, - Containers: []corev1.Container{ - { - Name: "test-container", - Image: "test-image", - }, - }, - }, - } - err := k8sClient.Create(context.TODO(), pod) - Expect(err).NotTo(HaveOccurred()) - createdObjects = append(createdObjects, pod) - return pod -} - func updatePodStatus(pod *corev1.Pod) error { err := k8sClient.Status().Update(context.TODO(), pod) Expect(err).NotTo(HaveOccurred()) return err } -func updatePod(pod *corev1.Pod) error { - err := k8sClient.Update(context.TODO(), pod) - Expect(err).NotTo(HaveOccurred()) - return err -} - -func createJob(name string, namespace string, labels map[string]string) *batchv1.Job { - var backOffLimit int32 = 0 - manualSelector := true - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Labels: labels, - }, - Spec: batchv1.JobSpec{ - ManualSelector: &manualSelector, - Selector: &metav1.LabelSelector{MatchLabels: labels}, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: labels, - }, - Spec: corev1.PodSpec{ - Containers: []v1.Container{ - { - Name: name, - Image: "test-image", - Command: []string{"test-command"}, - }, - }, - RestartPolicy: corev1.RestartPolicyNever, - }, - }, - BackoffLimit: &backOffLimit, - }, - } - err := k8sClient.Create(context.TODO(), job) - Expect(err).NotTo(HaveOccurred()) - createdObjects = append(createdObjects, job) - return job -} - func createNode(name string) *corev1.Node { node := &corev1.Node{} node.Name = name @@ -469,32 +400,17 @@ func getNode(name string) *corev1.Node { return node } -func updateNode(node *corev1.Node) error { - err := k8sClient.Update(context.TODO(), node) - Expect(err).NotTo(HaveOccurred()) - return err -} - func deleteObj(obj client.Object) { Expect(k8sClient.Delete(context.TODO(), obj)).To(BeNil()) } -func getNodeUpgradeState(node *corev1.Node) string { - return node.Labels[upgrade.GetUpgradeStateLabelKey()] -} - -func isUnschedulableAnnotationPresent(node *corev1.Node) bool { - _, ok := node.Annotations[upgrade.GetUpgradeInitialStateAnnotationKey()] - return ok -} - func isWaitForCompletionAnnotationPresent(node *corev1.Node) bool { - _, ok := node.Annotations[upgrade.GetWaitForPodCompletionStartTimeAnnotationKey()] + _, ok := node.Annotations[common.GetWaitForPodCompletionStartTimeAnnotationKey()] return ok } func isValidationAnnotationPresent(node *corev1.Node) bool { - _, ok := node.Annotations[upgrade.GetValidationStartTimeAnnotationKey()] + _, ok := node.Annotations[common.GetValidationStartTimeAnnotationKey()] return ok } diff --git a/pkg/upgrade/util.go b/pkg/upgrade/common/util.go similarity index 99% rename from pkg/upgrade/util.go rename to pkg/upgrade/common/util.go index ee9a049f..c3737c1b 100644 --- a/pkg/upgrade/util.go +++ b/pkg/upgrade/common/util.go @@ -11,7 +11,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "fmt" diff --git a/pkg/upgrade/validation_manager.go b/pkg/upgrade/common/validation_manager.go similarity index 99% rename from pkg/upgrade/validation_manager.go rename to pkg/upgrade/common/validation_manager.go index f3ec7027..77efc0fa 100644 --- a/pkg/upgrade/validation_manager.go +++ b/pkg/upgrade/common/validation_manager.go @@ -11,7 +11,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade +package common import ( "context" diff --git a/pkg/upgrade/validation_manager_test.go b/pkg/upgrade/common/validation_manager_test.go similarity index 70% rename from pkg/upgrade/validation_manager_test.go rename to pkg/upgrade/common/validation_manager_test.go index d05dd6a6..91e89b6d 100644 --- a/pkg/upgrade/validation_manager_test.go +++ b/pkg/upgrade/common/validation_manager_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package upgrade_test +package common_test import ( "context" @@ -22,11 +22,10 @@ import ( "strconv" "time" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" - - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" ) var _ = Describe("ValidationManager", func() { @@ -43,16 +42,16 @@ var _ = Describe("ValidationManager", func() { }) It("should return no error if podSelector is empty", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - validationManager := upgrade.NewValidationManager(k8sInterface, log, eventRecorder, provider, "") + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + validationManager := common.NewValidationManager(k8sInterface, log, eventRecorder, provider, "") validationDone, err := validationManager.Validate(ctx, node) Expect(err).To(Succeed()) Expect(validationDone).To(Equal(true)) }) It("Validate() should return false when no validation pods are running", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - validationManager := upgrade.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validation") + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + validationManager := common.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validation") validationDone, err := validationManager.Validate(ctx, node) Expect(err).To(Succeed()) Expect(validationDone).To(Equal(false)) @@ -60,11 +59,11 @@ var _ = Describe("ValidationManager", func() { }) It("Validate() should return true if validation pod is Running and Ready", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) _ = NewPod("pod", namespace.Name, node.Name). WithLabels(map[string]string{"app": "validator"}). Create() - validationManager := upgrade.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") + validationManager := common.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") validationDone, err := validationManager.Validate(ctx, node) Expect(err).To(Succeed()) Expect(validationDone).To(Equal(true)) @@ -72,14 +71,14 @@ var _ = Describe("ValidationManager", func() { }) It("Validate() should return false if validation pod is Running but not Ready", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) pod := NewPod("pod", namespace.Name, node.Name). WithLabels(map[string]string{"app": "validator"}). Create() pod.Status.ContainerStatuses[0].Ready = false _ = updatePodStatus(pod) - validationManager := upgrade.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") + validationManager := common.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") validationDone, err := validationManager.Validate(ctx, node) Expect(err).To(Succeed()) Expect(validationDone).To(Equal(false)) @@ -87,14 +86,14 @@ var _ = Describe("ValidationManager", func() { }) It("Validate() should return false if validation pod is not Running", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) pod := NewPod("pod", namespace.Name, node.Name). WithLabels(map[string]string{"app": "validator"}). Create() pod.Status.Phase = "Terminating" _ = updatePodStatus(pod) - validationManager := upgrade.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") + validationManager := common.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") validationDone, err := validationManager.Validate(ctx, node) Expect(err).To(Succeed()) Expect(validationDone).To(Equal(false)) @@ -102,8 +101,8 @@ var _ = Describe("ValidationManager", func() { }) It("Validate() should mark node as UpgradeFailed when validation does not complete before timeout", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStateValidationRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStateValidationRequired) Expect(err).To(Succeed()) pod := NewPod("pod", namespace.Name, node.Name). @@ -112,19 +111,19 @@ var _ = Describe("ValidationManager", func() { pod.Status.ContainerStatuses[0].Ready = false _ = updatePodStatus(pod) - validationManager := upgrade.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") + validationManager := common.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") validationDone, err := validationManager.Validate(ctx, node) Expect(err).To(Succeed()) Expect(validationDone).To(Equal(false)) node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateValidationRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateValidationRequired)) Expect(isValidationAnnotationPresent(node)).To(Equal(true)) startTime := strconv.FormatInt(time.Now().Unix()-605, 10) - provider.ChangeNodeUpgradeAnnotation(ctx, node, upgrade.GetValidationStartTimeAnnotationKey(), startTime) + provider.ChangeNodeUpgradeAnnotation(ctx, node, common.GetValidationStartTimeAnnotationKey(), startTime) validationDone, err = validationManager.Validate(ctx, node) Expect(err).To(Succeed()) @@ -132,13 +131,13 @@ var _ = Describe("ValidationManager", func() { node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateFailed)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateFailed)) Expect(isValidationAnnotationPresent(node)).To(Equal(false)) }) It("Validate() should remove annotation when validation completes before timeout", func() { - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) - err := provider.ChangeNodeUpgradeState(ctx, node, upgrade.UpgradeStateValidationRequired) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + err := provider.ChangeNodeUpgradeState(ctx, node, common.UpgradeStateValidationRequired) Expect(err).To(Succeed()) pod := NewPod("pod", namespace.Name, node.Name). @@ -147,14 +146,14 @@ var _ = Describe("ValidationManager", func() { pod.Status.ContainerStatuses[0].Ready = false _ = updatePodStatus(pod) - validationManager := upgrade.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") + validationManager := common.NewValidationManager(k8sInterface, log, eventRecorder, provider, "app=validator") validationDone, err := validationManager.Validate(ctx, node) Expect(err).To(Succeed()) Expect(validationDone).To(Equal(false)) node, err = provider.GetNode(ctx, node.Name) Expect(err).To(Succeed()) - Expect(node.Labels[upgrade.GetUpgradeStateLabelKey()]).To(Equal(upgrade.UpgradeStateValidationRequired)) + Expect(node.Labels[common.GetUpgradeStateLabelKey()]).To(Equal(common.UpgradeStateValidationRequired)) Expect(isValidationAnnotationPresent(node)).To(Equal(true)) diff --git a/pkg/upgrade/inbox/upgrade_inbox.go b/pkg/upgrade/inbox/upgrade_inbox.go new file mode 100644 index 00000000..33844ab4 --- /dev/null +++ b/pkg/upgrade/inbox/upgrade_inbox.go @@ -0,0 +1,77 @@ +/* +Copyright 2022 NVIDIA CORPORATION & AFFILIATES + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package inbox + +import ( + "context" + + "github.com/NVIDIA/k8s-operator-libs/pkg/consts" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" +) + +// InboxUpgradeManagerImpl contains concrete implementations for distinct inbox upgrade mode +type InboxUpgradeManagerImpl struct { + *common.CommonUpgradeManagerImpl +} + +// ProcessUpgradeRequiredNodes processes UpgradeStateUpgradeRequired nodes and moves them to UpgradeStateCordonRequired +// until the limit on max parallel upgrades is reached. +func (m *InboxUpgradeManagerImpl) ProcessUpgradeRequiredNodes( + ctx context.Context, currentClusterState *common.ClusterUpgradeState, upgradesAvailable int) error { + m.Log.V(consts.LogLevelInfo).Info("ProcessUpgradeRequiredNodes") + for _, nodeState := range currentClusterState.NodeStates[common.UpgradeStateUpgradeRequired] { + if m.IsUpgradeRequested(nodeState.Node) { + // Make sure to remove the upgrade-requested annotation + err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeAnnotation(ctx, nodeState.Node, + common.GetUpgradeRequestedAnnotationKey(), "null") + if err != nil { + m.Log.V(consts.LogLevelError).Error( + err, "Failed to delete node upgrade-requested annotation") + return err + } + } + if m.SkipNodeUpgrade(nodeState.Node) { + m.Log.V(consts.LogLevelInfo).Info("Node is marked for skipping upgrades", "node", nodeState.Node.Name) + continue + } + + if upgradesAvailable <= 0 { + // when no new node upgrades are available, progess with manually cordoned nodes + if m.IsNodeUnschedulable(nodeState.Node) { + m.Log.V(consts.LogLevelDebug).Info("Node is already cordoned, progressing for driver upgrade", + "node", nodeState.Node.Name) + } else { + m.Log.V(consts.LogLevelDebug).Info("Node upgrade limit reached, pausing further upgrades", + "node", nodeState.Node.Name) + continue + } + } + + err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeState(ctx, nodeState.Node, common.UpgradeStateCordonRequired) + if err == nil { + upgradesAvailable-- + m.Log.V(consts.LogLevelInfo).Info("Node waiting for cordon", + "node", nodeState.Node.Name) + } else { + m.Log.V(consts.LogLevelError).Error( + err, "Failed to change node upgrade state", "state", common.UpgradeStateCordonRequired) + return err + } + } + + return nil +} diff --git a/pkg/upgrade/mocks/CordonManager.go b/pkg/upgrade/manager/mocks/CordonManager.go similarity index 100% rename from pkg/upgrade/mocks/CordonManager.go rename to pkg/upgrade/manager/mocks/CordonManager.go diff --git a/pkg/upgrade/mocks/DrainManager.go b/pkg/upgrade/manager/mocks/DrainManager.go similarity index 84% rename from pkg/upgrade/mocks/DrainManager.go rename to pkg/upgrade/manager/mocks/DrainManager.go index b62afde1..3ba26841 100644 --- a/pkg/upgrade/mocks/DrainManager.go +++ b/pkg/upgrade/manager/mocks/DrainManager.go @@ -17,7 +17,7 @@ package mocks import ( context "context" - upgrade "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" mock "github.com/stretchr/testify/mock" ) @@ -27,11 +27,11 @@ type DrainManager struct { } // ScheduleNodesDrain provides a mock function with given fields: ctx, drainConfig -func (_m *DrainManager) ScheduleNodesDrain(ctx context.Context, drainConfig *upgrade.DrainConfiguration) error { +func (_m *DrainManager) ScheduleNodesDrain(ctx context.Context, drainConfig *common.DrainConfiguration) error { ret := _m.Called(ctx, drainConfig) var r0 error - if rf, ok := ret.Get(0).(func(context.Context, *upgrade.DrainConfiguration) error); ok { + if rf, ok := ret.Get(0).(func(context.Context, *common.DrainConfiguration) error); ok { r0 = rf(ctx, drainConfig) } else { r0 = ret.Error(0) diff --git a/pkg/upgrade/mocks/NodeUpgradeStateProvider.go b/pkg/upgrade/manager/mocks/NodeUpgradeStateProvider.go similarity index 100% rename from pkg/upgrade/mocks/NodeUpgradeStateProvider.go rename to pkg/upgrade/manager/mocks/NodeUpgradeStateProvider.go diff --git a/pkg/upgrade/mocks/PodManager.go b/pkg/upgrade/manager/mocks/PodManager.go similarity index 86% rename from pkg/upgrade/mocks/PodManager.go rename to pkg/upgrade/manager/mocks/PodManager.go index d333a8f4..bcfe0f5b 100644 --- a/pkg/upgrade/mocks/PodManager.go +++ b/pkg/upgrade/manager/mocks/PodManager.go @@ -22,7 +22,7 @@ import ( mock "github.com/stretchr/testify/mock" - upgrade "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" + common "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" v1 "k8s.io/api/apps/v1" ) @@ -75,15 +75,15 @@ func (_m *PodManager) GetPodControllerRevisionHash(ctx context.Context, pod *cor } // GetPodDeletionFilter provides a mock function with given fields: -func (_m *PodManager) GetPodDeletionFilter() upgrade.PodDeletionFilter { +func (_m *PodManager) GetPodDeletionFilter() common.PodDeletionFilter { ret := _m.Called() - var r0 upgrade.PodDeletionFilter - if rf, ok := ret.Get(0).(func() upgrade.PodDeletionFilter); ok { + var r0 common.PodDeletionFilter + if rf, ok := ret.Get(0).(func() common.PodDeletionFilter); ok { r0 = rf() } else { if ret.Get(0) != nil { - r0 = ret.Get(0).(upgrade.PodDeletionFilter) + r0 = ret.Get(0).(common.PodDeletionFilter) } } @@ -91,11 +91,11 @@ func (_m *PodManager) GetPodDeletionFilter() upgrade.PodDeletionFilter { } // ScheduleCheckOnPodCompletion provides a mock function with given fields: ctx, config -func (_m *PodManager) ScheduleCheckOnPodCompletion(ctx context.Context, config *upgrade.PodManagerConfig) error { +func (_m *PodManager) ScheduleCheckOnPodCompletion(ctx context.Context, config *common.PodManagerConfig) error { ret := _m.Called(ctx, config) var r0 error - if rf, ok := ret.Get(0).(func(context.Context, *upgrade.PodManagerConfig) error); ok { + if rf, ok := ret.Get(0).(func(context.Context, *common.PodManagerConfig) error); ok { r0 = rf(ctx, config) } else { r0 = ret.Error(0) @@ -105,11 +105,11 @@ func (_m *PodManager) ScheduleCheckOnPodCompletion(ctx context.Context, config * } // SchedulePodEviction provides a mock function with given fields: ctx, config -func (_m *PodManager) SchedulePodEviction(ctx context.Context, config *upgrade.PodManagerConfig) error { +func (_m *PodManager) SchedulePodEviction(ctx context.Context, config *common.PodManagerConfig) error { ret := _m.Called(ctx, config) var r0 error - if rf, ok := ret.Get(0).(func(context.Context, *upgrade.PodManagerConfig) error); ok { + if rf, ok := ret.Get(0).(func(context.Context, *common.PodManagerConfig) error); ok { r0 = rf(ctx, config) } else { r0 = ret.Error(0) diff --git a/pkg/upgrade/mocks/ValidationManager.go b/pkg/upgrade/manager/mocks/ValidationManager.go similarity index 100% rename from pkg/upgrade/mocks/ValidationManager.go rename to pkg/upgrade/manager/mocks/ValidationManager.go diff --git a/pkg/upgrade/manager/upgrade_state.go b/pkg/upgrade/manager/upgrade_state.go new file mode 100644 index 00000000..095b795f --- /dev/null +++ b/pkg/upgrade/manager/upgrade_state.go @@ -0,0 +1,235 @@ +/* +Copyright 2022 NVIDIA CORPORATION & AFFILIATES + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package upgrade + +import ( + "context" + "fmt" + + "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1" + "github.com/NVIDIA/k8s-operator-libs/pkg/consts" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/inbox" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/requestor" + "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" +) + +// ExtendedUpgradeStateManager interface purpose is to decouple ApplyState implementation from common package +// since its referencing inbox, requestor (maintenance OP) packages +type ExtendedUpgradeStateManager interface { + // ApplyState receives a complete cluster upgrade state and, based on upgrade policy, processes each node's state. + // Based on the current state of the node, it is calculated if the node can be moved to the next state right now + // or whether any actions need to be scheduled for the node to move to the next state. + // The function is stateless and idempotent. If the error was returned before all nodes' states were processed, + // ApplyState would be called again and complete the processing - all the decisions are based on the input data. + ApplyState(ctx context.Context, + currentState *common.ClusterUpgradeState, upgradePolicy *v1alpha1.DriverUpgradePolicySpec) (err error) +} + +// ProcessNodeStateManager interface is used for abstracting both upgrade modes: inbox, requestor (e.g. maintenance OP) +// Similar node states are used in both modes, while changes are introduced within ApplyState Process methods to +// support both modes logic +type ProcessNodeStateManager interface { + ProcessUpgradeRequiredNodes(ctx context.Context, currentClusterState *common.ClusterUpgradeState, upgradesAvailable int) error +} + +// ClusterUpgradeStateManager is an interface for performing cluster upgrades of driver containers +// +//nolint:interfacebloat +type ClusterUpgradeStateManager interface { + ExtendedUpgradeStateManager + common.CommonUpgradeStateManager +} + +// ClusterUpgradeStateManagerImpl serves as a state machine for the ClusterUpgradeState +// It processes each node and based on its state schedules the required jobs to change their state to the next one +type ClusterUpgradeStateManagerImpl struct { + *common.CommonUpgradeManagerImpl + inbox ProcessNodeStateManager + requestor ProcessNodeStateManager +} + +// NewClusterUpgradeStateManager creates a new instance of RequestorUpgradeManagerImpl +func NewRequestorUpgradeManagerImpl( + common *common.CommonUpgradeManagerImpl) (ProcessNodeStateManager, error) { + + manager := &requestor.RequestorUpgradeManagerImpl{ + CommonUpgradeManagerImpl: common, + } + return manager, nil +} + +// NewClusterUpgradeStateManager creates a new instance of InboxUpgradeManagerImpl +func NewInboxUpgradeManagerImpl( + common *common.CommonUpgradeManagerImpl) (ProcessNodeStateManager, error) { + + manager := &inbox.InboxUpgradeManagerImpl{ + CommonUpgradeManagerImpl: common, + } + return manager, nil +} + +// NewClusterUpgradeStateManager creates a new instance of ClusterUpgradeStateManagerImpl +func NewClusterUpgradeStateManager( + log logr.Logger, + k8sConfig *rest.Config, + eventRecorder record.EventRecorder) (ClusterUpgradeStateManager, error) { + + common, _ := common.NewCommonUpgradeStateManager(log, k8sConfig, eventRecorder) + request, _ := NewRequestorUpgradeManagerImpl(common) + inbox, _ := NewInboxUpgradeManagerImpl(common) + + manager := &ClusterUpgradeStateManagerImpl{ + CommonUpgradeManagerImpl: common, + requestor: request, + inbox: inbox, + } + + return manager, nil +} + +// ApplyState receives a complete cluster upgrade state and, based on upgrade policy, processes each node's state. +// Based on the current state of the node, it is calculated if the node can be moved to the next state right now +// or whether any actions need to be scheduled for the node to move to the next state. +// The function is stateless and idempotent. If the error was returned before all nodes' states were processed, +// ApplyState would be called again and complete the processing - all the decisions are based on the input data. +// +//nolint:funlen +func (m *ClusterUpgradeStateManagerImpl) ApplyState(ctx context.Context, + currentState *common.ClusterUpgradeState, upgradePolicy *v1alpha1.DriverUpgradePolicySpec) (err error) { + m.Log.V(consts.LogLevelInfo).Info("State Manager, got state update") + + if currentState == nil { + return fmt.Errorf("currentState should not be empty") + } + + if upgradePolicy == nil || !upgradePolicy.AutoUpgrade { + m.Log.V(consts.LogLevelInfo).Info("Driver auto upgrade is disabled, skipping") + return nil + } + + m.Log.V(consts.LogLevelInfo).Info("Node states:", + "Unknown", len(currentState.NodeStates[common.UpgradeStateUnknown]), + common.UpgradeStateDone, len(currentState.NodeStates[common.UpgradeStateDone]), + common.UpgradeStateUpgradeRequired, len(currentState.NodeStates[common.UpgradeStateUpgradeRequired]), + common.UpgradeStateCordonRequired, len(currentState.NodeStates[common.UpgradeStateCordonRequired]), + common.UpgradeStateWaitForJobsRequired, len(currentState.NodeStates[common.UpgradeStateWaitForJobsRequired]), + common.UpgradeStatePodDeletionRequired, len(currentState.NodeStates[common.UpgradeStatePodDeletionRequired]), + common.UpgradeStateFailed, len(currentState.NodeStates[common.UpgradeStateFailed]), + common.UpgradeStateDrainRequired, len(currentState.NodeStates[common.UpgradeStateDrainRequired]), + common.UpgradeStatePodRestartRequired, len(currentState.NodeStates[common.UpgradeStatePodRestartRequired]), + common.UpgradeStateValidationRequired, len(currentState.NodeStates[common.UpgradeStateValidationRequired]), + common.UpgradeStateUncordonRequired, len(currentState.NodeStates[common.UpgradeStateUncordonRequired])) + + totalNodes := m.GetTotalManagedNodes(ctx, currentState) + upgradesInProgress := m.GetUpgradesInProgress(ctx, currentState) + currentUnavailableNodes := m.GetCurrentUnavailableNodes(ctx, currentState) + maxUnavailable := totalNodes + + if upgradePolicy.MaxUnavailable != nil { + maxUnavailable, err = intstr.GetScaledValueFromIntOrPercent(upgradePolicy.MaxUnavailable, totalNodes, true) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to compute maxUnavailable from the current total nodes") + return err + } + } + + upgradesAvailable := m.GetUpgradesAvailable(ctx, currentState, upgradePolicy.MaxParallelUpgrades, maxUnavailable) + + m.Log.V(consts.LogLevelInfo).Info("Upgrades in progress", + "currently in progress", upgradesInProgress, + "max parallel upgrades", upgradePolicy.MaxParallelUpgrades, + "upgrade slots available", upgradesAvailable, + "currently unavailable nodes", currentUnavailableNodes, + "total number of nodes", totalNodes, + "maximum nodes that can be unavailable", maxUnavailable) + + // Determine the object to log this event + // m.EventRecorder.Eventf(m.Namespace, v1.EventTypeNormal, GetEventReason(), + // "InProgress: %d, MaxParallelUpgrades: %d, UpgradeSlotsAvailable: %s", upgradesInProgress, + // upgradePolicy.MaxParallelUpgrades, upgradesAvailable) + + // First, check if unknown or ready nodes need to be upgraded + err = m.ProcessDoneOrUnknownNodes(ctx, currentState, common.UpgradeStateUnknown) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to process nodes", "state", common.UpgradeStateUnknown) + return err + } + err = m.ProcessDoneOrUnknownNodes(ctx, currentState, common.UpgradeStateDone) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to process nodes", "state", common.UpgradeStateDone) + return err + } + // Start upgrade process for upgradesAvailable number of nodes + err = m.inbox.ProcessUpgradeRequiredNodes(ctx, currentState, upgradesAvailable) + if err != nil { + m.Log.V(consts.LogLevelError).Error( + err, "Failed to process nodes", "state", common.UpgradeStateUpgradeRequired) + return err + } + + err = m.ProcessCordonRequiredNodes(ctx, currentState) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to cordon nodes") + return err + } + + err = m.ProcessWaitForJobsRequiredNodes(ctx, currentState, upgradePolicy.WaitForCompletion) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to waiting for required jobs to complete") + return err + } + + drainEnabled := upgradePolicy.DrainSpec != nil && upgradePolicy.DrainSpec.Enable + err = m.ProcessPodDeletionRequiredNodes(ctx, currentState, upgradePolicy.PodDeletion, drainEnabled) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to delete pods") + return err + } + + // Schedule nodes for drain + err = m.ProcessDrainNodes(ctx, currentState, upgradePolicy.DrainSpec) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to schedule nodes drain") + return err + } + err = m.ProcessPodRestartNodes(ctx, currentState) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to schedule pods restart") + return err + } + err = m.ProcessUpgradeFailedNodes(ctx, currentState) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to process nodes in 'upgrade-failed' state") + return err + } + err = m.ProcessValidationRequiredNodes(ctx, currentState) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to validate driver upgrade") + return err + } + err = m.ProcessUncordonRequiredNodes(ctx, currentState) + if err != nil { + m.Log.V(consts.LogLevelError).Error(err, "Failed to uncordon nodes") + return err + } + m.Log.V(consts.LogLevelInfo).Info("State Manager, finished processing") + return nil +} diff --git a/pkg/upgrade/upgrade_state_test.go b/pkg/upgrade/manager/upgrade_state_test.go similarity index 59% rename from pkg/upgrade/upgrade_state_test.go rename to pkg/upgrade/manager/upgrade_state_test.go index b73a1e70..b28976ae 100644 --- a/pkg/upgrade/upgrade_state_test.go +++ b/pkg/upgrade/manager/upgrade_state_test.go @@ -31,8 +31,9 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1" - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" - "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/mocks" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" + upgrade "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/manager" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/manager/mocks" ) var _ = Describe("UpgradeStateManager tests", func() { @@ -47,6 +48,7 @@ var _ = Describe("UpgradeStateManager tests", func() { var err error stateManagerInterface, err := upgrade.NewClusterUpgradeStateManager(log, k8sConfig, eventRecorder) Expect(err).NotTo(HaveOccurred()) + stateManager, _ = stateManagerInterface.(*upgrade.ClusterUpgradeStateManagerImpl) stateManager.NodeUpgradeStateProvider = &nodeUpgradeStateProvider stateManager.DrainManager = &drainManager @@ -134,72 +136,72 @@ var _ = Describe("UpgradeStateManager tests", func() { Expect(stateManager.ApplyState(ctx, nil, &v1alpha1.DriverUpgradePolicySpec{})).ToNot(Succeed()) }) It("UpgradeStateManager should not fail on nil upgradePolicy", func() { - Expect(stateManager.ApplyState(ctx, &upgrade.ClusterUpgradeState{}, nil)).To(Succeed()) + Expect(stateManager.ApplyState(ctx, &common.ClusterUpgradeState{}, nil)).To(Succeed()) }) It("UpgradeStateManager should move up-to-date nodes to Done and outdated nodes to UpgradeRequired states", func() { daemonSet := &appsv1.DaemonSet{ObjectMeta: v1.ObjectMeta{}} upToDatePod := &corev1.Pod{ - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} outdatedPod := &corev1.Pod{ - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-outadated"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-outadated"}}} UnknownToDoneNode := nodeWithUpgradeState("") UnknownToUpgradeRequiredNode := nodeWithUpgradeState("") - DoneToDoneNode := nodeWithUpgradeState(upgrade.UpgradeStateDone) - DoneToUpgradeRequiredNode := nodeWithUpgradeState(upgrade.UpgradeStateDone) + DoneToDoneNode := nodeWithUpgradeState(common.UpgradeStateDone) + DoneToUpgradeRequiredNode := nodeWithUpgradeState(common.UpgradeStateDone) - clusterState := upgrade.NewClusterUpgradeState() - unknownNodes := []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + unknownNodes := []*common.NodeUpgradeState{ {Node: UnknownToDoneNode, DriverPod: upToDatePod, DriverDaemonSet: daemonSet}, {Node: UnknownToUpgradeRequiredNode, DriverPod: outdatedPod, DriverDaemonSet: daemonSet}, } - doneNodes := []*upgrade.NodeUpgradeState{ + doneNodes := []*common.NodeUpgradeState{ {Node: DoneToDoneNode, DriverPod: upToDatePod, DriverDaemonSet: daemonSet}, {Node: DoneToUpgradeRequiredNode, DriverPod: outdatedPod, DriverDaemonSet: daemonSet}, } clusterState.NodeStates[""] = unknownNodes - clusterState.NodeStates[upgrade.UpgradeStateDone] = doneNodes + clusterState.NodeStates[common.UpgradeStateDone] = doneNodes Expect(stateManager.ApplyState(ctx, &clusterState, &v1alpha1.DriverUpgradePolicySpec{AutoUpgrade: true})).To(Succeed()) - Expect(getNodeUpgradeState(UnknownToDoneNode)).To(Equal(upgrade.UpgradeStateDone)) - Expect(getNodeUpgradeState(UnknownToUpgradeRequiredNode)).To(Equal(upgrade.UpgradeStateUpgradeRequired)) - Expect(getNodeUpgradeState(DoneToDoneNode)).To(Equal(upgrade.UpgradeStateDone)) - Expect(getNodeUpgradeState(DoneToUpgradeRequiredNode)).To(Equal(upgrade.UpgradeStateUpgradeRequired)) + Expect(getNodeUpgradeState(UnknownToDoneNode)).To(Equal(common.UpgradeStateDone)) + Expect(getNodeUpgradeState(UnknownToUpgradeRequiredNode)).To(Equal(common.UpgradeStateUpgradeRequired)) + Expect(getNodeUpgradeState(DoneToDoneNode)).To(Equal(common.UpgradeStateDone)) + Expect(getNodeUpgradeState(DoneToUpgradeRequiredNode)).To(Equal(common.UpgradeStateUpgradeRequired)) }) It("UpgradeStateManager should move outdated nodes to UpgradeRequired state and annotate node if unschedulable", func() { ctx := context.TODO() daemonSet := &appsv1.DaemonSet{ObjectMeta: v1.ObjectMeta{}} upToDatePod := &corev1.Pod{ - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} outdatedPod := &corev1.Pod{ - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} UnknownToDoneNode := NewNode(fmt.Sprintf("node1-%s", id)).Create() UnknownToUpgradeRequiredNode := NewNode(fmt.Sprintf("node2-%s", id)).Unschedulable(true).Create() - DoneToDoneNode := NewNode(fmt.Sprintf("node3-%s", id)).WithUpgradeState(upgrade.UpgradeStateDone).Create() - DoneToUpgradeRequiredNode := NewNode(fmt.Sprintf("node4-%s", id)).WithUpgradeState(upgrade.UpgradeStateDone).Unschedulable(true).Create() + DoneToDoneNode := NewNode(fmt.Sprintf("node3-%s", id)).WithUpgradeState(common.UpgradeStateDone).Create() + DoneToUpgradeRequiredNode := NewNode(fmt.Sprintf("node4-%s", id)).WithUpgradeState(common.UpgradeStateDone).Unschedulable(true).Create() - clusterState := upgrade.NewClusterUpgradeState() - unknownNodes := []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + unknownNodes := []*common.NodeUpgradeState{ {Node: UnknownToDoneNode, DriverPod: upToDatePod, DriverDaemonSet: daemonSet}, {Node: UnknownToUpgradeRequiredNode, DriverPod: outdatedPod, DriverDaemonSet: daemonSet}, } - doneNodes := []*upgrade.NodeUpgradeState{ + doneNodes := []*common.NodeUpgradeState{ {Node: DoneToDoneNode, DriverPod: upToDatePod, DriverDaemonSet: daemonSet}, {Node: DoneToUpgradeRequiredNode, DriverPod: outdatedPod, DriverDaemonSet: daemonSet}, } clusterState.NodeStates[""] = unknownNodes - clusterState.NodeStates[upgrade.UpgradeStateDone] = doneNodes + clusterState.NodeStates[common.UpgradeStateDone] = doneNodes - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) stateManager.NodeUpgradeStateProvider = provider Expect(stateManager.ApplyState(ctx, &clusterState, &v1alpha1.DriverUpgradePolicySpec{AutoUpgrade: true})).To(Succeed()) - Expect(getNodeUpgradeState(UnknownToDoneNode)).To(Equal(upgrade.UpgradeStateDone)) - Expect(getNodeUpgradeState(UnknownToUpgradeRequiredNode)).To(Equal(upgrade.UpgradeStateUpgradeRequired)) - Expect(getNodeUpgradeState(DoneToDoneNode)).To(Equal(upgrade.UpgradeStateDone)) - Expect(getNodeUpgradeState(DoneToUpgradeRequiredNode)).To(Equal(upgrade.UpgradeStateUpgradeRequired)) + Expect(getNodeUpgradeState(UnknownToDoneNode)).To(Equal(common.UpgradeStateDone)) + Expect(getNodeUpgradeState(UnknownToUpgradeRequiredNode)).To(Equal(common.UpgradeStateUpgradeRequired)) + Expect(getNodeUpgradeState(DoneToDoneNode)).To(Equal(common.UpgradeStateDone)) + Expect(getNodeUpgradeState(DoneToUpgradeRequiredNode)).To(Equal(common.UpgradeStateUpgradeRequired)) Expect(isUnschedulableAnnotationPresent(UnknownToUpgradeRequiredNode)). To(Equal(true)) @@ -215,36 +217,36 @@ var _ = Describe("UpgradeStateManager tests", func() { "to UpgradeRequired state", func() { ctx := context.TODO() - safeLoadAnnotationKey := upgrade.GetUpgradeDriverWaitForSafeLoadAnnotationKey() + safeLoadAnnotationKey := common.GetUpgradeDriverWaitForSafeLoadAnnotationKey() daemonSet := &appsv1.DaemonSet{ObjectMeta: v1.ObjectMeta{}} upToDatePod := &corev1.Pod{ - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} waitForSafeLoadNode := NewNode(fmt.Sprintf("node1-%s", id)). WithAnnotations(map[string]string{safeLoadAnnotationKey: "true"}). Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateDone] = []*upgrade.NodeUpgradeState{{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateDone] = []*common.NodeUpgradeState{{ Node: waitForSafeLoadNode, DriverPod: upToDatePod, DriverDaemonSet: daemonSet, }} - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) stateManager.NodeUpgradeStateProvider = provider Expect(stateManager.ApplyState(ctx, &clusterState, &v1alpha1.DriverUpgradePolicySpec{AutoUpgrade: true})).To(Succeed()) - Expect(getNodeUpgradeState(waitForSafeLoadNode)).To(Equal(upgrade.UpgradeStateUpgradeRequired)) + Expect(getNodeUpgradeState(waitForSafeLoadNode)).To(Equal(common.UpgradeStateUpgradeRequired)) }) It("UpgradeStateManager should schedule upgrade on all nodes if maxParallel upgrades is set to 0", func() { - clusterState := upgrade.NewClusterUpgradeState() - nodeStates := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, + clusterState := common.NewClusterUpgradeState() + nodeStates := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, } - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = nodeStates + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = nodeStates policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -255,26 +257,26 @@ var _ = Describe("UpgradeStateManager tests", func() { Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) stateCount := make(map[string]int) for i := range nodeStates { - state := getNodeUpgradeState(clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired][i].Node) + state := getNodeUpgradeState(clusterState.NodeStates[common.UpgradeStateUpgradeRequired][i].Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(0)) - Expect(stateCount[upgrade.UpgradeStateCordonRequired]).To(Equal(len(nodeStates))) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(0)) + Expect(stateCount[common.UpgradeStateCordonRequired]).To(Equal(len(nodeStates))) }) It("UpgradeStateManager should start upgrade on limited amount of nodes "+ "if maxParallel upgrades is less than node count", func() { const maxParallelUpgrades = 3 - clusterState := upgrade.NewClusterUpgradeState() - nodeStates := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, + clusterState := common.NewClusterUpgradeState() + nodeStates := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, } - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = nodeStates + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = nodeStates policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -287,24 +289,24 @@ var _ = Describe("UpgradeStateManager tests", func() { state := getNodeUpgradeState(nodeStates[i].Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(2)) - Expect(stateCount[upgrade.UpgradeStateCordonRequired]).To(Equal(maxParallelUpgrades)) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(2)) + Expect(stateCount[common.UpgradeStateCordonRequired]).To(Equal(maxParallelUpgrades)) }) It("UpgradeStateManager should start additional upgrades if maxParallelUpgrades limit is not reached", func() { const maxParallelUpgrades = 4 - upgradeRequiredNodes := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, + upgradeRequiredNodes := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, } - cordonRequiredNodes := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, + cordonRequiredNodes := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, } - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = upgradeRequiredNodes - clusterState.NodeStates[upgrade.UpgradeStateCordonRequired] = cordonRequiredNodes + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = upgradeRequiredNodes + clusterState.NodeStates[common.UpgradeStateCordonRequired] = cordonRequiredNodes policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -320,21 +322,21 @@ var _ = Describe("UpgradeStateManager tests", func() { state := getNodeUpgradeState(state.Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(1)) - Expect(stateCount[upgrade.UpgradeStateCordonRequired] + - stateCount[upgrade.UpgradeStateWaitForJobsRequired]).To(Equal(4)) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(1)) + Expect(stateCount[common.UpgradeStateCordonRequired] + + stateCount[common.UpgradeStateWaitForJobsRequired]).To(Equal(4)) }) It("UpgradeStateManager should schedule upgrade all nodes if maxParallel upgrades is set to 0 and maxUnavailable is set to 100%", func() { - clusterState := upgrade.NewClusterUpgradeState() - nodeStates := []*upgrade.NodeUpgradeState{ - {Node: NewNode("node1").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node2").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node3").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node4").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, - {Node: NewNode("node5").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, + clusterState := common.NewClusterUpgradeState() + nodeStates := []*common.NodeUpgradeState{ + {Node: NewNode("node1").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node2").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node3").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node4").WithUpgradeState(common.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, + {Node: NewNode("node5").WithUpgradeState(common.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, } - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = nodeStates + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = nodeStates policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -347,23 +349,23 @@ var _ = Describe("UpgradeStateManager tests", func() { Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) stateCount := make(map[string]int) for i := range nodeStates { - state := getNodeUpgradeState(clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired][i].Node) + state := getNodeUpgradeState(clusterState.NodeStates[common.UpgradeStateUpgradeRequired][i].Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(0)) - Expect(stateCount[upgrade.UpgradeStateCordonRequired]).To(Equal(len(nodeStates))) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(0)) + Expect(stateCount[common.UpgradeStateCordonRequired]).To(Equal(len(nodeStates))) }) It("UpgradeStateManager should schedule upgrade based on maxUnavailable constraint if maxParallel upgrades is set to 0 and maxUnavailable is set to 50%", func() { - clusterState := upgrade.NewClusterUpgradeState() - nodeStates := []*upgrade.NodeUpgradeState{ - {Node: NewNode("node1").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node2").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node3").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node4").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, - {Node: NewNode("node5").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, + clusterState := common.NewClusterUpgradeState() + nodeStates := []*common.NodeUpgradeState{ + {Node: NewNode("node1").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node2").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node3").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node4").WithUpgradeState(common.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, + {Node: NewNode("node5").WithUpgradeState(common.UpgradeStateUpgradeRequired).Unschedulable(true).Node}, } - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = nodeStates + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = nodeStates policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -375,38 +377,38 @@ var _ = Describe("UpgradeStateManager tests", func() { Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) stateCount := make(map[string]int) for i := range nodeStates { - state := getNodeUpgradeState(clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired][i].Node) + state := getNodeUpgradeState(clusterState.NodeStates[common.UpgradeStateUpgradeRequired][i].Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(2)) - Expect(stateCount[upgrade.UpgradeStateCordonRequired]).To(Equal(3)) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(2)) + Expect(stateCount[common.UpgradeStateCordonRequired]).To(Equal(3)) }) It("UpgradeStateManager should schedule upgrade based on 50% maxUnavailable, with some unavailable nodes already upgraded", func() { - clusterState := upgrade.NewClusterUpgradeState() + clusterState := common.NewClusterUpgradeState() daemonSet := &appsv1.DaemonSet{ObjectMeta: v1.ObjectMeta{}} upToDatePod := &corev1.Pod{ Status: corev1.PodStatus{Phase: "Running"}, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} - upgradeRequiredNodes := []*upgrade.NodeUpgradeState{ - {Node: NewNode("node1").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node2").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, - {Node: NewNode("node3").WithUpgradeState(upgrade.UpgradeStateUpgradeRequired).Node}, + upgradeRequiredNodes := []*common.NodeUpgradeState{ + {Node: NewNode("node1").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node2").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, + {Node: NewNode("node3").WithUpgradeState(common.UpgradeStateUpgradeRequired).Node}, } - upgradeDoneNodes := []*upgrade.NodeUpgradeState{ + upgradeDoneNodes := []*common.NodeUpgradeState{ { - Node: NewNode("node4").WithUpgradeState(upgrade.UpgradeStateDone).Unschedulable(true).Node, + Node: NewNode("node4").WithUpgradeState(common.UpgradeStateDone).Unschedulable(true).Node, DriverPod: upToDatePod, DriverDaemonSet: daemonSet, }, { - Node: NewNode("node5").WithUpgradeState(upgrade.UpgradeStateDone).Unschedulable(true).Node, + Node: NewNode("node5").WithUpgradeState(common.UpgradeStateDone).Unschedulable(true).Node, DriverPod: upToDatePod, DriverDaemonSet: daemonSet, }, } - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = upgradeRequiredNodes - clusterState.NodeStates[upgrade.UpgradeStateDone] = upgradeDoneNodes + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = upgradeRequiredNodes + clusterState.NodeStates[common.UpgradeStateDone] = upgradeDoneNodes policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -426,7 +428,7 @@ var _ = Describe("UpgradeStateManager tests", func() { On("GetPodControllerRevisionHash", mock.Anything, mock.Anything). Return( func(ctx context.Context, pod *corev1.Pod) string { - return pod.Labels[upgrade.PodControllerRevisionHashLabelKey] + return pod.Labels[common.PodControllerRevisionHashLabelKey] }, func(ctx context.Context, pod *corev1.Pod) error { return nil @@ -440,34 +442,34 @@ var _ = Describe("UpgradeStateManager tests", func() { Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) stateCount := make(map[string]int) for i := range upgradeRequiredNodes { - state := getNodeUpgradeState(clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired][i].Node) + state := getNodeUpgradeState(clusterState.NodeStates[common.UpgradeStateUpgradeRequired][i].Node) stateCount[state]++ } for i := range upgradeDoneNodes { - state := getNodeUpgradeState(clusterState.NodeStates[upgrade.UpgradeStateDone][i].Node) + state := getNodeUpgradeState(clusterState.NodeStates[common.UpgradeStateDone][i].Node) stateCount[state]++ } // check if already upgraded node states are not changed - Expect(stateCount[upgrade.UpgradeStateDone]).To(Equal(2)) + Expect(stateCount[common.UpgradeStateDone]).To(Equal(2)) // expect only single node to move to next state as upgradesUnavailble = maxUnavailable(3) - currentlyUnavailable(2) - Expect(stateCount[upgrade.UpgradeStateCordonRequired]).To(Equal(1)) + Expect(stateCount[common.UpgradeStateCordonRequired]).To(Equal(1)) // remaining nodes to be in same original state - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(2)) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(2)) }) It("UpgradeStateManager should start upgrade on limited amount of nodes "+ "if maxParallel upgrades and maxUnavailable are less than node count", func() { const maxParallelUpgrades = 3 - clusterState := upgrade.NewClusterUpgradeState() - nodeStates := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, + clusterState := common.NewClusterUpgradeState() + nodeStates := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, } - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = nodeStates + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = nodeStates policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -481,25 +483,25 @@ var _ = Describe("UpgradeStateManager tests", func() { state := getNodeUpgradeState(nodeStates[i].Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(3)) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(3)) // only maxUnavailable nodes should progress to next state - Expect(stateCount[upgrade.UpgradeStateCordonRequired]).To(Equal(2)) + Expect(stateCount[common.UpgradeStateCordonRequired]).To(Equal(2)) }) It("UpgradeStateManager should start additional upgrades if maxParallelUpgrades and maxUnavailable limits are not reached", func() { const maxParallelUpgrades = 4 - upgradeRequiredNodes := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, + upgradeRequiredNodes := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, } - cordonRequiredNodes := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, + cordonRequiredNodes := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, } - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = upgradeRequiredNodes - clusterState.NodeStates[upgrade.UpgradeStateCordonRequired] = cordonRequiredNodes + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = upgradeRequiredNodes + clusterState.NodeStates[common.UpgradeStateCordonRequired] = cordonRequiredNodes policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -516,25 +518,25 @@ var _ = Describe("UpgradeStateManager tests", func() { state := getNodeUpgradeState(state.Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(1)) - Expect(stateCount[upgrade.UpgradeStateCordonRequired] + - stateCount[upgrade.UpgradeStateWaitForJobsRequired]).To(Equal(4)) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(1)) + Expect(stateCount[common.UpgradeStateCordonRequired] + + stateCount[common.UpgradeStateWaitForJobsRequired]).To(Equal(4)) }) It("UpgradeStateManager should start additional upgrades if maxParallelUpgrades and maxUnavailable limits are not reached", func() { const maxParallelUpgrades = 4 - upgradeRequiredNodes := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired)}, + upgradeRequiredNodes := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateUpgradeRequired)}, } - cordonRequiredNodes := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateCordonRequired)}, + cordonRequiredNodes := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateCordonRequired)}, } - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = upgradeRequiredNodes - clusterState.NodeStates[upgrade.UpgradeStateCordonRequired] = cordonRequiredNodes + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = upgradeRequiredNodes + clusterState.NodeStates[common.UpgradeStateCordonRequired] = cordonRequiredNodes policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -551,18 +553,18 @@ var _ = Describe("UpgradeStateManager tests", func() { state := getNodeUpgradeState(state.Node) stateCount[state]++ } - Expect(stateCount[upgrade.UpgradeStateUpgradeRequired]).To(Equal(1)) - Expect(stateCount[upgrade.UpgradeStateCordonRequired] + - stateCount[upgrade.UpgradeStateWaitForJobsRequired]).To(Equal(4)) + Expect(stateCount[common.UpgradeStateUpgradeRequired]).To(Equal(1)) + Expect(stateCount[common.UpgradeStateCordonRequired] + + stateCount[common.UpgradeStateWaitForJobsRequired]).To(Equal(4)) }) It("UpgradeStateManager should skip pod deletion if no filter is provided to PodManager at contruction", func() { ctx := context.TODO() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateWaitForJobsRequired] = []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateWaitForJobsRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateWaitForJobsRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateWaitForJobsRequired)}, + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateWaitForJobsRequired] = []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateWaitForJobsRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateWaitForJobsRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateWaitForJobsRequired)}, } policyWithNoDrainSpec := &v1alpha1.DriverUpgradePolicySpec{ @@ -570,18 +572,18 @@ var _ = Describe("UpgradeStateManager tests", func() { } Expect(stateManager.ApplyState(ctx, &clusterState, policyWithNoDrainSpec)).To(Succeed()) - for _, state := range clusterState.NodeStates[upgrade.UpgradeStateWaitForJobsRequired] { - Expect(getNodeUpgradeState(state.Node)).To(Equal(upgrade.UpgradeStateDrainRequired)) + for _, state := range clusterState.NodeStates[common.UpgradeStateWaitForJobsRequired] { + Expect(getNodeUpgradeState(state.Node)).To(Equal(common.UpgradeStateDrainRequired)) } }) It("UpgradeStateManager should not skip pod deletion if a filter is provided to PodManager at contruction", func() { ctx := context.TODO() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateWaitForJobsRequired] = []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateWaitForJobsRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateWaitForJobsRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateWaitForJobsRequired)}, + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateWaitForJobsRequired] = []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateWaitForJobsRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateWaitForJobsRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateWaitForJobsRequired)}, } policyWithNoDrainSpec := &v1alpha1.DriverUpgradePolicySpec{ @@ -589,26 +591,26 @@ var _ = Describe("UpgradeStateManager tests", func() { } filter := func(pod corev1.Pod) bool { return false } - stateManager = stateManager.WithPodDeletionEnabled(filter).(*upgrade.ClusterUpgradeStateManagerImpl) - Expect(stateManager.IsPodDeletionEnabled()).To(Equal(true)) + commonStateManager := stateManager.CommonUpgradeManagerImpl.WithPodDeletionEnabled(filter) + Expect(commonStateManager.IsPodDeletionEnabled()).To(Equal(true)) Expect(stateManager.ApplyState(ctx, &clusterState, policyWithNoDrainSpec)).To(Succeed()) - for _, state := range clusterState.NodeStates[upgrade.UpgradeStateWaitForJobsRequired] { - Expect(getNodeUpgradeState(state.Node)).To(Equal(upgrade.UpgradeStatePodDeletionRequired)) + for _, state := range clusterState.NodeStates[common.UpgradeStateWaitForJobsRequired] { + Expect(getNodeUpgradeState(state.Node)).To(Equal(common.UpgradeStatePodDeletionRequired)) } }) It("UpgradeStateManager should not attempt to delete pods if pod deletion is disabled", func() { ctx := context.TODO() - clusterState := upgrade.NewClusterUpgradeState() - nodes := []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStatePodDeletionRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStatePodDeletionRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStatePodDeletionRequired)}, + clusterState := common.NewClusterUpgradeState() + nodes := []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStatePodDeletionRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStatePodDeletionRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStatePodDeletionRequired)}, } - clusterState.NodeStates[upgrade.UpgradeStatePodDeletionRequired] = nodes + clusterState.NodeStates[common.UpgradeStatePodDeletionRequired] = nodes policyWithNoPodDeletionSpec := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, @@ -619,7 +621,7 @@ var _ = Describe("UpgradeStateManager tests", func() { podManagerMock := mocks.PodManager{} podManagerMock. On("SchedulePodEviction", mock.Anything, mock.Anything). - Return(func(ctx context.Context, config *upgrade.PodManagerConfig) error { + Return(func(ctx context.Context, config *common.PodManagerConfig) error { podEvictionCalled = true return nil }). @@ -633,15 +635,15 @@ var _ = Describe("UpgradeStateManager tests", func() { Expect(stateManager.ApplyState(ctx, &clusterState, policyWithNoPodDeletionSpec)).To(Succeed()) for _, state := range nodes { - Expect(getNodeUpgradeState(state.Node)).To(Equal(upgrade.UpgradeStateDrainRequired)) + Expect(getNodeUpgradeState(state.Node)).To(Equal(common.UpgradeStateDrainRequired)) } }) It("UpgradeStateManager should skip drain if it's disabled by policy", func() { - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateDrainRequired] = []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateDrainRequired] = []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, } policyWithNoDrainSpec := &v1alpha1.DriverUpgradePolicySpec{ @@ -656,26 +658,26 @@ var _ = Describe("UpgradeStateManager tests", func() { } Expect(stateManager.ApplyState(ctx, &clusterState, policyWithNoDrainSpec)).To(Succeed()) - for _, state := range clusterState.NodeStates[upgrade.UpgradeStateDrainRequired] { - Expect(getNodeUpgradeState(state.Node)).To(Equal(upgrade.UpgradeStatePodRestartRequired)) + for _, state := range clusterState.NodeStates[common.UpgradeStateDrainRequired] { + Expect(getNodeUpgradeState(state.Node)).To(Equal(common.UpgradeStatePodRestartRequired)) } - clusterState.NodeStates[upgrade.UpgradeStateDrainRequired] = []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, + clusterState.NodeStates[common.UpgradeStateDrainRequired] = []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, } Expect(stateManager.ApplyState(ctx, &clusterState, policyWithDisabledDrain)).To(Succeed()) - for _, state := range clusterState.NodeStates[upgrade.UpgradeStateDrainRequired] { - Expect(getNodeUpgradeState(state.Node)).To(Equal(upgrade.UpgradeStatePodRestartRequired)) + for _, state := range clusterState.NodeStates[common.UpgradeStateDrainRequired] { + Expect(getNodeUpgradeState(state.Node)).To(Equal(common.UpgradeStatePodRestartRequired)) } }) It("UpgradeStateManager should schedule drain for UpgradeStateDrainRequired nodes and pass drain config", func() { - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateDrainRequired] = []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateDrainRequired] = []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, } policy := v1alpha1.DriverUpgradePolicySpec{ @@ -691,7 +693,7 @@ var _ = Describe("UpgradeStateManager tests", func() { drainManagerMock := mocks.DrainManager{} drainManagerMock. On("ScheduleNodesDrain", mock.Anything, mock.Anything). - Return(func(ctx context.Context, config *upgrade.DrainConfiguration) error { + Return(func(ctx context.Context, config *common.DrainConfiguration) error { Expect(config.Spec).To(Equal(&expectedDrainSpec)) Expect(config.Nodes).To(HaveLen(3)) return nil @@ -705,11 +707,11 @@ var _ = Describe("UpgradeStateManager tests", func() { Expect(stateManager.ApplyState(ctx, &clusterState, &policy)).To(Succeed()) }) It("UpgradeStateManager should fail if drain manager returns an error", func() { - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateDrainRequired] = []*upgrade.NodeUpgradeState{ - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, - {Node: nodeWithUpgradeState(upgrade.UpgradeStateDrainRequired)}, + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateDrainRequired] = []*common.NodeUpgradeState{ + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, + {Node: nodeWithUpgradeState(common.UpgradeStateDrainRequired)}, } policy := &v1alpha1.DriverUpgradePolicySpec{ @@ -722,7 +724,7 @@ var _ = Describe("UpgradeStateManager tests", func() { drainManagerMock := mocks.DrainManager{} drainManagerMock. On("ScheduleNodesDrain", mock.Anything, mock.Anything). - Return(func(ctx context.Context, config *upgrade.DrainConfiguration) error { + Return(func(ctx context.Context, config *common.DrainConfiguration) error { return errors.New("drain failed") }) stateManager.DrainManager = &drainManagerMock @@ -733,29 +735,29 @@ var _ = Describe("UpgradeStateManager tests", func() { daemonSet := &appsv1.DaemonSet{ObjectMeta: v1.ObjectMeta{}} upToDatePod := &corev1.Pod{ Status: corev1.PodStatus{Phase: "Running"}, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} outdatedRunningPod := &corev1.Pod{ Status: corev1.PodStatus{Phase: "Running"}, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} outdatedTerminatingPod := &corev1.Pod{ - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} now := v1.Now() outdatedTerminatingPod.ObjectMeta.DeletionTimestamp = &now - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStatePodRestartRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStatePodRestartRequired] = []*common.NodeUpgradeState{ { - Node: nodeWithUpgradeState(upgrade.UpgradeStatePodRestartRequired), + Node: nodeWithUpgradeState(common.UpgradeStatePodRestartRequired), DriverPod: upToDatePod, DriverDaemonSet: daemonSet, }, { - Node: nodeWithUpgradeState(upgrade.UpgradeStatePodRestartRequired), + Node: nodeWithUpgradeState(common.UpgradeStatePodRestartRequired), DriverPod: outdatedRunningPod, DriverDaemonSet: daemonSet, }, { - Node: nodeWithUpgradeState(upgrade.UpgradeStatePodRestartRequired), + Node: nodeWithUpgradeState(common.UpgradeStatePodRestartRequired), DriverPod: outdatedTerminatingPod, DriverDaemonSet: daemonSet, }, @@ -777,7 +779,7 @@ var _ = Describe("UpgradeStateManager tests", func() { On("GetPodControllerRevisionHash", mock.Anything, mock.Anything). Return( func(ctx context.Context, pod *corev1.Pod) string { - return pod.Labels[upgrade.PodControllerRevisionHashLabelKey] + return pod.Labels[common.PodControllerRevisionHashLabelKey] }, func(ctx context.Context, pod *corev1.Pod) error { return nil @@ -792,20 +794,20 @@ var _ = Describe("UpgradeStateManager tests", func() { }) It("UpgradeStateManager should unblock loading of the driver instead of restarting the Pod when node "+ "is waiting for safe driver loading", func() { - safeLoadAnnotation := upgrade.GetUpgradeDriverWaitForSafeLoadAnnotationKey() + safeLoadAnnotation := common.GetUpgradeDriverWaitForSafeLoadAnnotationKey() daemonSet := &appsv1.DaemonSet{ObjectMeta: v1.ObjectMeta{}} upToDatePod := &corev1.Pod{ Status: corev1.PodStatus{Phase: "Running"}, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} waitForSafeLoadNode := NewNode(fmt.Sprintf("node1-%s", id)). - WithUpgradeState(upgrade.UpgradeStatePodRestartRequired). + WithUpgradeState(common.UpgradeStatePodRestartRequired). WithAnnotations(map[string]string{safeLoadAnnotation: "true"}). Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStatePodRestartRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStatePodRestartRequired] = []*common.NodeUpgradeState{ { Node: waitForSafeLoadNode, DriverPod: upToDatePod, @@ -816,7 +818,7 @@ var _ = Describe("UpgradeStateManager tests", func() { policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, } - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) stateManager.NodeUpgradeStateProvider = provider Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) @@ -832,19 +834,19 @@ var _ = Describe("UpgradeStateManager tests", func() { Phase: "Running", ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, }, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} - podRestartNode := NewNode("pod-restart-node").WithUpgradeState(upgrade.UpgradeStatePodRestartRequired).Create() - upgradeFailedNode := NewNode("upgrade-failed-node").WithUpgradeState(upgrade.UpgradeStateFailed).Create() + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + podRestartNode := NewNode("pod-restart-node").WithUpgradeState(common.UpgradeStatePodRestartRequired).Create() + upgradeFailedNode := NewNode("upgrade-failed-node").WithUpgradeState(common.UpgradeStateFailed).Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStatePodRestartRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStatePodRestartRequired] = []*common.NodeUpgradeState{ { Node: podRestartNode, DriverPod: pod, DriverDaemonSet: daemonSet, }, } - clusterState.NodeStates[upgrade.UpgradeStateFailed] = []*upgrade.NodeUpgradeState{ + clusterState.NodeStates[common.UpgradeStateFailed] = []*common.NodeUpgradeState{ { Node: upgradeFailedNode, DriverPod: pod, @@ -857,8 +859,8 @@ var _ = Describe("UpgradeStateManager tests", func() { } Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(podRestartNode)).To(Equal(upgrade.UpgradeStateUncordonRequired)) - Expect(getNodeUpgradeState(upgradeFailedNode)).To(Equal(upgrade.UpgradeStateUncordonRequired)) + Expect(getNodeUpgradeState(podRestartNode)).To(Equal(common.UpgradeStateUncordonRequired)) + Expect(getNodeUpgradeState(upgradeFailedNode)).To(Equal(common.UpgradeStateUncordonRequired)) }) It("UpgradeStateManager should move pod to UpgradeDone state "+ "if it's in PodRestart or UpgradeFailed, driver pod is up-to-date and ready, and node was initially Unschedulable", func() { @@ -870,27 +872,27 @@ var _ = Describe("UpgradeStateManager tests", func() { Phase: "Running", ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, }, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}} podRestartNode := NewNode("pod-restart-node-unschedulable"). - WithUpgradeState(upgrade.UpgradeStatePodRestartRequired). - WithAnnotations(map[string]string{upgrade.GetUpgradeInitialStateAnnotationKey(): "true"}). + WithUpgradeState(common.UpgradeStatePodRestartRequired). + WithAnnotations(map[string]string{common.GetUpgradeInitialStateAnnotationKey(): "true"}). Unschedulable(true). Create() upgradeFailedNode := NewNode("upgrade-failed-node-unschedulable"). - WithUpgradeState(upgrade.UpgradeStateFailed). - WithAnnotations(map[string]string{upgrade.GetUpgradeInitialStateAnnotationKey(): "true"}). + WithUpgradeState(common.UpgradeStateFailed). + WithAnnotations(map[string]string{common.GetUpgradeInitialStateAnnotationKey(): "true"}). Unschedulable(true). Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStatePodRestartRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStatePodRestartRequired] = []*common.NodeUpgradeState{ { Node: podRestartNode, DriverPod: pod, DriverDaemonSet: daemonSet, }, } - clusterState.NodeStates[upgrade.UpgradeStateFailed] = []*upgrade.NodeUpgradeState{ + clusterState.NodeStates[common.UpgradeStateFailed] = []*common.NodeUpgradeState{ { Node: upgradeFailedNode, DriverPod: pod, @@ -902,12 +904,12 @@ var _ = Describe("UpgradeStateManager tests", func() { AutoUpgrade: true, } - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) stateManager.NodeUpgradeStateProvider = provider Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(podRestartNode)).To(Equal(upgrade.UpgradeStateDone)) - Expect(getNodeUpgradeState(upgradeFailedNode)).To(Equal(upgrade.UpgradeStateDone)) + Expect(getNodeUpgradeState(podRestartNode)).To(Equal(common.UpgradeStateDone)) + Expect(getNodeUpgradeState(upgradeFailedNode)).To(Equal(common.UpgradeStateDone)) // unschedulable annotation should be removed Expect(isUnschedulableAnnotationPresent(podRestartNode)).To(Equal(false)) Expect(isUnschedulableAnnotationPresent(upgradeFailedNode)).To(Equal(false)) @@ -921,7 +923,7 @@ var _ = Describe("UpgradeStateManager tests", func() { Phase: "Running", ContainerStatuses: []corev1.ContainerStatus{{Ready: false, RestartCount: 0}}, }, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}, + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}, } // pod2, initCtr finished, mainCtr not Ready w/ no repeated restarts pod2 := pod1.DeepCopy() @@ -936,12 +938,12 @@ var _ = Describe("UpgradeStateManager tests", func() { nodes := make([]*corev1.Node, 4) for i := 0; i < len(nodes); i++ { nodes[i] = NewNode(fmt.Sprintf("node%d-%s", i, id)). - WithUpgradeState(upgrade.UpgradeStatePodRestartRequired). + WithUpgradeState(common.UpgradeStatePodRestartRequired). Create() } - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStatePodRestartRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStatePodRestartRequired] = []*common.NodeUpgradeState{ {Node: nodes[0], DriverPod: pod1, DriverDaemonSet: daemonSet}, {Node: nodes[1], DriverPod: pod2, DriverDaemonSet: daemonSet}, {Node: nodes[2], DriverPod: pod3, DriverDaemonSet: daemonSet}, @@ -953,10 +955,10 @@ var _ = Describe("UpgradeStateManager tests", func() { } Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(nodes[0])).To(Equal(upgrade.UpgradeStatePodRestartRequired)) - Expect(getNodeUpgradeState(nodes[1])).To(Equal(upgrade.UpgradeStatePodRestartRequired)) - Expect(getNodeUpgradeState(nodes[2])).To(Equal(upgrade.UpgradeStateFailed)) - Expect(getNodeUpgradeState(nodes[3])).To(Equal(upgrade.UpgradeStateFailed)) + Expect(getNodeUpgradeState(nodes[0])).To(Equal(common.UpgradeStatePodRestartRequired)) + Expect(getNodeUpgradeState(nodes[1])).To(Equal(common.UpgradeStatePodRestartRequired)) + Expect(getNodeUpgradeState(nodes[2])).To(Equal(common.UpgradeStateFailed)) + Expect(getNodeUpgradeState(nodes[3])).To(Equal(common.UpgradeStateFailed)) }) It("UpgradeStateManager should move pod to UpgradeValidationRequired state "+ "if it's in PodRestart, driver pod is up-to-date and ready, and validation is enabled", func() { @@ -968,14 +970,14 @@ var _ = Describe("UpgradeStateManager tests", func() { Phase: "Running", ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, }, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-12345"}}, + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-12345"}}, } podRestartNode := NewNode(fmt.Sprintf("node1-%s", id)). - WithUpgradeState(upgrade.UpgradeStatePodRestartRequired). + WithUpgradeState(common.UpgradeStatePodRestartRequired). Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStatePodRestartRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStatePodRestartRequired] = []*common.NodeUpgradeState{ { Node: podRestartNode, DriverPod: pod, @@ -985,22 +987,21 @@ var _ = Describe("UpgradeStateManager tests", func() { policy := &v1alpha1.DriverUpgradePolicySpec{ AutoUpgrade: true, } - - stateManager = stateManager.WithValidationEnabled("app=validator").(*upgrade.ClusterUpgradeStateManagerImpl) - Expect(stateManager.IsValidationEnabled()).To(Equal(true)) + commonStateManager := stateManager.CommonUpgradeManagerImpl.WithValidationEnabled("app=validator") + Expect(commonStateManager.IsValidationEnabled()).To(Equal(true)) // do not mock NodeUpgradeStateProvider as it is used during ProcessUpgradeValidationRequiredNodes() - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) stateManager.NodeUpgradeStateProvider = provider Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(podRestartNode)).To(Equal(upgrade.UpgradeStateValidationRequired)) + Expect(getNodeUpgradeState(podRestartNode)).To(Equal(common.UpgradeStateValidationRequired)) }) It("UpgradeStateManager should move pod to UpgradeUncordonRequired state "+ "if it's in ValidationRequired and validation has completed", func() { ctx := context.TODO() node := NewNode(fmt.Sprintf("node1-%s", id)). - WithUpgradeState(upgrade.UpgradeStateValidationRequired). + WithUpgradeState(common.UpgradeStateValidationRequired). Create() namespace := createNamespace(fmt.Sprintf("namespace-%s", id)) @@ -1008,8 +1009,8 @@ var _ = Describe("UpgradeStateManager tests", func() { WithLabels(map[string]string{"app": "validator"}). Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateValidationRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateValidationRequired] = []*common.NodeUpgradeState{ { Node: node, DriverPod: &corev1.Pod{}, @@ -1021,22 +1022,22 @@ var _ = Describe("UpgradeStateManager tests", func() { AutoUpgrade: true, } - stateManager = stateManager.WithValidationEnabled("app=validator").(*upgrade.ClusterUpgradeStateManagerImpl) - Expect(stateManager.IsValidationEnabled()).To(Equal(true)) + commonStateManager := stateManager.CommonUpgradeManagerImpl.WithValidationEnabled("app=validator") + Expect(commonStateManager.IsValidationEnabled()).To(Equal(true)) // do not mock NodeUpgradeStateProvider as it is used during ProcessUpgradeValidationRequiredNodes() - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) stateManager.NodeUpgradeStateProvider = provider Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(node)).To(Equal(upgrade.UpgradeStateUncordonRequired)) + Expect(getNodeUpgradeState(node)).To(Equal(common.UpgradeStateUncordonRequired)) }) It("UpgradeStateManager should move pod to UpgradeDone state"+ "if it's in ValidationRequired, validation has completed, and node was initially Unschedulable", func() { ctx := context.TODO() node := NewNode(fmt.Sprintf("node1-%s", id)). - WithUpgradeState(upgrade.UpgradeStateValidationRequired). - WithAnnotations(map[string]string{upgrade.GetUpgradeInitialStateAnnotationKey(): "true"}). + WithUpgradeState(common.UpgradeStateValidationRequired). + WithAnnotations(map[string]string{common.GetUpgradeInitialStateAnnotationKey(): "true"}). Unschedulable(true). Create() @@ -1045,8 +1046,8 @@ var _ = Describe("UpgradeStateManager tests", func() { WithLabels(map[string]string{"app": "validator"}). Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateValidationRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateValidationRequired] = []*common.NodeUpgradeState{ { Node: node, DriverPod: &corev1.Pod{}, @@ -1058,22 +1059,22 @@ var _ = Describe("UpgradeStateManager tests", func() { AutoUpgrade: true, } - stateManager = stateManager.WithValidationEnabled("app=validator").(*upgrade.ClusterUpgradeStateManagerImpl) - Expect(stateManager.IsValidationEnabled()).To(Equal(true)) + commonStateManager := stateManager.CommonUpgradeManagerImpl.WithValidationEnabled("app=validator") + Expect(commonStateManager.IsValidationEnabled()).To(Equal(true)) // do not mock NodeUpgradeStateProvider as it is used during ProcessUpgradeValidationRequiredNodes() - provider := upgrade.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) + provider := common.NewNodeUpgradeStateProvider(k8sClient, log, eventRecorder) stateManager.NodeUpgradeStateProvider = provider Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(node)).To(Equal(upgrade.UpgradeStateDone)) + Expect(getNodeUpgradeState(node)).To(Equal(common.UpgradeStateDone)) // unschedulable annotation should be removed Expect(isUnschedulableAnnotationPresent(node)).To(Equal(false)) }) It("UpgradeStateManager should uncordon UncordonRequired pod and finish upgrade", func() { - node := nodeWithUpgradeState(upgrade.UpgradeStateUncordonRequired) + node := nodeWithUpgradeState(common.UpgradeStateUncordonRequired) - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateUncordonRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateUncordonRequired] = []*common.NodeUpgradeState{ { Node: node, }, @@ -1093,13 +1094,13 @@ var _ = Describe("UpgradeStateManager tests", func() { stateManager.CordonManager = &cordonManagerMock Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(node)).To(Equal(upgrade.UpgradeStateDone)) + Expect(getNodeUpgradeState(node)).To(Equal(common.UpgradeStateDone)) }) It("UpgradeStateManager should fail if cordonManager fails", func() { - node := nodeWithUpgradeState(upgrade.UpgradeStateUncordonRequired) + node := nodeWithUpgradeState(common.UpgradeStateUncordonRequired) - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateUncordonRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateUncordonRequired] = []*common.NodeUpgradeState{ { Node: node, }, @@ -1118,76 +1119,76 @@ var _ = Describe("UpgradeStateManager tests", func() { stateManager.CordonManager = &cordonManagerMock Expect(stateManager.ApplyState(ctx, &clusterState, policy)).ToNot(Succeed()) - Expect(getNodeUpgradeState(node)).ToNot(Equal(upgrade.UpgradeStateDone)) + Expect(getNodeUpgradeState(node)).ToNot(Equal(common.UpgradeStateDone)) }) }) It("UpgradeStateManager should not move outdated node to UpgradeRequired states with orphaned pod", func() { orphanedPod := &corev1.Pod{} UnknownToUpgradeDoneNode := nodeWithUpgradeState("") - DoneToUpgradeDoneNode := nodeWithUpgradeState(upgrade.UpgradeStateDone) + DoneToUpgradeDoneNode := nodeWithUpgradeState(common.UpgradeStateDone) - clusterState := upgrade.NewClusterUpgradeState() - unknownNodes := []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + unknownNodes := []*common.NodeUpgradeState{ {Node: UnknownToUpgradeDoneNode, DriverPod: orphanedPod, DriverDaemonSet: nil}, } - doneNodes := []*upgrade.NodeUpgradeState{ + doneNodes := []*common.NodeUpgradeState{ {Node: DoneToUpgradeDoneNode, DriverPod: orphanedPod, DriverDaemonSet: nil}, } clusterState.NodeStates[""] = unknownNodes - clusterState.NodeStates[upgrade.UpgradeStateDone] = doneNodes + clusterState.NodeStates[common.UpgradeStateDone] = doneNodes Expect(stateManager.ApplyState(ctx, &clusterState, &v1alpha1.DriverUpgradePolicySpec{AutoUpgrade: true})).To(Succeed()) - Expect(getNodeUpgradeState(UnknownToUpgradeDoneNode)).To(Equal(upgrade.UpgradeStateDone)) - Expect(getNodeUpgradeState(DoneToUpgradeDoneNode)).To(Equal(upgrade.UpgradeStateDone)) + Expect(getNodeUpgradeState(UnknownToUpgradeDoneNode)).To(Equal(common.UpgradeStateDone)) + Expect(getNodeUpgradeState(DoneToUpgradeDoneNode)).To(Equal(common.UpgradeStateDone)) }) It("UpgradeStateManager should move outdated node to UpgradeRequired states with orphaned pod if upgrade-requested", func() { orphanedPod := &corev1.Pod{} UnknownToUpgradeRequiredNode := nodeWithUpgradeState("") - UnknownToUpgradeRequiredNode.Annotations[upgrade.GetUpgradeRequestedAnnotationKey()] = "true" - DoneToUpgradeRequiredNode := nodeWithUpgradeState(upgrade.UpgradeStateDone) - DoneToUpgradeRequiredNode.Annotations[upgrade.GetUpgradeRequestedAnnotationKey()] = "true" + UnknownToUpgradeRequiredNode.Annotations[common.GetUpgradeRequestedAnnotationKey()] = "true" + DoneToUpgradeRequiredNode := nodeWithUpgradeState(common.UpgradeStateDone) + DoneToUpgradeRequiredNode.Annotations[common.GetUpgradeRequestedAnnotationKey()] = "true" - clusterState := upgrade.NewClusterUpgradeState() - unknownNodes := []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + unknownNodes := []*common.NodeUpgradeState{ {Node: UnknownToUpgradeRequiredNode, DriverPod: orphanedPod, DriverDaemonSet: nil}, } - doneNodes := []*upgrade.NodeUpgradeState{ + doneNodes := []*common.NodeUpgradeState{ {Node: DoneToUpgradeRequiredNode, DriverPod: orphanedPod, DriverDaemonSet: nil}, } clusterState.NodeStates[""] = unknownNodes - clusterState.NodeStates[upgrade.UpgradeStateDone] = doneNodes + clusterState.NodeStates[common.UpgradeStateDone] = doneNodes Expect(stateManager.ApplyState(ctx, &clusterState, &v1alpha1.DriverUpgradePolicySpec{AutoUpgrade: true})).To(Succeed()) - Expect(getNodeUpgradeState(UnknownToUpgradeRequiredNode)).To(Equal(upgrade.UpgradeStateUpgradeRequired)) - Expect(getNodeUpgradeState(DoneToUpgradeRequiredNode)).To(Equal(upgrade.UpgradeStateUpgradeRequired)) + Expect(getNodeUpgradeState(UnknownToUpgradeRequiredNode)).To(Equal(common.UpgradeStateUpgradeRequired)) + Expect(getNodeUpgradeState(DoneToUpgradeRequiredNode)).To(Equal(common.UpgradeStateUpgradeRequired)) }) It("UpgradeStateManager should move upgrade required node to CordonRequired states with orphaned pod and remove upgrade-requested annotation", func() { orphanedPod := &corev1.Pod{} - UpgradeRequiredToCordonNodes := nodeWithUpgradeState(upgrade.UpgradeStateUpgradeRequired) - UpgradeRequiredToCordonNodes.Annotations[upgrade.GetUpgradeRequestedAnnotationKey()] = "true" + UpgradeRequiredToCordonNodes := nodeWithUpgradeState(common.UpgradeStateUpgradeRequired) + UpgradeRequiredToCordonNodes.Annotations[common.GetUpgradeRequestedAnnotationKey()] = "true" - clusterState := upgrade.NewClusterUpgradeState() - upgradeRequiredNodes := []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + upgradeRequiredNodes := []*common.NodeUpgradeState{ {Node: UpgradeRequiredToCordonNodes, DriverPod: orphanedPod, DriverDaemonSet: nil}, } - clusterState.NodeStates[upgrade.UpgradeStateUpgradeRequired] = upgradeRequiredNodes + clusterState.NodeStates[common.UpgradeStateUpgradeRequired] = upgradeRequiredNodes Expect(stateManager.ApplyState(ctx, &clusterState, &v1alpha1.DriverUpgradePolicySpec{AutoUpgrade: true})).To(Succeed()) - Expect(getNodeUpgradeState(UpgradeRequiredToCordonNodes)).To(Equal(upgrade.UpgradeStateCordonRequired)) - Expect(UpgradeRequiredToCordonNodes.Annotations[upgrade.GetUpgradeRequestedAnnotationKey()]).To(Equal("")) + Expect(getNodeUpgradeState(UpgradeRequiredToCordonNodes)).To(Equal(common.UpgradeStateCordonRequired)) + Expect(UpgradeRequiredToCordonNodes.Annotations[common.GetUpgradeRequestedAnnotationKey()]).To(Equal("")) }) It("UpgradeStateManager should restart pod if it is Orphaned", func() { orphanedPod := &corev1.Pod{ Status: corev1.PodStatus{Phase: "Running"}, - ObjectMeta: v1.ObjectMeta{Labels: map[string]string{upgrade.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} + ObjectMeta: v1.ObjectMeta{Labels: map[string]string{common.PodControllerRevisionHashLabelKey: "test-hash-outdated"}}} - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStatePodRestartRequired] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStatePodRestartRequired] = []*common.NodeUpgradeState{ { - Node: nodeWithUpgradeState(upgrade.UpgradeStatePodRestartRequired), + Node: nodeWithUpgradeState(common.UpgradeStatePodRestartRequired), DriverPod: orphanedPod, DriverDaemonSet: nil, }, @@ -1218,10 +1219,10 @@ var _ = Describe("UpgradeStateManager tests", func() { ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, }, } - upgradeFailedNode := NewNode("upgrade-failed-node").WithUpgradeState(upgrade.UpgradeStateFailed).Create() + upgradeFailedNode := NewNode("upgrade-failed-node").WithUpgradeState(common.UpgradeStateFailed).Create() - clusterState := upgrade.NewClusterUpgradeState() - clusterState.NodeStates[upgrade.UpgradeStateFailed] = []*upgrade.NodeUpgradeState{ + clusterState := common.NewClusterUpgradeState() + clusterState.NodeStates[common.UpgradeStateFailed] = []*common.NodeUpgradeState{ { Node: upgradeFailedNode, DriverPod: pod, @@ -1234,7 +1235,7 @@ var _ = Describe("UpgradeStateManager tests", func() { } Expect(stateManager.ApplyState(ctx, &clusterState, policy)).To(Succeed()) - Expect(getNodeUpgradeState(upgradeFailedNode)).To(Equal(upgrade.UpgradeStateFailed)) + Expect(getNodeUpgradeState(upgradeFailedNode)).To(Equal(common.UpgradeStateFailed)) }) }) @@ -1242,7 +1243,7 @@ var _ = Describe("UpgradeStateManager tests", func() { func nodeWithUpgradeState(state string) *corev1.Node { return &corev1.Node{ ObjectMeta: v1.ObjectMeta{ - Labels: map[string]string{upgrade.GetUpgradeStateLabelKey(): state}, + Labels: map[string]string{common.GetUpgradeStateLabelKey(): state}, Annotations: map[string]string{}, }, } diff --git a/pkg/upgrade/manager/upgrade_suit_test.go b/pkg/upgrade/manager/upgrade_suit_test.go new file mode 100644 index 00000000..52140535 --- /dev/null +++ b/pkg/upgrade/manager/upgrade_suit_test.go @@ -0,0 +1,419 @@ +/* +Copyright 2022 NVIDIA CORPORATION & AFFILIATES + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package upgrade_test + +import ( + "context" + "math/rand" + "testing" + + "github.com/go-logr/logr" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/stretchr/testify/mock" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/manager/mocks" + // +kubebuilder:scaffold:imports +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var k8sConfig *rest.Config +var k8sClient client.Client +var k8sInterface kubernetes.Interface +var testEnv *envtest.Environment +var log logr.Logger +var nodeUpgradeStateProvider mocks.NodeUpgradeStateProvider +var drainManager mocks.DrainManager +var podManager mocks.PodManager +var cordonManager mocks.CordonManager +var validationManager mocks.ValidationManager +var eventRecorder = record.NewFakeRecorder(100) + +var createdObjects []client.Object + +func TestAPIs(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecs(t, "Controller Suite") +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{} + + var err error + k8sConfig, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(k8sConfig).NotTo(BeNil()) + + // +kubebuilder:scaffold:scheme + + k8sClient, err = client.New(k8sConfig, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + + k8sInterface, err = kubernetes.NewForConfig(k8sConfig) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sInterface).NotTo(BeNil()) + + log = ctrl.Log.WithName("upgradeSuitTest") + + // set driver name to be managed by the upgrade-manager + common.SetDriverName("gpu") + + nodeUpgradeStateProvider = mocks.NodeUpgradeStateProvider{} + nodeUpgradeStateProvider. + On("ChangeNodeUpgradeState", mock.Anything, mock.Anything, mock.Anything). + Return(func(ctx context.Context, node *corev1.Node, newNodeState string) error { + node.Labels[common.GetUpgradeStateLabelKey()] = newNodeState + return nil + }) + nodeUpgradeStateProvider. + On("ChangeNodeUpgradeAnnotation", mock.Anything, mock.Anything, mock.Anything, mock.Anything). + Return(func(ctx context.Context, node *corev1.Node, key string, value string) error { + if value == "null" { + delete(node.Annotations, key) + } else { + node.Annotations[key] = value + } + return nil + }) + nodeUpgradeStateProvider. + On("GetNode", mock.Anything, mock.Anything). + Return( + func(ctx context.Context, nodeName string) *corev1.Node { + return getNode(nodeName) + }, + func(ctx context.Context, nodeName string) error { + return nil + }, + ) + + drainManager = mocks.DrainManager{} + drainManager. + On("ScheduleNodesDrain", mock.Anything, mock.Anything). + Return(nil) + podManager = mocks.PodManager{} + podManager. + On("SchedulePodsRestart", mock.Anything, mock.Anything). + Return(nil) + podManager. + On("ScheduleCheckOnPodCompletion", mock.Anything, mock.Anything). + Return(nil) + podManager. + On("SchedulePodEviction", mock.Anything, mock.Anything). + Return(nil) + podManager. + On("GetPodDeletionFilter"). + Return(nil) + podManager. + On("GetPodControllerRevisionHash", mock.Anything, mock.Anything). + Return( + func(ctx context.Context, pod *corev1.Pod) string { + return pod.Labels[common.PodControllerRevisionHashLabelKey] + }, + func(ctx context.Context, pod *corev1.Pod) error { + return nil + }, + ) + podManager. + On("GetDaemonsetControllerRevisionHash", mock.Anything, mock.Anything, mock.Anything). + Return("test-hash-12345", nil) + cordonManager = mocks.CordonManager{} + cordonManager. + On("Cordon", mock.Anything, mock.Anything, mock.Anything). + Return(nil) + cordonManager. + On("Uncordon", mock.Anything, mock.Anything, mock.Anything). + Return(nil) + validationManager = mocks.ValidationManager{} + validationManager. + On("Validate", mock.Anything, mock.Anything). + Return(true, nil) +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) + +var _ = BeforeEach(func() { + createdObjects = nil +}) + +var _ = AfterEach(func() { + for i := range createdObjects { + r := createdObjects[i] + key := client.ObjectKeyFromObject(r) + err := k8sClient.Get(context.TODO(), key, r) + if err == nil { + Expect(k8sClient.Delete(context.TODO(), r)).To(Succeed()) + } + // drain events from FakeRecorder + for len(eventRecorder.Events) > 0 { + <-eventRecorder.Events + } + _, isNamespace := r.(*corev1.Namespace) + if !isNamespace { + Eventually(func() error { + return k8sClient.Get(context.TODO(), key, r) + }).Should(HaveOccurred()) + } + } +}) + +type Node struct { + *corev1.Node +} + +func NewNode(name string) Node { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{"dummy-key": "dummy-value"}, + Annotations: map[string]string{"dummy-key": "dummy-value"}, + }, + } + Expect(node.Labels).NotTo(BeNil()) + return Node{node} +} + +func (n Node) WithUpgradeState(state string) Node { + if n.Labels == nil { + n.Labels = make(map[string]string) + } + n.Labels[common.GetUpgradeStateLabelKey()] = state + return n +} + +func (n Node) WithLabels(l map[string]string) Node { + n.Labels = l + return n +} + +func (n Node) WithAnnotations(a map[string]string) Node { + n.Annotations = a + return n +} + +func (n Node) Unschedulable(b bool) Node { + n.Spec.Unschedulable = b + return n +} + +func (n Node) Create() *corev1.Node { + node := n.Node + err := k8sClient.Create(context.TODO(), node) + Expect(err).NotTo(HaveOccurred()) + createdObjects = append(createdObjects, node) + return node +} + +type DaemonSet struct { + *appsv1.DaemonSet + + desiredNumberScheduled int32 +} + +func NewDaemonSet(name, namespace string, selector map[string]string) DaemonSet { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: selector}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: selector, + }, + Spec: corev1.PodSpec{ + // fill in some required fields in the pod spec + Containers: []corev1.Container{ + {Name: "foo", Image: "foo"}, + }, + }, + }, + }, + } + return DaemonSet{ds, 0} +} + +func (d DaemonSet) WithLabels(labels map[string]string) DaemonSet { + d.ObjectMeta.Labels = labels + return d +} + +func (d DaemonSet) WithDesiredNumberScheduled(num int32) DaemonSet { + d.desiredNumberScheduled = num + return d +} + +func (d DaemonSet) Create() *appsv1.DaemonSet { + ds := d.DaemonSet + err := k8sClient.Create(context.TODO(), ds) + Expect(err).NotTo(HaveOccurred()) + + // set Pod in Running state and mark Container as Ready + ds.Status.DesiredNumberScheduled = d.desiredNumberScheduled + err = k8sClient.Status().Update(context.TODO(), ds) + Expect(err).NotTo(HaveOccurred()) + createdObjects = append(createdObjects, ds) + return ds +} + +type Pod struct { + *corev1.Pod +} + +func NewPod(name, namespace, nodeName string) Pod { + gracePeriodSeconds := int64(0) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: corev1.PodSpec{ + TerminationGracePeriodSeconds: &gracePeriodSeconds, + NodeName: nodeName, + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "test-image", + }, + }, + }, + } + + return Pod{pod} +} + +func (p Pod) WithLabels(labels map[string]string) Pod { + p.ObjectMeta.Labels = labels + return p +} + +func (p Pod) WithEmptyDir() Pod { + p.Spec.Volumes = []corev1.Volume{ + { + Name: "volume", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + } + return p +} + +func (p Pod) WithResource(name, quantity string) Pod { + resourceQuantity, err := resource.ParseQuantity(quantity) + Expect(err).NotTo(HaveOccurred()) + p.Spec.Containers[0].Resources = corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceName(name): resourceQuantity, + }, + } + return p +} + +func (p Pod) WithOwnerReference(ownerRef metav1.OwnerReference) Pod { + p.OwnerReferences = append(p.OwnerReferences, ownerRef) + return p +} + +func (p Pod) Create() *corev1.Pod { + pod := p.Pod + err := k8sClient.Create(context.TODO(), pod) + Expect(err).NotTo(HaveOccurred()) + + // set Pod in Running state and mark Container as Ready + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{Ready: true}} + err = k8sClient.Status().Update(context.TODO(), pod) + Expect(err).NotTo(HaveOccurred()) + createdObjects = append(createdObjects, pod) + return pod +} + +func createNamespace(name string) *corev1.Namespace { + namespace := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + err := k8sClient.Create(context.TODO(), namespace) + Expect(err).NotTo(HaveOccurred()) + createdObjects = append(createdObjects, namespace) + return namespace +} + +func updatePodStatus(pod *corev1.Pod) error { + err := k8sClient.Status().Update(context.TODO(), pod) + Expect(err).NotTo(HaveOccurred()) + return err +} + +func createNode(name string) *corev1.Node { + node := &corev1.Node{} + node.Name = name + err := k8sClient.Create(context.TODO(), node) + Expect(err).NotTo(HaveOccurred()) + createdObjects = append(createdObjects, node) + return node +} + +func getNode(name string) *corev1.Node { + node := &corev1.Node{} + err := k8sClient.Get(context.TODO(), types.NamespacedName{Name: name}, node) + Expect(err).NotTo(HaveOccurred()) + Expect(node).NotTo(BeNil()) + return node +} + +func getNodeUpgradeState(node *corev1.Node) string { + return node.Labels[common.GetUpgradeStateLabelKey()] +} + +func isUnschedulableAnnotationPresent(node *corev1.Node) bool { + _, ok := node.Annotations[common.GetUpgradeInitialStateAnnotationKey()] + return ok +} + +func randSeq(n int) string { + letters := []rune("abcdefghijklmnopqrstuvwxyz") + b := make([]rune, n) + for i := range b { + b[i] = letters[rand.Intn(len(letters))] + } + return string(b) +} diff --git a/pkg/upgrade/requestor/upgrade_requestor.go b/pkg/upgrade/requestor/upgrade_requestor.go new file mode 100644 index 00000000..983581fc --- /dev/null +++ b/pkg/upgrade/requestor/upgrade_requestor.go @@ -0,0 +1,77 @@ +/* +Copyright 2022 NVIDIA CORPORATION & AFFILIATES + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package requestor + +import ( + "context" + + "github.com/NVIDIA/k8s-operator-libs/pkg/consts" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common" +) + +// RequestorUpgradeManagerImpl contains concrete implementations for distinct requestor (e.g. maintenance OP) upgrade mode +type RequestorUpgradeManagerImpl struct { + *common.CommonUpgradeManagerImpl +} + +// ProcessUpgradeRequiredNodes processes UpgradeStateUpgradeRequired nodes and moves them to UpgradeStateCordonRequired +// until the limit on max parallel upgrades is reached. +func (m *RequestorUpgradeManagerImpl) ProcessUpgradeRequiredNodes( + ctx context.Context, currentClusterState *common.ClusterUpgradeState, upgradesAvailable int) error { + m.Log.V(consts.LogLevelInfo).Info("ProcessUpgradeRequiredNodes") + for _, nodeState := range currentClusterState.NodeStates[common.UpgradeStateUpgradeRequired] { + if m.IsUpgradeRequested(nodeState.Node) { + // Make sure to remove the upgrade-requested annotation + err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeAnnotation(ctx, nodeState.Node, + common.GetUpgradeRequestedAnnotationKey(), "null") + if err != nil { + m.Log.V(consts.LogLevelError).Error( + err, "Failed to delete node upgrade-requested annotation") + return err + } + } + if m.SkipNodeUpgrade(nodeState.Node) { + m.Log.V(consts.LogLevelInfo).Info("Node is marked for skipping upgrades", "node", nodeState.Node.Name) + continue + } + + if upgradesAvailable <= 0 { + // when no new node upgrades are available, progess with manually cordoned nodes + if m.IsNodeUnschedulable(nodeState.Node) { + m.Log.V(consts.LogLevelDebug).Info("Node is already cordoned, progressing for driver upgrade", + "node", nodeState.Node.Name) + } else { + m.Log.V(consts.LogLevelDebug).Info("Node upgrade limit reached, pausing further upgrades", + "node", nodeState.Node.Name) + continue + } + } + + err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeState(ctx, nodeState.Node, common.UpgradeStateCordonRequired) + if err == nil { + upgradesAvailable-- + m.Log.V(consts.LogLevelInfo).Info("Node waiting for cordon", + "node", nodeState.Node.Name) + } else { + m.Log.V(consts.LogLevelError).Error( + err, "Failed to change node upgrade state", "state", common.UpgradeStateCordonRequired) + return err + } + } + + return nil +}