From 4245aa3a239691af3a852604e90b046b57257b09 Mon Sep 17 00:00:00 2001 From: Yury Kulazhenkov Date: Mon, 28 Oct 2024 17:01:15 +0200 Subject: [PATCH] feat: Update controller logic to handle stale SriovNetworkNodeState CRs with delay - Changed the logic in the sriov-network-operator controller to handle stale SriovNetworkNodeState CRs (those with no matching Nodes with daemon). - Introduced a delay (30 minutes by default) before removing stale state CRs to manage scenarios where the user temporarily removes the daemon from the node but does not want to lose the state stored in the SriovNetworkNodeState. - Added the `STALE_NODE_STATE_CLEANUP_DELAY` environment variable to configure the required delay in minutes (default is 30 minutes). --- api/v1/helper.go | 41 ++++++++++++++ .../sriovnetworknodepolicy_controller.go | 51 ++++++++++++++++-- .../sriovnetworknodepolicy_controller_test.go | 54 +++++++++++++++++++ .../templates/operator.yaml | 2 + .../sriov-network-operator-chart/values.yaml | 3 ++ pkg/consts/constants.go | 2 + 6 files changed, 148 insertions(+), 5 deletions(-) diff --git a/api/v1/helper.go b/api/v1/helper.go index bfdfbc4731..d9dd033f83 100644 --- a/api/v1/helper.go +++ b/api/v1/helper.go @@ -12,6 +12,7 @@ import ( "sort" "strconv" "strings" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -1004,3 +1005,43 @@ func GenerateBridgeName(iface *InterfaceExt) string { func NeedToUpdateBridges(bridgeSpec, bridgeStatus *Bridges) bool { return !reflect.DeepEqual(bridgeSpec, bridgeStatus) } + +// SetKeepUntilTime sets an annotation to hold the "keep until time" for the node’s state. +// The "keep until time" specifies the earliest time at which the state object can be removed +// if the daemon's pod is not found on the node. +func (s *SriovNetworkNodeState) SetKeepUntilTime(t time.Time) { + ts := t.Format(time.RFC3339) + annotations := s.GetAnnotations() + if annotations == nil { + annotations = map[string]string{} + } + annotations[consts.NodeStateKeepUntilAnnotation] = ts + s.SetAnnotations(annotations) +} + +// GetKeepUntilTime returns the value that is stored in the "keep until time" annotation. +// The "keep until time" specifies the earliest time at which the state object can be removed +// if the daemon's pod is not found on the node. +// Return zero time instant if annotaion is not found on the object or if it has a wrong format. +func (s *SriovNetworkNodeState) GetKeepUntilTime() time.Time { + t, err := time.Parse(time.RFC3339, s.GetAnnotations()[consts.NodeStateKeepUntilAnnotation]) + if err != nil { + return time.Time{} + } + return t +} + +// ResetKeepUntilTime removes "keep until time" annotation from the state object. +// The "keep until time" specifies the earliest time at which the state object can be removed +// if the daemon's pod is not found on the node. +// Returns true if the value was removed, false otherwise. +func (s *SriovNetworkNodeState) ResetKeepUntilTime() bool { + annotations := s.GetAnnotations() + _, exist := annotations[consts.NodeStateKeepUntilAnnotation] + if !exist { + return false + } + delete(annotations, consts.NodeStateKeepUntilAnnotation) + s.SetAnnotations(annotations) + return true +} diff --git a/controllers/sriovnetworknodepolicy_controller.go b/controllers/sriovnetworknodepolicy_controller.go index be46880b7c..cbb4a712ce 100644 --- a/controllers/sriovnetworknodepolicy_controller.go +++ b/controllers/sriovnetworknodepolicy_controller.go @@ -20,8 +20,10 @@ import ( "context" "encoding/json" "fmt" + "os" "reflect" "sort" + "strconv" "strings" "time" @@ -296,10 +298,7 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con } } if !found { - logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name) - err := r.Delete(ctx, &ns, &client.DeleteOptions{}) - if err != nil { - logger.Error(err, "Fail to Delete", "SriovNetworkNodeState CR:", ns.GetName()) + if err := r.handleStaleNodeState(ctx, &ns); err != nil { return err } } @@ -308,6 +307,41 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con return nil } +// handleStaleNodeState handles stale SriovNetworkNodeState CR (the CR which no longer have a corresponding node with the daemon). +// If the CR has the "keep until time" annotation, indicating the earliest time the state object can be removed, +// this function will compare it to the current time to determine if deletion is permissible and do deletion if allowed. +// If the annotation is absent, the function will create one with a timestamp in future, using either the default or a configured offset. +func (r *SriovNetworkNodePolicyReconciler) handleStaleNodeState(ctx context.Context, ns *sriovnetworkv1.SriovNetworkNodeState) error { + logger := log.Log.WithName("handleStaleNodeState") + now := time.Now().UTC() + keepUntilTime := ns.GetKeepUntilTime() + if keepUntilTime.IsZero() { + // keep until time annotation is not set, set a new value with default or configured offset and update the object + delayMinutes, err := strconv.Atoi(os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY")) + if err != nil || delayMinutes <= 0 { + delayMinutes = 30 // keep objects for 30 minutes by default + } + keepUntilTime = now.Add(time.Minute * time.Duration(delayMinutes)) + logger.V(2).Info("SriovNetworkNodeState has no matching node, configure cleanup delay for the state object", + "nodeStateName", ns.Name, "delay", delayMinutes, "keepUntilTime", keepUntilTime.String()) + ns.SetKeepUntilTime(keepUntilTime) + if err := r.Update(ctx, ns); err != nil { + logger.Error(err, "Fail to update SriovNetworkNodeState CR", "name", ns.GetName()) + return err + } + return nil + } + if now.Before(keepUntilTime) { + return nil + } + logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name) + if err := r.Delete(ctx, ns, &client.DeleteOptions{}); err != nil { + logger.Error(err, "Fail to delete SriovNetworkNodeState CR", "name", ns.GetName()) + return err + } + return nil +} + func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context.Context, dc *sriovnetworkv1.SriovOperatorConfig, npl *sriovnetworkv1.SriovNetworkNodePolicyList, @@ -333,9 +367,16 @@ func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context return fmt.Errorf("failed to get SriovNetworkNodeState: %v", err) } } else { + keepUntilAnnotationUpdated := found.ResetKeepUntilTime() + if len(found.Status.Interfaces) == 0 { logger.Info("SriovNetworkNodeState Status Interfaces are empty. Skip update of policies in spec", "namespace", ns.Namespace, "name", ns.Name) + if keepUntilAnnotationUpdated { + if err := r.Update(ctx, found); err != nil { + return fmt.Errorf("couldn't update SriovNetworkNodeState: %v", err) + } + } return nil } @@ -378,7 +419,7 @@ func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context // Note(adrianc): we check same ownerReferences since SriovNetworkNodeState // was owned by a default SriovNetworkNodePolicy. if we encounter a descripancy // we need to update. - if reflect.DeepEqual(newVersion.OwnerReferences, found.OwnerReferences) && + if !keepUntilAnnotationUpdated && reflect.DeepEqual(newVersion.OwnerReferences, found.OwnerReferences) && equality.Semantic.DeepEqual(newVersion.Spec, found.Spec) { logger.V(1).Info("SriovNetworkNodeState did not change, not updating") return nil diff --git a/controllers/sriovnetworknodepolicy_controller_test.go b/controllers/sriovnetworknodepolicy_controller_test.go index a116efe87f..756593ef4a 100644 --- a/controllers/sriovnetworknodepolicy_controller_test.go +++ b/controllers/sriovnetworknodepolicy_controller_test.go @@ -3,18 +3,25 @@ package controllers import ( "context" "encoding/json" + "os" "testing" + "time" "github.com/google/go-cmp/cmp" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "sigs.k8s.io/controller-runtime/pkg/client/fake" dptypes "github.com/k8snetworkplumbingwg/sriov-network-device-plugin/pkg/types" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" v1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" @@ -126,3 +133,50 @@ func TestRenderDevicePluginConfigData(t *testing.T) { }) } } + +var _ = Describe("SriovNetworkNodePolicyReconciler", Ordered, func() { + Context("handleStaleNodeState", func() { + var ( + ctx context.Context + r *SriovNetworkNodePolicyReconciler + nodeState *sriovnetworkv1.SriovNetworkNodeState + ) + + BeforeEach(func() { + ctx = context.Background() + scheme := runtime.NewScheme() + utilruntime.Must(sriovnetworkv1.AddToScheme(scheme)) + nodeState = &sriovnetworkv1.SriovNetworkNodeState{ObjectMeta: metav1.ObjectMeta{Name: "node1"}} + r = &SriovNetworkNodePolicyReconciler{Client: fake.NewClientBuilder().WithObjects(nodeState).Build()} + }) + It("should set default delay", func() { + nodeState := nodeState.DeepCopy() + Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred()) + Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred()) + Expect(time.Now().UTC().Before(nodeState.GetKeepUntilTime())).To(BeTrue()) + }) + It("should remove if wait time expired", func() { + nodeState := nodeState.DeepCopy() + nodeState.SetKeepUntilTime(time.Now().UTC().Add(-time.Minute)) + Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred()) + Expect(errors.IsNotFound(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState))).To(BeTrue()) + }) + It("should keep existing wait time if already set", func() { + nodeState := nodeState.DeepCopy() + nodeState.SetKeepUntilTime(time.Now().UTC().Add(time.Minute)) + testTime := nodeState.GetKeepUntilTime() + r.Update(ctx, nodeState) + Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred()) + Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred()) + Expect(nodeState.GetKeepUntilTime()).To(Equal(testTime)) + }) + It("non default dealy", func() { + DeferCleanup(os.Setenv, "STALE_NODE_STATE_CLEANUP_DELAY", os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY")) + os.Setenv("STALE_NODE_STATE_CLEANUP_DELAY", "60") + nodeState := nodeState.DeepCopy() + Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred()) + Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred()) + Expect(time.Until(nodeState.GetKeepUntilTime()) > 30*time.Minute).To(BeTrue()) + }) + }) +}) diff --git a/deployment/sriov-network-operator-chart/templates/operator.yaml b/deployment/sriov-network-operator-chart/templates/operator.yaml index 0e89d1959e..61b27659d2 100644 --- a/deployment/sriov-network-operator-chart/templates/operator.yaml +++ b/deployment/sriov-network-operator-chart/templates/operator.yaml @@ -112,6 +112,8 @@ spec: value: {{ .Values.operator.cniBinPath }} - name: CLUSTER_TYPE value: {{ .Values.operator.clusterType }} + - name: STALE_NODE_STATE_CLEANUP_DELAY + value: "{{ .Values.operator.staleNodeStateCleanupDelay }}" {{- if .Values.operator.admissionControllers.enabled }} - name: ADMISSION_CONTROLLERS_CERTIFICATES_OPERATOR_SECRET_NAME value: {{ .Values.operator.admissionControllers.certificates.secretNames.operator }} diff --git a/deployment/sriov-network-operator-chart/values.yaml b/deployment/sriov-network-operator-chart/values.yaml index c70d6e323c..c351c7eb7c 100644 --- a/deployment/sriov-network-operator-chart/values.yaml +++ b/deployment/sriov-network-operator-chart/values.yaml @@ -27,6 +27,9 @@ operator: resourcePrefix: "openshift.io" cniBinPath: "/opt/cni/bin" clusterType: "kubernetes" + # minimal amount of time (in minutes) the operator will wait before removing + # stale SriovNetworkNodeState objects (objects that doesn't match node with the daemon) + staleNodeStateCleanupDelay: "30" metricsExporter: port: "9110" certificates: diff --git a/pkg/consts/constants.go b/pkg/consts/constants.go index f3c0761112..aea94ed6d6 100644 --- a/pkg/consts/constants.go +++ b/pkg/consts/constants.go @@ -86,6 +86,8 @@ const ( MCPPauseAnnotationState = "sriovnetwork.openshift.io/state" MCPPauseAnnotationTime = "sriovnetwork.openshift.io/time" + NodeStateKeepUntilAnnotation = "sriovnetwork.openshift.io/keep-state-until" + CheckpointFileName = "sno-initial-node-state.json" Unknown = "Unknown"