Skip to content

Commit

Permalink
feat: Update controller logic to handle stale SriovNetworkNodeState C…
Browse files Browse the repository at this point in the history
…Rs with delay

- Changed the logic in the sriov-network-operator controller to handle stale SriovNetworkNodeState CRs (those with no matching Nodes with daemon).
- Introduced a delay (30 minutes by default) before removing stale state CRs to manage scenarios where the user temporarily removes the daemon from the node but does not want to lose the state stored in the SriovNetworkNodeState.
- Added the `STALE_NODE_STATE_CLEANUP_DELAY` environment variable to configure the required delay in minutes (default is 30 minutes).
  • Loading branch information
ykulazhenkov committed Oct 28, 2024
1 parent 68b6c02 commit 4245aa3
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 5 deletions.
41 changes: 41 additions & 0 deletions api/v1/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"sort"
"strconv"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -1004,3 +1005,43 @@ func GenerateBridgeName(iface *InterfaceExt) string {
func NeedToUpdateBridges(bridgeSpec, bridgeStatus *Bridges) bool {
return !reflect.DeepEqual(bridgeSpec, bridgeStatus)
}

// SetKeepUntilTime sets an annotation to hold the "keep until time" for the node’s state.
// The "keep until time" specifies the earliest time at which the state object can be removed
// if the daemon's pod is not found on the node.
func (s *SriovNetworkNodeState) SetKeepUntilTime(t time.Time) {
ts := t.Format(time.RFC3339)
annotations := s.GetAnnotations()
if annotations == nil {
annotations = map[string]string{}
}
annotations[consts.NodeStateKeepUntilAnnotation] = ts
s.SetAnnotations(annotations)
}

// GetKeepUntilTime returns the value that is stored in the "keep until time" annotation.
// The "keep until time" specifies the earliest time at which the state object can be removed
// if the daemon's pod is not found on the node.
// Return zero time instant if annotaion is not found on the object or if it has a wrong format.
func (s *SriovNetworkNodeState) GetKeepUntilTime() time.Time {
t, err := time.Parse(time.RFC3339, s.GetAnnotations()[consts.NodeStateKeepUntilAnnotation])
if err != nil {
return time.Time{}
}
return t
}

// ResetKeepUntilTime removes "keep until time" annotation from the state object.
// The "keep until time" specifies the earliest time at which the state object can be removed
// if the daemon's pod is not found on the node.
// Returns true if the value was removed, false otherwise.
func (s *SriovNetworkNodeState) ResetKeepUntilTime() bool {
annotations := s.GetAnnotations()
_, exist := annotations[consts.NodeStateKeepUntilAnnotation]
if !exist {
return false
}
delete(annotations, consts.NodeStateKeepUntilAnnotation)
s.SetAnnotations(annotations)
return true
}
51 changes: 46 additions & 5 deletions controllers/sriovnetworknodepolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ import (
"context"
"encoding/json"
"fmt"
"os"
"reflect"
"sort"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -296,10 +298,7 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con
}
}
if !found {
logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name)
err := r.Delete(ctx, &ns, &client.DeleteOptions{})
if err != nil {
logger.Error(err, "Fail to Delete", "SriovNetworkNodeState CR:", ns.GetName())
if err := r.handleStaleNodeState(ctx, &ns); err != nil {
return err
}
}
Expand All @@ -308,6 +307,41 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con
return nil
}

// handleStaleNodeState handles stale SriovNetworkNodeState CR (the CR which no longer have a corresponding node with the daemon).
// If the CR has the "keep until time" annotation, indicating the earliest time the state object can be removed,
// this function will compare it to the current time to determine if deletion is permissible and do deletion if allowed.
// If the annotation is absent, the function will create one with a timestamp in future, using either the default or a configured offset.
func (r *SriovNetworkNodePolicyReconciler) handleStaleNodeState(ctx context.Context, ns *sriovnetworkv1.SriovNetworkNodeState) error {
logger := log.Log.WithName("handleStaleNodeState")
now := time.Now().UTC()
keepUntilTime := ns.GetKeepUntilTime()
if keepUntilTime.IsZero() {
// keep until time annotation is not set, set a new value with default or configured offset and update the object
delayMinutes, err := strconv.Atoi(os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY"))
if err != nil || delayMinutes <= 0 {
delayMinutes = 30 // keep objects for 30 minutes by default
}
keepUntilTime = now.Add(time.Minute * time.Duration(delayMinutes))
logger.V(2).Info("SriovNetworkNodeState has no matching node, configure cleanup delay for the state object",
"nodeStateName", ns.Name, "delay", delayMinutes, "keepUntilTime", keepUntilTime.String())
ns.SetKeepUntilTime(keepUntilTime)
if err := r.Update(ctx, ns); err != nil {
logger.Error(err, "Fail to update SriovNetworkNodeState CR", "name", ns.GetName())
return err
}
return nil
}
if now.Before(keepUntilTime) {
return nil
}
logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name)
if err := r.Delete(ctx, ns, &client.DeleteOptions{}); err != nil {
logger.Error(err, "Fail to delete SriovNetworkNodeState CR", "name", ns.GetName())
return err
}
return nil
}

func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context.Context,
dc *sriovnetworkv1.SriovOperatorConfig,
npl *sriovnetworkv1.SriovNetworkNodePolicyList,
Expand All @@ -333,9 +367,16 @@ func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context
return fmt.Errorf("failed to get SriovNetworkNodeState: %v", err)
}
} else {
keepUntilAnnotationUpdated := found.ResetKeepUntilTime()

if len(found.Status.Interfaces) == 0 {
logger.Info("SriovNetworkNodeState Status Interfaces are empty. Skip update of policies in spec",
"namespace", ns.Namespace, "name", ns.Name)
if keepUntilAnnotationUpdated {
if err := r.Update(ctx, found); err != nil {
return fmt.Errorf("couldn't update SriovNetworkNodeState: %v", err)
}
}
return nil
}

Expand Down Expand Up @@ -378,7 +419,7 @@ func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context
// Note(adrianc): we check same ownerReferences since SriovNetworkNodeState
// was owned by a default SriovNetworkNodePolicy. if we encounter a descripancy
// we need to update.
if reflect.DeepEqual(newVersion.OwnerReferences, found.OwnerReferences) &&
if !keepUntilAnnotationUpdated && reflect.DeepEqual(newVersion.OwnerReferences, found.OwnerReferences) &&
equality.Semantic.DeepEqual(newVersion.Spec, found.Spec) {
logger.V(1).Info("SriovNetworkNodeState did not change, not updating")
return nil
Expand Down
54 changes: 54 additions & 0 deletions controllers/sriovnetworknodepolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,25 @@ package controllers
import (
"context"
"encoding/json"
"os"
"testing"
"time"

"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"

"sigs.k8s.io/controller-runtime/pkg/client/fake"

dptypes "github.com/k8snetworkplumbingwg/sriov-network-device-plugin/pkg/types"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
v1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
Expand Down Expand Up @@ -126,3 +133,50 @@ func TestRenderDevicePluginConfigData(t *testing.T) {
})
}
}

var _ = Describe("SriovNetworkNodePolicyReconciler", Ordered, func() {
Context("handleStaleNodeState", func() {
var (
ctx context.Context
r *SriovNetworkNodePolicyReconciler
nodeState *sriovnetworkv1.SriovNetworkNodeState
)

BeforeEach(func() {
ctx = context.Background()
scheme := runtime.NewScheme()
utilruntime.Must(sriovnetworkv1.AddToScheme(scheme))
nodeState = &sriovnetworkv1.SriovNetworkNodeState{ObjectMeta: metav1.ObjectMeta{Name: "node1"}}
r = &SriovNetworkNodePolicyReconciler{Client: fake.NewClientBuilder().WithObjects(nodeState).Build()}
})
It("should set default delay", func() {
nodeState := nodeState.DeepCopy()
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred())
Expect(time.Now().UTC().Before(nodeState.GetKeepUntilTime())).To(BeTrue())
})
It("should remove if wait time expired", func() {
nodeState := nodeState.DeepCopy()
nodeState.SetKeepUntilTime(time.Now().UTC().Add(-time.Minute))
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(errors.IsNotFound(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState))).To(BeTrue())
})
It("should keep existing wait time if already set", func() {
nodeState := nodeState.DeepCopy()
nodeState.SetKeepUntilTime(time.Now().UTC().Add(time.Minute))
testTime := nodeState.GetKeepUntilTime()
r.Update(ctx, nodeState)
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred())
Expect(nodeState.GetKeepUntilTime()).To(Equal(testTime))
})
It("non default dealy", func() {
DeferCleanup(os.Setenv, "STALE_NODE_STATE_CLEANUP_DELAY", os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY"))
os.Setenv("STALE_NODE_STATE_CLEANUP_DELAY", "60")
nodeState := nodeState.DeepCopy()
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred())
Expect(time.Until(nodeState.GetKeepUntilTime()) > 30*time.Minute).To(BeTrue())
})
})
})
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ spec:
value: {{ .Values.operator.cniBinPath }}
- name: CLUSTER_TYPE
value: {{ .Values.operator.clusterType }}
- name: STALE_NODE_STATE_CLEANUP_DELAY
value: "{{ .Values.operator.staleNodeStateCleanupDelay }}"
{{- if .Values.operator.admissionControllers.enabled }}
- name: ADMISSION_CONTROLLERS_CERTIFICATES_OPERATOR_SECRET_NAME
value: {{ .Values.operator.admissionControllers.certificates.secretNames.operator }}
Expand Down
3 changes: 3 additions & 0 deletions deployment/sriov-network-operator-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ operator:
resourcePrefix: "openshift.io"
cniBinPath: "/opt/cni/bin"
clusterType: "kubernetes"
# minimal amount of time (in minutes) the operator will wait before removing
# stale SriovNetworkNodeState objects (objects that doesn't match node with the daemon)
staleNodeStateCleanupDelay: "30"
metricsExporter:
port: "9110"
certificates:
Expand Down
2 changes: 2 additions & 0 deletions pkg/consts/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ const (
MCPPauseAnnotationState = "sriovnetwork.openshift.io/state"
MCPPauseAnnotationTime = "sriovnetwork.openshift.io/time"

NodeStateKeepUntilAnnotation = "sriovnetwork.openshift.io/keep-state-until"

CheckpointFileName = "sno-initial-node-state.json"
Unknown = "Unknown"

Expand Down

0 comments on commit 4245aa3

Please sign in to comment.