From 53939cd8212a4b1fb78e8685b3c8e86eef9fb8b3 Mon Sep 17 00:00:00 2001 From: sushrk Date: Fri, 22 Nov 2024 01:19:19 +0000 Subject: [PATCH] skip leaked ENIs cleanup on unmanaged nodes --- pkg/node/manager/manager.go | 18 +++++++++--------- pkg/provider/branch/provider.go | 7 ++----- pkg/provider/branch/trunk/trunk.go | 6 ++---- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pkg/node/manager/manager.go b/pkg/node/manager/manager.go index b6539877..d7bb2835 100644 --- a/pkg/node/manager/manager.go +++ b/pkg/node/manager/manager.go @@ -123,28 +123,28 @@ func NewNodeManager(logger logr.Logger, resourceManager resource.ResourceManager } func (m *manager) CheckNodeForLeakedENIs(nodeName string) { - managedNode, found := m.GetNode(nodeName) - if !found { - m.Log.Info("Node manager couldn't find the node for reconciliation cleanup", "NodeName", nodeName) + cachedNode, found := m.GetNode(nodeName) + if !found || !cachedNode.IsManaged() { + m.Log.V(1).Info("node not found or not managed by controller, skip reconciliation", "nodeName", nodeName) return } // Only start a goroutine when need to - if time.Now().After(managedNode.GetNextReconciliationTime()) { + if time.Now().After(cachedNode.GetNextReconciliationTime()) { go func() { if resourceProvider, found := m.resourceManager.GetResourceProvider(config.ResourceNamePodENI); found { foundLeakedENI := resourceProvider.ReconcileNode(nodeName) if foundLeakedENI { - managedNode.SetReconciliationInterval(node.NodeInitialCleanupInterval) + cachedNode.SetReconciliationInterval(node.NodeInitialCleanupInterval) } else { - interval := wait.Jitter(managedNode.GetReconciliationInterval(), 5) + interval := wait.Jitter(cachedNode.GetReconciliationInterval(), 5) if interval > node.MaxNodeReconciliationInterval { interval = node.MaxNodeReconciliationInterval } - managedNode.SetReconciliationInterval(interval) + cachedNode.SetReconciliationInterval(interval) } - managedNode.SetNextReconciliationTime(time.Now().Add(managedNode.GetReconciliationInterval())) - m.Log.Info("reconciled cleanup node for leaking branch interfaces", "NodeName", nodeName, "NextInterval", managedNode.GetReconciliationInterval(), "NextReconciliationTime", managedNode.GetNextReconciliationTime()) + cachedNode.SetNextReconciliationTime(time.Now().Add(cachedNode.GetReconciliationInterval())) + m.Log.Info("reconciled node to cleanup leaked branch ENIs", "NodeName", nodeName, "NextInterval", cachedNode.GetReconciliationInterval(), "NextReconciliationTime", cachedNode.GetNextReconciliationTime()) } else { // no SGP provider enabled return diff --git a/pkg/provider/branch/provider.go b/pkg/provider/branch/provider.go index 4028300b..c79780b1 100644 --- a/pkg/provider/branch/provider.go +++ b/pkg/provider/branch/provider.go @@ -276,20 +276,17 @@ func (b *branchENIProvider) ReconcileNode(nodeName string) bool { log := b.log.WithValues("node", nodeName) if !isPresent { // return true to set the node next clean up asap since we don't know why trunk is missing - log.Info("no trunk ENI is pointing to the given node", "nodeName", nodeName) + log.V(1).Info("trunk ENI not found, requeue node", "nodeName", nodeName) return true } podList, err := b.apiWrapper.PodAPI.ListPods(nodeName) if err != nil { // return true to set the node next cleanup asap since the LIST call may fail for other reasons // we should assume that there are leaked resources need to be cleaned up - log.Error(err, "failed fo list pod") + log.Error(err, "failed to list pods, requeue node", "nodeName", nodeName) return true } foundLeakedENI := trunkENI.Reconcile(podList.Items) - - log.Info("completed reconcile node cleanup on branch ENIs", "nodeName", nodeName) - return foundLeakedENI } diff --git a/pkg/provider/branch/trunk/trunk.go b/pkg/provider/branch/trunk/trunk.go index 71de6991..82d93b7e 100644 --- a/pkg/provider/branch/trunk/trunk.go +++ b/pkg/provider/branch/trunk/trunk.go @@ -368,9 +368,7 @@ func (t *trunkENI) Reconcile(pods []v1.Pod) bool { t.deleteQueue = append(t.deleteQueue, eni) } delete(t.uidToBranchENIMap, uid) - - t.log.Info("trunk controller found leaked branch ENI. the controller pushed leaked ENI to delete queue and deleted pod that doesn't exist anymore", "pod uid", uid, - "eni", branchENIs) + t.log.Info("leaked eni pushed to delete queue, deleted non-existing pod", "pod uid", uid, "eni", branchENIs) } } @@ -505,7 +503,7 @@ func (t *trunkENI) PushBranchENIsToCoolDownQueue(UID string) { branchENIs, isPresent := t.uidToBranchENIMap[UID] if !isPresent { t.log.Info("couldn't find Branch ENI in cache, it could have been released if pod"+ - "succeeded/failed before being deleted", "UID", UID, "BranchENIs", branchENIs) + "succeeded/failed before being deleted", "UID", UID) trunkENIOperationsErrCount.WithLabelValues("get_branch_from_cache").Inc() return }