Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Centralized leaked ENI cleanup- refactor periodic cleanup & add node termination cleaner #505

Open
wants to merge 1 commit into
base: eni-cleanup
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
corecontroller "github.com/aws/amazon-vpc-resource-controller-k8s/controllers/core"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/api"
ec2API "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api"
eniCleaner "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api/cleanup"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/condition"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
Expand Down Expand Up @@ -362,12 +363,17 @@ func main() {
os.Exit(1)
}

if err := (&ec2API.ENICleaner{
EC2Wrapper: ec2Wrapper,
cleaner := &eniCleaner.ClusterENICleaner{
ClusterName: clusterName,
Log: ctrl.Log.WithName("eni cleaner"),
VPCID: vpcID,
}).SetupWithManager(ctx, mgr, healthzHandler); err != nil {
}
cleaner.ENICleaner = &eniCleaner.ENICleaner{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is cluster ENICleaner for cluster wide 30 minutes cleanup? I am trying to understand the pattern here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is right. ClusterENICleaner is used in the periodic cleanup routine.

In the next PR, I will add the NodeTerminationCleaner which is used in the finalizer routine.

EC2Wrapper: ec2Wrapper,
Manager: cleaner,
VpcId: vpcID,
Log: ctrl.Log.WithName("eniCleaner").WithName("cluster"),
}

if err := cleaner.SetupWithManager(ctx, mgr, healthzHandler); err != nil {
setupLog.Error(err, "unable to start eni cleaner")
os.Exit(1)
}
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

221 changes: 221 additions & 0 deletions pkg/aws/ec2/api/cleanup/eni_cleanup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package cleanup

import (
"context"
"fmt"
"strings"
"time"

"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/utils"

ec2Errors "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/errors"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/go-logr/logr"
kerrors "k8s.io/apimachinery/pkg/util/errors"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
)

// NetworkInterfaceManager interface allows to define the ENI filters and checks if ENI should be deleted for different callers like in the periodic cleanup routine or
// during node termination
type NetworkInterfaceManager interface {
GetENITagFilters() []*ec2.Filter
ShouldDeleteENI(eniID *string) bool
UpdateAvailableENIsIfNeeded(eniMap *map[string]struct{})
UpdateCleanupMetrics(vpcrcAvailableCount int, vpccniAvailableCount int, leakedENICount int)
}

type ENICleaner struct {
EC2Wrapper api.EC2Wrapper
Manager NetworkInterfaceManager
VpcId string
Log logr.Logger
}

// common filters for describing network interfaces
var CommonNetworkInterfaceFilters = []*ec2.Filter{
{
Name: aws.String("status"),
Values: []*string{aws.String(ec2.NetworkInterfaceStatusAvailable)},
},
{
Name: aws.String("tag:" + config.NetworkInterfaceOwnerTagKey),
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
config.NetworkInterfaceOwnerVPCCNITagValue}),
},
}

// ClusterENICleaner periodically deletes leaked network interfaces(provisioned by the controller or VPC-CNI) in the cluster
type ClusterENICleaner struct {
ClusterName string
shutdown bool
ctx context.Context
availableENIs map[string]struct{}
*ENICleaner
}

func (e *ClusterENICleaner) SetupWithManager(ctx context.Context, mgr ctrl.Manager, healthzHandler *rcHealthz.HealthzHandler) error {
e.ctx = ctx
e.availableENIs = make(map[string]struct{})
healthzHandler.AddControllersHealthCheckers(
map[string]healthz.Checker{
"health-interface-cleaner": rcHealthz.SimplePing("interface cleanup", e.Log),
},
)

return mgr.Add(e)
}

// StartENICleaner starts the ENI Cleaner routine that cleans up dangling ENIs created by the controller
func (e *ClusterENICleaner) Start(ctx context.Context) error {
e.Log.Info("starting eni clean up routine")

// Start routine to listen for shut down signal, on receiving the signal it set shutdown to true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't think we need this comments which just describes code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, this was just copied over from the existing eni_cleanup file.

go func() {
<-ctx.Done()
e.shutdown = true
}()
// Perform ENI cleanup after fixed time intervals till shut down variable is set to true on receiving the shutdown
// signal
for !e.shutdown {
Copy link
Contributor

@yash97 yash97 Jan 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can still result in race condition.
Sequence for loop check can happen first and then shutdown gets sets to true.
It would be better if we use for select pattern here.

e.DeleteLeakedResources()
time.Sleep(config.ENICleanUpInterval)
}

return nil
}

// DeleteLeakedResources describes all the network interfaces in available status that are created by the controller or VPC-CNI
// This is called by periodically by ClusterENICleaner which deletes available ENIs cluster-wide, and by the NodeTermination cleaner on node termination
// The available ENIs are deleted if ShouldDeleteENI is true, defined in the respective cleaners
// The function also updates metrics for the periodic cleanup routine and the node termination cleanup
func (e *ENICleaner) DeleteLeakedResources() error {
var errors []error
availableENIs := make(map[string]struct{})
vpcrcAvailableCount := 0
vpccniAvailableCount := 0
leakedENICount := 0

filters := CommonNetworkInterfaceFilters
// Append the VPC-ID deep filter for the paginated call
filters = append(filters, []*ec2.Filter{
{
Name: aws.String("vpc-id"),
Values: []*string{aws.String(e.VpcId)},
},
}...)
// get cleaner specific filters
filters = append(filters, e.Manager.GetENITagFilters()...)
describeNetworkInterfaceIp := &ec2.DescribeNetworkInterfacesInput{
Filters: filters,
}
for {
describeNetworkInterfaceOp, err := e.EC2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfaceIp)
if err != nil {
e.Log.Error(err, "failed to describe network interfaces, cleanup will be retried in next cycle")
return err
}
for _, nwInterface := range describeNetworkInterfaceOp.NetworkInterfaces {
if e.Manager.ShouldDeleteENI(nwInterface.NetworkInterfaceId) {
tagMap := utils.GetTagKeyValueMap(nwInterface.TagSet)
if val, ok := tagMap[config.NetworkInterfaceOwnerTagKey]; ok {
// Increment promethues metrics for number of leaked ENIs cleaned up
switch val {
case config.NetworkInterfaceOwnerTagValue:
vpcrcAvailableCount += 1
case config.NetworkInterfaceOwnerVPCCNITagValue:
vpccniAvailableCount += 1
default:
// We should not hit this case as we only filter for relevant tag values, log error and continue if unexpected ENIs found
e.Log.Error(fmt.Errorf("found available ENI not created by VPC-CNI/VPC-RC"), "eniID", *nwInterface.NetworkInterfaceId)
continue
}
}
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
NetworkInterfaceId: nwInterface.NetworkInterfaceId,
})
if err != nil {
if !strings.Contains(err.Error(), ec2Errors.NotFoundInterfaceID) { // ignore InvalidNetworkInterfaceID.NotFound error
// append err and continue, we will retry deletion in the next period/reconcile
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hopefully we move to comparing error type in sdkgo v2

leakedENICount += 1
errors = append(errors, fmt.Errorf("failed to delete leaked network interface %v:%v", *nwInterface.NetworkInterfaceId, err))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will count eni that are being re tried too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you expand on this? In each periodic cleanup routine, this is the number of leaked ENIs that failed to be deleted.

If these ENIs are still in leaked state, then yes, deletion will be re-tried on the next cleanup. And if it still fails deletion, it will be counted as leaked ENI which is the intended behavior. Do you have any concerns on this?

Copy link
Contributor

@yash97 yash97 Jan 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok I see we set metric instead of addition after every run. We should be good then. It will just count leaked eni for particular run only.

e.Log.Error(err, "failed to delete the leaked network interface",
"id", *nwInterface.NetworkInterfaceId)
}
continue
}
e.Log.Info("deleted leaked ENI successfully", "eni id", nwInterface.NetworkInterfaceId)
} else {
// Seeing the ENI for the first time, add it to the new list of available network interfaces
availableENIs[*nwInterface.NetworkInterfaceId] = struct{}{}
e.Log.Info("adding eni to to the map of available ENIs, will be removed if present in "+
"next run too", "id", *nwInterface.NetworkInterfaceId)
}
}

if describeNetworkInterfaceOp.NextToken == nil {
break
}
describeNetworkInterfaceIp.NextToken = describeNetworkInterfaceOp.NextToken
}
e.Manager.UpdateCleanupMetrics(vpcrcAvailableCount, vpccniAvailableCount, leakedENICount)
e.Manager.UpdateAvailableENIsIfNeeded(&availableENIs)
return kerrors.NewAggregate(errors)
}

func (e *ClusterENICleaner) GetENITagFilters() []*ec2.Filter {
clusterNameTagKey := fmt.Sprintf(config.ClusterNameTagKeyFormat, e.ClusterName)
return []*ec2.Filter{
{
Name: aws.String("tag:" + clusterNameTagKey),
Values: []*string{aws.String(config.ClusterNameTagValue)},
},
}
}

// ShouldDeleteENI returns true if the ENI should be deleted.
func (e *ClusterENICleaner) ShouldDeleteENI(eniID *string) bool {
if _, exists := e.availableENIs[*eniID]; exists {
return true
}
return false
}

// Set the available ENIs to the list of ENIs seen in the current cycle
// This adds ENIs that should not be deleted in the current cleanup cycle to the internal cache so it can be deleted in next cycle
// This prevents the clean up routine to remove ENIs that are created by another routines and are yet not attached to
// an instance or associated with a trunk interface in the periodic cleanup routine

// Example
// 1st cycle, Describe Available NetworkInterface Result - Interface 1, Interface 2, Interface 3
// 2nd cycle, Describe Available NetworkInterface Result - Interface 2, Interface 3
// In the second cycle we can conclude that Interface 2 and 3 are leaked because they have been sitting for the time
// interval between cycle 1 and 2 and hence can be safely deleted. And we can also conclude that Interface 1 was
// created but not attached at the the time when 1st cycle ran and hence it should not be deleted.
func (e *ClusterENICleaner) UpdateAvailableENIsIfNeeded(eniMap *map[string]struct{}) {
e.availableENIs = *eniMap
}

// Update cluster cleanup metrics for the current cleanup cycle
func (e *ClusterENICleaner) UpdateCleanupMetrics(vpcrcAvailableCount int, vpccniAvailableCount int, leakedENICount int) {
api.VpcRcAvailableClusterENICnt.Set(float64(vpcrcAvailableCount))
api.VpcCniAvailableClusterENICnt.Set(float64(vpccniAvailableCount))
api.LeakedENIClusterCleanupCnt.Set(float64(leakedENICount))
}
Loading
Loading