Skip to content

Commit

Permalink
Add options to NodeGroupAutoscalingOptions from machineDeployment
Browse files Browse the repository at this point in the history
… annotations (#257)

* Add node group specific options to NodeGroupAutoscalingOptions from machineDeployment annotations

* Updated func and variable names

* Added unit tests

* Fix unit tests after rebase
  • Loading branch information
aaronfern authored Dec 13, 2023
1 parent 5289808 commit 6005261
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 4 deletions.
64 changes: 61 additions & 3 deletions cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ package mcm
import (
"context"
"fmt"
"strconv"
"strings"
"time"

apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/labels"
Expand All @@ -34,7 +38,6 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"strings"
)

const (
Expand All @@ -44,6 +47,21 @@ const (
// GPULabel is the label added to nodes with GPU resource.
// TODO: Align on a GPU Label for Gardener.
GPULabel = "gardener.cloud/accelerator"

// ScaleDownUtilizationThresholdAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUtilizationThreshold
ScaleDownUtilizationThresholdAnnotation = "autoscaler.gardener.cloud/scale-down-utilization-threshold"

// ScaleDownGpuUtilizationThresholdAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownGpuUtilizationThreshold
ScaleDownGpuUtilizationThresholdAnnotation = "autoscaler.gardener.cloud/scale-down-gpu-utilization-threshold"

// ScaleDownUnneededTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUnneededTime
ScaleDownUnneededTimeAnnotation = "autoscaler.gardener.cloud/scale-down-unneeded-time"

// ScaleDownUnreadyTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUnreadyTime
ScaleDownUnreadyTimeAnnotation = "autoscaler.gardener.cloud/scale-down-unready-time"

// MaxNodeProvisionTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.MaxNodeProvisionTime
MaxNodeProvisionTimeAnnotation = "autoscaler.gardener.cloud/max-node-provision-time"
)

// MCMCloudProvider implements the cloud provider interface for machine-controller-manager
Expand Down Expand Up @@ -439,9 +457,49 @@ func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e
// GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
// NodeGroup. Returning a nil will result in using default options.
// Implementation optional.
// TODO: add proper implementation
func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
return nil, cloudprovider.ErrNotImplemented
mcdAnnotations, err := machinedeployment.mcmManager.GetMachineDeploymentAnnotations(machinedeployment.Name)
if err != nil {
return nil, err
}

scaleDownUtilThresholdValue := defaults.ScaleDownUtilizationThreshold
if _, ok := mcdAnnotations[ScaleDownUtilizationThresholdAnnotation]; ok {
if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownUtilizationThresholdAnnotation], 64); err == nil {
scaleDownUtilThresholdValue = floatVal
}
}
scaleDownGPUUtilThresholdValue := defaults.ScaleDownGpuUtilizationThreshold
if _, ok := mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation]; ok {
if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation], 64); err == nil {
scaleDownGPUUtilThresholdValue = floatVal
}
}
scaleDownUnneededDuration := defaults.ScaleDownUnneededTime
if _, ok := mcdAnnotations[ScaleDownUnneededTimeAnnotation]; ok {
if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnneededTimeAnnotation]); err == nil {
scaleDownUnneededDuration = durationVal
}
}
scaleDownUnreadyDuration := defaults.ScaleDownUnreadyTime
if _, ok := mcdAnnotations[ScaleDownUnreadyTimeAnnotation]; ok {
if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnreadyTimeAnnotation]); err == nil {
scaleDownUnreadyDuration = durationVal
}
}
maxNodeProvisionDuration := defaults.MaxNodeProvisionTime
if _, ok := mcdAnnotations[MaxNodeProvisionTimeAnnotation]; ok {
if durationVal, err := time.ParseDuration(mcdAnnotations[MaxNodeProvisionTimeAnnotation]); err == nil {
maxNodeProvisionDuration = durationVal
}
}
return &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: scaleDownUtilThresholdValue,
ScaleDownGpuUtilizationThreshold: scaleDownGPUUtilThresholdValue,
ScaleDownUnneededTime: scaleDownUnneededDuration,
ScaleDownUnreadyTime: scaleDownUnreadyDuration,
MaxNodeProvisionTime: maxNodeProvisionDuration,
}, nil
}

// TemplateNodeInfo returns a node template for this node group.
Expand Down
140 changes: 139 additions & 1 deletion cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@ import (
"context"
"errors"
"fmt"
v1 "k8s.io/api/apps/v1"
"math"
"strings"
"testing"
"time"

v1 "k8s.io/api/apps/v1"

machinecodes "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
customfake "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mcm/fakeclient"
"k8s.io/autoscaler/cluster-autoscaler/config"

"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -580,3 +583,138 @@ func TestNodes(t *testing.T) {
})
}
}

func TestGetOptions(t *testing.T) {
type expect struct {
ngOptions *config.NodeGroupAutoscalingOptions
err error
}
type data struct {
name string
setup setup
expect expect
}
table := []data{
{
"should throw error if machinedeployment cannot be found",
setup{
nodeGroups: []string{nodeGroup1},
},
expect{
err: fmt.Errorf("unable to fetch MachineDeployment object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
},
},
{
"should return default nodegroupautoscalingoptions if none are provided",
setup{
machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
nodeGroups: []string{nodeGroup1},
},
expect{
ngOptions: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5,
ScaleDownGpuUtilizationThreshold: 0.5,
ScaleDownUnneededTime: 1 * time.Minute,
ScaleDownUnreadyTime: 1 * time.Minute,
MaxNodeProvisionTime: 1 * time.Minute,
},
err: nil,
},
},
{
"should return nodegroupautoscalingoptions with values from mcd if all annotations are present",
setup{
machineDeployments: newMachineDeployments(
1,
2,
nil,
map[string]string{
ScaleDownUtilizationThresholdAnnotation: "0.7",
ScaleDownGpuUtilizationThresholdAnnotation: "0.7",
ScaleDownUnneededTimeAnnotation: "5m",
ScaleDownUnreadyTimeAnnotation: "5m",
MaxNodeProvisionTimeAnnotation: "5m",
},
nil,
),
nodeGroups: []string{nodeGroup1},
},
expect{
ngOptions: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.7,
ScaleDownGpuUtilizationThreshold: 0.7,
ScaleDownUnneededTime: 5 * time.Minute,
ScaleDownUnreadyTime: 5 * time.Minute,
MaxNodeProvisionTime: 5 * time.Minute,
},
err: nil,
},
},
{
"should return nodegroupautoscalingoptions with annotations values from mcd and remaining defaults",
setup{
machineDeployments: newMachineDeployments(
1,
2,
nil,
map[string]string{
ScaleDownUtilizationThresholdAnnotation: "0.7",
ScaleDownUnneededTimeAnnotation: "5m",
MaxNodeProvisionTimeAnnotation: "2m",
},
nil,
),
nodeGroups: []string{nodeGroup1},
},
expect{
ngOptions: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.7,
ScaleDownGpuUtilizationThreshold: 0.5,
ScaleDownUnneededTime: 5 * time.Minute,
ScaleDownUnreadyTime: 1 * time.Minute,
MaxNodeProvisionTime: 2 * time.Minute,
},
err: nil,
},
},
}

for _, entry := range table {
entry := entry // have a shallow copy of the entry for parallelization of tests
t.Run(entry.name, func(t *testing.T) {
t.Parallel()
g := NewWithT(t)
stop := make(chan struct{})
defer close(stop)
controlMachineObjects, targetCoreObjects, _ := setupEnv(&entry.setup)
m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, nil, controlMachineObjects, targetCoreObjects, nil)
defer trackers.Stop()
waitForCacheSync(t, stop, hasSyncedCacheFns)

md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m)
g.Expect(err).To(BeNil())

ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5,
ScaleDownGpuUtilizationThreshold: 0.5,
ScaleDownUnneededTime: 1 * time.Minute,
ScaleDownUnreadyTime: 1 * time.Minute,
MaxNodeProvisionTime: 1 * time.Minute,
}

options, err := md.GetOptions(ngAutoScalingOpDefaults)

if entry.expect.err != nil {
g.Expect(err).To(Equal(entry.expect.err))
g.Expect(options).To(BeNil())
} else {
g.Expect(err).To(BeNil())
g.Expect(*options).To(HaveField("ScaleDownUtilizationThreshold", entry.expect.ngOptions.ScaleDownUtilizationThreshold))
g.Expect(*options).To(HaveField("ScaleDownGpuUtilizationThreshold", entry.expect.ngOptions.ScaleDownGpuUtilizationThreshold))
g.Expect(*options).To(HaveField("ScaleDownUnneededTime", entry.expect.ngOptions.ScaleDownUnneededTime))
g.Expect(*options).To(HaveField("ScaleDownUnreadyTime", entry.expect.ngOptions.ScaleDownUnreadyTime))
g.Expect(*options).To(HaveField("MaxNodeProvisionTime", entry.expect.ngOptions.MaxNodeProvisionTime))
}
})
}
}
10 changes: 10 additions & 0 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,16 @@ func validateNodeTemplate(nodeTemplateAttributes *v1alpha1.NodeTemplate) error {
return nil
}

// GetMachineDeploymentAnnotations returns the annotations present on the machine deployment for the provided machine deployment name
func (m *McmManager) GetMachineDeploymentAnnotations(machineDeploymentName string) (map[string]string, error) {
md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machineDeploymentName)
if err != nil {
return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", machineDeploymentName, err)
}

return md.Annotations, nil
}

// GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the machinedeployment
// If no node present then it forms the nodeTemplate using the one present in machineClass
func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *MachineDeployment) (*nodeTemplate, error) {
Expand Down

0 comments on commit 6005261

Please sign in to comment.