Skip to content

Commit

Permalink
Balance node grps during scale from zero (#114)
Browse files Browse the repository at this point in the history
  • Loading branch information
himanshu-kun authored Mar 22, 2022
1 parent 418db31 commit c6f0379
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 38 deletions.
114 changes: 94 additions & 20 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,12 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/selection"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/client-go/discovery"
coreinformers "k8s.io/client-go/informers"
Expand Down Expand Up @@ -109,10 +111,12 @@ type McmManager struct {
}

type instanceType struct {
InstanceType string
VCPU resource.Quantity
Memory resource.Quantity
GPU resource.Quantity
InstanceType string
VCPU resource.Quantity
Memory resource.Quantity
GPU resource.Quantity
EphemeralStorage resource.Quantity
PodCount resource.Quantity
}

type nodeTemplate struct {
Expand Down Expand Up @@ -611,7 +615,8 @@ func validateNodeTemplate(nodeTemplateAttributes *v1alpha1.NodeTemplate) error {
return nil
}

// GetMachineDeploymentNodeTemplate returns the NodeTemplate which belongs to the MachineDeployment.
// GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the machinedeployment
// If no node present then it forms the nodeTemplate using the one present in machineClass
func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *MachineDeployment) (*nodeTemplate, error) {

md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machinedeployment.Name)
Expand All @@ -620,14 +625,23 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
}

var (
region string
zone string
instance instanceType

workerPool = getWorkerPoolForMachineDeploy(md)
list = []string{workerPool}
selector = labels.NewSelector()
req, _ = labels.NewRequirement(nodegroupset.LabelWorkerPool, selection.Equals, list)
region string
zone string
instance instanceType
machineClass = md.Spec.Template.Spec.Class
nodeTemplateSpec = md.Spec.Template.Spec.NodeTemplateSpec
)

selector = selector.Add(*req)
nodes, err := m.nodeLister.List(selector)
if err != nil {
return nil, fmt.Errorf("error fetching node object for worker pool %s, Error: %v", workerPool, err)
}

switch machineClass.Kind {
case kindMachineClass:
mc, err := m.machineClassLister.MachineClasses(m.namespace).Get(machineClass.Name)
Expand All @@ -639,16 +653,33 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine

err := validateNodeTemplate(nodeTemplateAttributes)
if err != nil {
return nil, fmt.Errorf("NodeTemplate validation error in MachineClass %s : %s", mc.Name, err)
return nil, fmt.Errorf("nodeTemplate validation error in MachineClass %s : %s", mc.Name, err)
}

klog.V(1).Infof("Generating node template using nodeTemplate from MachineClass %s", machineClass.Name)
instance = instanceType{
InstanceType: nodeTemplateAttributes.InstanceType,
VCPU: nodeTemplateAttributes.Capacity["cpu"],
Memory: nodeTemplateAttributes.Capacity["memory"],
GPU: nodeTemplateAttributes.Capacity["gpu"],
filteredNodes := filterOutNodes(nodes, nodeTemplateAttributes.InstanceType)

if len(filteredNodes) > 0 {
klog.V(1).Infof("Nodes already existing in the worker pool %s", workerPool)
baseNode := filteredNodes[0]
klog.V(1).Infof("Worker pool node used to form template is %s and its capacity is cpu: %s, memory:%s", baseNode.Name, baseNode.Status.Capacity.Cpu().String(), baseNode.Status.Capacity.Memory().String())
instance = instanceType{
VCPU: baseNode.Status.Capacity[apiv1.ResourceCPU],
Memory: baseNode.Status.Capacity[apiv1.ResourceMemory],
GPU: baseNode.Status.Capacity[gpu.ResourceNvidiaGPU],
EphemeralStorage: baseNode.Status.Capacity[apiv1.ResourceEphemeralStorage],
PodCount: baseNode.Status.Capacity[apiv1.ResourcePods],
}
} else {
klog.V(1).Infof("Generating node template only using nodeTemplate from MachineClass %s: template resources-> cpu: %s,memory: %s", machineClass.Name, nodeTemplateAttributes.Capacity.Cpu().String(), nodeTemplateAttributes.Capacity.Memory().String())
instance = instanceType{
VCPU: nodeTemplateAttributes.Capacity[apiv1.ResourceCPU],
Memory: nodeTemplateAttributes.Capacity[apiv1.ResourceMemory],
GPU: nodeTemplateAttributes.Capacity["gpu"],
// Numbers pods per node will depends on the CNI used and the maxPods kubelet config, default is often 110
PodCount: resource.MustParse("110"),
}
}
instance.InstanceType = nodeTemplateAttributes.InstanceType
region = nodeTemplateAttributes.Region
zone = nodeTemplateAttributes.Zone
break
Expand All @@ -671,6 +702,8 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
VCPU: awsInstance.VCPU,
Memory: awsInstance.Memory,
GPU: awsInstance.GPU,
// Numbers pods per node will depends on the CNI used and the maxPods kubelet config, default is often 110
PodCount: resource.MustParse("110"),
}
region = providerSpec.Region
zone = getZoneValueFromMCLabels(mc.Labels)
Expand All @@ -689,6 +722,8 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
VCPU: azureInstance.VCPU,
Memory: azureInstance.Memory,
GPU: azureInstance.GPU,
// Numbers pods per node will depends on the CNI used and the maxPods kubelet config, default is often 110
PodCount: resource.MustParse("110"),
}
region = providerSpec.Location
if providerSpec.Properties.Zone != nil {
Expand Down Expand Up @@ -722,6 +757,40 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
return nodeTmpl, nil
}

func filterOutNodes(nodes []*v1.Node, instanceType string) []*v1.Node {
var filteredNodes []*v1.Node
for _, node := range nodes {
if node.Status.Capacity != nil && getInstanceTypeForNode(node) == instanceType {
filteredNodes = append(filteredNodes, node)
}
}

return filteredNodes
}

func getInstanceTypeForNode(node *v1.Node) string {
var instanceTypeLabelValue string
if node.Labels != nil {
if val, ok := node.Labels[apiv1.LabelInstanceTypeStable]; ok {
instanceTypeLabelValue = val
} else if val, ok := node.Labels[apiv1.LabelInstanceType]; ok {
instanceTypeLabelValue = val
}
}

return instanceTypeLabelValue
}

func getWorkerPoolForMachineDeploy(md *v1alpha1.MachineDeployment) string {
if md.Spec.Template.Spec.NodeTemplateSpec.Labels != nil {
if value, exists := md.Spec.Template.Spec.NodeTemplateSpec.Labels[nodegroupset.LabelWorkerPool]; exists {
return value
}
}

return ""
}

func getZoneValueFromMCLabels(labels map[string]string) string {
var zone string

Expand All @@ -730,7 +799,7 @@ func getZoneValueFromMCLabels(labels map[string]string) string {
// Prefer zone value from the new label
zone = value
} else if value, exists := labels[apiv1.LabelZoneFailureDomain]; exists {
// Fallback to zone value from deprecated label if new lable value doesn't exist
// Fallback to zone value from deprecated label if new label value doesn't exist
zone = value
}
}
Expand All @@ -752,11 +821,16 @@ func (m *McmManager) buildNodeFromTemplate(name string, template *nodeTemplate)
Capacity: apiv1.ResourceList{},
}

// Numbers pods per node will depends on the CNI used and the maxPods kubelet config, default is often 100
node.Status.Capacity[apiv1.ResourcePods] = *resource.NewQuantity(100, resource.DecimalSI)
node.Status.Capacity[apiv1.ResourcePods] = template.InstanceType.PodCount
node.Status.Capacity[apiv1.ResourceCPU] = template.InstanceType.VCPU
node.Status.Capacity[gpu.ResourceNvidiaGPU] = template.InstanceType.GPU
if template.InstanceType.GPU.Cmp(resource.MustParse("0")) != 0 {
node.Status.Capacity[gpu.ResourceNvidiaGPU] = template.InstanceType.GPU
}
node.Status.Capacity[apiv1.ResourceMemory] = template.InstanceType.Memory
node.Status.Capacity[apiv1.ResourceEphemeralStorage] = template.InstanceType.EphemeralStorage
// added most common hugepages sizes. This will help to consider the template node while finding similar node groups
node.Status.Capacity["hugepages-1Gi"] = *resource.NewQuantity(0, resource.DecimalSI)
node.Status.Capacity["hugepages-2Mi"] = *resource.NewQuantity(0, resource.DecimalSI)

node.Status.Allocatable = node.Status.Capacity

Expand Down
43 changes: 40 additions & 3 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/stretchr/testify/assert"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
)
Expand All @@ -36,9 +37,10 @@ func TestBuildGenericLabels(t *testing.T) {

labels := buildGenericLabels(&nodeTemplate{
InstanceType: &instanceType{
InstanceType: instanceTypeC4Large,
VCPU: resource.MustParse("2"),
Memory: resource.MustParse("3840Mi"),
InstanceType: instanceTypeC4Large,
VCPU: resource.MustParse("2"),
Memory: resource.MustParse("3840Mi"),
EphemeralStorage: resource.MustParse("50378260Ki"),
},
Region: regionUSEast1,
Zone: zoneUSEast1a,
Expand Down Expand Up @@ -97,3 +99,38 @@ func TestGenerationOfCorrectZoneValueFromMCLabel(t *testing.T) {
})
assert.Equal(t, resultingZone, "")
}

func TestFilterNodes(t *testing.T) {
var (
node1 = &apiv1.Node{
Status: apiv1.NodeStatus{
Capacity: apiv1.ResourceList{
"cpu": resource.MustParse("2"),
"memory": resource.MustParse("64Gi"),
},
},
}
node2 = &apiv1.Node{
ObjectMeta: v1.ObjectMeta{
Labels: map[string]string{
apiv1.LabelInstanceTypeStable: "test-instance-type",
},
},
Status: apiv1.NodeStatus{
Capacity: apiv1.ResourceList{
"cpu": resource.MustParse("2"),
"memory": resource.MustParse("64Gi"),
},
},
}
emptyNode = &apiv1.Node{}
)
filteredNodes := filterOutNodes([]*apiv1.Node{
node1,
node2,
emptyNode,
}, "test-instance-type")

assert.EqualValues(t, len(filteredNodes), 1)
assert.Equal(t, filteredNodes, []*apiv1.Node{node2})
}
21 changes: 15 additions & 6 deletions cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,21 @@ import (
const (
// MaxAllocatableDifferenceRatio describes how Node.Status.Allocatable can differ between
// groups in the same NodeGroupSet
MaxAllocatableDifferenceRatio = 0.05
// changing to 50% or any very high %age as gardener nodegroups have similar node group label and so
// comparision won't be done with any other nodegroup node, done to help in balancing during scale from zero
MaxAllocatableDifferenceRatio = 0.5
// MaxFreeDifferenceRatio describes how free resources (allocatable - daemon and system pods)
// can differ between groups in the same NodeGroupSet
MaxFreeDifferenceRatio = 0.05
// changing to 50% or any very high %age as gardener nodegroups have similar node group label and so
// comparision won't be done with any other nodegroup node, done to help in balancing during scale from zero
MaxFreeDifferenceRatio = 0.5
// MaxCapacityMemoryDifferenceRatio describes how Node.Status.Capacity.Memory can differ between
// groups in the same NodeGroupSet
MaxCapacityMemoryDifferenceRatio = 0.015
// changing to 50% or any very high %age as gardener nodegroups have similar node group label and so
// comparision won't be done with any other nodegroup node, done to help in balancing during scale from zero
MaxCapacityMemoryDifferenceRatio = 0.5
// LabelWorkerKubernetesVersion is a constant for a label that indicates the kubernetes version of the kubelet on the node
LabelWorkerKubernetesVersion = "worker.gardener.cloud/kubernetes-version"
// LabelWorkerPool is a constant for a label that indicates the worker pool the node belongs to
LabelWorkerPool = "worker.gardener.cloud/pool"
// LabelWorkerPoolDeprecated is a deprecated constant for a label that indicates the worker pool the node belongs to
Expand All @@ -58,9 +66,10 @@ var BasicIgnoredLabels = map[string]bool{
"beta.kubernetes.io/fluentd-ds-ready": true, // this is internal label used for determining if fluentd should be installed as deamon set. Used for migration 1.8 to 1.9.
"kops.k8s.io/instancegroup": true, // this is a label used by kops to identify "instance group" names. it's value is variable, defeating check of similar node groups

// Ignore gardener specific labels.
LabelWorkerPool: true,
LabelWorkerPoolDeprecated: true,
// Ignore gardener specific labels except worker pool labels
LabelWorkerPool: false,
LabelWorkerPoolDeprecated: false,
LabelWorkerKubernetesVersion: true,
// Ignore CSI specific labels.
LabelTopologyEBSCSIAWS: true,
LabelTopologyGKE: true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ func TestNodesSimilarVariousRequirements(t *testing.T) {
n3.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewMilliQuantity(999, resource.DecimalSI)
checkNodesSimilar(t, n1, n3, comparator, true)

// Same CPU capacity, significantly different allocatable
// Same CPU capacity, significantly different allocatable (more than 50%)
n4 := BuildTestNode("node4", 1000, 2000)
n4.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewMilliQuantity(500, resource.DecimalSI)
n4.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewMilliQuantity(400, resource.DecimalSI)
checkNodesSimilar(t, n1, n4, comparator, false)

// One with GPU, one without
Expand All @@ -81,7 +81,7 @@ func TestNodesSimilarVariousRequirementsAndPods(t *testing.T) {

// Different allocatable, but same free
n2 := BuildTestNode("node2", 1000, 2000)
n2.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewMilliQuantity(500, resource.DecimalSI)
n2.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewMilliQuantity(400, resource.DecimalSI)
n2.Status.Allocatable[apiv1.ResourceMemory] = *resource.NewQuantity(1000, resource.DecimalSI)
checkNodesSimilarWithPods(t, n1, n2, []*apiv1.Pod{p1}, []*apiv1.Pod{}, comparator, false)

Expand Down Expand Up @@ -129,9 +129,9 @@ func TestNodesSimilarVariousLargeMemoryRequirementsM5XLarge(t *testing.T) {
n2 := BuildTestNode("node2", 1000, q2.Value())
checkNodesSimilar(t, n1, n2, comparator, true)

// Different memory capacity exceeds tolerance
// Value of q1 * 1.02
q3 := resource.MustParse("16438475Ki")
// Different memory capacity exceeds tolerance (50%)
// Value of q1 * 2.05
q3 := resource.MustParse("33038111Ki")
n3 := BuildTestNode("node3", 1000, q3.Value())
checkNodesSimilar(t, n1, n3, comparator, false)
}
Expand All @@ -151,9 +151,9 @@ func TestNodesSimilarVariousLargeMemoryRequirementsM516XLarge(t *testing.T) {
n2 := BuildTestNode("node2", 1000, q2.Value())
checkNodesSimilar(t, n1, n2, comparator, true)

// Different memory capacity exceeds tolerance
// Value of q1 * 1.02
q3 := resource.MustParse("265169453Ki")
// Different memory capacity exceeds tolerance (50%)
// Value of q1 * 2.05
q3 := resource.MustParse("532938606Ki")
n3 := BuildTestNode("node3", 1000, q3.Value())
checkNodesSimilar(t, n1, n3, comparator, false)
}
Expand Down

0 comments on commit c6f0379

Please sign in to comment.