Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support extended resources and ephemeral-storage for scale-from-zero specified in MachineClass NodeTemplate #334

Open
wants to merge 3 commits into
base: machine-controller-manager-provider
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 48 additions & 21 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ import (
"fmt"
v1appslister "k8s.io/client-go/listers/apps/v1"
"k8s.io/utils/pointer"
"maps"
"math/rand"
"net/http"
"os"
"slices"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -111,6 +113,12 @@ var (
machineGVR = schema.GroupVersionResource{Group: machineGroup, Version: machineVersion, Resource: "machines"}
machineSetGVR = schema.GroupVersionResource{Group: machineGroup, Version: machineVersion, Resource: "machinesets"}
machineDeploymentGVR = schema.GroupVersionResource{Group: machineGroup, Version: machineVersion, Resource: "machinedeployments"}

// ErrInvalidNodeTemplate is a sentinel error that indicates that the nodeTemplate is invalid.
ErrInvalidNodeTemplate = errors.New("invalid node template")
coreResourceNames = []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory, "gpu"}
extraResourceNames = []v1.ResourceName{gpu.ResourceNvidiaGPU, v1.ResourcePods, v1.ResourceEphemeralStorage}
knownResourceNames = slices.Concat(coreResourceNames, extraResourceNames)
)

// McmManager manages the client communication for MachineDeployments.
Expand All @@ -130,12 +138,13 @@ type McmManager struct {
}

type instanceType struct {
InstanceType string
VCPU resource.Quantity
Memory resource.Quantity
GPU resource.Quantity
EphemeralStorage resource.Quantity
PodCount resource.Quantity
InstanceType string
VCPU resource.Quantity
Memory resource.Quantity
GPU resource.Quantity
EphemeralStorage resource.Quantity
PodCount resource.Quantity
ExtendedResources apiv1.ResourceList
}

type nodeTemplate struct {
Expand Down Expand Up @@ -689,25 +698,23 @@ func generateInstanceStatus(machine *v1alpha1.Machine) *cloudprovider.InstanceSt
func validateNodeTemplate(nodeTemplateAttributes *v1alpha1.NodeTemplate) error {
var allErrs []error

capacityAttributes := []v1.ResourceName{"cpu", "gpu", "memory"}

for _, attribute := range capacityAttributes {
for _, attribute := range coreResourceNames {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI - With this GPU is not considered a core resource.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this was stupid oversight. I have gpuResourceNames but it makes more correct to have gpu inside coreReourceNames. To be honest in the real world Node we seem to be using only nvidia GPU's labels.

if _, ok := nodeTemplateAttributes.Capacity[attribute]; !ok {
errMessage := fmt.Errorf("CPU, GPU and memory fields are mandatory")
errMessage := fmt.Errorf("the core resource fields %q are mandatory: %w", coreResourceNames, ErrInvalidNodeTemplate)
klog.Warning(errMessage)
allErrs = append(allErrs, errMessage)
break
}
}

if nodeTemplateAttributes.Region == "" || nodeTemplateAttributes.InstanceType == "" || nodeTemplateAttributes.Zone == "" {
errMessage := fmt.Errorf("InstanceType, Region and Zone attributes are mandatory")
errMessage := fmt.Errorf("InstanceType, Region and Zone attributes are mandatory: %w", ErrInvalidNodeTemplate)
klog.Warning(errMessage)
allErrs = append(allErrs, errMessage)
}

if allErrs != nil {
return fmt.Errorf("%s", allErrs)
return errors.Join(allErrs...)
}

return nil
Expand Down Expand Up @@ -771,21 +778,26 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
klog.V(1).Infof("Nodes already existing in the worker pool %s", workerPool)
baseNode := filteredNodes[0]
klog.V(1).Infof("Worker pool node used to form template is %s and its capacity is cpu: %s, memory:%s", baseNode.Name, baseNode.Status.Capacity.Cpu().String(), baseNode.Status.Capacity.Memory().String())
extendedResources := filterExtendedResources(baseNode.Status.Capacity)
instance = instanceType{
VCPU: baseNode.Status.Capacity[apiv1.ResourceCPU],
Memory: baseNode.Status.Capacity[apiv1.ResourceMemory],
GPU: baseNode.Status.Capacity[gpu.ResourceNvidiaGPU],
EphemeralStorage: baseNode.Status.Capacity[apiv1.ResourceEphemeralStorage],
PodCount: baseNode.Status.Capacity[apiv1.ResourcePods],
VCPU: baseNode.Status.Capacity[apiv1.ResourceCPU],
Memory: baseNode.Status.Capacity[apiv1.ResourceMemory],
GPU: baseNode.Status.Capacity[gpu.ResourceNvidiaGPU],
EphemeralStorage: baseNode.Status.Capacity[apiv1.ResourceEphemeralStorage],
PodCount: baseNode.Status.Capacity[apiv1.ResourcePods],
ExtendedResources: extendedResources,
}
} else {
klog.V(1).Infof("Generating node template only using nodeTemplate from MachineClass %s: template resources-> cpu: %s,memory: %s", machineClass.Name, nodeTemplateAttributes.Capacity.Cpu().String(), nodeTemplateAttributes.Capacity.Memory().String())
extendedResources := filterExtendedResources(nodeTemplateAttributes.Capacity)
instance = instanceType{
VCPU: nodeTemplateAttributes.Capacity[apiv1.ResourceCPU],
Memory: nodeTemplateAttributes.Capacity[apiv1.ResourceMemory],
GPU: nodeTemplateAttributes.Capacity["gpu"],
VCPU: nodeTemplateAttributes.Capacity[apiv1.ResourceCPU],
Memory: nodeTemplateAttributes.Capacity[apiv1.ResourceMemory],
GPU: nodeTemplateAttributes.Capacity["gpu"],
EphemeralStorage: nodeTemplateAttributes.Capacity[apiv1.ResourceEphemeralStorage],
// Numbers pods per node will depends on the CNI used and the maxPods kubelet config, default is often 110
PodCount: resource.MustParse("110"),
PodCount: resource.MustParse("110"),
ExtendedResources: extendedResources,
}
}
instance.InstanceType = nodeTemplateAttributes.InstanceType
Expand Down Expand Up @@ -964,6 +976,12 @@ func (m *McmManager) buildNodeFromTemplate(name string, template *nodeTemplate)
node.Status.Capacity["hugepages-1Gi"] = *resource.NewQuantity(0, resource.DecimalSI)
node.Status.Capacity["hugepages-2Mi"] = *resource.NewQuantity(0, resource.DecimalSI)

// populate extended resources from nodeTemplate
if len(template.InstanceType.ExtendedResources) > 0 {
klog.V(2).Infof("Copying extended resources %v to template node.Status.Capacity", template.InstanceType.ExtendedResources)
maps.Copy(node.Status.Capacity, template.InstanceType.ExtendedResources)
}

node.Status.Allocatable = node.Status.Capacity

// NodeLabels
Expand Down Expand Up @@ -1011,3 +1029,12 @@ func isMachineFailedOrTerminating(machine *v1alpha1.Machine) bool {
}
return false
}

// filterExtendedResources removes knownResourceNames from allResources and retains only the extendedResources.
func filterExtendedResources(allResources v1.ResourceList) (extendedResources v1.ResourceList) {
extendedResources = allResources.DeepCopy()
maps.DeleteFunc(extendedResources, func(name v1.ResourceName, _ resource.Quantity) bool {
return slices.Contains(knownResourceNames, name)
})
return
}
110 changes: 110 additions & 0 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ limitations under the License.
package mcm

import (
"errors"
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/utils/ptr"
"maps"
"math/rand/v2"
"testing"

"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -134,3 +140,107 @@ func TestFilterNodes(t *testing.T) {
assert.EqualValues(t, len(filteredNodes), 1)
assert.Equal(t, filteredNodes, []*apiv1.Node{node2})
}

func TestValidateNodeTemplate(t *testing.T) {
m5LargeType := createSampleInstanceType("m5.large", "sap.com/mana", resource.MustParse("300"))
nt := v1alpha1.NodeTemplate{
InstanceType: m5LargeType.InstanceType,
Capacity: make(apiv1.ResourceList),
}
err := validateNodeTemplate(&nt)
assert.NotNil(t, err)
assert.True(t, errors.Is(err, ErrInvalidNodeTemplate))

nt.Region = "europe-west1"
nt.Zone = nt.Region + "-b"

err = validateNodeTemplate(&nt)
assert.NotNil(t, err)
assert.True(t, errors.Is(err, ErrInvalidNodeTemplate))

if err != nil {
t.Logf("error %s", err)
}
}

func TestBuildNodeFromTemplate(t *testing.T) {
m := &McmManager{}
namePrefix := "bingo"
m5LargeType := createSampleInstanceType("m5.large", "sap.com/mana", resource.MustParse("300"))
labels := map[string]string{
"weapon": "light-saber",
}
nt := nodeTemplate{
InstanceType: m5LargeType,
Architecture: ptr.To("amd64"),
Labels: labels,
}
nt.Region = "europe-west1"
nt.Zone = nt.Region + "-b"
node, err := m.buildNodeFromTemplate(namePrefix, &nt)
assert.Nil(t, err)
if err != nil {
t.Logf("error %s", err)
}
assert.True(t, isSubset(labels, node.Labels), "labels should be a subset of node.Labels")
for _, k := range []apiv1.ResourceName{apiv1.ResourceMemory, apiv1.ResourceCPU} {
assert.Contains(t, node.Status.Capacity, k, "node.Status.Capacity should contain the mandatory resource named: %s", k)
}

// test with gpu resource
gpuQuantity := resource.MustParse("4")
nt.InstanceType.GPU = gpuQuantity
node, err = m.buildNodeFromTemplate(namePrefix, &nt)
assert.Nil(t, err)
if err != nil {
t.Logf("error %s", err)
}
for _, k := range []apiv1.ResourceName{apiv1.ResourceMemory, apiv1.ResourceCPU, gpu.ResourceNvidiaGPU} {
assert.Contains(t, node.Status.Capacity, k, "node.Status.Capacity should contain the mandatory resource named: %s", k)
}
actualGpuQuantity, hasGpuResource := node.Status.Capacity[gpu.ResourceNvidiaGPU]
assert.True(t, hasGpuResource, "node.Status.Capacity should have a gpu resource named %q", gpu.ResourceNvidiaGPU)
if hasGpuResource {
assert.Equal(t, gpuQuantity, actualGpuQuantity, "node.Status.Capacity should have gpu resource named %q with value %s instead of %s", gpu.ResourceDirectX, gpuQuantity, actualGpuQuantity)
}
}

func TestFilterExtendedResources(t *testing.T) {
resources := make(apiv1.ResourceList)
for _, n := range knownResourceNames {
resources[n] = *resource.NewQuantity(rand.Int64(), resource.DecimalSI)
}
customResources := make(apiv1.ResourceList)
customResources["resource.com/dongle"] = resource.MustParse("50")
customResources["quantum.com/memory"] = resource.MustParse("100Gi")

allResources := resources.DeepCopy()
maps.Copy(allResources, customResources)

extendedResources := filterExtendedResources(allResources)
t.Logf("TestFilterExtendedResources obtained: %+v", extendedResources)
assert.Equal(t, customResources, extendedResources)
}

func createSampleInstanceType(instanceTypeName string, customResourceName apiv1.ResourceName, customResourceQuantity resource.Quantity) *instanceType {
awsM5Large := AWSInstanceTypes[instanceTypeName]
extendedResources := make(apiv1.ResourceList)
extendedResources[customResourceName] = customResourceQuantity
iType := &instanceType{
InstanceType: awsM5Large.InstanceType,
VCPU: awsM5Large.VCPU,
Memory: awsM5Large.Memory,
GPU: awsM5Large.GPU,
ExtendedResources: extendedResources,
}
return iType
}

func isSubset[K comparable, V comparable](map1, map2 map[K]V) bool {
for k, v := range map1 {
if val, ok := map2[k]; !ok || val != v {
return false
}
}
return true
}
Loading