Skip to content

Commit

Permalink
slo-controller: fix mid resource calculate formula (#2291)
Browse files Browse the repository at this point in the history
Signed-off-by: lijunxin <[email protected]>
  • Loading branch information
lijunxin559 authored Dec 19, 2024
1 parent 0615332 commit 9bb39a2
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 32 deletions.
27 changes: 22 additions & 5 deletions pkg/slo-controller/noderesource/plugins/midresource/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,14 +159,14 @@ func (p *Plugin) getUnallocated(nodeName string, podList *corev1.PodList, nodeCa

func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *corev1.Node, podList *corev1.PodList,
resourceMetrics *framework.ResourceMetrics) []framework.ResourceItem {
// Allocatable[Mid]' := min(Reclaimable[Mid], NodeAllocatable * thresholdRatio) + Unallocated[Mid] * midUnallocatedRatio
// Allocatable[Mid]' := min(Reclaimable[Mid], NodeAllocatable * thresholdRatio, NodeUnused) + Unallocated[Mid] * midUnallocatedRatio
// Unallocated[Mid] = max(NodeCapacity - NodeReserved - Allocated[Prod], 0)

var allocatableMilliCPU, allocatableMemory int64
prodReclaimableCPU, prodReclaimableMemory := resource.NewQuantity(0, resource.DecimalSI), resource.NewQuantity(0, resource.BinarySI)
prodReclaimableMetic := resourceMetrics.NodeMetric.Status.ProdReclaimableMetric
prodReclaimableMetric := resourceMetrics.NodeMetric.Status.ProdReclaimableMetric

if prodReclaimableMetic == nil || prodReclaimableMetic.Resource.ResourceList == nil {
if prodReclaimableMetric == nil || prodReclaimableMetric.Resource.ResourceList == nil {
klog.V(4).Infof("no valid prod reclaimable, so use default zero value")
allocatableMilliCPU = 0
allocatableMemory = 0
Expand All @@ -185,7 +185,7 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor
nodeCapacity := resutil.GetNodeCapacity(node)

systemUsed := resutil.GetResourceListForCPUAndMemory(nodeMetric.Status.NodeMetric.SystemUsage.ResourceList)
// resource usage of host applications with prod priority will be count as host system usage since they consumes the
// resource usage of host applications with prod priority will be count as host system usage since they consume the
// node reserved resource.
systemUsed = quotav1.Add(systemUsed, hostAppHPUsed)

Expand All @@ -198,8 +198,14 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor

unallocated := p.getUnallocated(node.Name, podList, nodeCapacity, nodeReserved)

nodeUnused, err := getNodeUnused(node, nodeMetric)
if err != nil {
// failed to get nodeUsage, so radically belief that there is no resource left
// to keep mid-resource calculations relatively strict
nodeUnused = corev1.ResourceList{}
}
cpuInMilliCores, memory, cpuMsg, memMsg := resutil.CalculateMidResourceByPolicy(strategy, nodeCapacity,
unallocated, allocatableMilliCPU, allocatableMemory, prodReclaimableCPU, prodReclaimableMemory, node.Name)
unallocated, nodeUnused, allocatableMilliCPU, allocatableMemory, prodReclaimableCPU, prodReclaimableMemory, node.Name)

metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.MidCPU), metrics.UnitInteger, float64(cpuInMilliCores.MilliValue())/1000)
metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.MidMemory), metrics.UnitByte, float64(memory.Value()))
Expand All @@ -219,3 +225,14 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor
},
}
}

func getNodeUnused(node *corev1.Node, nodeMetrics *slov1alpha1.NodeMetric) (corev1.ResourceList, error) {
// nodeCapacity - nodeUsed
nodeCapacity := resutil.GetNodeCapacity(node)
nodeUsed := nodeMetrics.Status.NodeMetric.NodeUsage.ResourceList
if isValid, mes := resutil.IsValidNodeUsage(nodeMetrics); isValid {
return quotav1.Subtract(nodeCapacity, nodeUsed), nil
} else {
return nil, fmt.Errorf("invalid node usage: %v", mes)
}
}
150 changes: 138 additions & 12 deletions pkg/slo-controller/noderesource/plugins/midresource/plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -422,12 +422,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:10000) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:10000, NodeUnused:80000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(10000, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:15 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:15) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:15 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:15, NodeUnused:165) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(15, 9),
},
},
Expand Down Expand Up @@ -502,11 +502,11 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:18000 = min(nodeCapacity:100000 * thresholdRatio:0.1, ProdReclaimable:15000) + Unallocated:80000 * midUnallocatedRatio:0.1",
Message: "midAllocatable[CPU(milli-core)]:18000 = min(nodeCapacity:100000 * thresholdRatio:0.1, ProdReclaimable:15000, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0.1",
Quantity: resource.NewQuantity(18000, resource.DecimalSI)},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:46 = min(nodeCapacity:210 * thresholdRatio:0.2, ProdReclaimable:30) + Unallocated:160 * midUnallocatedRatio:0.1",
Message: "midAllocatable[Memory(GB)]:46 = min(nodeCapacity:210 * thresholdRatio:0.2, ProdReclaimable:30, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0.1",
Quantity: resource.NewScaledQuantity(46, 9),
},
},
Expand Down Expand Up @@ -571,17 +571,143 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
wantErr: false,
},
{
name: "calculate correctly where node metrics is invalid",
args: args{
strategy: &configuration.ColocationStrategy{
Enable: pointer.Bool(true),
DegradeTimeMinutes: pointer.Int64(10),
},
node: testNode,
podList: &corev1.PodList{
Items: []corev1.Pod{
*testProdLSPod,
*testBatchBEPod,
},
},
metrics: &framework.ResourceMetrics{
NodeMetric: &slov1alpha1.NodeMetric{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Status: slov1alpha1.NodeMetricStatus{
UpdateTime: &metav1.Time{Time: time.Now().Add(-20 * time.Second)},
NodeMetric: &slov1alpha1.NodeMetricInfo{},
PodsMetric: []*slov1alpha1.PodMetricInfo{},
ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{
Resource: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("20"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
},
},
},
},
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:20000, NodeUnused:0) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:20, NodeUnused:0) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
wantErr: false,
},
{
name: "calculate correctly where the prod reclaimable exceeds the node free resource",
args: args{
strategy: &configuration.ColocationStrategy{
Enable: pointer.Bool(true),
DegradeTimeMinutes: pointer.Int64(10),
},
node: testNode,
podList: &corev1.PodList{
Items: []corev1.Pod{
*testProdLSPod,
*testBatchBEPod,
},
},
metrics: &framework.ResourceMetrics{
NodeMetric: &slov1alpha1.NodeMetric{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Status: slov1alpha1.NodeMetricStatus{
UpdateTime: &metav1.Time{Time: time.Now().Add(-20 * time.Second)},
NodeMetric: &slov1alpha1.NodeMetricInfo{
NodeUsage: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("90"),
corev1.ResourceMemory: resource.MustParse("200G"),
},
},
},
PodsMetric: []*slov1alpha1.PodMetricInfo{
{
Name: testProdLSPod.Name,
Namespace: testProdLSPod.Namespace,
PodUsage: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("10"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
{
Name: testBatchBEPod.Name,
Namespace: testBatchBEPod.Namespace,
PodUsage: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("15"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
},
ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{
Resource: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("20"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
},
},
},
},
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:20000, NodeUnused:10000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(10000, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:10 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:20, NodeUnused:10) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(10, 9),
},
},
wantErr: false,
},
{
name: "including product host application usage",
args: args{
Expand Down Expand Up @@ -653,12 +779,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:75000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:75000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:155 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:155 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
Expand Down Expand Up @@ -735,12 +861,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
Expand Down Expand Up @@ -817,12 +943,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
Expand Down
31 changes: 26 additions & 5 deletions pkg/slo-controller/noderesource/plugins/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,16 @@ func CalculateBatchResourceByPolicy(strategy *configuration.ColocationStrategy,
return batchAllocatable, cpuMsg, memMsg
}

func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, nodeCapacity, unallocated corev1.ResourceList, allocatableMilliCPU, allocatableMemory int64,
func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, nodeCapacity, unallocated, nodeUnused corev1.ResourceList, allocatableMilliCPU, allocatableMemory int64,
prodReclaimableCPU, prodReclaimableMemory *resource.Quantity, nodeName string) (*resource.Quantity, *resource.Quantity, string, string) {
defaultStrategy := sloconfig.DefaultColocationStrategy()
cpuThresholdRatio := getPercentFromStrategy(strategy, &defaultStrategy, MidCPUThreshold)
if maxMilliCPU := float64(nodeCapacity.Cpu().MilliValue()) * cpuThresholdRatio; allocatableMilliCPU > int64(maxMilliCPU) {
allocatableMilliCPU = int64(maxMilliCPU)
}
if allocatableMilliCPU > nodeUnused.Cpu().MilliValue() {
allocatableMilliCPU = nodeUnused.Cpu().MilliValue()
}
if allocatableMilliCPU < 0 {
klog.V(5).Infof("mid allocatable milli cpu of node %s is %v less than zero, set to zero",
nodeName, allocatableMilliCPU)
Expand All @@ -120,6 +123,9 @@ func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, no
if maxMemory := float64(nodeCapacity.Memory().Value()) * memThresholdRatio; allocatableMemory > int64(maxMemory) {
allocatableMemory = int64(maxMemory)
}
if allocatableMemory > nodeUnused.Memory().Value() {
allocatableMemory = nodeUnused.Memory().Value()
}
if allocatableMemory < 0 {
klog.V(5).Infof("mid allocatable memory of node %s is %v less than zero, set to zero",
nodeName, allocatableMemory)
Expand All @@ -136,14 +142,14 @@ func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, no
cpuInMilliCores.Add(*adjustedUnallocatedMilliCPU)
memory.Add(*adjustedUnallocatedMemory)

cpuMsg := fmt.Sprintf("midAllocatable[CPU(milli-core)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v) + Unallocated:%v * midUnallocatedRatio:%v",
cpuMsg := fmt.Sprintf("midAllocatable[CPU(milli-core)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v, NodeUnused:%v) + Unallocated:%v * midUnallocatedRatio:%v",
cpuInMilliCores.Value(), nodeCapacity.Cpu().MilliValue(),
cpuThresholdRatio, prodReclaimableCPU.MilliValue(),
cpuThresholdRatio, prodReclaimableCPU.MilliValue(), nodeUnused.Cpu().MilliValue(),
unallocatedMilliCPU.Value(), midUnallocatedRatio)

memMsg := fmt.Sprintf("midAllocatable[Memory(GB)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v) + Unallocated:%v * midUnallocatedRatio:%v",
memMsg := fmt.Sprintf("midAllocatable[Memory(GB)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v, NodeUnused:%v) + Unallocated:%v * midUnallocatedRatio:%v",
memory.ScaledValue(resource.Giga), nodeCapacity.Memory().ScaledValue(resource.Giga),
memThresholdRatio, prodReclaimableMemory.ScaledValue(resource.Giga),
memThresholdRatio, prodReclaimableMemory.ScaledValue(resource.Giga), nodeUnused.Memory().ScaledValue(resource.Giga),
unallocatedMemory.ScaledValue(resource.Giga), midUnallocatedRatio)

return cpuInMilliCores, memory, cpuMsg, memMsg
Expand Down Expand Up @@ -453,3 +459,18 @@ func getPercentFromStrategy(strategy, defaultStrategy *configuration.ColocationS
return 0
}
}

func IsValidNodeUsage(nodeMetric *slov1alpha1.NodeMetric) (bool, string) {
if nodeMetric == nil || nodeMetric.Status.NodeMetric == nil || nodeMetric.Status.NodeMetric.NodeUsage.ResourceList == nil {
return false, "node metric is incomplete"
}
_, ok := nodeMetric.Status.NodeMetric.NodeUsage.ResourceList[corev1.ResourceCPU]
if !ok {
return false, "cpu usage is missing"
}
_, ok = nodeMetric.Status.NodeMetric.NodeUsage.ResourceList[corev1.ResourceMemory]
if !ok {
return false, "memory usage is missing"
}
return true, ""
}
Loading

0 comments on commit 9bb39a2

Please sign in to comment.