Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slo-controller: fix mid resource calculate formula #2291

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions pkg/slo-controller/noderesource/plugins/midresource/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,14 +159,14 @@ func (p *Plugin) getUnallocated(nodeName string, podList *corev1.PodList, nodeCa

func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *corev1.Node, podList *corev1.PodList,
resourceMetrics *framework.ResourceMetrics) []framework.ResourceItem {
// Allocatable[Mid]' := min(Reclaimable[Mid], NodeAllocatable * thresholdRatio) + Unallocated[Mid] * midUnallocatedRatio
// Allocatable[Mid]' := min(Reclaimable[Mid], NodeAllocatable * thresholdRatio, NodeUnused) + Unallocated[Mid] * midUnallocatedRatio
// Unallocated[Mid] = max(NodeCapacity - NodeReserved - Allocated[Prod], 0)

var allocatableMilliCPU, allocatableMemory int64
prodReclaimableCPU, prodReclaimableMemory := resource.NewQuantity(0, resource.DecimalSI), resource.NewQuantity(0, resource.BinarySI)
prodReclaimableMetic := resourceMetrics.NodeMetric.Status.ProdReclaimableMetric
prodReclaimableMetric := resourceMetrics.NodeMetric.Status.ProdReclaimableMetric

if prodReclaimableMetic == nil || prodReclaimableMetic.Resource.ResourceList == nil {
if prodReclaimableMetric == nil || prodReclaimableMetric.Resource.ResourceList == nil {
klog.V(4).Infof("no valid prod reclaimable, so use default zero value")
allocatableMilliCPU = 0
allocatableMemory = 0
Expand All @@ -185,7 +185,7 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor
nodeCapacity := resutil.GetNodeCapacity(node)

systemUsed := resutil.GetResourceListForCPUAndMemory(nodeMetric.Status.NodeMetric.SystemUsage.ResourceList)
// resource usage of host applications with prod priority will be count as host system usage since they consumes the
// resource usage of host applications with prod priority will be count as host system usage since they consume the
// node reserved resource.
systemUsed = quotav1.Add(systemUsed, hostAppHPUsed)

Expand All @@ -198,8 +198,14 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor

unallocated := p.getUnallocated(node.Name, podList, nodeCapacity, nodeReserved)

nodeUnused, err := getNodeUnused(node, nodeMetric)
if err != nil {
// failed to get nodeUsage, so radically belief that there is no resource left
// to keep mid-resource calculations relatively strict
nodeUnused = corev1.ResourceList{}
}
cpuInMilliCores, memory, cpuMsg, memMsg := resutil.CalculateMidResourceByPolicy(strategy, nodeCapacity,
unallocated, allocatableMilliCPU, allocatableMemory, prodReclaimableCPU, prodReclaimableMemory, node.Name)
unallocated, nodeUnused, allocatableMilliCPU, allocatableMemory, prodReclaimableCPU, prodReclaimableMemory, node.Name)

metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.MidCPU), metrics.UnitInteger, float64(cpuInMilliCores.MilliValue())/1000)
metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.MidMemory), metrics.UnitByte, float64(memory.Value()))
Expand All @@ -219,3 +225,14 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor
},
}
}

func getNodeUnused(node *corev1.Node, nodeMetrics *slov1alpha1.NodeMetric) (corev1.ResourceList, error) {
// nodeCapacity - nodeUsed
nodeCapacity := resutil.GetNodeCapacity(node)
nodeUsed := nodeMetrics.Status.NodeMetric.NodeUsage.ResourceList
lijunxin559 marked this conversation as resolved.
Show resolved Hide resolved
if isValid, mes := resutil.IsValidNodeUsage(nodeMetrics); isValid {
return quotav1.Subtract(nodeCapacity, nodeUsed), nil
} else {
return nil, fmt.Errorf("invalid node usage: %v", mes)
}
}
150 changes: 138 additions & 12 deletions pkg/slo-controller/noderesource/plugins/midresource/plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -422,12 +422,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:10000) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:10000, NodeUnused:80000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(10000, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:15 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:15) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:15 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:15, NodeUnused:165) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(15, 9),
},
},
Expand Down Expand Up @@ -502,11 +502,11 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:18000 = min(nodeCapacity:100000 * thresholdRatio:0.1, ProdReclaimable:15000) + Unallocated:80000 * midUnallocatedRatio:0.1",
Message: "midAllocatable[CPU(milli-core)]:18000 = min(nodeCapacity:100000 * thresholdRatio:0.1, ProdReclaimable:15000, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0.1",
Quantity: resource.NewQuantity(18000, resource.DecimalSI)},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:46 = min(nodeCapacity:210 * thresholdRatio:0.2, ProdReclaimable:30) + Unallocated:160 * midUnallocatedRatio:0.1",
Message: "midAllocatable[Memory(GB)]:46 = min(nodeCapacity:210 * thresholdRatio:0.2, ProdReclaimable:30, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0.1",
Quantity: resource.NewScaledQuantity(46, 9),
},
},
Expand Down Expand Up @@ -571,17 +571,143 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
wantErr: false,
},
{
name: "calculate correctly where node metrics is invalid",
args: args{
strategy: &configuration.ColocationStrategy{
Enable: pointer.Bool(true),
DegradeTimeMinutes: pointer.Int64(10),
},
node: testNode,
podList: &corev1.PodList{
Items: []corev1.Pod{
*testProdLSPod,
*testBatchBEPod,
},
},
metrics: &framework.ResourceMetrics{
NodeMetric: &slov1alpha1.NodeMetric{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Status: slov1alpha1.NodeMetricStatus{
UpdateTime: &metav1.Time{Time: time.Now().Add(-20 * time.Second)},
NodeMetric: &slov1alpha1.NodeMetricInfo{},
PodsMetric: []*slov1alpha1.PodMetricInfo{},
ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{
Resource: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("20"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
},
},
},
},
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:20000, NodeUnused:0) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:20, NodeUnused:0) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
wantErr: false,
},
{
name: "calculate correctly where the prod reclaimable exceeds the node free resource",
args: args{
strategy: &configuration.ColocationStrategy{
Enable: pointer.Bool(true),
DegradeTimeMinutes: pointer.Int64(10),
},
node: testNode,
podList: &corev1.PodList{
Items: []corev1.Pod{
*testProdLSPod,
*testBatchBEPod,
},
},
metrics: &framework.ResourceMetrics{
NodeMetric: &slov1alpha1.NodeMetric{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Status: slov1alpha1.NodeMetricStatus{
UpdateTime: &metav1.Time{Time: time.Now().Add(-20 * time.Second)},
NodeMetric: &slov1alpha1.NodeMetricInfo{
NodeUsage: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("90"),
corev1.ResourceMemory: resource.MustParse("200G"),
},
},
},
PodsMetric: []*slov1alpha1.PodMetricInfo{
{
Name: testProdLSPod.Name,
Namespace: testProdLSPod.Namespace,
PodUsage: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("10"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
{
Name: testBatchBEPod.Name,
Namespace: testBatchBEPod.Namespace,
PodUsage: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("15"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
},
ProdReclaimableMetric: &slov1alpha1.ReclaimableMetric{
Resource: slov1alpha1.ResourceMap{
ResourceList: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("20"),
corev1.ResourceMemory: resource.MustParse("20G"),
},
},
},
},
},
},
},
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:10000 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:20000, NodeUnused:10000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(10000, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:10 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:20, NodeUnused:10) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(10, 9),
},
},
wantErr: false,
},
{
name: "including product host application usage",
args: args{
Expand Down Expand Up @@ -653,12 +779,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:75000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:75000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:155 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:155 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
Expand Down Expand Up @@ -735,12 +861,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
Expand Down Expand Up @@ -817,12 +943,12 @@ func TestPluginCalculate(t *testing.T) {
want: []framework.ResourceItem{
{
Name: extension.MidCPU,
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:80000 * midUnallocatedRatio:0",
Message: "midAllocatable[CPU(milli-core)]:0 = min(nodeCapacity:100000 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:70000) + Unallocated:80000 * midUnallocatedRatio:0",
Quantity: resource.NewQuantity(0, resource.DecimalSI),
},
{
Name: extension.MidMemory,
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0) + Unallocated:160 * midUnallocatedRatio:0",
Message: "midAllocatable[Memory(GB)]:0 = min(nodeCapacity:210 * thresholdRatio:1, ProdReclaimable:0, NodeUnused:160) + Unallocated:160 * midUnallocatedRatio:0",
Quantity: resource.NewScaledQuantity(0, 0),
},
},
Expand Down
31 changes: 26 additions & 5 deletions pkg/slo-controller/noderesource/plugins/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,16 @@ func CalculateBatchResourceByPolicy(strategy *configuration.ColocationStrategy,
return batchAllocatable, cpuMsg, memMsg
}

func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, nodeCapacity, unallocated corev1.ResourceList, allocatableMilliCPU, allocatableMemory int64,
func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, nodeCapacity, unallocated, nodeUnused corev1.ResourceList, allocatableMilliCPU, allocatableMemory int64,
prodReclaimableCPU, prodReclaimableMemory *resource.Quantity, nodeName string) (*resource.Quantity, *resource.Quantity, string, string) {
defaultStrategy := sloconfig.DefaultColocationStrategy()
cpuThresholdRatio := getPercentFromStrategy(strategy, &defaultStrategy, MidCPUThreshold)
if maxMilliCPU := float64(nodeCapacity.Cpu().MilliValue()) * cpuThresholdRatio; allocatableMilliCPU > int64(maxMilliCPU) {
allocatableMilliCPU = int64(maxMilliCPU)
}
if allocatableMilliCPU > nodeUnused.Cpu().MilliValue() {
allocatableMilliCPU = nodeUnused.Cpu().MilliValue()
}
if allocatableMilliCPU < 0 {
klog.V(5).Infof("mid allocatable milli cpu of node %s is %v less than zero, set to zero",
nodeName, allocatableMilliCPU)
Expand All @@ -120,6 +123,9 @@ func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, no
if maxMemory := float64(nodeCapacity.Memory().Value()) * memThresholdRatio; allocatableMemory > int64(maxMemory) {
allocatableMemory = int64(maxMemory)
}
if allocatableMemory > nodeUnused.Memory().Value() {
allocatableMemory = nodeUnused.Memory().Value()
}
if allocatableMemory < 0 {
klog.V(5).Infof("mid allocatable memory of node %s is %v less than zero, set to zero",
nodeName, allocatableMemory)
Expand All @@ -136,14 +142,14 @@ func CalculateMidResourceByPolicy(strategy *configuration.ColocationStrategy, no
cpuInMilliCores.Add(*adjustedUnallocatedMilliCPU)
memory.Add(*adjustedUnallocatedMemory)

cpuMsg := fmt.Sprintf("midAllocatable[CPU(milli-core)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v) + Unallocated:%v * midUnallocatedRatio:%v",
cpuMsg := fmt.Sprintf("midAllocatable[CPU(milli-core)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v, NodeUnused:%v) + Unallocated:%v * midUnallocatedRatio:%v",
cpuInMilliCores.Value(), nodeCapacity.Cpu().MilliValue(),
cpuThresholdRatio, prodReclaimableCPU.MilliValue(),
cpuThresholdRatio, prodReclaimableCPU.MilliValue(), nodeUnused.Cpu().MilliValue(),
unallocatedMilliCPU.Value(), midUnallocatedRatio)

memMsg := fmt.Sprintf("midAllocatable[Memory(GB)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v) + Unallocated:%v * midUnallocatedRatio:%v",
memMsg := fmt.Sprintf("midAllocatable[Memory(GB)]:%v = min(nodeCapacity:%v * thresholdRatio:%v, ProdReclaimable:%v, NodeUnused:%v) + Unallocated:%v * midUnallocatedRatio:%v",
memory.ScaledValue(resource.Giga), nodeCapacity.Memory().ScaledValue(resource.Giga),
memThresholdRatio, prodReclaimableMemory.ScaledValue(resource.Giga),
memThresholdRatio, prodReclaimableMemory.ScaledValue(resource.Giga), nodeUnused.Memory().ScaledValue(resource.Giga),
unallocatedMemory.ScaledValue(resource.Giga), midUnallocatedRatio)

return cpuInMilliCores, memory, cpuMsg, memMsg
Expand Down Expand Up @@ -453,3 +459,18 @@ func getPercentFromStrategy(strategy, defaultStrategy *configuration.ColocationS
return 0
}
}

func IsValidNodeUsage(nodeMetric *slov1alpha1.NodeMetric) (bool, string) {
if nodeMetric == nil || nodeMetric.Status.NodeMetric == nil || nodeMetric.Status.NodeMetric.NodeUsage.ResourceList == nil {
return false, "node metric is incomplete"
}
_, ok := nodeMetric.Status.NodeMetric.NodeUsage.ResourceList[corev1.ResourceCPU]
if !ok {
return false, "cpu usage is missing"
}
_, ok = nodeMetric.Status.NodeMetric.NodeUsage.ResourceList[corev1.ResourceMemory]
if !ok {
return false, "memory usage is missing"
}
return true, ""
}
Loading
Loading