Skip to content

Commit

Permalink
[YUNIKORN-2030] headroom check for reserved allocations (#793)
Browse files Browse the repository at this point in the history
During the allocation cycle for reservations we fall back to trying all
reservations on all nodes for an application. During this fallback a
headroom check was missing which caused queue update failures to be
logged.
The allocation did not succeed but it caused scheduling delays and log
spew.

Closes: #793

Signed-off-by: Wilfred Spiegelenburg <[email protected]>
(cherry picked from commit 908d1cb)
  • Loading branch information
Yongjun Zhang authored and wilfred-s committed Feb 26, 2024
1 parent d24a25a commit 381bc9a
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 9 deletions.
23 changes: 14 additions & 9 deletions pkg/scheduler/objects/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -1223,6 +1223,12 @@ func (sa *Application) tryPlaceholderAllocate(nodeIterator func() NodeIterator,
return allocResult
}

// check ask against both user headRoom and queue headRoom
func (sa *Application) checkHeadRooms(ask *AllocationAsk, userHeadroom *resources.Resource, headRoom *resources.Resource) bool {
// check if this fits in the users' headroom first, if that fits check the queues' headroom
return userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) && headRoom.FitInMaxUndef(ask.GetAllocatedResource())
}

// Try a reserved allocation of an outstanding reservation
func (sa *Application) tryReservedAllocate(headRoom *resources.Resource, nodeIterator func() NodeIterator) *Allocation {
sa.Lock()
Expand Down Expand Up @@ -1250,13 +1256,8 @@ func (sa *Application) tryReservedAllocate(headRoom *resources.Resource, nodeIte
alloc := newUnreservedAllocation(reserve.nodeID, unreserveAsk)
return alloc
}
// check if this fits in the users' headroom first, if that fits check the queues' headroom
if !userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) {
continue
}

// check if this fits in the queue's headroom
if !headRoom.FitInMaxUndef(ask.GetAllocatedResource()) {
if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
continue
}

Expand All @@ -1280,12 +1281,16 @@ func (sa *Application) tryReservedAllocate(headRoom *resources.Resource, nodeIte
// lets try this on all other nodes
for _, reserve := range sa.reservations {
// Other nodes cannot be tried if the ask has a required node
if reserve.ask.GetRequiredNode() != "" {
ask := reserve.ask
if ask.GetRequiredNode() != "" {
continue
}
iterator := nodeIterator()
if iterator != nil {
alloc := sa.tryNodesNoReserve(reserve.ask, iterator, reserve.nodeID)
if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
continue
}
alloc := sa.tryNodesNoReserve(ask, iterator, reserve.nodeID)
// have a candidate return it, including the node that was reserved
if alloc != nil {
return alloc
Expand Down Expand Up @@ -1490,7 +1495,7 @@ func (sa *Application) tryNode(node *Node, ask *AllocationAsk) *Allocation {
alloc := NewAllocation(node.NodeID, ask)
if node.AddAllocation(alloc) {
if err := sa.queue.IncAllocatedResource(alloc.GetAllocatedResource(), false); err != nil {
log.Log(log.SchedApplication).Warn("queue update failed unexpectedly",
log.Log(log.SchedApplication).DPanic("queue update failed unexpectedly",
zap.Error(err))
// revert the node update
node.RemoveAllocation(alloc.GetAllocationID())
Expand Down
33 changes: 33 additions & 0 deletions pkg/scheduler/objects/application_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2601,6 +2601,39 @@ func TestGetRateLimitedAppLog(t *testing.T) {
assert.Check(t, l != nil)
}

func TestTryAllocateWithReservedHeadRoomChecking(t *testing.T) {
defer func() {
if r := recover(); r != nil {
t.Fatalf("reserved headroom test regression: %v", r)
}
}()

res, err := resources.NewResourceFromConf(map[string]string{"memory": "2G"})
assert.NilError(t, err, "failed to create basic resource")
var headRoom *resources.Resource
headRoom, err = resources.NewResourceFromConf(map[string]string{"memory": "1G"})
assert.NilError(t, err, "failed to create basic resource")

app := newApplication(appID1, "default", "root")
ask := newAllocationAsk(aKey, appID1, res)
var queue *Queue
queue, err = createRootQueue(map[string]string{"memory": "1G"})
assert.NilError(t, err, "queue create failed")
app.queue = queue
err = app.AddAllocationAsk(ask)
assert.NilError(t, err, "ask should have been added to app")

node1 := newNodeRes(nodeID1, res)
node2 := newNodeRes(nodeID2, res)
// reserve that works
err = app.Reserve(node1, ask)
assert.NilError(t, err, "reservation should not have failed")

iter := getNodeIteratorFn(node1, node2)
alloc := app.tryReservedAllocate(headRoom, iter)
assert.Assert(t, alloc == nil, "Alloc is expected to be nil due to insufficient headroom")
}

func TestUpdateRunnableStatus(t *testing.T) {
app := newApplication(appID0, "default", "root.unknown")
assert.Assert(t, app.runnableInQueue)
Expand Down
1 change: 1 addition & 0 deletions pkg/scheduler/objects/utilities_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ const (
aKey = "alloc-1"
aAllocationID = "alloc-allocationid-1"
nodeID1 = "node-1"
nodeID2 = "node-2"
instType1 = "itype-1"
)

Expand Down

0 comments on commit 381bc9a

Please sign in to comment.