Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[YUNIKORN-2897] Health checker reports foreign allocation as orphan #977

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/scheduler/health_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ func checkAppAllocations(app *objects.Application, nodes objects.NodeCollection)

func checkNodeAllocations(node *objects.Node, partitionContext *PartitionContext) []*objects.Allocation {
orphanAllocationsOnNode := make([]*objects.Allocation, 0)
for _, alloc := range node.GetAllAllocations() {
for _, alloc := range node.GetYunikornAllocations() {
if app := partitionContext.getApplication(alloc.GetApplicationID()); app != nil {
if !app.IsAllocationAssignedToApp(alloc) {
orphanAllocationsOnNode = append(orphanAllocationsOnNode, alloc)
Expand Down
9 changes: 9 additions & 0 deletions pkg/scheduler/health_checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,15 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should be successful")
assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app should not be successful")
part.removeApplication("appID")
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app still fails after removing the app")

// check that foreign allocation does not interfere with health check
falloc := newForeignAllocation("foreign-1", "node")
node.AddAllocation(falloc)
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "Foreign allocation was detected as orphan")
}

func TestGetSchedulerHealthStatusMetrics(t *testing.T) {
Expand Down
13 changes: 0 additions & 13 deletions pkg/scheduler/objects/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,19 +230,6 @@ func (sn *Node) GetAllocation(allocationKey string) *Allocation {
return sn.allocations[allocationKey]
}

// Get a copy of the allocations on this node
func (sn *Node) GetAllAllocations() []*Allocation {
sn.RLock()
defer sn.RUnlock()

arr := make([]*Allocation, 0)
for _, v := range sn.allocations {
arr = append(arr, v)
}

return arr
}

// GetYunikornAllocations returns a copy of Yunikorn allocations on this node
func (sn *Node) GetYunikornAllocations() []*Allocation {
sn.RLock()
Expand Down
23 changes: 0 additions & 23 deletions pkg/scheduler/objects/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,29 +688,6 @@ func TestGetAllocation(t *testing.T) {
}
}

func TestGetAllAllocations(t *testing.T) {
node := newNode("node-123", map[string]resources.Quantity{"first": 100, "second": 200})
if !resources.IsZero(node.GetAllocatedResource()) {
t.Fatal("Failed to initialize resource")
}

// nothing allocated get an empty list
allocs := node.GetAllAllocations()
if allocs == nil || len(allocs) != 0 {
t.Fatalf("allocation length should be 0 on new node")
}
alloc1 := newAllocation(appID1, nodeID1, nil)
alloc2 := newAllocation(appID1, nodeID1, nil)

// allocate
node.AddAllocation(alloc1)
node.AddAllocation(alloc2)
assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length mismatch")
// This should not happen in real code just making sure the code does do what is expected
node.AddAllocation(alloc2)
assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length mismatch")
}

func TestSchedulingState(t *testing.T) {
node := newNode("node-123", nil)
if !node.IsSchedulable() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/objects/required_node_preemptor.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func NewRequiredNodePreemptor(node *Node, requiredAsk *Allocation) *PreemptionCo
func (p *PreemptionContext) filterAllocations() {
p.Lock()
defer p.Unlock()
for _, allocation := range p.node.GetAllAllocations() {
for _, allocation := range p.node.GetYunikornAllocations() {
// skip daemon set pods and higher priority allocation
if allocation.GetRequiredNode() != "" || allocation.GetPriority() > p.requiredAsk.GetPriority() {
continue
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ func (pc *PartitionContext) removeNodeAllocations(node *objects.Node) ([]*object
released := make([]*objects.Allocation, 0)
confirmed := make([]*objects.Allocation, 0)
// walk over all allocations still registered for this node
for _, alloc := range node.GetAllAllocations() {
for _, alloc := range node.GetYunikornAllocations() {
allocationKey := alloc.GetAllocationKey()
// since we are not locking the node and or application we could have had an update while processing
// note that we do not return the allocation if the app or allocation is not found and assume that it
Expand Down
24 changes: 12 additions & 12 deletions pkg/scheduler/partition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ func TestRemoveNodeWithAllocations(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
allocated := node.GetAllAllocations()
allocated := node.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly")
assertLimits(t, getTestUserGroup(), appRes)

Expand Down Expand Up @@ -316,7 +316,7 @@ func TestRemoveNodeWithPlaceholders(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1 expected 1 got: %v", allocated)
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
Expand Down Expand Up @@ -432,7 +432,7 @@ func TestPlaceholderDataWithPlaceholderPreemption(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")

Expand Down Expand Up @@ -561,7 +561,7 @@ func TestPlaceholderDataWithNodeRemoval(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")

Expand Down Expand Up @@ -648,7 +648,7 @@ func TestPlaceholderDataWithRemoval(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")

Expand Down Expand Up @@ -733,7 +733,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")
Expand All @@ -753,7 +753,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 1, false)
alloc.SetRelease(ph)
node2.AddAllocation(alloc)
allocated = node2.GetAllAllocations()
allocated = node2.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node2")
assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), appRes), "allocation not added correctly to node2 (resource count)")
assertLimits(t, getTestUserGroup(), appRes)
Expand All @@ -768,7 +768,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
// remove the node with the placeholder
released, confirmed := partition.removeNode(nodeID1)
assert.Equal(t, 1, partition.GetTotalNodeCount(), "node list was not updated, node was not removed")
assert.Equal(t, 1, len(node2.GetAllAllocations()), "remaining node should have allocation")
assert.Equal(t, 1, len(node2.GetYunikornAllocations()), "remaining node should have allocation")
assert.Equal(t, 1, len(released), "node removal did not release correct allocation")
assert.Equal(t, 1, len(confirmed), "node removal did not confirm correct allocation")
assert.Equal(t, ph.GetAllocationKey(), released[0].GetAllocationKey(), "allocationKey returned by release not the same as the placeholder")
Expand Down Expand Up @@ -807,7 +807,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
Expand All @@ -827,7 +827,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 1, false)
alloc.SetRelease(ph)
node2.AddAllocation(alloc)
allocated = node2.GetAllAllocations()
allocated = node2.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node2")
assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), appRes), "allocation not added correctly to node2 (resource count)")
assertLimits(t, getTestUserGroup(), appRes)
Expand Down Expand Up @@ -4729,7 +4729,7 @@ func TestForeignAllocation(t *testing.T) {
assert.NilError(t, err)
assert.Equal(t, 1, len(partition.foreignAllocs))
assert.Equal(t, nodeID1, partition.foreignAllocs[foreignAlloc1])
assert.Equal(t, 1, len(node1.GetAllAllocations()))
assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
assert.Assert(t, node1.GetAllocation(foreignAlloc1) != nil)

// remove allocation
Expand All @@ -4739,6 +4739,6 @@ func TestForeignAllocation(t *testing.T) {
assert.Assert(t, released == nil)
assert.Assert(t, confirmed == nil)
assert.Equal(t, 0, len(partition.foreignAllocs))
assert.Equal(t, 0, len(node1.GetAllAllocations()))
assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
assert.Assert(t, node1.GetAllocation(foreignAlloc1) == nil)
}
4 changes: 2 additions & 2 deletions pkg/scheduler/tests/recovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ func TestSchedulerRecovery(t *testing.T) {
// verify nodes
assert.Equal(t, 2, part.GetTotalNodeCount(), "incorrect recovered node count")

assert.Equal(t, len(node1Allocations), len(part.GetNode("node-1:1234").GetAllAllocations()), "allocations on node-1 not as expected")
assert.Equal(t, len(node2Allocations), len(part.GetNode("node-2:1234").GetAllAllocations()), "allocations on node-1 not as expected")
assert.Equal(t, len(node1Allocations), len(part.GetNode("node-1:1234").GetYunikornAllocations()), "allocations on node-1 not as expected")
assert.Equal(t, len(node2Allocations), len(part.GetNode("node-2:1234").GetYunikornAllocations()), "allocations on node-1 not as expected")

node1AllocatedMemory := part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.Memory]
node2AllocatedMemory := part.GetNode("node-2:1234").GetAllocatedResource().Resources[common.Memory]
Expand Down
Loading