From fad90121d148b80eeadea4d4b89b33455d63f334 Mon Sep 17 00:00:00 2001 From: Hanyu Zhao Date: Tue, 7 Apr 2020 10:51:38 +0800 Subject: [PATCH] refine chain search (#13) * fix bug in early stop chain * keep searching the chains until placement within suggested nodes * refine logging * refactor h.Schedule() which was too long * fix virtual cell's healthiness * refine suggested nodes related logic * resolve comments * readme --- README.md | 2 +- pkg/algorithm/cell.go | 1 + pkg/algorithm/hived_algorithm.go | 338 ++++++++++++++++---------- pkg/algorithm/hived_algorithm_test.go | 22 +- pkg/algorithm/intra_vc_scheduler.go | 19 +- pkg/algorithm/utils.go | 26 +- pkg/api/types.go | 4 +- 7 files changed, 241 insertions(+), 171 deletions(-) diff --git a/README.md b/README.md index e759532..5b9dd7b 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ HiveD supports multiple job **priorities**. Higher-priority jobs can **[preempt] 4. Optimized Resource Fragmentation and Less Starvation 5. [Priorities](example/feature/README.md#Guaranteed-Job), [Overuse with Low Priority](example/feature/README.md#Opportunistic-Job), and [Inter-](example/feature/README.md#Inter-VC-Preemption)/[Intra-VC Preemption](example/feature/README.md#Intra-VC-Preemption) 6. [Job (Full/Partial) Gang Scheduling/Preemption](example/feature/README.md#Gang-Scheduling) -7. Fault-Tolerance, [Hardware Failure-Awareness](example/feature/README.md#Bad-Hardware-Awareness), [Work-Preserving Reconfiguration](example/feature/README.md#Work-Preserving-Reconfiguration) +7. Fault-Tolerance, [Bad Hardware Awareness](example/feature/README.md#Bad-Hardware-Awareness), [Work-Preserving Reconfiguration](example/feature/README.md#Work-Preserving-Reconfiguration) 8. [Leverage K8S Default Scheduler](example/feature/README.md#Leverage-K8S-Default-Scheduler) ## Prerequisite diff --git a/pkg/algorithm/cell.go b/pkg/algorithm/cell.go index 2863f0b..b1994dc 100644 --- a/pkg/algorithm/cell.go +++ b/pkg/algorithm/cell.go @@ -394,6 +394,7 @@ func (c *VirtualCell) SetPhysicalCell(cell *PhysicalCell) { if cell == nil { c.apiStatus.PhysicalCell = nil c.apiStatus.CellHealthiness = api.CellHealthy + c.apiStatus.CellState = api.CellState(cellFree) } else { pcs := &api.PhysicalCellStatus{} // shallow copy the status, clear the pointers to avoid reference diff --git a/pkg/algorithm/hived_algorithm.go b/pkg/algorithm/hived_algorithm.go index f98bd78..753b5ee 100644 --- a/pkg/algorithm/hived_algorithm.go +++ b/pkg/algorithm/hived_algorithm.go @@ -166,90 +166,31 @@ func (h *HivedAlgorithm) Schedule( groupPhysicalPlacement groupPhysicalPlacement // GPU number -> a set of pods -> a set of GPUs of each pod groupVirtualPlacement groupVirtualPlacement // GPU number -> a set of pods -> a set of GPUs of each pod preemptionVictims map[string]common.Set // node -> pods - nodesNotInSuggested common.Set // nodes in physical placement that are not within suggested nodes - podIndex int32 // index of current pod among those of the same GPU number in the group, 0 by default + waitReason string + podIndex int32 // index of current pod among those of the same GPU number in the group, 0 by default ) if g := h.affinityGroups[s.AffinityGroup.Name]; g != nil { - // state of an existing group can be either Allocated or Preempting - if g.state == groupAllocated { - klog.Infof("[%v]: Pod affinity group is already allocated: %v", internal.Key(pod), s.AffinityGroup.Name) - groupPhysicalPlacement = g.physicalGpuPlacement - groupVirtualPlacement = g.virtualGpuPlacement - if podIndex = getNewPodIndex(g.allocatedPods[s.GpuNumber]); podIndex == -1 { - panic(internal.NewBadRequestError(fmt.Sprintf( - "Requesting more pods than the configured number for %v GPUs (%v pods) in affinity group %v", - s.GpuNumber, g.totalPodNums[s.GpuNumber], s.AffinityGroup.Name))) - } - } else { // groupPreempting - klog.Infof("[%v]: Pod affinity group is preempting others: %v", internal.Key(pod), s.AffinityGroup.Name) - nodesNotInSuggested = collectNodesNotSuggested(g.physicalGpuPlacement, suggestedNodeSet) - if phase == internal.PreemptingPhase && !nodesNotInSuggested.IsEmpty() { - // If we find a preempting group's placement is not fully within suggested nodes, we should cancel - // the preemption so as to reschedule it to other places. We should do this only in Preempting phase - // because the suggested nodes in Filtering phase does not consider preemption. - klog.Infof("[%v]: Canceling affinity group %v's preemption because its placement is no longer "+ - "fully within Preempting-phase suggested nodes (non-suggested nodes: %v)", - internal.Key(pod), g.name, nodesNotInSuggested) - h.deletePreemptingAffinityGroup(g, pod) - } else { - groupPhysicalPlacement = g.physicalGpuPlacement - groupVirtualPlacement = g.virtualGpuPlacement - if preemptionVictims, _ = collectPreemptionVictims(groupPhysicalPlacement); len(preemptionVictims) == 0 { - klog.Infof("Preemption victims have been cleaned up for the preemptor affinity group %v", g.name) - } - g.preemptingPods[pod.UID] = pod - } - } + groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, podIndex = + h.schedulePodFromExistingGroup(g, s, suggestedNodeSet, phase, pod) } + // we need to re-evaluate the existence of the group here (instead of an "else") because it is + // possible that the group was a preempting group and deleted in h.schedulePodFromExistingGroup if h.affinityGroups[s.AffinityGroup.Name] == nil { - klog.Infof("[%v]: Scheduling new affinity group %v", internal.Key(pod), s.AffinityGroup.Name) - groupPhysicalPlacement, groupVirtualPlacement = h.scheduleNewAffinityGroup(pod, s, suggestedNodeSet) - var overlappingPreemptors common.Set - preemptionVictims, overlappingPreemptors = collectPreemptionVictims(groupPhysicalPlacement) - nodesNotInSuggested = collectNodesNotSuggested(groupPhysicalPlacement, suggestedNodeSet) - // we allow a new preemption only when in Preempting phase - // and the placement is fully within suggested nodes - if phase == internal.PreemptingPhase && nodesNotInSuggested.IsEmpty() { - for preemptor := range overlappingPreemptors.Items() { - klog.Infof("[%v]: Canceling affinity group %v's preemption because it is "+ - "further preempted by a higher-priority affinity group %v", - internal.Key(pod), preemptor.(*AlgoAffinityGroup).name, s.AffinityGroup.Name) - h.deletePreemptingAffinityGroup(preemptor.(*AlgoAffinityGroup), pod) - } - } - if len(preemptionVictims) != 0 { - if phase == internal.PreemptingPhase && nodesNotInSuggested.IsEmpty() { - h.createPreemptingAffinityGroup(s, groupPhysicalPlacement, groupVirtualPlacement, pod) - } else if phase == internal.FilteringPhase { - klog.Infof("[%v]: Found preemption victims %v, but we do not allow preemption in Filtering "+ - "phase because K8s won't call preempt, creating preemption state here is misleading", - internal.Key(pod), victimsToString(preemptionVictims)) - groupPhysicalPlacement = nil - groupVirtualPlacement = nil - preemptionVictims = nil - } else { - klog.Infof("[%v]: Found preemption victims %v, but we do not allow this preemption "+ - "because the placement is not fully within Preempting-phase suggested nodes "+ - "(non-suggested nodes: %v)", - internal.Key(pod), victimsToString(preemptionVictims), nodesNotInSuggested) - preemptionVictims = nil - } - } + groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, waitReason = + h.schedulePodFromNewGroup(s, suggestedNodeSet, phase, pod) } return generatePodScheduleResult( groupPhysicalPlacement, groupVirtualPlacement, - CellPriority(s.Priority), preemptionVictims, - nodesNotInSuggested, + waitReason, h.cellTypes, s.GpuNumber, podIndex, h.affinityGroups[s.AffinityGroup.Name], s.AffinityGroup.Name, suggestedNodeSet, - s.VirtualCluster, pod) } @@ -666,13 +607,125 @@ func (h *HivedAlgorithm) updateVCDoomedBadCells(c CellChain, l CellLevel) { } } +// schedulePodFromExistingGroup schedules a pod from an allocated or preempting affinity group. +// If it is from an allocated group, we will schedule the pod to the corresponding placement. +// If it is from a preempting group, we will continue its preemption, or schedule it when the preemption is done. +func (h *HivedAlgorithm) schedulePodFromExistingGroup( + g *AlgoAffinityGroup, + s *api.PodSchedulingSpec, + suggestedNodes common.Set, + phase internal.SchedulingPhase, + pod *core.Pod) ( + groupPhysicalPlacement groupPhysicalPlacement, + groupVirtualPlacement groupVirtualPlacement, + preemptionVictims map[string]common.Set, + podIndex int32) { + + nodesNotInSuggested := collectNodesNotSuggested(g.physicalGpuPlacement, suggestedNodes) + // state of an existing group can be either Allocated or Preempting + if g.state == groupAllocated { + klog.Infof("[%v]: Pod is from an affinity group that is already allocated: %v", + internal.Key(pod), s.AffinityGroup.Name) + groupPhysicalPlacement = g.physicalGpuPlacement + groupVirtualPlacement = g.virtualGpuPlacement + if !nodesNotInSuggested.IsEmpty() { + // for an allocated group, we always insist the previous scheduling decision + // even if some pods are now not within suggested nodes + klog.Warningf("Some nodes allocated to affinity group %v are no longer "+ + "within K8s suggested nodes: %v", g.name, nodesNotInSuggested) + } + if podIndex = getNewPodIndex(g.allocatedPods[s.GpuNumber]); podIndex == -1 { + panic(internal.NewBadRequestError(fmt.Sprintf( + "Requesting more pods than the configured number for %v GPUs (%v pods) in affinity group %v", + s.GpuNumber, g.totalPodNums[s.GpuNumber], s.AffinityGroup.Name))) + } + } else { // groupPreempting + klog.Infof("[%v]: Pod is from an affinity group that is preempting others: %v", + internal.Key(pod), s.AffinityGroup.Name) + if phase == internal.PreemptingPhase && !nodesNotInSuggested.IsEmpty() { + // If we find a preempting group's placement is not fully within suggested nodes, we should cancel + // the preemption so as to reschedule it to other places. We should do this only in Preempting phase + // because only suggested nodes of this phase consider preemption. + klog.Infof("[%v]: Canceling affinity group %v's preemption because its placement is "+ + "no longer fully within Preempting-phase suggested nodes (non-suggested nodes: %v)", + internal.Key(pod), g.name, nodesNotInSuggested) + h.deletePreemptingAffinityGroup(g, pod) + } else { + groupPhysicalPlacement = g.physicalGpuPlacement + groupVirtualPlacement = g.virtualGpuPlacement + preemptionVictims, _ = collectPreemptionVictims(groupPhysicalPlacement) + if len(preemptionVictims) == 0 { + klog.Infof( + "Preemption victims have been cleaned up for the preemptor affinity group %v", g.name) + } + g.preemptingPods[pod.UID] = pod + } + } + return groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, podIndex +} + +// schedulePodFromNewGroup schedules a pod from a new affinity group, find placement for the group, +// and checks if the group needs preemption. +func (h *HivedAlgorithm) schedulePodFromNewGroup( + s *api.PodSchedulingSpec, + suggestedNodes common.Set, + phase internal.SchedulingPhase, + pod *core.Pod) ( + groupPhysicalPlacement groupPhysicalPlacement, + groupVirtualPlacement groupVirtualPlacement, + preemptionVictims map[string]common.Set, + waitReason string) { + + groupPhysicalPlacement, groupVirtualPlacement, nodesNotInSuggested := h.scheduleNewAffinityGroup( + pod, s, suggestedNodes) + if groupPhysicalPlacement == nil { + if nodesNotInSuggested.IsEmpty() { + waitReason = "insufficient capacity in physical cluster" + if CellPriority(s.Priority) >= minGuaranteedPriority { + waitReason = fmt.Sprintf("insufficient capacity in VC %v", s.VirtualCluster) + } + } else { + // for an unallocated group, + // we will keep it waiting if not all of its pods are scheduled to suggested nodes + waitReason = fmt.Sprintf("affinity group has to be scheduled to some nodes "+ + "not within K8s suggested nodes: %v", nodesNotInSuggested) + } + return nil, nil, nil, waitReason + } + preemptionVictims, overlappingPreemptors := collectPreemptionVictims(groupPhysicalPlacement) + // we allow a new preemption only when in Preempting phase + // and the placement is fully within suggested nodes + if phase == internal.PreemptingPhase { + // first cancel preemption of other groups whose resources overlap with the current group + for preemptor := range overlappingPreemptors.Items() { + klog.Infof("[%v]: Canceling affinity group %v's preemption because it is "+ + "further preempted by a higher-priority affinity group %v", + internal.Key(pod), preemptor.(*AlgoAffinityGroup).name, s.AffinityGroup.Name) + h.deletePreemptingAffinityGroup(preemptor.(*AlgoAffinityGroup), pod) + } + if len(preemptionVictims) != 0 { + // create preemption state to avoid resource contention among multiple preemptors + h.createPreemptingAffinityGroup(s, groupPhysicalPlacement, groupVirtualPlacement, pod) + } + } else if len(preemptionVictims) != 0 { + // here we won't create preemption state since we call preempt only in Preempting phase + klog.Infof("[%v]: Found preemption victims %v in non-Preempting phase, skipping it", + internal.Key(pod), victimsToString(preemptionVictims)) + } + return groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, waitReason +} + // scheduleNewAffinityGroup schedules each pod of a new affinity group to a set of GPUs // (in both the physical cluster and the VC). This is the entrance of a new scheduling attempt. func (h *HivedAlgorithm) scheduleNewAffinityGroup( pod *core.Pod, s *api.PodSchedulingSpec, - suggestedNodes common.Set) (physicalPlacement groupPhysicalPlacement, virtualPlacement groupVirtualPlacement) { + suggestedNodes common.Set) ( + physicalPlacement groupPhysicalPlacement, + virtualPlacement groupVirtualPlacement, + nodesNotInSuggested common.Set) { + klog.Infof("[%v]: Scheduling new affinity group %v", internal.Key(pod), s.AffinityGroup.Name) priority := CellPriority(s.Priority) sr := schedulingRequest{ vc: s.VirtualCluster, @@ -687,72 +740,83 @@ func (h *HivedAlgorithm) scheduleNewAffinityGroup( } h.validateSchedulingRequest(sr, pod) if sr.reservationId != "" { - klog.Infof("Use reservation %v", s.ReservationId) - physicalPlacement, virtualPlacement = h.processSchedulingRequest(sr, suggestedNodes) + klog.Infof("Using reservation %v", s.ReservationId) + physicalPlacement, virtualPlacement, nodesNotInSuggested = h.processSchedulingRequest(sr, suggestedNodes) } else if s.GpuType != "" { - physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForGivenGpuType(sr, s.GpuType, pod, suggestedNodes) - } else { - physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForAnyGpuType(sr, suggestedNodes) - } - if physicalPlacement != nil { - klog.Infof("Succeeded in scheduling group %v", s.AffinityGroup.Name) + if _, ok := h.chains[s.GpuType]; !ok { + panic(internal.NewBadRequestError(fmt.Sprintf( + "[%v]: Pod requesting GPU type %v which the whole cluster does not have", + internal.Key(pod), s.GpuType))) + } + klog.Infof("Using specified GPU type %v", s.GpuType) + physicalPlacement, virtualPlacement, nodesNotInSuggested = h.scheduleAffinityGroupForGpuType( + sr, s.GpuType, pod, suggestedNodes, true) } else { - klog.Infof("Failed to schedule group %v", s.AffinityGroup.Name) + physicalPlacement, virtualPlacement, nodesNotInSuggested = h.scheduleAffinityGroupForAnyGpuType( + sr, pod, suggestedNodes) } - return physicalPlacement, virtualPlacement + return physicalPlacement, virtualPlacement, nodesNotInSuggested } -// scheduleAffinityGroupForGivenGpuType schedules an affinity group in a certain cell chain -// that matches the specified GPU type. -func (h *HivedAlgorithm) scheduleAffinityGroupForGivenGpuType( +// scheduleAffinityGroupForGpuType schedules an affinity group in a certain cell chain +// that matches the given GPU type. +func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType( sr schedulingRequest, gpuType string, pod *core.Pod, - suggestedNodes common.Set) (groupPhysicalPlacement, groupVirtualPlacement) { - - if chains := h.chains[gpuType]; chains == nil { - panic(internal.NewBadRequestError(fmt.Sprintf( - "[%v]: Pod requesting GPU type %v which the whole cluster does not have", - internal.Key(pod), gpuType))) - } else { - vcHasType := false - for _, chain := range chains { - if h.vcSchedulers[sr.vc].getNonReservedFullCellList()[chain] != nil { - vcHasType = true - sr.chain = chain - physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes) - if physicalPlacement != nil { - return physicalPlacement, virtualPlacement - } + suggestedNodes common.Set, + typeSpecified bool) ( + physicalPlacement groupPhysicalPlacement, + virtualPlacement groupVirtualPlacement, + nodesNotInSuggested common.Set) { + + vcHasType := false + for _, chain := range h.chains[gpuType] { + if sr.priority < minGuaranteedPriority || + h.vcSchedulers[sr.vc].getNonReservedFreeCellList()[chain] != nil { + vcHasType = true + klog.Infof("Searching chain %v", chain) + sr.chain = chain + physicalPlacement, virtualPlacement, chainNodesNotInSuggested := + h.processSchedulingRequest(sr, suggestedNodes) + if physicalPlacement != nil { + return physicalPlacement, virtualPlacement, chainNodesNotInSuggested + } + if !chainNodesNotInSuggested.IsEmpty() { + nodesNotInSuggested = chainNodesNotInSuggested } - } - if sr.priority >= minGuaranteedPriority && !vcHasType { - panic(internal.NewBadRequestError(fmt.Sprintf( - "[%v]: Pod requesting GPU type %v which VC %v does not have", - internal.Key(pod), gpuType, sr.vc))) } } - return nil, nil + if typeSpecified && sr.priority >= minGuaranteedPriority && !vcHasType { + panic(internal.NewBadRequestError(fmt.Sprintf( + "[%v]: Pod requesting GPU type %v which VC %v does not have", + internal.Key(pod), gpuType, sr.vc))) + } + return nil, nil, nodesNotInSuggested } -// scheduleAffinityGroupForAnyGpuType schedules an affinity group in a certain cell chain, -// trying every possible GPU type (as the user does not specify a GPU type). +// scheduleAffinityGroupForAnyGpuType schedules an affinity group in every possible GPU type +// (when the user does not specify a GPU type). func (h *HivedAlgorithm) scheduleAffinityGroupForAnyGpuType( sr schedulingRequest, - suggestedNodes common.Set) (groupPhysicalPlacement, groupVirtualPlacement) { - - for _, chains := range h.chains { - for _, chain := range chains { - if h.vcSchedulers[sr.vc].getNonReservedFullCellList()[chain] != nil { - sr.chain = chain - physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes) - if physicalPlacement != nil { - return physicalPlacement, virtualPlacement - } - } + pod *core.Pod, + suggestedNodes common.Set) ( + physicalPlacement groupPhysicalPlacement, + virtualPlacement groupVirtualPlacement, + nodesNotInSuggested common.Set) { + + for gpuType := range h.chains { + klog.Infof("Searching GPU type %v", gpuType) + physicalPlacement, virtualPlacement, typeNodesNotInSuggested := + h.scheduleAffinityGroupForGpuType(sr, gpuType, pod, suggestedNodes, false) + if physicalPlacement != nil { + return physicalPlacement, virtualPlacement, typeNodesNotInSuggested + } + if !typeNodesNotInSuggested.IsEmpty() { + nodesNotInSuggested = typeNodesNotInSuggested } } - return nil, nil + return nil, nil, nodesNotInSuggested } // validateSchedulingRequest checks the existence of VC and reservation ID, and the legality of priority. @@ -775,12 +839,34 @@ func (h *HivedAlgorithm) validateSchedulingRequest(sr schedulingRequest, pod *co // processSchedulingRequest feeds a request to a VC scheduler or the opportunistic scheduler depending on its priority. func (h *HivedAlgorithm) processSchedulingRequest( sr schedulingRequest, - suggestedNodes common.Set) (groupPhysicalPlacement, groupVirtualPlacement) { + suggestedNodes common.Set) ( + physicalPlacement groupPhysicalPlacement, + virtualPlacement groupVirtualPlacement, + nodesNotInSuggested common.Set) { + str := fmt.Sprintf("chain %v", sr.chain) + if sr.reservationId != "" { + str = fmt.Sprintf("reservation %v", sr.reservationId) + } + klog.Infof("Processing scheduling request: %v, GPU numbers %v, priority %v", + str, common.ToJson(sr.affinityGroupPodNums), sr.priority) if sr.priority >= minGuaranteedPriority { - return h.scheduleGuaranteedAffinityGroup(sr, suggestedNodes) + physicalPlacement, virtualPlacement = h.scheduleGuaranteedAffinityGroup(sr, suggestedNodes) + } else { + physicalPlacement = h.scheduleOpportunisticAffinityGroup(sr, suggestedNodes) + } + if physicalPlacement == nil { + klog.Infof("Cannot find placement in %v", str) + return nil, nil, nodesNotInSuggested + } + nodesNotInSuggested = collectNodesNotSuggested(physicalPlacement, suggestedNodes) + if !nodesNotInSuggested.IsEmpty() { + klog.Infof("Found placement in %v NOT fully within suggested nodes, "+ + "placement: %v, non-suggested nodes: %v", str, physicalPlacement, nodesNotInSuggested) + return nil, nil, nodesNotInSuggested } else { - return h.scheduleOpportunisticAffinityGroup(sr, suggestedNodes), nil + klog.Infof("Found placement in %v fully within suggested nodes: %v", str, physicalPlacement) + return physicalPlacement, virtualPlacement, nodesNotInSuggested } } @@ -835,16 +921,8 @@ func (h *HivedAlgorithm) scheduleOpportunisticAffinityGroup( sr schedulingRequest, suggestedNodes common.Set) groupPhysicalPlacement { - physicalPlacement := h.opportunisticSchedulers[sr.chain].Schedule( + return h.opportunisticSchedulers[sr.chain].Schedule( sr.affinityGroupPodNums, opportunisticPriority, suggestedNodes) - if physicalPlacement == nil { - klog.Infof("Failed in scheduling in PC due to insufficient capacity for scheduling request: GPU numbers %v, priority %v, chain %v", - sr.affinityGroupPodNums, sr.priority, sr.chain) - } else { - klog.Infof("Succeeded in scheduling in PC for scheduling request: GPU numbers %v, priority %v, chain %v", - sr.affinityGroupPodNums, sr.priority, sr.chain) - } - return physicalPlacement } // createAllocatedAffinityGroup creates a new affinity group and allocate the resources. diff --git a/pkg/algorithm/hived_algorithm_test.go b/pkg/algorithm/hived_algorithm_test.go index 2ca0de4..b3a371c 100644 --- a/pkg/algorithm/hived_algorithm_test.go +++ b/pkg/algorithm/hived_algorithm_test.go @@ -424,6 +424,14 @@ var pss = map[types.UID]api.PodSchedulingSpec{ GpuType: "DGX2-V100", GpuNumber: 16, AffinityGroup: group26, + }, "pod36": { // will iterate the GPU types until find a placement within suggested nodes + VirtualCluster: "VC1", + Priority: -1, + LazyPreemptionEnable: true, + ReservationId: "", + GpuType: "", + GpuNumber: 1, + AffinityGroup: group1, }, } @@ -468,6 +476,7 @@ var expectedBindInfos = map[string]result{ "pod25": {node: "0.0.0.1", gpuIsolation: []int32{0, 1}}, "pod28": {node: "0.0.3.0", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}, "pod34": {node: "0.0.3.0", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}, + "pod36": {node: "0.0.1.0", gpuIsolation: []int32{0}}, } var expectedPreemptInfos = map[string]common.Set{ @@ -616,17 +625,21 @@ func testDeletePods(t *testing.T, h *HivedAlgorithm) { } func testSuggestedNodes(t *testing.T, h *HivedAlgorithm) { + pod := allPods["pod36"] + pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID]) + psr := h.Schedule(pod, []string{"0.0.1.0"}, internal.PreemptingPhase) + compareSchedulingResult(t, pod, psr) + var nodes []string for _, node := range allNodes { if node != "0.0.3.1" { nodes = append(nodes, node) } } - pod := allPods["pod27"] + pod = allPods["pod27"] pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID]) - psr := h.Schedule(pod, nodes, internal.PreemptingPhase) + psr = h.Schedule(pod, nodes, internal.PreemptingPhase) compareSchedulingResult(t, pod, psr) - nodes = append(nodes, "0.0.3.1") pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID]) // this time scheduling will succeed @@ -647,7 +660,8 @@ func testSuggestedNodes(t *testing.T, h *HivedAlgorithm) { // this time group will be preempting psr = h.Schedule(pod, nodes, internal.PreemptingPhase) if g := h.affinityGroups[pss[pod.UID].AffinityGroup.Name]; g == nil { - t.Errorf("Group %v should be preempting but does not exist", g.name) + t.Errorf("Group %v should be preempting but does not exist", + pss[pod.UID].AffinityGroup.Name) } else if g.state != groupPreempting { t.Errorf("Group %v should be in Preempting state but not", g.name) } diff --git a/pkg/algorithm/intra_vc_scheduler.go b/pkg/algorithm/intra_vc_scheduler.go index b9eb4bf..d8f4c29 100644 --- a/pkg/algorithm/intra_vc_scheduler.go +++ b/pkg/algorithm/intra_vc_scheduler.go @@ -88,26 +88,23 @@ func (s *defaultIntraVCScheduler) getReservedCellList() map[api.ReservationId]Ch return s.reservedCellList } -func (s *defaultIntraVCScheduler) schedule(sr schedulingRequest) groupVirtualPlacement { - var scheduler *topologyAwareScheduler - var str string +func (s *defaultIntraVCScheduler) schedule(sr schedulingRequest) (placement groupVirtualPlacement) { + scheduler := s.nonReservedSchedulers[sr.chain] + str := fmt.Sprintf("chain %v", sr.chain) if sr.reservationId != "" { scheduler = s.reservedSchedulers[sr.reservationId] str = fmt.Sprintf("reservation %v", sr.reservationId) - } else { - scheduler = s.nonReservedSchedulers[sr.chain] - str = fmt.Sprintf("chain %v", sr.chain) } - var placement groupVirtualPlacement + klog.Infof("Processing scheduling request in VC %v: %v, GPU numbers %v, priority %v", + sr.vc, str, common.ToJson(sr.affinityGroupPodNums), sr.priority) if scheduler != nil { placement = scheduler.Schedule(sr.affinityGroupPodNums, sr.priority, common.NewSet()) } if placement == nil { - klog.Infof("Insufficient capacity in VC %v for scheduling request: %v, GPU numbers %v, priority %v", - sr.vc, str, sr.affinityGroupPodNums, sr.priority) + klog.Infof("Cannot find placement in VC %v", sr.vc) } else { - klog.Infof("Succeeded in scheduling in VC %v for scheduling request: %v, GPU numbers %v, priority %v", - sr.vc, str, sr.affinityGroupPodNums, sr.priority) + klog.Infof("Found placement in VC %v: %v", + sr.vc, placement) } return placement } diff --git a/pkg/algorithm/utils.go b/pkg/algorithm/utils.go index 5e20b8b..1d8dfdd 100644 --- a/pkg/algorithm/utils.go +++ b/pkg/algorithm/utils.go @@ -37,16 +37,14 @@ import ( func generatePodScheduleResult( groupPhysicalPlacement groupPhysicalPlacement, groupVirtualPlacement groupVirtualPlacement, - priority CellPriority, preemptionVictims map[string]common.Set, - nodesNotInSuggested common.Set, + waitReason string, cellLevelToType map[CellChain]map[CellLevel]api.CellType, currentGpuNum int32, currentPodIndex int32, group *AlgoAffinityGroup, groupName string, suggestedNodes common.Set, - vc api.VirtualClusterName, pod *core.Pod) internal.PodScheduleResult { klog.V(4).Infof("[%v]: Got K8s suggested nodes: %v", internal.Key(pod), suggestedNodes) @@ -59,25 +57,6 @@ func generatePodScheduleResult( PodPreemptInfo: generatePodPreemptInfo(preemptionVictims, pod), } } - var waitReason string - if groupPhysicalPlacement == nil { - waitReason = "insufficient capacity in physical cluster" - if priority >= minGuaranteedPriority { - waitReason = fmt.Sprintf("insufficient capacity in VC %v", vc) - } - } else if !nodesNotInSuggested.IsEmpty() { - if group == nil || group.state == groupPreempting { - // for an unallocated group, we will keep it waiting if not all of its pods are scheduled to suggested nodes - waitReason = fmt.Sprintf( - "affinity group is decided to be scheduled to some nodes not within K8s suggested nodes: %v", - nodesNotInSuggested) - } else { - // for an existing group, we always insist the previous scheduling decision - // even if some pods are now not within suggested nodes - klog.Warningf("Some nodes used by affinity group %v are no longer within K8s suggested nodes: %v", - group.name, nodesNotInSuggested) - } - } if waitReason != "" { klog.Infof("[%v]: need to wait because %v", internal.Key(pod), waitReason) return internal.PodScheduleResult{PodWaitInfo: &internal.PodWaitInfo{Reason: waitReason}} @@ -100,7 +79,8 @@ func generatePodScheduleResult( // generatePodPreemptInfo writes the preemption victims into a PodPreemptInfo. func generatePodPreemptInfo(preemptionVictims map[string]common.Set, pod *core.Pod) *internal.PodPreemptInfo { - klog.Infof("[%v]: Preemption victim candidates: %v", internal.Key(pod), victimsToString(preemptionVictims)) + klog.Infof("[%v]: Preemption victim candidates: %v", + internal.Key(pod), victimsToString(preemptionVictims)) var ( nodesHavingVictims []string victimPods []*core.Pod diff --git a/pkg/api/types.go b/pkg/api/types.go index bf59334..8c261b3 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -192,8 +192,8 @@ type CellStatus struct { // (e.g., VC1/0/0 may represent VC1, preassigned cell 0, index 0 among its children) CellAddress CellAddress `json:"cellAddress"` // CellState and CellHealthiness are two orthogonal fields. - // That means, there are four possible combinations of them: a cell may be in - // (1) used and healthy, (2) used and bad, (3) free and healthy, and (4) free and bad. + // CellState represents whether the cell is being used (or acquired) by an affinity group. + // CellHealthiness represents whether the physical hardware is working normally. CellState CellState `json:"cellState"` CellHealthiness CellHealthiness `json:"cellHealthiness"` CellPriority int32 `json:"cellPriority"`