From fad90121d148b80eeadea4d4b89b33455d63f334 Mon Sep 17 00:00:00 2001
From: Hanyu Zhao <zhaohanyu@pku.edu.cn>
Date: Tue, 7 Apr 2020 10:51:38 +0800
Subject: [PATCH] refine chain search (#13)

* fix bug in early stop chain

* keep searching the chains until placement within suggested nodes

* refine logging

* refactor h.Schedule() which was too long

* fix virtual cell's healthiness

* refine suggested nodes related logic

* resolve comments

* readme
---
 README.md                             |   2 +-
 pkg/algorithm/cell.go                 |   1 +
 pkg/algorithm/hived_algorithm.go      | 338 ++++++++++++++++----------
 pkg/algorithm/hived_algorithm_test.go |  22 +-
 pkg/algorithm/intra_vc_scheduler.go   |  19 +-
 pkg/algorithm/utils.go                |  26 +-
 pkg/api/types.go                      |   4 +-
 7 files changed, 241 insertions(+), 171 deletions(-)

diff --git a/README.md b/README.md
index e759532..5b9dd7b 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ HiveD supports multiple job **priorities**. Higher-priority jobs can **[preempt]
 4. Optimized Resource Fragmentation and Less Starvation
 5. [Priorities](example/feature/README.md#Guaranteed-Job), [Overuse with Low Priority](example/feature/README.md#Opportunistic-Job), and [Inter-](example/feature/README.md#Inter-VC-Preemption)/[Intra-VC Preemption](example/feature/README.md#Intra-VC-Preemption)
 6. [Job (Full/Partial) Gang Scheduling/Preemption](example/feature/README.md#Gang-Scheduling)
-7. Fault-Tolerance, [Hardware Failure-Awareness](example/feature/README.md#Bad-Hardware-Awareness), [Work-Preserving Reconfiguration](example/feature/README.md#Work-Preserving-Reconfiguration)
+7. Fault-Tolerance, [Bad Hardware Awareness](example/feature/README.md#Bad-Hardware-Awareness), [Work-Preserving Reconfiguration](example/feature/README.md#Work-Preserving-Reconfiguration)
 8. [Leverage K8S Default Scheduler](example/feature/README.md#Leverage-K8S-Default-Scheduler)
 
 ## Prerequisite
diff --git a/pkg/algorithm/cell.go b/pkg/algorithm/cell.go
index 2863f0b..b1994dc 100644
--- a/pkg/algorithm/cell.go
+++ b/pkg/algorithm/cell.go
@@ -394,6 +394,7 @@ func (c *VirtualCell) SetPhysicalCell(cell *PhysicalCell) {
 	if cell == nil {
 		c.apiStatus.PhysicalCell = nil
 		c.apiStatus.CellHealthiness = api.CellHealthy
+		c.apiStatus.CellState = api.CellState(cellFree)
 	} else {
 		pcs := &api.PhysicalCellStatus{}
 		// shallow copy the status, clear the pointers to avoid reference
diff --git a/pkg/algorithm/hived_algorithm.go b/pkg/algorithm/hived_algorithm.go
index f98bd78..753b5ee 100644
--- a/pkg/algorithm/hived_algorithm.go
+++ b/pkg/algorithm/hived_algorithm.go
@@ -166,90 +166,31 @@ func (h *HivedAlgorithm) Schedule(
 		groupPhysicalPlacement groupPhysicalPlacement // GPU number -> a set of pods -> a set of GPUs of each pod
 		groupVirtualPlacement  groupVirtualPlacement  // GPU number -> a set of pods -> a set of GPUs of each pod
 		preemptionVictims      map[string]common.Set  // node -> pods
-		nodesNotInSuggested    common.Set             // nodes in physical placement that are not within suggested nodes
-		podIndex               int32                  // index of current pod among those of the same GPU number in the group, 0 by default
+		waitReason             string
+		podIndex               int32 // index of current pod among those of the same GPU number in the group, 0 by default
 	)
 
 	if g := h.affinityGroups[s.AffinityGroup.Name]; g != nil {
-		// state of an existing group can be either Allocated or Preempting
-		if g.state == groupAllocated {
-			klog.Infof("[%v]: Pod affinity group is already allocated: %v", internal.Key(pod), s.AffinityGroup.Name)
-			groupPhysicalPlacement = g.physicalGpuPlacement
-			groupVirtualPlacement = g.virtualGpuPlacement
-			if podIndex = getNewPodIndex(g.allocatedPods[s.GpuNumber]); podIndex == -1 {
-				panic(internal.NewBadRequestError(fmt.Sprintf(
-					"Requesting more pods than the configured number for %v GPUs (%v pods) in affinity group %v",
-					s.GpuNumber, g.totalPodNums[s.GpuNumber], s.AffinityGroup.Name)))
-			}
-		} else { // groupPreempting
-			klog.Infof("[%v]: Pod affinity group is preempting others: %v", internal.Key(pod), s.AffinityGroup.Name)
-			nodesNotInSuggested = collectNodesNotSuggested(g.physicalGpuPlacement, suggestedNodeSet)
-			if phase == internal.PreemptingPhase && !nodesNotInSuggested.IsEmpty() {
-				// If we find a preempting group's placement is not fully within suggested nodes, we should cancel
-				// the preemption so as to reschedule it to other places. We should do this only in Preempting phase
-				// because the suggested nodes in Filtering phase does not consider preemption.
-				klog.Infof("[%v]: Canceling affinity group %v's preemption because its placement is no longer "+
-					"fully within Preempting-phase suggested nodes (non-suggested nodes: %v)",
-					internal.Key(pod), g.name, nodesNotInSuggested)
-				h.deletePreemptingAffinityGroup(g, pod)
-			} else {
-				groupPhysicalPlacement = g.physicalGpuPlacement
-				groupVirtualPlacement = g.virtualGpuPlacement
-				if preemptionVictims, _ = collectPreemptionVictims(groupPhysicalPlacement); len(preemptionVictims) == 0 {
-					klog.Infof("Preemption victims have been cleaned up for the preemptor affinity group %v", g.name)
-				}
-				g.preemptingPods[pod.UID] = pod
-			}
-		}
+		groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, podIndex =
+			h.schedulePodFromExistingGroup(g, s, suggestedNodeSet, phase, pod)
 	}
+	// we need to re-evaluate the existence of the group here (instead of an "else") because it is
+	// possible that the group was a preempting group and deleted in h.schedulePodFromExistingGroup
 	if h.affinityGroups[s.AffinityGroup.Name] == nil {
-		klog.Infof("[%v]: Scheduling new affinity group %v", internal.Key(pod), s.AffinityGroup.Name)
-		groupPhysicalPlacement, groupVirtualPlacement = h.scheduleNewAffinityGroup(pod, s, suggestedNodeSet)
-		var overlappingPreemptors common.Set
-		preemptionVictims, overlappingPreemptors = collectPreemptionVictims(groupPhysicalPlacement)
-		nodesNotInSuggested = collectNodesNotSuggested(groupPhysicalPlacement, suggestedNodeSet)
-		// we allow a new preemption only when in Preempting phase
-		// and the placement is fully within suggested nodes
-		if phase == internal.PreemptingPhase && nodesNotInSuggested.IsEmpty() {
-			for preemptor := range overlappingPreemptors.Items() {
-				klog.Infof("[%v]: Canceling affinity group %v's preemption because it is "+
-					"further preempted by a higher-priority affinity group %v",
-					internal.Key(pod), preemptor.(*AlgoAffinityGroup).name, s.AffinityGroup.Name)
-				h.deletePreemptingAffinityGroup(preemptor.(*AlgoAffinityGroup), pod)
-			}
-		}
-		if len(preemptionVictims) != 0 {
-			if phase == internal.PreemptingPhase && nodesNotInSuggested.IsEmpty() {
-				h.createPreemptingAffinityGroup(s, groupPhysicalPlacement, groupVirtualPlacement, pod)
-			} else if phase == internal.FilteringPhase {
-				klog.Infof("[%v]: Found preemption victims %v, but we do not allow preemption in Filtering "+
-					"phase because K8s won't call preempt, creating preemption state here is misleading",
-					internal.Key(pod), victimsToString(preemptionVictims))
-				groupPhysicalPlacement = nil
-				groupVirtualPlacement = nil
-				preemptionVictims = nil
-			} else {
-				klog.Infof("[%v]: Found preemption victims %v, but we do not allow this preemption "+
-					"because the placement is not fully within Preempting-phase suggested nodes "+
-					"(non-suggested nodes: %v)",
-					internal.Key(pod), victimsToString(preemptionVictims), nodesNotInSuggested)
-				preemptionVictims = nil
-			}
-		}
+		groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, waitReason =
+			h.schedulePodFromNewGroup(s, suggestedNodeSet, phase, pod)
 	}
 	return generatePodScheduleResult(
 		groupPhysicalPlacement,
 		groupVirtualPlacement,
-		CellPriority(s.Priority),
 		preemptionVictims,
-		nodesNotInSuggested,
+		waitReason,
 		h.cellTypes,
 		s.GpuNumber,
 		podIndex,
 		h.affinityGroups[s.AffinityGroup.Name],
 		s.AffinityGroup.Name,
 		suggestedNodeSet,
-		s.VirtualCluster,
 		pod)
 }
 
@@ -666,13 +607,125 @@ func (h *HivedAlgorithm) updateVCDoomedBadCells(c CellChain, l CellLevel) {
 	}
 }
 
+// schedulePodFromExistingGroup schedules a pod from an allocated or preempting affinity group.
+// If it is from an allocated group, we will schedule the pod to the corresponding placement.
+// If it is from a preempting group, we will continue its preemption, or schedule it when the preemption is done.
+func (h *HivedAlgorithm) schedulePodFromExistingGroup(
+	g *AlgoAffinityGroup,
+	s *api.PodSchedulingSpec,
+	suggestedNodes common.Set,
+	phase internal.SchedulingPhase,
+	pod *core.Pod) (
+	groupPhysicalPlacement groupPhysicalPlacement,
+	groupVirtualPlacement groupVirtualPlacement,
+	preemptionVictims map[string]common.Set,
+	podIndex int32) {
+
+	nodesNotInSuggested := collectNodesNotSuggested(g.physicalGpuPlacement, suggestedNodes)
+	// state of an existing group can be either Allocated or Preempting
+	if g.state == groupAllocated {
+		klog.Infof("[%v]: Pod is from an affinity group that is already allocated: %v",
+			internal.Key(pod), s.AffinityGroup.Name)
+		groupPhysicalPlacement = g.physicalGpuPlacement
+		groupVirtualPlacement = g.virtualGpuPlacement
+		if !nodesNotInSuggested.IsEmpty() {
+			// for an allocated group, we always insist the previous scheduling decision
+			// even if some pods are now not within suggested nodes
+			klog.Warningf("Some nodes allocated to affinity group %v are no longer "+
+				"within K8s suggested nodes: %v", g.name, nodesNotInSuggested)
+		}
+		if podIndex = getNewPodIndex(g.allocatedPods[s.GpuNumber]); podIndex == -1 {
+			panic(internal.NewBadRequestError(fmt.Sprintf(
+				"Requesting more pods than the configured number for %v GPUs (%v pods) in affinity group %v",
+				s.GpuNumber, g.totalPodNums[s.GpuNumber], s.AffinityGroup.Name)))
+		}
+	} else { // groupPreempting
+		klog.Infof("[%v]: Pod is from an affinity group that is preempting others: %v",
+			internal.Key(pod), s.AffinityGroup.Name)
+		if phase == internal.PreemptingPhase && !nodesNotInSuggested.IsEmpty() {
+			// If we find a preempting group's placement is not fully within suggested nodes, we should cancel
+			// the preemption so as to reschedule it to other places. We should do this only in Preempting phase
+			// because only suggested nodes of this phase consider preemption.
+			klog.Infof("[%v]: Canceling affinity group %v's preemption because its placement is "+
+				"no longer fully within Preempting-phase suggested nodes (non-suggested nodes: %v)",
+				internal.Key(pod), g.name, nodesNotInSuggested)
+			h.deletePreemptingAffinityGroup(g, pod)
+		} else {
+			groupPhysicalPlacement = g.physicalGpuPlacement
+			groupVirtualPlacement = g.virtualGpuPlacement
+			preemptionVictims, _ = collectPreemptionVictims(groupPhysicalPlacement)
+			if len(preemptionVictims) == 0 {
+				klog.Infof(
+					"Preemption victims have been cleaned up for the preemptor affinity group %v", g.name)
+			}
+			g.preemptingPods[pod.UID] = pod
+		}
+	}
+	return groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, podIndex
+}
+
+// schedulePodFromNewGroup schedules a pod from a new affinity group, find placement for the group,
+// and checks if the group needs preemption.
+func (h *HivedAlgorithm) schedulePodFromNewGroup(
+	s *api.PodSchedulingSpec,
+	suggestedNodes common.Set,
+	phase internal.SchedulingPhase,
+	pod *core.Pod) (
+	groupPhysicalPlacement groupPhysicalPlacement,
+	groupVirtualPlacement groupVirtualPlacement,
+	preemptionVictims map[string]common.Set,
+	waitReason string) {
+
+	groupPhysicalPlacement, groupVirtualPlacement, nodesNotInSuggested := h.scheduleNewAffinityGroup(
+		pod, s, suggestedNodes)
+	if groupPhysicalPlacement == nil {
+		if nodesNotInSuggested.IsEmpty() {
+			waitReason = "insufficient capacity in physical cluster"
+			if CellPriority(s.Priority) >= minGuaranteedPriority {
+				waitReason = fmt.Sprintf("insufficient capacity in VC %v", s.VirtualCluster)
+			}
+		} else {
+			// for an unallocated group,
+			// we will keep it waiting if not all of its pods are scheduled to suggested nodes
+			waitReason = fmt.Sprintf("affinity group has to be scheduled to some nodes "+
+				"not within K8s suggested nodes: %v", nodesNotInSuggested)
+		}
+		return nil, nil, nil, waitReason
+	}
+	preemptionVictims, overlappingPreemptors := collectPreemptionVictims(groupPhysicalPlacement)
+	// we allow a new preemption only when in Preempting phase
+	// and the placement is fully within suggested nodes
+	if phase == internal.PreemptingPhase {
+		// first cancel preemption of other groups whose resources overlap with the current group
+		for preemptor := range overlappingPreemptors.Items() {
+			klog.Infof("[%v]: Canceling affinity group %v's preemption because it is "+
+				"further preempted by a higher-priority affinity group %v",
+				internal.Key(pod), preemptor.(*AlgoAffinityGroup).name, s.AffinityGroup.Name)
+			h.deletePreemptingAffinityGroup(preemptor.(*AlgoAffinityGroup), pod)
+		}
+		if len(preemptionVictims) != 0 {
+			// create preemption state to avoid resource contention among multiple preemptors
+			h.createPreemptingAffinityGroup(s, groupPhysicalPlacement, groupVirtualPlacement, pod)
+		}
+	} else if len(preemptionVictims) != 0 {
+		// here we won't create preemption state since we call preempt only in Preempting phase
+		klog.Infof("[%v]: Found preemption victims %v in non-Preempting phase, skipping it",
+			internal.Key(pod), victimsToString(preemptionVictims))
+	}
+	return groupPhysicalPlacement, groupVirtualPlacement, preemptionVictims, waitReason
+}
+
 // scheduleNewAffinityGroup schedules each pod of a new affinity group to a set of GPUs
 // (in both the physical cluster and the VC). This is the entrance of a new scheduling attempt.
 func (h *HivedAlgorithm) scheduleNewAffinityGroup(
 	pod *core.Pod,
 	s *api.PodSchedulingSpec,
-	suggestedNodes common.Set) (physicalPlacement groupPhysicalPlacement, virtualPlacement groupVirtualPlacement) {
+	suggestedNodes common.Set) (
+	physicalPlacement groupPhysicalPlacement,
+	virtualPlacement groupVirtualPlacement,
+	nodesNotInSuggested common.Set) {
 
+	klog.Infof("[%v]: Scheduling new affinity group %v", internal.Key(pod), s.AffinityGroup.Name)
 	priority := CellPriority(s.Priority)
 	sr := schedulingRequest{
 		vc:                   s.VirtualCluster,
@@ -687,72 +740,83 @@ func (h *HivedAlgorithm) scheduleNewAffinityGroup(
 	}
 	h.validateSchedulingRequest(sr, pod)
 	if sr.reservationId != "" {
-		klog.Infof("Use reservation %v", s.ReservationId)
-		physicalPlacement, virtualPlacement = h.processSchedulingRequest(sr, suggestedNodes)
+		klog.Infof("Using reservation %v", s.ReservationId)
+		physicalPlacement, virtualPlacement, nodesNotInSuggested = h.processSchedulingRequest(sr, suggestedNodes)
 	} else if s.GpuType != "" {
-		physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForGivenGpuType(sr, s.GpuType, pod, suggestedNodes)
-	} else {
-		physicalPlacement, virtualPlacement = h.scheduleAffinityGroupForAnyGpuType(sr, suggestedNodes)
-	}
-	if physicalPlacement != nil {
-		klog.Infof("Succeeded in scheduling group %v", s.AffinityGroup.Name)
+		if _, ok := h.chains[s.GpuType]; !ok {
+			panic(internal.NewBadRequestError(fmt.Sprintf(
+				"[%v]: Pod requesting GPU type %v which the whole cluster does not have",
+				internal.Key(pod), s.GpuType)))
+		}
+		klog.Infof("Using specified GPU type %v", s.GpuType)
+		physicalPlacement, virtualPlacement, nodesNotInSuggested = h.scheduleAffinityGroupForGpuType(
+			sr, s.GpuType, pod, suggestedNodes, true)
 	} else {
-		klog.Infof("Failed to schedule group %v", s.AffinityGroup.Name)
+		physicalPlacement, virtualPlacement, nodesNotInSuggested = h.scheduleAffinityGroupForAnyGpuType(
+			sr, pod, suggestedNodes)
 	}
-	return physicalPlacement, virtualPlacement
+	return physicalPlacement, virtualPlacement, nodesNotInSuggested
 }
 
-// scheduleAffinityGroupForGivenGpuType schedules an affinity group in a certain cell chain
-// that matches the specified GPU type.
-func (h *HivedAlgorithm) scheduleAffinityGroupForGivenGpuType(
+// scheduleAffinityGroupForGpuType schedules an affinity group in a certain cell chain
+// that matches the given GPU type.
+func (h *HivedAlgorithm) scheduleAffinityGroupForGpuType(
 	sr schedulingRequest,
 	gpuType string,
 	pod *core.Pod,
-	suggestedNodes common.Set) (groupPhysicalPlacement, groupVirtualPlacement) {
-
-	if chains := h.chains[gpuType]; chains == nil {
-		panic(internal.NewBadRequestError(fmt.Sprintf(
-			"[%v]: Pod requesting GPU type %v which the whole cluster does not have",
-			internal.Key(pod), gpuType)))
-	} else {
-		vcHasType := false
-		for _, chain := range chains {
-			if h.vcSchedulers[sr.vc].getNonReservedFullCellList()[chain] != nil {
-				vcHasType = true
-				sr.chain = chain
-				physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes)
-				if physicalPlacement != nil {
-					return physicalPlacement, virtualPlacement
-				}
+	suggestedNodes common.Set,
+	typeSpecified bool) (
+	physicalPlacement groupPhysicalPlacement,
+	virtualPlacement groupVirtualPlacement,
+	nodesNotInSuggested common.Set) {
+
+	vcHasType := false
+	for _, chain := range h.chains[gpuType] {
+		if sr.priority < minGuaranteedPriority ||
+			h.vcSchedulers[sr.vc].getNonReservedFreeCellList()[chain] != nil {
+			vcHasType = true
+			klog.Infof("Searching chain %v", chain)
+			sr.chain = chain
+			physicalPlacement, virtualPlacement, chainNodesNotInSuggested :=
+				h.processSchedulingRequest(sr, suggestedNodes)
+			if physicalPlacement != nil {
+				return physicalPlacement, virtualPlacement, chainNodesNotInSuggested
+			}
+			if !chainNodesNotInSuggested.IsEmpty() {
+				nodesNotInSuggested = chainNodesNotInSuggested
 			}
-		}
-		if sr.priority >= minGuaranteedPriority && !vcHasType {
-			panic(internal.NewBadRequestError(fmt.Sprintf(
-				"[%v]: Pod requesting GPU type %v which VC %v does not have",
-				internal.Key(pod), gpuType, sr.vc)))
 		}
 	}
-	return nil, nil
+	if typeSpecified && sr.priority >= minGuaranteedPriority && !vcHasType {
+		panic(internal.NewBadRequestError(fmt.Sprintf(
+			"[%v]: Pod requesting GPU type %v which VC %v does not have",
+			internal.Key(pod), gpuType, sr.vc)))
+	}
+	return nil, nil, nodesNotInSuggested
 }
 
-// scheduleAffinityGroupForAnyGpuType schedules an affinity group in a certain cell chain,
-// trying every possible GPU type (as the user does not specify a GPU type).
+// scheduleAffinityGroupForAnyGpuType schedules an affinity group in every possible GPU type
+// (when the user does not specify a GPU type).
 func (h *HivedAlgorithm) scheduleAffinityGroupForAnyGpuType(
 	sr schedulingRequest,
-	suggestedNodes common.Set) (groupPhysicalPlacement, groupVirtualPlacement) {
-
-	for _, chains := range h.chains {
-		for _, chain := range chains {
-			if h.vcSchedulers[sr.vc].getNonReservedFullCellList()[chain] != nil {
-				sr.chain = chain
-				physicalPlacement, virtualPlacement := h.processSchedulingRequest(sr, suggestedNodes)
-				if physicalPlacement != nil {
-					return physicalPlacement, virtualPlacement
-				}
-			}
+	pod *core.Pod,
+	suggestedNodes common.Set) (
+	physicalPlacement groupPhysicalPlacement,
+	virtualPlacement groupVirtualPlacement,
+	nodesNotInSuggested common.Set) {
+
+	for gpuType := range h.chains {
+		klog.Infof("Searching GPU type %v", gpuType)
+		physicalPlacement, virtualPlacement, typeNodesNotInSuggested :=
+			h.scheduleAffinityGroupForGpuType(sr, gpuType, pod, suggestedNodes, false)
+		if physicalPlacement != nil {
+			return physicalPlacement, virtualPlacement, typeNodesNotInSuggested
+		}
+		if !typeNodesNotInSuggested.IsEmpty() {
+			nodesNotInSuggested = typeNodesNotInSuggested
 		}
 	}
-	return nil, nil
+	return nil, nil, nodesNotInSuggested
 }
 
 // validateSchedulingRequest checks the existence of VC and reservation ID, and the legality of priority.
@@ -775,12 +839,34 @@ func (h *HivedAlgorithm) validateSchedulingRequest(sr schedulingRequest, pod *co
 // processSchedulingRequest feeds a request to a VC scheduler or the opportunistic scheduler depending on its priority.
 func (h *HivedAlgorithm) processSchedulingRequest(
 	sr schedulingRequest,
-	suggestedNodes common.Set) (groupPhysicalPlacement, groupVirtualPlacement) {
+	suggestedNodes common.Set) (
+	physicalPlacement groupPhysicalPlacement,
+	virtualPlacement groupVirtualPlacement,
+	nodesNotInSuggested common.Set) {
 
+	str := fmt.Sprintf("chain %v", sr.chain)
+	if sr.reservationId != "" {
+		str = fmt.Sprintf("reservation %v", sr.reservationId)
+	}
+	klog.Infof("Processing scheduling request: %v, GPU numbers %v, priority %v",
+		str, common.ToJson(sr.affinityGroupPodNums), sr.priority)
 	if sr.priority >= minGuaranteedPriority {
-		return h.scheduleGuaranteedAffinityGroup(sr, suggestedNodes)
+		physicalPlacement, virtualPlacement = h.scheduleGuaranteedAffinityGroup(sr, suggestedNodes)
+	} else {
+		physicalPlacement = h.scheduleOpportunisticAffinityGroup(sr, suggestedNodes)
+	}
+	if physicalPlacement == nil {
+		klog.Infof("Cannot find placement in %v", str)
+		return nil, nil, nodesNotInSuggested
+	}
+	nodesNotInSuggested = collectNodesNotSuggested(physicalPlacement, suggestedNodes)
+	if !nodesNotInSuggested.IsEmpty() {
+		klog.Infof("Found placement in %v NOT fully within suggested nodes, "+
+			"placement: %v, non-suggested nodes: %v", str, physicalPlacement, nodesNotInSuggested)
+		return nil, nil, nodesNotInSuggested
 	} else {
-		return h.scheduleOpportunisticAffinityGroup(sr, suggestedNodes), nil
+		klog.Infof("Found placement in %v fully within suggested nodes: %v", str, physicalPlacement)
+		return physicalPlacement, virtualPlacement, nodesNotInSuggested
 	}
 }
 
@@ -835,16 +921,8 @@ func (h *HivedAlgorithm) scheduleOpportunisticAffinityGroup(
 	sr schedulingRequest,
 	suggestedNodes common.Set) groupPhysicalPlacement {
 
-	physicalPlacement := h.opportunisticSchedulers[sr.chain].Schedule(
+	return h.opportunisticSchedulers[sr.chain].Schedule(
 		sr.affinityGroupPodNums, opportunisticPriority, suggestedNodes)
-	if physicalPlacement == nil {
-		klog.Infof("Failed in scheduling in PC due to insufficient capacity for scheduling request: GPU numbers %v, priority %v, chain %v",
-			sr.affinityGroupPodNums, sr.priority, sr.chain)
-	} else {
-		klog.Infof("Succeeded in scheduling in PC for scheduling request: GPU numbers %v, priority %v, chain %v",
-			sr.affinityGroupPodNums, sr.priority, sr.chain)
-	}
-	return physicalPlacement
 }
 
 // createAllocatedAffinityGroup creates a new affinity group and allocate the resources.
diff --git a/pkg/algorithm/hived_algorithm_test.go b/pkg/algorithm/hived_algorithm_test.go
index 2ca0de4..b3a371c 100644
--- a/pkg/algorithm/hived_algorithm_test.go
+++ b/pkg/algorithm/hived_algorithm_test.go
@@ -424,6 +424,14 @@ var pss = map[types.UID]api.PodSchedulingSpec{
 		GpuType:              "DGX2-V100",
 		GpuNumber:            16,
 		AffinityGroup:        group26,
+	}, "pod36": { // will iterate the GPU types until find a placement within suggested nodes
+		VirtualCluster:       "VC1",
+		Priority:             -1,
+		LazyPreemptionEnable: true,
+		ReservationId:        "",
+		GpuType:              "",
+		GpuNumber:            1,
+		AffinityGroup:        group1,
 	},
 }
 
@@ -468,6 +476,7 @@ var expectedBindInfos = map[string]result{
 	"pod25": {node: "0.0.0.1", gpuIsolation: []int32{0, 1}},
 	"pod28": {node: "0.0.3.0", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
 	"pod34": {node: "0.0.3.0", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
+	"pod36": {node: "0.0.1.0", gpuIsolation: []int32{0}},
 }
 
 var expectedPreemptInfos = map[string]common.Set{
@@ -616,17 +625,21 @@ func testDeletePods(t *testing.T, h *HivedAlgorithm) {
 }
 
 func testSuggestedNodes(t *testing.T, h *HivedAlgorithm) {
+	pod := allPods["pod36"]
+	pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
+	psr := h.Schedule(pod, []string{"0.0.1.0"}, internal.PreemptingPhase)
+	compareSchedulingResult(t, pod, psr)
+
 	var nodes []string
 	for _, node := range allNodes {
 		if node != "0.0.3.1" {
 			nodes = append(nodes, node)
 		}
 	}
-	pod := allPods["pod27"]
+	pod = allPods["pod27"]
 	pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
-	psr := h.Schedule(pod, nodes, internal.PreemptingPhase)
+	psr = h.Schedule(pod, nodes, internal.PreemptingPhase)
 	compareSchedulingResult(t, pod, psr)
-
 	nodes = append(nodes, "0.0.3.1")
 	pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
 	// this time scheduling will succeed
@@ -647,7 +660,8 @@ func testSuggestedNodes(t *testing.T, h *HivedAlgorithm) {
 	// this time group will be preempting
 	psr = h.Schedule(pod, nodes, internal.PreemptingPhase)
 	if g := h.affinityGroups[pss[pod.UID].AffinityGroup.Name]; g == nil {
-		t.Errorf("Group %v should be preempting but does not exist", g.name)
+		t.Errorf("Group %v should be preempting but does not exist",
+			pss[pod.UID].AffinityGroup.Name)
 	} else if g.state != groupPreempting {
 		t.Errorf("Group %v should be in Preempting state but not", g.name)
 	}
diff --git a/pkg/algorithm/intra_vc_scheduler.go b/pkg/algorithm/intra_vc_scheduler.go
index b9eb4bf..d8f4c29 100644
--- a/pkg/algorithm/intra_vc_scheduler.go
+++ b/pkg/algorithm/intra_vc_scheduler.go
@@ -88,26 +88,23 @@ func (s *defaultIntraVCScheduler) getReservedCellList() map[api.ReservationId]Ch
 	return s.reservedCellList
 }
 
-func (s *defaultIntraVCScheduler) schedule(sr schedulingRequest) groupVirtualPlacement {
-	var scheduler *topologyAwareScheduler
-	var str string
+func (s *defaultIntraVCScheduler) schedule(sr schedulingRequest) (placement groupVirtualPlacement) {
+	scheduler := s.nonReservedSchedulers[sr.chain]
+	str := fmt.Sprintf("chain %v", sr.chain)
 	if sr.reservationId != "" {
 		scheduler = s.reservedSchedulers[sr.reservationId]
 		str = fmt.Sprintf("reservation %v", sr.reservationId)
-	} else {
-		scheduler = s.nonReservedSchedulers[sr.chain]
-		str = fmt.Sprintf("chain %v", sr.chain)
 	}
-	var placement groupVirtualPlacement
+	klog.Infof("Processing scheduling request in VC %v: %v, GPU numbers %v, priority %v",
+		sr.vc, str, common.ToJson(sr.affinityGroupPodNums), sr.priority)
 	if scheduler != nil {
 		placement = scheduler.Schedule(sr.affinityGroupPodNums, sr.priority, common.NewSet())
 	}
 	if placement == nil {
-		klog.Infof("Insufficient capacity in VC %v for scheduling request: %v, GPU numbers %v, priority %v",
-			sr.vc, str, sr.affinityGroupPodNums, sr.priority)
+		klog.Infof("Cannot find placement in VC %v", sr.vc)
 	} else {
-		klog.Infof("Succeeded in scheduling in VC %v for scheduling request: %v, GPU numbers %v, priority %v",
-			sr.vc, str, sr.affinityGroupPodNums, sr.priority)
+		klog.Infof("Found placement in VC %v: %v",
+			sr.vc, placement)
 	}
 	return placement
 }
diff --git a/pkg/algorithm/utils.go b/pkg/algorithm/utils.go
index 5e20b8b..1d8dfdd 100644
--- a/pkg/algorithm/utils.go
+++ b/pkg/algorithm/utils.go
@@ -37,16 +37,14 @@ import (
 func generatePodScheduleResult(
 	groupPhysicalPlacement groupPhysicalPlacement,
 	groupVirtualPlacement groupVirtualPlacement,
-	priority CellPriority,
 	preemptionVictims map[string]common.Set,
-	nodesNotInSuggested common.Set,
+	waitReason string,
 	cellLevelToType map[CellChain]map[CellLevel]api.CellType,
 	currentGpuNum int32,
 	currentPodIndex int32,
 	group *AlgoAffinityGroup,
 	groupName string,
 	suggestedNodes common.Set,
-	vc api.VirtualClusterName,
 	pod *core.Pod) internal.PodScheduleResult {
 
 	klog.V(4).Infof("[%v]: Got K8s suggested nodes: %v", internal.Key(pod), suggestedNodes)
@@ -59,25 +57,6 @@ func generatePodScheduleResult(
 			PodPreemptInfo: generatePodPreemptInfo(preemptionVictims, pod),
 		}
 	}
-	var waitReason string
-	if groupPhysicalPlacement == nil {
-		waitReason = "insufficient capacity in physical cluster"
-		if priority >= minGuaranteedPriority {
-			waitReason = fmt.Sprintf("insufficient capacity in VC %v", vc)
-		}
-	} else if !nodesNotInSuggested.IsEmpty() {
-		if group == nil || group.state == groupPreempting {
-			// for an unallocated group, we will keep it waiting if not all of its pods are scheduled to suggested nodes
-			waitReason = fmt.Sprintf(
-				"affinity group is decided to be scheduled to some nodes not within K8s suggested nodes: %v",
-				nodesNotInSuggested)
-		} else {
-			// for an existing group, we always insist the previous scheduling decision
-			// even if some pods are now not within suggested nodes
-			klog.Warningf("Some nodes used by affinity group %v are no longer within K8s suggested nodes: %v",
-				group.name, nodesNotInSuggested)
-		}
-	}
 	if waitReason != "" {
 		klog.Infof("[%v]: need to wait because %v", internal.Key(pod), waitReason)
 		return internal.PodScheduleResult{PodWaitInfo: &internal.PodWaitInfo{Reason: waitReason}}
@@ -100,7 +79,8 @@ func generatePodScheduleResult(
 
 // generatePodPreemptInfo writes the preemption victims into a PodPreemptInfo.
 func generatePodPreemptInfo(preemptionVictims map[string]common.Set, pod *core.Pod) *internal.PodPreemptInfo {
-	klog.Infof("[%v]: Preemption victim candidates: %v", internal.Key(pod), victimsToString(preemptionVictims))
+	klog.Infof("[%v]: Preemption victim candidates: %v",
+		internal.Key(pod), victimsToString(preemptionVictims))
 	var (
 		nodesHavingVictims []string
 		victimPods         []*core.Pod
diff --git a/pkg/api/types.go b/pkg/api/types.go
index bf59334..8c261b3 100644
--- a/pkg/api/types.go
+++ b/pkg/api/types.go
@@ -192,8 +192,8 @@ type CellStatus struct {
 	// (e.g., VC1/0/0 may represent VC1, preassigned cell 0, index 0 among its children)
 	CellAddress CellAddress `json:"cellAddress"`
 	// CellState and CellHealthiness are two orthogonal fields.
-	// That means, there are four possible combinations of them: a cell may be in
-	// (1) used and healthy, (2) used and bad, (3) free and healthy, and (4) free and bad.
+	// CellState represents whether the cell is being used (or acquired) by an affinity group.
+	// CellHealthiness represents whether the physical hardware is working normally.
 	CellState       CellState       `json:"cellState"`
 	CellHealthiness CellHealthiness `json:"cellHealthiness"`
 	CellPriority    int32           `json:"cellPriority"`