manman-ren · manman-ren · Apr 27, 2024
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -31,6 +31,7 @@ set(NVPTXCodeGen_sources
   NVPTXPrologEpilogPass.cpp
   NVPTXRegisterInfo.cpp
   NVPTXReplaceImageHandles.cpp
+  NVPTXSchedStrategy.cpp
   NVPTXSubtarget.cpp
   NVPTXTargetMachine.cpp
   NVPTXTargetTransformInfo.cpp

diff --git a/llvm/lib/Target/NVPTX/NVPTXSchedStrategy.cpp b/llvm/lib/Target/NVPTX/NVPTXSchedStrategy.cpp
@@ -0,0 +1,306 @@
+//===-- NVPTXSchedStrategy.cpp - NVPTX Scheduler Strategy ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This contains a MachineSchedStrategy implementation for maximizing wave
+/// occupancy on NVPTX hardware.
+///
+/// This pass will apply multiple scheduling stages to the same function.
+/// Regions are first recorded in PTXScheduleDAGMILive::schedule. The actual
+/// entry point for the scheduling of those regions is
+/// PTXScheduleDAGMILive::runSchedStages.
+
+/// Generally, the reason for having multiple scheduling stages is to account
+/// for the kernel-wide effect of register usage on occupancy.  Usually, only a
+/// few scheduling regions will have register pressure high enough to limit
+/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
+/// other regions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSchedStrategy.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+#define DEBUG_TYPE "machine-scheduler"
+
+using namespace llvm;
+
+NVPTXSchedStrategy::NVPTXSchedStrategy(const MachineSchedContext *C)
+    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+      HasHighPressure(false) {}
+
+void NVPTXSchedStrategy::initialize(ScheduleDAGMI *DAG) {
+  GenericScheduler::initialize(DAG);
+
+  MF = &DAG->MF;
+
+  const NVPTXSubtarget &ST = MF->getSubtarget<NVPTXSubtarget>();
+
+  NVPTXMachineFunctionInfo &MFI = *MF->getInfo<NVPTXMachineFunctionInfo>();
+  // Set the initial TargetOccupnacy to the maximum occupancy that we can
+  // achieve for this function. This effectively sets a lower bound on the
+  // 'Critical' register limits in the scheduler.
+  // Allow for lower occupancy targets if kernel is wave limited or memory
+  // bound, and using the relaxed occupancy feature.
+  //TargetOccupancy = MFI.getOccupancy();
+  //    RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
+
+}
+
+void NVPTXSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
+                                     bool AtTop,
+                                     const RegPressureTracker &RPTracker,
+                                     const NVPTXRegisterInfo *SRI,
+                                     unsigned SGPRPressure,
+                                     unsigned VGPRPressure) {
+  Cand.SU = SU;
+  Cand.AtTop = AtTop;
+
+  if (!DAG->isTrackingPressure())
+    return;
+
+  // getDownwardPressure() and getUpwardPressure() make temporary changes to
+  // the tracker, so we need to pass those function a non-const copy.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  Pressure.clear();
+  MaxPressure.clear();
+
+  if (AtTop)
+    TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+  else {
+    // FIXME: I think for bottom up scheduling, the register pressure is cached
+    // and can be retrieved by DAG->getPressureDif(SU).
+    TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+  }
+
+  unsigned NewSGPRPressure = Pressure[NVPTX::RegisterPressureSets::Int16Regs];
+  unsigned NewVGPRPressure = Pressure[NVPTX::RegisterPressureSets::Float32Regs];
+
+  // If two instructions increase the pressure of different register sets
+  // by the same amount, the generic scheduler will prefer to schedule the
+  // instruction that increases the set with the least amount of registers,
+  // which in our case would be SGPRs.  This is rarely what we want, so
+  // when we report excess/critical register pressure, we do it either
+  // only for VGPRs or only for SGPRs.
+
+  // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
+  const unsigned MaxVGPRPressureInc = 16;
+  bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
+  bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+
+
+  // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
+  // to increase the likelihood we don't go over the limits.  We should improve
+  // the analysis to look through dependencies to find the path with the least
+  // register pressure.
+
+  // We only need to update the RPDelta for instructions that increase register
+  // pressure. Instructions that decrease or keep reg pressure the same will be
+  // marked as RegExcess in tryCandidate() when they are compared with
+  // instructions that increase the register pressure.
+  if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
+    HasHighPressure = true;
+    Cand.RPDelta.Excess = PressureChange(NVPTX::RegisterPressureSets::Float32Regs);
+    Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
+  }
+
+  if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
+    HasHighPressure = true;
+    Cand.RPDelta.Excess = PressureChange(NVPTX::RegisterPressureSets::Int16Regs);
+    Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
+  }
+
+  // Register pressure is considered 'CRITICAL' if it is approaching a value
+  // that would reduce the wave occupancy for the execution unit.  When
+  // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
+  // has the same cost, so we don't need to prefer one over the other.
+
+  int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
+  int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+
+  if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+    HasHighPressure = true;
+    if (SGPRDelta > VGPRDelta) {
+      Cand.RPDelta.CriticalMax =
+        PressureChange(NVPTX::RegisterPressureSets::Int16Regs);
+      Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+    } else {
+      Cand.RPDelta.CriticalMax =
+        PressureChange(NVPTX::RegisterPressureSets::Float32Regs);
+      Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
+    }
+  }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeFromQueue()
+void NVPTXSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
+                                         const CandPolicy &ZonePolicy,
+                                         const RegPressureTracker &RPTracker,
+                                         SchedCandidate &Cand) {
+  const NVPTXRegisterInfo *SRI = static_cast<const NVPTXRegisterInfo*>(TRI);
+  ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
+  unsigned SGPRPressure = 0;
+  unsigned VGPRPressure = 0;
+  if (DAG->isTrackingPressure()) {
+    SGPRPressure = Pressure[NVPTX::RegisterPressureSets::Int16Regs];
+    VGPRPressure = Pressure[NVPTX::RegisterPressureSets::Float32Regs];
+  }
+  ReadyQueue &Q = Zone.Available;
+  for (SUnit *SU : Q) {
+
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
+                  SGPRPressure, VGPRPressure);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    tryCandidate(Cand, TryCand, ZoneArg);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      Cand.setBest(TryCand);
+      LLVM_DEBUG(traceCandidate(Cand));
+    }
+  }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeBidirectional()
+SUnit *NVPTXSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  // Set the bottom-up policy based on the state of the current bottom zone and
+  // the instructions outside the zone, including the top zone.
+  CandPolicy BotPolicy;
+  setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
+  // Set the top-down policy based on the state of the current top zone and
+  // the instructions outside the zone, including the bottom zone.
+  CandPolicy TopPolicy;
+  setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+  // See if BotCand is still valid (because we previously scheduled from Top).
+  LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
+  if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+      BotCand.Policy != BotPolicy) {
+    BotCand.reset(CandPolicy());
+    pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+    assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    LLVM_DEBUG(traceCandidate(BotCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+      assert(TCand.SU == BotCand.SU &&
+             "Last pick result should correspond to re-picking right now");
+    }
+#endif
+  }
+
+  // Check if the top Q has a better candidate.
+  LLVM_DEBUG(dbgs() << "Picking from Top:\n");
+  if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+      TopCand.Policy != TopPolicy) {
+    TopCand.reset(CandPolicy());
+    pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+    assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    LLVM_DEBUG(traceCandidate(TopCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+      assert(TCand.SU == TopCand.SU &&
+           "Last pick result should correspond to re-picking right now");
+    }
+#endif
+  }
+
+  // Pick best from BotCand and TopCand.
+  LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
+             dbgs() << "Bot Cand: "; traceCandidate(BotCand););
+  SchedCandidate Cand = BotCand;
+  TopCand.Reason = NoCand;
+  tryCandidate(Cand, TopCand, nullptr);
+  if (TopCand.Reason != NoCand) {
+    Cand.setBest(TopCand);
+  }
+  LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
+
+  IsTopNode = Cand.AtTop;
+  return Cand.SU;
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNode()
+SUnit *NVPTXSchedStrategy::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  do {
+    if (RegionPolicy.OnlyTopDown) {
+      SU = Top.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        TopCand.reset(NoPolicy);
+        pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
+    } else if (RegionPolicy.OnlyBottomUp) {
+      SU = Bot.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        BotCand.reset(NoPolicy);
+        pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
+    } else {
+      SU = pickNodeBidirectional(IsTopNode);
+    }
+  } while (SU->isScheduled);
+
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                    << *SU->getInstr());
+  return SU;
+}
+
+#if 0
+static ScheduleDAGInstrs *createNVPTXMachineScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, std::make_unique<NVPTXSchedStrategy>(C));
+}
+
+static MachineSchedRegistry NVPTXSchedRegistry("nvptx-base",
+                                              "Run NVPTX's custom scheduler",
+                                              createNVPTXMachineScheduler);
+#endif