diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index 07e9a626..d1c88a18 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -8,12 +8,38 @@ USE_OPT_SCHED YES # Same options as use optimal scheduling. PRINT_SPILL_COUNTS YES +# Print clustering information +# YES +# NO +PRINT_CLUSTER YES + # Use two pass scheduling approach. # First pass minimizes RP and second pass tries to balances RP and ILP. # YES # NO USE_TWO_PASS NO +# Sets a limit for occupancy in the second ILP pass. We will not go below this +# occupancy when attempting rescheduling. +# Valid values: 1-10 (whole integers) +MIN_OCCUPANCY_FOR_RESCHEDULE 3 + +# Sets the required schedule length improvement percentage for the second ILP +# pass. If we do not meet this minimum improvement then we do not keep the +# lower occupancy schedules. +# Valid values: 1-100 (whole integers) +MIN_ILP_IMPROVEMENT 10 + +# Allow enumerator to try to cluster memory operations together in the second +# pass. +# YES +# NO +CLUSTER_MEMORY_OPS NO + +# The weight for clustering. This factor determines the importance of +# trying to find clusters when enumerating. +CLUSTER_WEIGHT 1000 + # These 3 flags control which schedulers will be used. # Each one can be individually toggled. The heuristic # list scheduler or ACO must be run before the @@ -85,7 +111,8 @@ HEURISTIC LUC_CP_NID ENUM_HEURISTIC LUC_CP_NID # The heuuristic used for the enumerator in the second pass in the two-pass scheduling approach. -# Same valid values as HEURISTIC. +# Same valid values as HEURISTIC with an additional heuristic: +# Cluster: Favor instructions that are part of an active memory clustering group. SECOND_PASS_ENUM_HEURISTIC LUC_CP_NID # The spill cost function to be used. Valid values are: diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h index 8eb1499d..6180e344 100644 --- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h +++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h @@ -14,9 +14,12 @@ class OptSchedDDGWrapperBase { public: virtual ~OptSchedDDGWrapperBase() = default; - virtual void convertSUnits() = 0; + virtual void convertSUnits(bool IgnoreRealEdges, + bool IgnoreArtificialEdges) = 0; virtual void convertRegFiles() = 0; + + virtual int findPossibleClusters(bool IsLoad) = 0; }; } // namespace opt_sched diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index 27e3cbed..ef536b85 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -12,8 +12,11 @@ Last Update: Apr. 2011 #include "opt-sched/Scheduler/OptSchedTarget.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/sched_region.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include +#include #include #include @@ -32,6 +35,91 @@ class BBWithSpill : public SchedRegion { InstCount crntSpillCost_; InstCount optmlSpillCost_; + int CurrentClusterCost; + + /// Used to calculate the dynamic lower bound for clustering. + llvm::SmallVector ClusterCount; + llvm::SmallVector ClusterInstrRemainderCount; + int ClusterGroupCount; + + void computeAndPrintClustering(InstSchedule *Sched) override; + + /// Print the current clusters found so far in the schedule. + void printCurrentClustering() override; + + void initForClustering(); + + /// Calculate the lower bound cost for memory operations clustering and + /// return the lower bound cost. Does not take into account the clustering + /// weight. + int calculateClusterStaticLB(); + + /// Helper function for clustering to save the state of the current cluster. + void saveCluster(SchedInstruction *inst); + + /// Helper function for clustering to start a new clustering. + void initCluster(SchedInstruction *inst); + + /// Reset the active cluster to 0 (none). + void resetActiveCluster(SchedInstruction *inst); + + /// Helper function to restore the previous cluster. + void restorePreviousCluster(SchedInstruction *inst); + + bool isClusterFinished(); + + int calculateClusterDLB(); + + /// Current cluster size + unsigned int CurrentClusterSize; + + /// The minimum amount of cluster blocks possible. + int MinClusterBlocks; + + /// The minimum amount of cluster blocks + the optimistic expected cluster + /// blocks remaining. + int DynamicClusterLowerBound; + + /// Current active cluster group. + int ClusterActiveGroup; + + int StartCycle; + + /// Data struct to contain information about the previous clusters + struct PastClusters { + /// The cluster group + int ClusterGroup; + /// Size of the cluster when it was ended by an instruction not in the + /// cluster + int ClusterSize; + + /// Instruction number that ended this cluster. Used to check if we should + /// restore the cluster state when backtracking. + int InstNum; + + int Start; + + /// Contains the actual names of the instructions in the cluster. Only used + /// for printing and debugging purposes. + std::unique_ptr> InstrList; + + /// Constructor for this struct + PastClusters(int Cluster, int Size, int Instructions, int CycleStart) + : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions), + Start(CycleStart) {} + }; + + /// Vector containing the (n-1) past clusters + llvm::SmallVector, 4> PastClustersList; + + /// Contains the actual names of the instructions in the current cluster. + /// Only used for printing and debugging purposes. + std::unique_ptr> InstrList; + + /// Pointer to the last cluster. This is kept out of the vector to avoid + /// having to fetch it every time we compare the current instruction + /// number to the one that ended the cluster. + std::unique_ptr LastCluster; // The target machine const OptSchedTarget *OST; @@ -103,7 +191,8 @@ class BBWithSpill : public SchedRegion { void InitForCostCmputtn_(); InstCount CmputDynmcCost_(); - void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts); + void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts, + int Start); void UpdateSpillInfoForUnSchdul_(SchedInstruction *inst); void SetupPhysRegs_(); void CmputCrntSpillCost_(); diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index 3ef48cab..5b021145 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -13,6 +13,7 @@ Last Update: Mar. 2011 #include "opt-sched/Scheduler/buffers.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/sched_basic_data.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include @@ -291,7 +292,24 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, RegisterFile *getRegFiles() { return RegFiles.get(); } + // Memory clustering helper functions + int getMinClusterCount() { return MinClusterCount; } + void setMinClusterCount(int Max) { MinClusterCount = Max; } + int getTotalInstructionsInAllClusters() { + return TotalInstructionsInAllClusters; + } + void setTotalInstructionsInAllClusters(int Max) { + TotalInstructionsInAllClusters = Max; + } + int getTotalInstructionsInCluster(int Cluster); + protected: + int MinClusterCount; + int TotalInstructionsInAllClusters; + /// Map the cluster block to the total number of instructions found in the + /// block + MapVector MaxInstructionsInEachClusters; + // TODO(max): Get rid of this. // Number of basic blocks int32_t bscBlkCnt_; @@ -391,7 +409,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, InstCount fileUB, int blkNum); FUNC_RESULT FinishNode_(InstCount nodeNum, InstCount edgeCnt = -1); void CreateEdge_(InstCount frmInstNum, InstCount toInstNum, int ltncy, - DependenceType depType); + DependenceType depType, bool IsArtificial = false); FUNC_RESULT Finish_(); @@ -629,6 +647,9 @@ class InstSchedule { // The schedule's spill cost according to the cost function used InstCount spillCost_; + // The number of clusters + int ClusterSize; + // An array of peak reg pressures for all reg types in the schedule InstCount *peakRegPressures_; @@ -676,6 +697,8 @@ class InstSchedule { InstCount GetExecCost() const; void SetSpillCost(InstCount cost); InstCount GetSpillCost() const; + void setClusterSize(int size); + int getClusterSize() const; void ResetInstIter(); InstCount GetFrstInst(InstCount &cycleNum, InstCount &slotNum); @@ -699,6 +722,7 @@ class InstSchedule { void Print(std::ostream &out, char const *const title); void PrintInstList(FILE *file, DataDepGraph *dataDepGraph, const char *title) const; + void Print(std::ostream &out, char const *const title, DataDepGraph *ddg); void PrintRegPressures() const; bool Verify(MachineModel *machMdl, DataDepGraph *dataDepGraph); void PrintClassData(); diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h index be2f376f..d165ddd0 100644 --- a/include/opt-sched/Scheduler/enumerator.h +++ b/include/opt-sched/Scheduler/enumerator.h @@ -153,6 +153,12 @@ class EnumTreeNode { InstCount peakSpillCost_; InstCount spillCostSum_; InstCount totalCost_ = -1; + int ClusterCost; + int ClusterActiveGroup; + int ClusterAbsorbCount; + int ClusterDLB; + int ClusterTotalCost = -1; + int ClusterBestCost; bool totalCostIsActualCost_ = false; ReserveSlot *rsrvSlots_; @@ -276,6 +282,18 @@ class EnumTreeNode { inline void SetSpillCostSum(InstCount cost); inline InstCount GetSpillCostSum(); + inline void setClusteringCost(int Cost); + inline int getClusteringCost(); + inline void setCurClusteringGroup(int Group); + inline int getCurClusteringGroup(); + inline void setClusterAbsorbCount(int Absorb); + inline int getClusterAbsorbCount(); + inline void setClusterLwrBound(int ClusterDynamicLowerBound); + inline int getClusterLwrBound(); + inline void setTotalClusterCost(int Cost); + inline int getTotalClusterCost(); + inline bool isClustering(); + bool ChkInstRdndncy(SchedInstruction *inst, int brnchNum); bool IsNxtSlotStall(); @@ -317,6 +335,9 @@ class Enumerator : public ConstrainedScheduler { friend class HistEnumTreeNode; friend class CostHistEnumTreeNode; + // Should we cluster memory operations + bool Clustering; + // TODO(max): Document. bool isCnstrctd_; @@ -508,7 +529,7 @@ class Enumerator : public ConstrainedScheduler { InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, - InstCount preFxdInstCnt = 0, + bool ClusteringEnabled, InstCount preFxdInstCnt = 0, SchedInstruction *preFxdInsts[] = NULL); virtual ~Enumerator(); virtual void Reset(); @@ -525,6 +546,8 @@ class Enumerator : public ConstrainedScheduler { // (Chris) inline bool IsSchedForRPOnly() const { return SchedForRPOnly_; } + inline bool isClustering() const { return Clustering; } + // Calculates the schedule and returns it in the passed argument. FUNC_RESULT FindSchedule(InstSchedule *sched, SchedRegion *rgn) { return RES_ERROR; @@ -586,6 +609,7 @@ class LengthCostEnumerator : public Enumerator { bool WasObjctvMet_(); bool BackTrack_(); InstCount GetBestCost_(); + int GetBestClusterCost_(); void CreateRootNode_(); // Check if branching from the current node by scheduling this instruction @@ -603,7 +627,7 @@ class LengthCostEnumerator : public Enumerator { SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, SPILL_COST_FUNCTION spillCostFunc, - InstCount preFxdInstCnt = 0, + bool ClusteringEnabled, InstCount preFxdInstCnt = 0, SchedInstruction *preFxdInsts[] = NULL); virtual ~LengthCostEnumerator(); void Reset(); @@ -616,6 +640,7 @@ class LengthCostEnumerator : public Enumerator { bool IsCostEnum(); SPILL_COST_FUNCTION GetSpillCostFunc() { return spillCostFunc_; } inline InstCount GetBestCost() { return GetBestCost_(); } + int getBestClusterCost() { return GetBestClusterCost_(); } }; /*****************************************************************************/ @@ -851,6 +876,44 @@ void EnumTreeNode::SetSpillCostSum(InstCount cost) { InstCount EnumTreeNode::GetSpillCostSum() { return spillCostSum_; } /*****************************************************************************/ +void EnumTreeNode::setClusteringCost(int Cost) { + assert(Cost >= 0); + ClusterCost = Cost; +} + +int EnumTreeNode::getClusteringCost() { return ClusterCost; } + +void EnumTreeNode::setCurClusteringGroup(int Group) { + assert(Group >= 0); + ClusterActiveGroup = Group; +} + +int EnumTreeNode::getCurClusteringGroup() { return ClusterActiveGroup; } + +void EnumTreeNode::setClusterAbsorbCount(int Absorb) { + assert(Absorb >= 0); + ClusterAbsorbCount = Absorb; +} + +int EnumTreeNode::getClusterAbsorbCount() { return ClusterAbsorbCount; } + +void EnumTreeNode::setClusterLwrBound(int ClusterDynamicLowerBound) { + assert(ClusterDynamicLowerBound >= 0); + ClusterDLB = ClusterDynamicLowerBound; +} + +int EnumTreeNode::getClusterLwrBound() { return ClusterDLB; } + +void EnumTreeNode::setTotalClusterCost(int Cost) { + assert(Cost >= 0); + ClusterTotalCost = Cost; +} + +int EnumTreeNode::getTotalClusterCost() { return ClusterTotalCost; } + +bool EnumTreeNode::isClustering() { return enumrtr_->isClustering(); } +/*****************************************************************************/ + bool EnumTreeNode::IsNxtCycleNew_() { if (enumrtr_->issuRate_ == 1) { return true; diff --git a/include/opt-sched/Scheduler/graph.h b/include/opt-sched/Scheduler/graph.h index af8ba8f2..fea0576f 100644 --- a/include/opt-sched/Scheduler/graph.h +++ b/include/opt-sched/Scheduler/graph.h @@ -49,11 +49,15 @@ struct GraphEdge { UDT_GEDGES predOrder; // The second node's order in the first node's successor list. UDT_GEDGES succOrder; + // Whether or not the edge is an artificial dependency meaning it isn't + // required to be correct + bool IsArtificial; // Creates an edge between two nodes with labels label and label2. GraphEdge(GraphNode *from, GraphNode *to, UDT_GLABEL label, - UDT_GLABEL label2 = 0) - : from(from), to(to), label(label), label2(label2) {} + UDT_GLABEL label2 = 0, bool IsArtificial = false) + : from(from), to(to), label(label), label2(label2), + IsArtificial(IsArtificial) {} // Returns the node on the other side of the edge from the provided node. // Assumes that the argument is one of the nodes on the sides of the edge. @@ -512,7 +516,7 @@ inline UDT_GEDGES GraphNode::GetRcrsvScsrCnt() const { } inline LinkedList *GraphNode::GetNghbrLst(DIRECTION dir) { - return dir == DIR_FRWRD ? scsrLst_ : prdcsrLst_; + return dir == DIR_FRWRD ? prdcsrLst_ : scsrLst_; } inline GraphEdge *GraphNode::GetFrstScsrEdge() { diff --git a/include/opt-sched/Scheduler/hist_table.h b/include/opt-sched/Scheduler/hist_table.h index 982c87a6..85f6592b 100644 --- a/include/opt-sched/Scheduler/hist_table.h +++ b/include/opt-sched/Scheduler/hist_table.h @@ -109,6 +109,10 @@ class CostHistEnumTreeNode : public HistEnumTreeNode { InstCount cost_; InstCount peakSpillCost_; InstCount spillCostSum_; + int ClusterCost; + int ClusterActiveGroup; + int ClusterAbsorbCount; + int ClusterTotalCost; // (Chris) InstCount totalCost_ = -1; @@ -119,7 +123,6 @@ class CostHistEnumTreeNode : public HistEnumTreeNode { #ifdef IS_DEBUG bool costInfoSet_; #endif - bool ChkCostDmntnForBBSpill_(EnumTreeNode *node, Enumerator *enumrtr); bool ChkCostDmntn_(EnumTreeNode *node, Enumerator *enumrtr, InstCount &maxShft); diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h index 8861d843..537de59a 100644 --- a/include/opt-sched/Scheduler/lnkd_lst.h +++ b/include/opt-sched/Scheduler/lnkd_lst.h @@ -573,43 +573,66 @@ inline T *PriorityList::GetNxtPriorityElmnt(K &key) { } } +//(Vlad) added functionality to decrease priority +// used for decreasing priority of clusterable instrs +// when leaving a cluster template void PriorityList::BoostEntry(KeyedEntry *entry, K newKey) { KeyedEntry *crnt; KeyedEntry *next = entry->GetNext(); KeyedEntry *prev = entry->GetPrev(); - assert(newKey > entry->key); assert(LinkedList::topEntry_ != NULL); - entry->key = newKey; + if (entry->key < newKey) // behave normally + { + entry->key = newKey; - // If it is already at the top, or its previous still has a larger key, - // then the entry is already in place and no boosting is needed - if (entry == LinkedList::topEntry_ || prev->key >= newKey) - return; + // If it is already at the top, or its previous still has a larger key, + // then the entry is already in place and no boosting is needed + if (entry == LinkedList::topEntry_ || prev->key >= newKey) + return; - prev = NULL; + prev = NULL; - for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) { - if (crnt->key >= newKey) { - assert(crnt != entry); - assert(crnt != entry->GetPrev()); - prev = crnt; - break; + for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) { + if (crnt->key >= newKey) { + assert(crnt != entry); + assert(crnt != entry->GetPrev()); + prev = crnt; + break; + } } - } - if (prev == NULL) { - next = (KeyedEntry *)LinkedList::topEntry_; - } else { - next = prev->GetNext(); - assert(next != NULL); - } + if (prev == NULL) { + next = (KeyedEntry *)LinkedList::topEntry_; + } else { + next = prev->GetNext(); + assert(next != NULL); + } - assert(next != entry->GetNext()); - LinkedList::RmvEntry_(entry, false); - InsrtEntry_(entry, next); + assert(next != entry->GetNext()); + LinkedList::RmvEntry_(entry, false); + InsrtEntry_(entry, next); + } else // move entry down on priority list + { + entry->key = newKey; + + // if it is at the bottom or next entry still has a smaller key, + // then the entry is already in place + if (entry == LinkedList::bottomEntry_ || next->key <= newKey) + return; + + for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext()) { + if (crnt->key <= newKey) { + next = crnt; + break; + } + } + + LinkedList::RmvEntry_(entry, false); + InsrtEntry_(entry, next); + } this->itrtrReset_ = true; } diff --git a/include/opt-sched/Scheduler/ready_list.h b/include/opt-sched/Scheduler/ready_list.h index 3c7bb1a6..054b19f1 100644 --- a/include/opt-sched/Scheduler/ready_list.h +++ b/include/opt-sched/Scheduler/ready_list.h @@ -115,6 +115,7 @@ class ReadyList { int16_t ltncySumBits_; int16_t nodeID_Bits_; int16_t inptSchedOrderBits_; + int16_t ClusterBit; // Constructs the priority-list key based on the schemes listed in prirts_. unsigned long CmputKey_(SchedInstruction *inst, bool isUpdate, bool &changed); diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index c177c77f..46117e9e 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -8,14 +8,11 @@ Last Update: Sept. 2013 #ifndef OPTSCHED_BASIC_SCHED_BASIC_DATA_H #define OPTSCHED_BASIC_SCHED_BASIC_DATA_H -// For class string. -#include -// For class ostream. #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/graph.h" #include "opt-sched/Scheduler/hash_table.h" #include "opt-sched/Scheduler/machine_model.h" -#include +#include namespace llvm { namespace opt_sched { @@ -51,7 +48,11 @@ enum LISTSCHED_HEURISTIC { LSH_LS = 7, // LLVM list scheduler order - LSH_LLVM = 8 + LSH_LLVM = 8, + + // Dynamic memory clustering heuristic, favor instructions that are part of + // an active cluster + LSH_CLUSTER = 9 }; #define MAX_SCHED_PRIRTS 10 @@ -204,12 +205,14 @@ class SchedInstruction : public GraphNode { // depType: the type of dependence between this node and the successor. SchedInstruction *GetFrstScsr(InstCount *prdcsrNum = NULL, UDT_GLABEL *ltncy = NULL, - DependenceType *depType = NULL); + DependenceType *depType = NULL, + bool *IsArtificial = nullptr); // Returns the next successor of this instruction node and moves the // successor iterator forward. Fills parameters as above. SchedInstruction *GetNxtScsr(InstCount *prdcsrNum = NULL, UDT_GLABEL *ltncy = NULL, - DependenceType *depType = NULL); + DependenceType *depType = NULL, + bool *IsArtificial = nullptr); // Returns the last successor of this instruction node and moves the // successor iterator to the end of the list. If prdcsrNum is provided, this @@ -414,6 +417,15 @@ class SchedInstruction : public GraphNode { InstType GetCrtclPathFrmRoot() { return crtclPathFrmRoot_; } + /// Set MayCluster to true if clustering memory operations was found + /// to be possible. + void SetMayCluster(int ClusteringGroup); + bool GetMayCluster() { return MayCluster; } + int GetClusterGroup() { return ClusterGroup; } + static int GetActiveCluster() { return ActiveCluster; } + static void SetActiveCluster(int Active) { ActiveCluster = Active; } + bool getWasActive() { return WasActive; } + bool computeWasActive(); friend class SchedRange; protected: @@ -421,6 +433,16 @@ class SchedInstruction : public GraphNode { string name_; // The mnemonic of this instruction, e.g. "add" or "jmp". string opCode_; + + bool WasActive; + + /// The cluster group that the current instruction is a part of. + /// Default of 0 means that it is not part of any cluster. + int ClusterGroup; + /// This value should be set to true if clustering may be possible. + bool MayCluster; + /// Currently active cluster. Used for ready list. + static int ActiveCluster; // A numberical ID for this instruction. int nodeID_; // The type of this instruction. diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h index de36f85b..2685b7d0 100644 --- a/include/opt-sched/Scheduler/sched_region.h +++ b/include/opt-sched/Scheduler/sched_region.h @@ -52,12 +52,18 @@ class SchedRegion { // Destroys the region. Must be overriden by child classes. virtual ~SchedRegion() {} + bool PrintClustering; + bool TwoPassEnabled; + virtual void computeAndPrintClustering(InstSchedule *Sched) = 0; + + virtual void printCurrentClustering() = 0; // Returns the dependence graph of this region. inline DataDepGraph *GetDepGraph() { return dataDepGraph_; } // Returns the lower bound on the cost of this region. inline int GetCostLwrBound() { return costLwrBound_; } // Returns the best cost found so far for this region. inline InstCount GetBestCost() { return bestCost_; } + inline int getBestClusterCost() { return BestClusterCost; } // Returns a pointer to the list scheduler heurisitcs. inline SchedPriorities GetHeuristicPriorities() { return hurstcPrirts_; } // Get the number of simulated spills code added for this block. @@ -107,6 +113,9 @@ class SchedRegion { // Initialie variables for the second pass of the two-pass-optsched void InitSecondPass(); + bool enumFoundSchedule() { return EnumFoundSchedule; } + void setEnumFoundSchedule() { EnumFoundSchedule = true; } + private: // The algorithm to use for calculated lower bounds. LB_ALG lbAlg_; @@ -127,11 +136,14 @@ class SchedRegion { // Used for two-pass-optsched to enable second pass functionalies. bool isSecondPass_; + bool EnumFoundSchedule; + // The absolute cost lower bound to be used as a ref for normalized costs. InstCount costLwrBound_ = 0; - + // The best results found so far. InstCount bestCost_; + int BestClusterCost; InstCount bestSchedLngth_; // (Chris): The cost function. Defaults to PERP. @@ -160,6 +172,11 @@ class SchedRegion { InstSchedule *enumBestSched_; // The best schedule found so far (may be heuristic or enumerator generated) InstSchedule *bestSched_; + /// Flag to enable or disable clustering memory operations in the ILP pass. + /// Reads from the sched.ini file then set the flag accordingly. + bool ClusterMemoryOperations; + /// The weight for memory ops clustering. + int ClusteringWeight; // TODO(max): Document. InstCount schedLwrBound_; @@ -180,9 +197,13 @@ class SchedRegion { void SetBestCost(InstCount bestCost) { bestCost_ = bestCost; } - void SetBestSchedLength(InstCount bestSchedLngth) { bestSchedLngth_ = bestSchedLngth; } + void setBestClusterCost(int BestCost) { BestClusterCost = BestCost; } + + void SetBestSchedLength(InstCount bestSchedLngth) { + bestSchedLngth_ = bestSchedLngth; + } - const SchedPriorities& GetEnumPriorities() const { return enumPrirts_; } + const SchedPriorities &GetEnumPriorities() const { return enumPrirts_; } int16_t GetSigHashSize() const { return sigHashSize_; } diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index 4acd4903..ee817ec7 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -25,7 +25,10 @@ extern bool OPTSCHED_gPrintSpills; using namespace llvm::opt_sched; // The denominator used when calculating cost weight. -static const int COST_WGHT_BASE = 10; +static const int COST_WGHT_BASE = 100; + +// The max number of instructions in a cluster +static const unsigned MAX_INSTR_IN_CLUSTER = 15; BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, long rgnNum, int16_t sigHashSize, LB_ALG lbAlg, @@ -67,9 +70,36 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; + ClusterGroupCount = dataDepGraph_->getMinClusterCount(); + MinClusterBlocks = 0; +// if (ClusterMemoryOperations && ClusterGroupCount > 0) { + if (ClusterGroupCount > 0) { + ClusterCount.resize(ClusterGroupCount + 1); + ClusterInstrRemainderCount.resize(ClusterGroupCount + 1); + MinClusterBlocks = calculateClusterStaticLB(); + initForClustering(); + } } /****************************************************************************/ +void BBWithSpill::initForClustering() { + // Memory clustering variables initialization + SchedInstruction::SetActiveCluster(0); + CurrentClusterSize = 0; + ClusterActiveGroup = 0; + CurrentClusterCost = 0; + PastClustersList.clear(); + LastCluster.reset(); + InstrList.reset(); + DynamicClusterLowerBound = 0; + + for (int begin = 1; begin <= ClusterGroupCount; begin++) { + ClusterCount[begin] = 0; + ClusterInstrRemainderCount[begin] = + dataDepGraph_->getTotalInstructionsInCluster(begin); + } +} + BBWithSpill::~BBWithSpill() { if (enumrtr_ != NULL) { delete enumrtr_; @@ -82,6 +112,26 @@ BBWithSpill::~BBWithSpill() { } /*****************************************************************************/ +int BBWithSpill::calculateClusterStaticLB() { + // No cluster in this scheduling region + if (ClusterGroupCount == 0) + return 0; + + // Calculate the minimum cluster blocks that will be needed to cluster all of + // the instructions. The maximum amount in a cluster block is determined by + // the constant MAX_INSTR_IN_CLUSTER. + int ClusterCost = 0; + for (int begin = 1; begin <= ClusterGroupCount; begin++) { + int InstructionCount = dataDepGraph_->getTotalInstructionsInCluster(begin); + int CurrentClusterCost = + std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER); + Logger::Info("Cost for block %d is %d", begin, CurrentClusterCost); + ClusterCost += CurrentClusterCost; + } + + return ClusterCost; +} + bool BBWithSpill::EnableEnum_() { return true; /* @@ -305,6 +355,11 @@ InstCount BBWithSpill::CmputCostLwrBound() { InstCount staticLowerBound = schedLwrBound_ * schedCostFactor_ + spillCostLwrBound * SCW_; + // Add the minimum of the possible clusters to the lower bound + if (IsSecondPass() && ClusterMemoryOperations) { + staticLowerBound += MinClusterBlocks * ClusteringWeight; + } + #if defined(IS_DEBUG_STATIC_LOWER_BOUND) Logger::Info( "DAG %s spillCostLB %d scFactor %d lengthLB %d lenFactor %d staticLB %d", @@ -326,6 +381,9 @@ void BBWithSpill::InitForSchdulng() { /*****************************************************************************/ void BBWithSpill::InitForCostCmputtn_() { + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) + initForClustering(); + int i; crntCycleNum_ = 0; @@ -376,8 +434,23 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched, InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, InstCount &execCost, bool trackCnflcts) { + + InstCount instNum; + InstCount cycleNum; + InstCount slotNum; + SchedInstruction *inst; + if (compMode == CCM_STTC) { - if (GetSpillCostFunc() == SCF_SPILLS) { + if (GetSpillCostFunc() != SCF_SPILLS) { + InitForCostCmputtn_(); + + for (instNum = sched->GetFrstInst(cycleNum, slotNum); + instNum != INVALID_VALUE; + instNum = sched->GetNxtInst(cycleNum, slotNum)) { + inst = dataDepGraph_->GetInstByIndx(instNum); + SchdulInst(inst, cycleNum, slotNum, trackCnflcts); + } + } else { LocalRegAlloc regAlloc(sched, dataDepGraph_); regAlloc.SetupForRegAlloc(); regAlloc.AllocRegs(); @@ -389,6 +462,13 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, InstCount cost = sched->GetCrntLngth() * schedCostFactor_; execCost = cost; cost += crntSpillCost_ * SCW_; + // Add the current clustering cost + if (IsSecondPass() && ClusterMemoryOperations) { + cost += CurrentClusterCost * ClusteringWeight; + assert(calculateClusterDLB() == CurrentClusterCost); + sched->setClusterSize(CurrentClusterCost); + } + sched->SetSpillCosts(spillCosts_); sched->SetPeakRegPressures(peakRegPressures_); sched->SetSpillCost(crntSpillCost_); @@ -421,8 +501,108 @@ void BBWithSpill::CmputCrntSpillCost_() { } /*****************************************************************************/ +void BBWithSpill::computeAndPrintClustering(InstSchedule *Sched) { + InstCount instNum; + InstCount cycleNum; + InstCount slotNum; + SchedInstruction *inst; + bool temp = ClusterMemoryOperations; + + ClusterMemoryOperations = true; + InitForCostCmputtn_(); + for (instNum = Sched->GetFrstInst(cycleNum, slotNum); + instNum != INVALID_VALUE; + instNum = Sched->GetNxtInst(cycleNum, slotNum)) { + inst = dataDepGraph_->GetInstByIndx(instNum); + SchdulInst(inst, cycleNum, slotNum, false); + } + printCurrentClustering(); + ClusterMemoryOperations = temp; +} + +void BBWithSpill::saveCluster(SchedInstruction *inst) { + if (LastCluster) + // Save previous clusters in a vector except the last cluster + // that we just exited out of. + PastClustersList.push_back(std::move(LastCluster)); + + // Last cluster that we just exited out of, used for fast accessing + // to its contents. + LastCluster = llvm::make_unique( + ClusterActiveGroup, CurrentClusterSize, inst->GetNum(), StartCycle); + + LastCluster->InstrList = std::move(InstrList); +} + +void BBWithSpill::initCluster(SchedInstruction *inst) { + ClusterActiveGroup = inst->GetClusterGroup(); + inst->SetActiveCluster(ClusterActiveGroup); + CurrentClusterSize = 1; + ClusterInstrRemainderCount[ClusterActiveGroup]--; + InstrList = llvm::make_unique>(); + InstrList->push_back(inst->GetName()); + ClusterCount[ClusterActiveGroup]++; + CurrentClusterCost++; +} + +void BBWithSpill::resetActiveCluster(SchedInstruction *inst) { + ClusterActiveGroup = 0; + inst->SetActiveCluster(0); + CurrentClusterSize = 0; +} + +void BBWithSpill::restorePreviousCluster(SchedInstruction *inst) { + CurrentClusterSize = LastCluster->ClusterSize; + ClusterActiveGroup = LastCluster->ClusterGroup; + StartCycle = LastCluster->Start; + inst->SetActiveCluster(ClusterActiveGroup); + InstrList = std::move(LastCluster->InstrList); + LastCluster.reset(); // Release current cluster pointer + + // Get previous cluster from vector list + if (!PastClustersList.empty()) { + LastCluster = std::move(PastClustersList.back()); + PastClustersList.pop_back(); + } +} + +bool BBWithSpill::isClusterFinished() { + assert(ClusterActiveGroup != 0); + if (ClusterInstrRemainderCount[ClusterActiveGroup] == 0 || + CurrentClusterSize == MAX_INSTR_IN_CLUSTER) { + return true; + } + return false; +} + +int BBWithSpill::calculateClusterDLB() { + int OptimisticLowerBound = 0; + + for (int begin = 1; begin <= ClusterGroupCount; begin++) { + if (begin != ClusterActiveGroup) + OptimisticLowerBound += std::ceil( + double(ClusterInstrRemainderCount[begin]) / MAX_INSTR_IN_CLUSTER); + else { + // The amount of instructions remaining that the current open cluster can + // add + int AbsorbCount = MAX_INSTR_IN_CLUSTER - CurrentClusterSize; + // Assume the current open cluster can add the max amount of instructions + // that a cluster can contain. + int Remainder = ClusterInstrRemainderCount[begin] - AbsorbCount; + // If the remainder is negative then that indicates the open cluster can + // absorb all of the remaining instructions. + if (Remainder < 0) + Remainder = 0; + // Estimate the optimistic dynamic lower bound for the current cluster + OptimisticLowerBound += + std::ceil(double(Remainder) / MAX_INSTR_IN_CLUSTER); + } + } + return CurrentClusterCost + OptimisticLowerBound; +} + void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, - bool trackCnflcts) { + bool trackCnflcts, int Start) { int16_t regType; int defCnt, useCnt, regNum, physRegNum; Register **defs, **uses; @@ -430,6 +610,72 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, int liveRegs; InstCount newSpillCost; + // Conditions for creating a cluster: + // 1.) If a block is ended before it reaches 15 && there are remaining + // instructions + + // Conditions for removing a cluster: + // 1.) If the block is not 15 && there are remaining instructions + + // Scheduling cases for clustering project: + // 1.) Same Cluster -> Same Cluster + // If size == MAX_INSTR_IN_CLUSTER + // Save cluster to restore + // Set active to 0 + // 2.) Cluster -> Different Cluster + // 3.) Non-Cluster -> Cluster + // 4.) Cluster -> Non-Cluster + + // Possibly keep track of the current memory clustering size here + // and in UpdateSpillInfoForUnSchdul_() + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) { + // Check if the current instruction is part of a cluster + if (inst->GetMayCluster()) { + // Check if there is a current active cluster + // A ClusterActiveGroup == 0 indicates that there is no currently active + // clustering While ClusterActiveGroup != 0 indicates that there is active + // clustering + if (ClusterActiveGroup != 0) { + // Check if the instruction is in the same cluster group as the active + // cluster + if (ClusterActiveGroup == inst->GetClusterGroup()) { + // Case 1: Simple case where the current instruction is part of an + // already active cluster. + CurrentClusterSize++; + ClusterInstrRemainderCount[ClusterActiveGroup]--; + InstrList->push_back(inst->GetName()); + + // If we reach the max amount for this cluster then save the cluster + // and reset. + if (isClusterFinished()) { + saveCluster(inst); + resetActiveCluster(inst); + } + } else { + // Case 2: Else the instruction is part of different cluster that + // is not currently active. Store information of the old cluster + // group and start clustering for the new cluster. + saveCluster(inst); + + // Finish setting up the new cluster + initCluster(inst); + StartCycle = Start; + } + } else { + // Case 3: Not currently clustering. Initialize clustering + initCluster(inst); + StartCycle = Start; + } + } else if (ClusterActiveGroup != 0) { + // Case 4: Exiting out of an active cluster + // Save the cluster to restore when backtracking. + saveCluster(inst); + + // Reset active cluster + resetActiveCluster(inst); + } + } + defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -621,6 +867,56 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) { inst->GetNum()); #endif + // Backtracking cases for clustering project: + // 1.) Same Cluster <- Same Cluster + // 2.) Non-Cluster <- Cluster + // 3.) Different Cluster <- Cluster + // 4.) Cluster <- Non-cluster + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) { + // If the instruction we are backtracking from is part of a cluster + if (inst->GetMayCluster()) { + if (CurrentClusterSize != 0) { + // Case 1, 2, and 3 + // Reduce the cluster size + CurrentClusterSize--; + ClusterInstrRemainderCount[ClusterActiveGroup]++; + // Remove instruction's name from the list + InstrList->pop_back(); + + // Case 2: If there are no more instructions in the currently active + // cluster then it indicates that we backtracked out of a cluster. + if (CurrentClusterSize == 0) { + ClusterCount[ClusterActiveGroup]--; + assert(ClusterCount[ClusterActiveGroup] >= 0); + CurrentClusterCost--; + // Set active cluster to none. + resetActiveCluster(inst); + + // Case 3: Check If this instruction ended another cluster + if (LastCluster && LastCluster->InstNum == inst->GetNum()) { + // If so, then we need to restore the state of the previous cluster + restorePreviousCluster(inst); + } + } + } + // A cluster size of 0 while an instruction may cluster indicates that + // the current instruction is at the end of a finished cluster + else if (CurrentClusterSize == 0) { + assert(inst->GetNum() == LastCluster->InstNum); + restorePreviousCluster(inst); + + CurrentClusterSize--; + ClusterInstrRemainderCount[ClusterActiveGroup]++; + // Remove instruction's name from the list + InstrList->pop_back(); + } + } else if (LastCluster && LastCluster->InstNum == inst->GetNum()) { + // Case 4: If there was a previous cluster and this instruction + // ended the cluster then restore the previous cluster's state + restorePreviousCluster(inst); + } + } + defCnt = inst->GetDefs(defs); useCnt = inst->GetUses(uses); @@ -728,7 +1024,7 @@ void BBWithSpill::SchdulInst(SchedInstruction *inst, InstCount cycleNum, if (inst == NULL) return; assert(inst != NULL); - UpdateSpillInfoForSchdul_(inst, trackCnflcts); + UpdateSpillInfoForSchdul_(inst, trackCnflcts, crntCycleNum_); } /*****************************************************************************/ @@ -764,7 +1060,7 @@ void BBWithSpill::FinishHurstc_() { void BBWithSpill::FinishOptml_() { #ifdef IS_DEBUG_BBSPILL_COST - stats::traceOptimalCost.Record(bestCost_); + stats::traceOptimalCost.Record(GetBestCost()); stats::traceOptimalScheduleLength.Record(bestSchedLngth_); #endif } @@ -772,6 +1068,7 @@ void BBWithSpill::FinishOptml_() { Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) { bool enblStallEnum = enblStallEnum_; + bool ClusteringEnabled = IsSecondPass() && ClusterMemoryOperations; /* if (!dataDepGraph_->IncludesUnpipelined()) { enblStallEnum = false; }*/ @@ -779,7 +1076,7 @@ Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) { enumrtr_ = new LengthCostEnumerator( dataDepGraph_, machMdl_, schedUprBound_, GetSigHashSize(), GetEnumPriorities(), GetPruningStrategy(), SchedForRPOnly_, enblStallEnum, - timeout, GetSpillCostFunc(), 0, NULL); + timeout, GetSpillCostFunc(), ClusteringEnabled, 0, NULL); return enumrtr_; } @@ -813,26 +1110,14 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, HandlEnumrtrRslt_(rslt, trgtLngth); if (GetBestCost() == 0 || rslt == RES_ERROR || - (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT) || - (rslt == RES_SUCCESS && IsSecondPass())) { - - // If doing two pass optsched and on the second pass then terminate if a - // schedule is found with the same min-RP found in first pass. - if (rslt == RES_SUCCESS && IsSecondPass()) { - Logger::Info("Schedule found in second pass, terminating BB loop."); - - if (trgtLngth < schedUprBound_) - Logger::Info("Schedule found with length %d is shorter than current schedule with length %d.", trgtLngth, schedUprBound_); - } - + (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) { break; } enumrtr_->Reset(); enumCrntSched_->Reset(); - if (!IsSecondPass()) - CmputSchedUprBound_(); + CmputSchedUprBound_(); iterCnt++; costLwrBound += 1; @@ -880,14 +1165,54 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched, Logger::Info("$$$ GOOD_HIT: Better spill cost for a longer schedule"); SetBestCost(crntCost); + if (IsSecondPass() && ClusterMemoryOperations) + setBestClusterCost(CurrentClusterCost); optmlSpillCost_ = crntSpillCost_; SetBestSchedLength(crntSched->GetCrntLngth()); enumBestSched_->Copy(crntSched); bestSched_ = enumBestSched_; + if (!enumFoundSchedule()) + setEnumFoundSchedule(); } return GetBestCost(); } + +void BBWithSpill::printCurrentClustering() { + // Print the instructions in the clusters after finding a schedule. + if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) { + dbgs() << "Printing clustered instructions:\n"; + int i = 1; + for (const auto &clusters : PastClustersList) { + dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start + << "): "; + for (const auto &instr : *clusters->InstrList) { + dbgs() << instr << " "; + } + i++; + dbgs() << '\n'; + } + + if (LastCluster) { + dbgs() << "Printing cluster " << i << ", start cycle (" + << LastCluster->Start << "): "; + for (const auto &instr : *(LastCluster->InstrList)) { + dbgs() << instr << " "; + } + i++; + dbgs() << '\n'; + } + + if (InstrList && InstrList->size() > 0) { + dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle + << "): "; + for (const auto &instr : *InstrList) { + dbgs() << instr << " "; + } + dbgs() << '\n'; + } + } +} /*****************************************************************************/ void BBWithSpill::SetupForSchdulng_() { @@ -914,17 +1239,32 @@ void BBWithSpill::SetupForSchdulng_() { bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { bool fsbl = true; InstCount crntCost, dynmcCostLwrBound; + int ClusterDynamicLowerBound; if (GetSpillCostFunc() == SCF_SLIL) { crntCost = dynamicSlilLowerBound_ * SCW_ + trgtLngth * schedCostFactor_; } else { crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_; } + // Add the cost of clustering + if (IsSecondPass() && ClusterMemoryOperations) { + ClusterDynamicLowerBound = calculateClusterDLB(); + crntCost += ClusterDynamicLowerBound * ClusteringWeight; + } + crntCost -= GetCostLwrBound(); dynmcCostLwrBound = crntCost; // assert(cost >= 0); assert(dynmcCostLwrBound >= 0); + /* + if (IsSecondPass() && ClusterMemoryOperations) { + dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " << + dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n'; + printCurrentClustering(); + } + */ + fsbl = dynmcCostLwrBound < GetBestCost(); // FIXME: RP tracking should be limited to the current SCF. We need RP @@ -934,6 +1274,16 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) { node->SetCostLwrBound(dynmcCostLwrBound); node->SetPeakSpillCost(peakSpillCost_); node->SetSpillCostSum(totSpillCost_); + if (IsSecondPass() && ClusterMemoryOperations) { + node->setClusteringCost(CurrentClusterCost); + node->setCurClusteringGroup(ClusterActiveGroup); + node->setClusterLwrBound(ClusterDynamicLowerBound); + if (ClusterActiveGroup != 0) { + node->setClusterAbsorbCount(15 - CurrentClusterSize); + } else { + node->setClusterAbsorbCount(0); + } + } } return fsbl; } diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index c7273b78..ef6e2cda 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -197,6 +197,9 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn) exitInstCnt_ = 0; RegFiles = llvm::make_unique(machMdl_->GetRegTypeCnt()); + + MinClusterCount = 0; + TotalInstructionsInAllClusters = 0; } DataDepGraph::~DataDepGraph() { @@ -211,6 +214,11 @@ DataDepGraph::~DataDepGraph() { delete[] instCntPerType_; } +int DataDepGraph::getTotalInstructionsInCluster(int Cluster) { + assert(Cluster > 0); + return MaxInstructionsInEachClusters[Cluster]; +} + FUNC_RESULT DataDepGraph::SetupForSchdulng(bool cmputTrnstvClsr) { assert(wasSetupForSchduling_ == false); @@ -899,7 +907,8 @@ void DataDepGraph::CreateEdge(SchedInstruction *frmNode, } void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum, - int ltncy, DependenceType depType) { + int ltncy, DependenceType depType, + bool IsArtificial) { GraphEdge *edge; assert(frmNodeNum < instCnt_); @@ -928,7 +937,7 @@ void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum, Logger::Info("Creating edge from %d to %d of type %d and latency %d", frmNodeNum, toNodeNum, depType, ltncy); #endif - edge = new GraphEdge(frmNode, toNode, ltncy, depType); + edge = new GraphEdge(frmNode, toNode, ltncy, depType, IsArtificial); frmNode->AddScsr(edge); toNode->AddPrdcsr(edge); @@ -2753,6 +2762,7 @@ void InstSchedule::Copy(InstSchedule *src) { SetSpillCosts(src->spillCosts_); SetPeakRegPressures(src->peakRegPressures_); + setClusterSize(src->getClusterSize()); cost_ = src->cost_; execCost_ = src->execCost_; spillCost_ = src->spillCost_; @@ -2827,6 +2837,44 @@ void InstSchedule::Print(std::ostream &out, char const *const label) { } } + + void InstSchedule::Print(std::ostream &out, char const *const title, + DataDepGraph *ddg) { + InstCount slotInCycle = 0; + InstCount cycleNum = 0; + InstCount i; + + // out << '\n' << label << " Schedule"; + Logger::Info("Printing Schedule"); + + for (i = 0; i < crntSlotNum_; i++) { + if (slotInCycle == 0) { + if (instInSlot_[i] != SCHD_STALL) { + InstCount instNum = instInSlot_[i]; + SchedInstruction *inst = ddg->GetInstByIndx(instNum); + Logger::Info("Cycle# %d : %d - %s", cycleNum, instInSlot_[i], inst->GetName()); + } else + Logger::Info("Cycle# %d : %d -", cycleNum, instInSlot_[i]); + } + /* + out << "\nCycle# " << cycleNum << ": "; + + if (instInSlot_[i] == SCHD_STALL) { + out << "X "; + } else { + out << instInSlot_[i] << ' '; + } + */ + + slotInCycle++; + + if (slotInCycle == issuRate_) { + slotInCycle = 0; + cycleNum++; + } + } + } + #if defined(IS_DEBUG_PEAK_PRESSURE) || defined(IS_DEBUG_OPTSCHED_PRESSURES) void InstSchedule::PrintRegPressures() const { Logger::Info("OptSched max reg pressures:"); @@ -2972,8 +3020,15 @@ bool InstSchedule::VerifyDataDeps_(DataDepGraph *dataDepGraph) { UDT_GLABEL ltncy; DependenceType depType; - for (SchedInstruction *scsr = inst->GetFrstScsr(NULL, <ncy, &depType); - scsr != NULL; scsr = inst->GetNxtScsr(NULL, <ncy, &depType)) { + bool IsArtificial; + for (SchedInstruction *scsr = + inst->GetFrstScsr(NULL, <ncy, &depType, &IsArtificial); + scsr != NULL; + scsr = inst->GetNxtScsr(NULL, <ncy, &depType, &IsArtificial)) { + // Artificial nodes are not required for the schedule to be correct + if (IsArtificial) + continue; + InstCount scsrCycle = GetSchedCycle(scsr); if (scsrCycle < (instCycle + ltncy)) { Logger::Error("Invalid schedule: Latency from %d to %d not satisfied", @@ -3043,6 +3098,10 @@ void InstSchedule::SetSpillCost(InstCount cost) { spillCost_ = cost; } InstCount InstSchedule::GetSpillCost() const { return spillCost_; } +void InstSchedule::setClusterSize(int size) { ClusterSize = size; } + +int InstSchedule::getClusterSize() const { return ClusterSize; } + /******************************************************************************* * Previously inlined functions ******************************************************************************/ @@ -3205,7 +3264,6 @@ bool DataDepGraph::DoesFeedUser(SchedInstruction *inst) { // If there is a successor instruction that decreases live intervals // or one that does not increase live intervals, then return true. return true; - } // Return false if there is no recursive successor of inst // that uses a live register. diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index d9c4e3b1..43bf6ed6 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -64,6 +64,12 @@ void EnumTreeNode::Init_() { isLeaf_ = false; cost_ = INVALID_VALUE; costLwrBound_ = INVALID_VALUE; + ClusterCost = INVALID_VALUE; + ClusterActiveGroup = INVALID_VALUE; + ClusterAbsorbCount = INVALID_VALUE; + ClusterDLB = INVALID_VALUE; + ClusterTotalCost = -1; + ClusterBestCost = 99999999; crntCycleBlkd_ = false; rsrvSlots_ = NULL; totalCostIsActualCost_ = false; @@ -434,8 +440,8 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, - Milliseconds timeout, InstCount preFxdInstCnt, - SchedInstruction *preFxdInsts[]) + Milliseconds timeout, bool ClusteringEnabled, + InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : ConstrainedScheduler(dataDepGraph, machMdl, schedUprBound) { memAllocBlkSize_ = (int)timeout / TIMEOUT_TO_MEMBLOCK_RATIO; assert(preFxdInstCnt >= 0); @@ -454,6 +460,7 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl, prune_ = PruningStrategy; SchedForRPOnly_ = SchedForRPOnly; enblStallEnum_ = enblStallEnum; + Clustering = ClusteringEnabled; isEarlySubProbDom_ = true; @@ -1316,17 +1323,27 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode, Logger::Info("Leaf node total cost %d", currentNode->GetCost()); #endif currentNode->SetTotalCost(currentNode->GetCost()); + if (currentNode->isClustering()) + currentNode->setTotalClusterCost(currentNode->getClusteringCost()); currentNode->SetTotalCostIsActualCost(true); } else { - if (!currentNode->GetTotalCostIsActualCost() && - (currentNode->GetTotalCost() == -1 || - currentNode->GetCostLwrBound() < currentNode->GetTotalCost())) { -#if defined(IS_DEBUG_ARCHIVE) - Logger::Info("Inner node doesn't have a real cost yet. Setting total " - "cost to dynamic lower bound %d", - currentNode->GetCostLwrBound()); -#endif - currentNode->SetTotalCost(currentNode->GetCostLwrBound()); + if (!currentNode->GetTotalCostIsActualCost()) { + // Set overall weighted sum cost + if (currentNode->GetTotalCost() == -1 || + currentNode->GetCostLwrBound() < currentNode->GetTotalCost()) { + #if defined(IS_DEBUG_ARCHIVE) + Logger::Info("Inner node doesn't have a real cost yet. Setting total " + "cost to dynamic lower bound %d", + currentNode->GetCostLwrBound()); + #endif + currentNode->SetTotalCost(currentNode->GetCostLwrBound()); + } + + // Set clustering cost + if ((currentNode->isClustering() && currentNode->getTotalClusterCost() == -1) || + (currentNode->getClusterLwrBound() < currentNode->getTotalClusterCost())) { + currentNode->setTotalClusterCost(currentNode->getClusterLwrBound()); + } } } @@ -1359,16 +1376,25 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode, currentNode->GetTotalCost()); #endif parentNode->SetTotalCost(currentNode->GetTotalCost()); + if (currentNode->isClustering()) + parentNode->setTotalClusterCost(currentNode->getTotalClusterCost()); parentNode->SetTotalCostIsActualCost(true); parentNode->SetSuffix(std::move(parentSuffix)); - } else if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) { -#if defined(IS_DEBUG_ARCHIVE) - Logger::Info( - "Current node has a real cost (%d), and so does parent. (%d)", - currentNode->GetTotalCost(), parentNode->GetTotalCost()); -#endif - parentNode->SetTotalCost(currentNode->GetTotalCost()); - parentNode->SetSuffix(std::move(parentSuffix)); + } else { + if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) { + #if defined(IS_DEBUG_ARCHIVE) + Logger::Info( + "Current node has a real cost (%d), and so does parent. (%d)", + currentNode->GetTotalCost(), parentNode->GetTotalCost()); + #endif + parentNode->SetTotalCost(currentNode->GetTotalCost()); + parentNode->SetSuffix(std::move(parentSuffix)); + } + + // Set clustering cost + if (currentNode->isClustering() && currentNode->getTotalClusterCost() < parentNode->getTotalClusterCost()) { + parentNode->setTotalClusterCost(currentNode->getTotalClusterCost()); + } } } } @@ -1856,7 +1882,7 @@ LengthEnumerator::LengthEnumerator( bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts, - PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, + PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, false, preFxdInstCnt, preFxdInsts) { SetupAllocators_(); tmpHstryNode_ = new HistEnumTreeNode; @@ -1941,11 +1967,11 @@ LengthCostEnumerator::LengthCostEnumerator( DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, - SPILL_COST_FUNCTION spillCostFunc, InstCount preFxdInstCnt, - SchedInstruction *preFxdInsts[]) + SPILL_COST_FUNCTION spillCostFunc, bool ClusteringEnabled, + InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts, PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, - preFxdInstCnt, preFxdInsts) { + ClusteringEnabled, preFxdInstCnt, preFxdInsts) { SetupAllocators_(); costChkCnt_ = 0; @@ -2141,6 +2167,7 @@ bool LengthCostEnumerator::BackTrack_() { /*****************************************************************************/ InstCount LengthCostEnumerator::GetBestCost_() { return rgn_->GetBestCost(); } +int LengthCostEnumerator::GetBestClusterCost_() { return rgn_->getBestClusterCost(); } /*****************************************************************************/ void LengthCostEnumerator::CreateRootNode_() { diff --git a/lib/Scheduler/hist_table.cpp b/lib/Scheduler/hist_table.cpp index a4c1cae7..8a9ff356 100644 --- a/lib/Scheduler/hist_table.cpp +++ b/lib/Scheduler/hist_table.cpp @@ -400,6 +400,10 @@ void CostHistEnumTreeNode::Init_() { costInfoSet_ = false; #endif cost_ = 0; + ClusterCost = 9999999; + ClusterTotalCost = 9999999; + ClusterActiveGroup = 0; + ClusterAbsorbCount = 0; } bool CostHistEnumTreeNode::DoesDominate(EnumTreeNode *node, @@ -467,6 +471,41 @@ static bool doesHistoryPeakCostDominate(InstCount OtherPrefixCost, return LCE->GetBestCost() <= OtherPrefixCost; } +static bool doesClusterCostDominate(EnumTreeNode *CurEnumNode, + int ClusterActiveGroup, int ClusterCost, + int ClusterAbsorbCount, int ClusterTotalCost, + int ClusterBest) { + // Correct but too restrictive + if (CurEnumNode->getCurClusteringGroup() != ClusterActiveGroup) + return false; + + // Count the instructions only if there is an instruction in the ready list that belongs + // to the open cluster. If there is none, you can't add any instructions. If there are no instructions + // on the ready list that belong to the open cluster, we can set the cluster absorb count to 0. + if (CurEnumNode->getClusteringCost() >= ClusterCost && + CurEnumNode->getClusterAbsorbCount() <= ClusterAbsorbCount) + return true; + + // More room in the open cluster can reduce the number clusters by at most one + if (CurEnumNode->getClusteringCost() >= ClusterCost + 1) + return true; + + int improvement = ClusterCost - CurEnumNode->getClusteringCost(); + + // If the current node has a better absorb count then we optimistically assume it may + // improve the number of clusters by 1 + if (CurEnumNode->getClusterAbsorbCount() < ClusterAbsorbCount) + improvement++; + + // Two cases for a history node, + // 1.) One without a full schedule below it. Look at DLB. + // 2.) One with a full schedule below it. Look at the best found below the history node. + if (ClusterBest != INVALID_VALUE && improvement <= ClusterTotalCost - ClusterBest) + return true; + + return false; +} + // Should we prune the other node based on RP cost. bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node, Enumerator *E) { @@ -502,6 +541,10 @@ bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node, ShouldPrune = spillCostSum_ % instCnt >= Node->GetSpillCostSum() % instCnt; } + if (!ShouldPrune && LCE->isClustering()) { + int ClusterBest = LCE->getBestClusterCost(); + ShouldPrune = doesClusterCostDominate(Node, ClusterActiveGroup, ClusterCost, ClusterAbsorbCount, ClusterTotalCost, ClusterBest); + } } return ShouldPrune; } @@ -511,6 +554,10 @@ void CostHistEnumTreeNode::SetCostInfo(EnumTreeNode *node, bool, Enumerator *) { peakSpillCost_ = node->GetPeakSpillCost(); spillCostSum_ = node->GetSpillCostSum(); isLngthFsbl_ = node->IsLngthFsbl(); + ClusterCost = node->getClusteringCost(); + ClusterActiveGroup = node->getCurClusteringGroup(); + ClusterAbsorbCount = node->getClusterAbsorbCount(); + ClusterTotalCost = node->getTotalClusterCost(); // (Chris) partialCost_ = node->GetCostLwrBound(); diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index 7abd3ff0..6bee513b 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -15,7 +15,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { // enable fast updating for dynamic heuristics. if (prirts_.isDynmc) keyedEntries_ = new KeyedEntry - *[dataDepGraph->GetInstCnt()]; + *[dataDepGraph->GetInstCnt()](); else keyedEntries_ = nullptr; @@ -34,9 +34,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { break; case LSH_LUC: - for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) { - keyedEntries_[j] = NULL; - } maxUseCnt_ = dataDepGraph->GetMaxUseCnt(); useCntBits_ = Utilities::clcltBitsNeededToHoldNum(maxUseCnt_); totKeyBits += useCntBits_; @@ -73,6 +70,14 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { ltncySumBits_ = Utilities::clcltBitsNeededToHoldNum(maxLtncySum_); totKeyBits += ltncySumBits_; break; + + case LSH_CLUSTER: + // Bits needed: 1 + // 0: Not part of an active cluster + // 1: Part of an active cluster + ClusterBit = Utilities::clcltBitsNeededToHoldNum(1); + totKeyBits += ClusterBit; + break; } // end switch } // end for @@ -116,6 +121,10 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { AddPrirtyToKey_(maxPriority_, keySize, ltncySumBits_, maxLtncySum_, maxLtncySum_); break; + + case LSH_CLUSTER: + AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1); + break; } } } @@ -152,6 +161,8 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, int16_t keySize = 0; int i; int16_t oldLastUseCnt, newLastUseCnt; + unsigned long ValueForKey; + bool OldWasActive, NewWasActive; changed = true; if (isUpdate) changed = false; @@ -198,6 +209,24 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, AddPrirtyToKey_(key, keySize, ltncySumBits_, inst->GetLtncySum(), maxLtncySum_); break; + + case LSH_CLUSTER: + // Partially copied how LUC is calculated to be updated. + if (inst->GetClusterGroup() == 0) + ValueForKey = 0; + else { + OldWasActive = inst->getWasActive(); + NewWasActive = inst->computeWasActive(); + + if (OldWasActive != NewWasActive) { + changed = true; + } + ValueForKey = + inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1 + : 0; + } + AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1); + break; } } return key; @@ -214,14 +243,17 @@ void ReadyList::AddLatestSubLists(LinkedList *lst1, } void ReadyList::Print(std::ostream &out) { + PriorityList *OutList = new PriorityList; + OutList->CopyList(prirtyLst_, nullptr); out << "Ready List: "; - for (const auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL; - crntInst = prirtyLst_->GetNxtElmnt()) { - out << " " << crntInst->GetNum(); + for (auto *crntInst = OutList->GetFrstElmnt(); crntInst != NULL; + crntInst = OutList->GetNxtElmnt()) { + out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup() + << ")"; } out << '\n'; - prirtyLst_->ResetIterator(); + delete OutList; } void ReadyList::AddLatestSubList_(LinkedList *lst) { @@ -280,6 +312,7 @@ void ReadyList::AddInst(SchedInstruction *inst) { assert(changed == true); KeyedEntry *entry = prirtyLst_->InsrtElmnt(inst, key, true); + InstCount instNum = inst->GetNum(); if (prirts_.isDynmc) keyedEntries_[instNum] = entry; diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index 2c2e4752..4aec6ec6 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -4,6 +4,9 @@ using namespace llvm::opt_sched; +// Initially set the active clustering to 0 for none. +int SchedInstruction::ActiveCluster = 0; + SchedInstruction::SchedInstruction(InstCount num, const string &name, InstType instType, const string &opCode, InstCount maxInstCnt, int nodeID, @@ -15,6 +18,8 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, name_ = name; opCode_ = opCode; instType_ = instType; + ClusterGroup = 0; + MayCluster = false; frwrdLwrBound_ = INVALID_VALUE; bkwrdLwrBound_ = INVALID_VALUE; @@ -60,6 +65,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, mustBeInBBEntry_ = false; mustBeInBBExit_ = false; + WasActive = false; } SchedInstruction::~SchedInstruction() { @@ -68,6 +74,11 @@ SchedInstruction::~SchedInstruction() { delete crntRange_; } +bool SchedInstruction::computeWasActive() { + WasActive = GetActiveCluster() == GetClusterGroup(); + return WasActive; +} + void SchedInstruction::SetupForSchdulng(InstCount instCnt, bool isCP_FromScsr, bool isCP_FromPrdcsr) { if (memAllocd_) @@ -373,7 +384,8 @@ SchedInstruction *SchedInstruction::GetNxtPrdcsr(InstCount *scsrNum, SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum, UDT_GLABEL *ltncy, - DependenceType *depType) { + DependenceType *depType, + bool *IsArtificial) { GraphEdge *edge = GetFrstScsrEdge(); if (!edge) return NULL; @@ -383,12 +395,15 @@ SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum, *ltncy = edge->label; if (depType) *depType = (DependenceType)edge->label2; + if (IsArtificial) + *IsArtificial = edge->IsArtificial; return (SchedInstruction *)(edge->to); } SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum, UDT_GLABEL *ltncy, - DependenceType *depType) { + DependenceType *depType, + bool *IsArtificial) { GraphEdge *edge = GetNxtScsrEdge(); if (!edge) return NULL; @@ -398,6 +413,8 @@ SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum, *ltncy = edge->label; if (depType) *depType = (DependenceType)edge->label2; + if (IsArtificial) + *IsArtificial = edge->IsArtificial; return (SchedInstruction *)(edge->to); } @@ -717,6 +734,13 @@ int16_t SchedInstruction::CmputLastUseCnt() { return lastUseCnt_; } +void SchedInstruction::SetMayCluster(int ClusteringGroup) { + if (ClusteringGroup > 0) { + ClusterGroup = ClusteringGroup; + MayCluster = true; + } +} + /****************************************************************************** * SchedRange * ******************************************************************************/ diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index 23dfd165..64e4bc56 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -2,6 +2,7 @@ #include #include +#include "Wrapper/OptSchedDDGWrapperBasic.h" #include "opt-sched/Scheduler/aco.h" #include "opt-sched/Scheduler/bb_spill.h" #include "opt-sched/Scheduler/config.h" @@ -39,6 +40,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph, totalSimSpills_ = INVALID_VALUE; bestCost_ = INVALID_VALUE; + BestClusterCost = INVALID_VALUE; bestSchedLngth_ = INVALID_VALUE; hurstcCost_ = INVALID_VALUE; enumCrntSched_ = NULL; @@ -47,6 +49,8 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph, schedUprBound_ = INVALID_VALUE; spillCostFunc_ = spillCostFunc; + PrintClustering = false; + EnumFoundSchedule = false; } void SchedRegion::UseFileBounds_() { @@ -122,6 +126,10 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( // heuristic scheduler or ACO before the branch & bound enumerator must be // enabled. Config &schedIni = SchedulerOptions::getInstance(); + PrintClustering = schedIni.GetBool("PRINT_CLUSTER"); + TwoPassEnabled = schedIni.GetBool("USE_TWO_PASS"); + ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS"); + ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT"); bool HeuristicSchedulerEnabled = schedIni.GetBool("HEUR_ENABLED"); bool AcoSchedulerEnabled = schedIni.GetBool("ACO_ENABLED"); bool BbSchedulerEnabled = isBbEnabled(schedIni, rgnTimeout); @@ -176,17 +184,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( CmputAbslutUprBound_(); schedLwrBound_ = dataDepGraph_->GetSchedLwrBound(); - // We can calculate lower bounds here since it is only dependent - // on schedLwrBound_ - if (!BbSchedulerEnabled) - costLwrBound_ = CmputCostLwrBound(); - else - CmputLwrBounds_(false); - - // Log the lower bound on the cost, allowing tools reading the log to compare - // absolute rather than relative costs. - Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_); - // Step #1: Find the heuristic schedule if enabled. // Note: Heuristic scheduler is required for the two-pass scheduler // to use the sequential list scheduler which inserts stalls into @@ -208,9 +205,37 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( hurstcTime = Utilities::GetProcessorTime() - hurstcStart; stats::heuristicTime.Record(hurstcTime); + if (hurstcTime > 0) Logger::Info("Heuristic_Time %d", hurstcTime); + } + + // After the sequential scheduler in the second pass, add the artificial edges + // to the DDG. Some mutations were adding artificial edges which caused a + // conflict with the sequential scheduler. Therefore, wait until the + // sequential scheduler is done before adding artificial edges. + if (IsSecondPass()) { + static_cast(dataDepGraph_)->addArtificialEdges(); + rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure); + if (rslt != RES_SUCCESS) { + Logger::Info("Invalid DAG after adding artificial cluster edges"); + return rslt; + } + } + + // This must be done after SetupForSchdulng() or UpdateSetupForSchdulng() to + // avoid resetting lower bound values. + if (!BbSchedulerEnabled) + costLwrBound_ = CmputCostLwrBound(); + else + CmputLwrBounds_(false); + + // Log the lower bound on the cost, allowing tools reading the log to compare + // absolute rather than relative costs. + Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_); + // Cost calculation must be below lower bounds calculation + if (HeuristicSchedulerEnabled || IsSecondPass()) { heuristicScheduleLength = lstSched->GetCrntLngth(); InstCount hurstcExecCost; // Compute cost for Heuristic list scheduler, this must be called before @@ -225,6 +250,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = lstSched; bestSchedLngth_ = heuristicScheduleLength; bestCost_ = hurstcCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(lstSched->getClusterSize()); } FinishHurstc_(); @@ -279,6 +306,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = AcoSchedule; bestSchedLngth_ = AcoScheduleLength_; bestCost_ = AcoScheduleCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(AcoSchedule->getClusterSize()); } } @@ -294,6 +323,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = lstSched; bestSchedLngth_ = heuristicScheduleLength; bestCost_ = hurstcCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(lstSched->getClusterSize()); } // B) Heuristic was never run. In that case, just use ACO and run with its // results, into B&B. @@ -301,6 +332,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_ = AcoSchedule; bestSchedLngth_ = AcoScheduleLength_; bestCost_ = AcoScheduleCost_; + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(AcoSchedule->getClusterSize()); // C) Neither scheduler was optimal. In that case, compare the two // schedules and use the one that's better as the input (initialSched) for // B&B. @@ -309,6 +342,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( bestSched = bestSched_; bestSchedLngth_ = bestSched_->GetCrntLngth(); bestCost_ = bestSched_->GetCost(); + if (IsSecondPass() && ClusterMemoryOperations) + bestSched->setClusterSize(bestSched_->getClusterSize()); } } // Step #3: Compute the cost upper bound. @@ -376,6 +411,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( InitialSchedule = bestSched_; InitialScheduleCost = bestCost_; InitialScheduleLength = bestSchedLngth_; + /*Logger::Info("Printing Initiial schedule"); + InitialSchedule->Print(Logger::GetLogStream(), "InitialSched", dataDepGraph_); + Logger::Info("Finish printing initial schedule");*/ // Step #4: Find the optimal schedule if the heuristc and ACO was not optimal. if (BbSchedulerEnabled) { @@ -606,6 +644,14 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( Logger::Info("DAG %s PEAK %d", dataDepGraph_->GetDagID(), maxSpillCost); } #endif + + if (PrintClustering && bestSched != NULL && (IsSecondPass() || !TwoPassEnabled)) { + computeAndPrintClustering(bestSched); + } + + //if (bestSched != NULL) + // bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_); + return rslt; } @@ -708,11 +754,6 @@ bool SchedRegion::CmputUprBounds_(InstSchedule *schedule, bool useFileBounds) { // If the heuristic schedule is optimal, we are done! schedUprBound_ = bestSchedLngth_; return true; - } else if (IsSecondPass()) { - // In the second pass, the upper bound is the length of the min-RP schedule - // that was found in the first pass with stalls inserted. - schedUprBound_ = schedule->GetCrntLngth(); - return false; } else { CmputSchedUprBound_(); return false; diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp index 46d6c1a3..915f4e6b 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp +++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp @@ -7,8 +7,13 @@ #include "GCNOptSched.h" #include "AMDGPUMacroFusion.h" #include "GCNSchedStrategy.h" +#include "OptSchedGCNTarget.h" #include "SIMachineFunctionInfo.h" +//#include "llvm/CodeGen/OptSequential.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include #define DEBUG_TYPE "optsched" @@ -43,7 +48,31 @@ static void getRealRegionPressure(MachineBasicBlock::const_iterator Begin, ScheduleDAGOptSchedGCN::ScheduleDAGOptSchedGCN( llvm::MachineSchedContext *C, std::unique_ptr S) - : ScheduleDAGOptSched(C, std::move(S)) {} + : ScheduleDAGOptSched(C, std::move(S)) { + MinOcc = getMinOcc(); +} + +unsigned ScheduleDAGOptSchedGCN::getMinOcc() { + SchedulerOptions &schedIni = SchedulerOptions::getInstance(); + int MinOcc = schedIni.GetInt("MIN_OCCUPANCY_FOR_RESCHEDULE"); + if (MinOcc <= 10 && MinOcc >= 1) + return MinOcc; + + llvm::report_fatal_error( + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" + + std::to_string(MinOcc), false); +} + +int ScheduleDAGOptSchedGCN::getMinILPImprovement() { + SchedulerOptions &schedIni = SchedulerOptions::getInstance(); + int MinIlpImprovement = schedIni.GetInt("MIN_ILP_IMPROVEMENT"); + if (MinIlpImprovement <= 100 && MinIlpImprovement >= 0) + return MinIlpImprovement; + + llvm::report_fatal_error( + "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" + + std::to_string(MinIlpImprovement), false); +} void ScheduleDAGOptSchedGCN::initSchedulers() { // Add DAG mutations that apply to both GCN and OptSched DAG's @@ -58,22 +87,54 @@ void ScheduleDAGOptSchedGCN::initSchedulers() { // First SchedPasses.push_back(OptSchedMaxOcc); - // Second + // Second ILP passes SchedPasses.push_back(OptSchedBalanced); -} + SchedPasses.push_back(OptSchedLowerOccAnalysis); + SchedPasses.push_back(OptSchedCommitLowerOcc); +} // Execute scheduling passes. // Partially copied GCNScheduleDAGMILive::finalizeSchedule void ScheduleDAGOptSchedGCN::finalizeSchedule() { if (TwoPassEnabled && OptSchedEnabled) { initSchedulers(); + RescheduleRegions.resize(Regions.size()); + ILPAnalysis.resize(Regions.size()); + CostAnalysis.resize(Regions.size()); + LowerOccScheds.resize(Regions.size()); + RescheduleRegions.set(); LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n"); TwoPassSchedulingStarted = true; for (const SchedPassStrategy &S : SchedPasses) { MachineBasicBlock *MBB = nullptr; // Reset - RegionNumber = ~0u; + RegionIdx = 0; + + if (S == OptSchedLowerOccAnalysis) { + if (RescheduleRegions.none()) + break; + else { + auto GCNOST = static_cast(OST.get()); + unsigned TargetOccupancy = GCNOST->getTargetOcc(); + if (TargetOccupancy <= MinOcc) + break; + + unsigned NewTarget = TargetOccupancy - 1u; + dbgs() << "Decreasing current target occupancy " << TargetOccupancy + << " to new target " << NewTarget << '\n'; + GCNOST->limitOccupancy(NewTarget); + } + } + + if (S == OptSchedCommitLowerOcc) { + if (!shouldCommitLowerOccSched()) { + dbgs() + << "Lower occupancy schedule did not meet minimum improvement.\n"; + break; + } + dbgs() << "Lower occupancy met minimum improvement requirement!\n"; + } for (auto &Region : Regions) { RegionBegin = Region.first; @@ -93,36 +154,42 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() { exitRegion(); continue; } - LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before")); + LLVM_DEBUG( + getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before")); runSchedPass(S); LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "After")); Region = std::make_pair(RegionBegin, RegionEnd); exitRegion(); + ++RegionIdx; } finishBlock(); } } ScheduleDAGMILive::finalizeSchedule(); - - LLVM_DEBUG(if (isSimRegAllocEnabled()) { - dbgs() << "*************************************\n"; - dbgs() << "Function: " << MF.getName() - << "\nTotal Simulated Spills: " << SimulatedSpills << "\n"; - dbgs() << "*************************************\n"; - }); } void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) { + RescheduleRegions[RegionIdx] = false; switch (S) { case GCNMaxOcc: scheduleGCNMaxOcc(); break; case OptSchedMaxOcc: scheduleOptSchedMaxOcc(); + Logger::Info("End of first pass through"); break; case OptSchedBalanced: scheduleOptSchedBalanced(); + Logger::Info("End of second pass through"); + break; + case OptSchedLowerOccAnalysis: + scheduleOptSchedLowerOccAnalysis(); + Logger::Info("End of third pass through"); + break; + case OptSchedCommitLowerOcc: + scheduleCommitLowerOcc(); + Logger::Info("End of fourth pass through"); break; } } @@ -144,3 +211,37 @@ void ScheduleDAGOptSchedGCN::scheduleOptSchedMaxOcc() { void ScheduleDAGOptSchedGCN::scheduleOptSchedBalanced() { ScheduleDAGOptSched::scheduleOptSchedBalanced(); } + +void ScheduleDAGOptSchedGCN::scheduleOptSchedLowerOccAnalysis() { + IsThirdPass = true; + ScheduleDAGOptSched::scheduleOptSchedBalanced(); + IsThirdPass = false; +} + +void ScheduleDAGOptSchedGCN::scheduleCommitLowerOcc() { + IsFourthPass = true; + ScheduleDAGOptSched::scheduleOptSchedBalanced(); + IsFourthPass = false; +} + +bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() { + // First analyze ILP improvements + int FirstPassLengthSum = 0; + int SecondPassLengthSum = 0; + int MinILPImprovement = getMinILPImprovement(); + for (std::pair &RegionLength : ILPAnalysis) { + FirstPassLengthSum += RegionLength.first; + SecondPassLengthSum += RegionLength.second; + } + double FirstPassAverageLength = (double)FirstPassLengthSum / Regions.size(); + double SecondPassAverageLength = (double)SecondPassLengthSum / Regions.size(); + double ILPImprovement = ((FirstPassAverageLength - SecondPassAverageLength) / + FirstPassAverageLength) * + 100.0; + dbgs() << "ILPImprovement from second ILP pass is " << ILPImprovement + << ", min improvement is: " << MinILPImprovement << '\n'; + if (ILPImprovement - MinILPImprovement >= 0) + return true; + + return false; +} diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h index f08056aa..c24c93c1 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.h +++ b/lib/Wrapper/AMDGPU/GCNOptSched.h @@ -9,17 +9,37 @@ #include "../OptimizingScheduler.h" #include "GCNRegPressure.h" +#include "OptSchedGCNTarget.h" namespace llvm { namespace opt_sched { class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched { private: - enum SchedPassStrategy { GCNMaxOcc, OptSchedMaxOcc, OptSchedBalanced }; + enum SchedPassStrategy { + GCNMaxOcc, + OptSchedMaxOcc, + OptSchedBalanced, + OptSchedLowerOccAnalysis, + OptSchedCommitLowerOcc + }; + + /// Get the minimum occupancy value from the sched.ini settings file. Check + /// if the value is between 1-10 and gives an error if it is not between the + /// valid range. + unsigned getMinOcc(); + + int getMinILPImprovement(); + + /// Analyze the possible improvements from lowering the target occupancy + /// and decide if we should keep the schedules. + bool shouldCommitLowerOccSched(); // Vector of scheduling passes to execute. SmallVector SchedPasses; + unsigned MinOcc; + public: ScheduleDAGOptSchedGCN(llvm::MachineSchedContext *C, std::unique_ptr S); @@ -45,6 +65,13 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched { // Run OptSched in ILP/RP balanced mode. void scheduleOptSchedBalanced() override; + + // Lower occupancy and run OptSched in ILP/RP balanced mode for analysis. + void scheduleOptSchedLowerOccAnalysis(); + + // Lower occupancy and run OptSched in ILP/RP balanced mode to commit + // scheduling in analysis pass. + void scheduleCommitLowerOcc(); }; } // namespace opt_sched diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp index 21faf51e..9f63a720 100644 --- a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp +++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp @@ -3,6 +3,7 @@ // AMDGCN OptSched target. // //===----------------------------------------------------------------------===// +#include "OptSchedGCNTarget.h" #include "OptSchedDDGWrapperGCN.h" #include "SIMachineFunctionInfo.h" #include "Wrapper/OptSchedMachineWrapper.h" @@ -22,7 +23,7 @@ using namespace llvm::opt_sched; // This is necessary because we cannot perfectly predict the number of registers // of each type that will be allocated. -static const unsigned GPRErrorMargin = 3; +static const unsigned GPRErrorMargin = 0; #ifndef NDEBUG static unsigned getOccupancyWeight(unsigned Occupancy) { @@ -62,56 +63,6 @@ static unsigned getAdjustedOccupancy(const GCNSubtarget *ST, unsigned VGPRCount, namespace { -class OptSchedGCNTarget : public OptSchedTarget { -public: - std::unique_ptr - createMachineModel(const char *ConfigPath) override { - return llvm::make_unique(ConfigPath); - } - - std::unique_ptr - createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG, - OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision, - const std::string &RegionID) override { - return llvm::make_unique(Context, DAG, MM, - LatencyPrecision, RegionID); - } - - void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override; - - void finalizeRegion(const InstSchedule *Schedule) override; - - // Returns occupancy cost with number of VGPRs and SGPRs from PRP for - // a partial or complete schedule. - InstCount getCost(const llvm::SmallVectorImpl &PRP) const override; - - void dumpOccupancyInfo(const InstSchedule *Schedule) const; - - // Revert scheduing if we decrease occupancy. - bool shouldKeepSchedule() override; - -private: - const llvm::MachineFunction *MF; - SIMachineFunctionInfo *MFI; - ScheduleDAGOptSched *DAG; - const GCNSubtarget *ST; - - unsigned RegionStartingOccupancy; - unsigned RegionEndingOccupancy; - unsigned TargetOccupancy; - - // Max occupancy with local memory size; - unsigned MaxOccLDS; - - // In RP only (max occupancy) scheduling mode we should try to find - // a min-RP schedule without considering perf hints which suggest limiting - // occupancy. Returns true if we should consider perf hints. - bool shouldLimitWaves() const; - - // Find occupancy with spill cost. - unsigned getOccupancyWithCost(const InstCount Cost) const; -}; - std::unique_ptr createOptSchedGCNTarget() { return llvm::make_unique(); } @@ -161,9 +112,9 @@ void OptSchedGCNTarget::initRegion(llvm::ScheduleDAGInstrs *DAG_, TargetOccupancy = shouldLimitWaves() ? MFI->getMinAllowedOccupancy() : MFI->getOccupancy(); - LLVM_DEBUG(dbgs() << "Region starting occupancy is " + dbgs() << "Region starting occupancy is " << RegionStartingOccupancy << "\n" - << "Target occupancy is " << TargetOccupancy << "\n"); + << "Target occupancy is " << TargetOccupancy << "\n"; } bool OptSchedGCNTarget::shouldLimitWaves() const { @@ -173,6 +124,16 @@ bool OptSchedGCNTarget::shouldLimitWaves() const { return false; } +void OptSchedGCNTarget::setTargetOcc(unsigned Target) { + dbgs() << "Setting target occupancy to " << Target << '\n'; + TargetOccupancy = Target; +} +void OptSchedGCNTarget::limitOccupancy(unsigned Limit) { + dbgs() << "Limiting occupancy to " << Limit << '\n'; + MFI->limitOccupancy(Limit); + TargetOccupancy = MFI->getOccupancy(); +} + unsigned OptSchedGCNTarget::getOccupancyWithCost(const InstCount Cost) const { return TargetOccupancy - Cost; } @@ -184,9 +145,9 @@ void OptSchedGCNTarget::finalizeRegion(const InstSchedule *Schedule) { // If we decrease occupancy we may revert scheduling. unsigned RegionOccupancy = std::max(RegionStartingOccupancy, RegionEndingOccupancy); - LLVM_DEBUG(if (RegionOccupancy < MFI->getOccupancy()) dbgs() + if (RegionOccupancy < MFI->getOccupancy()) dbgs() << "Limiting occupancy to " << RegionEndingOccupancy - << " waves.\n"); + << " waves.\n"; MFI->limitOccupancy(RegionOccupancy); } diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h new file mode 100644 index 00000000..996caaff --- /dev/null +++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h @@ -0,0 +1,73 @@ +#ifndef LLVM_GCN_OPT_SCHED_TARGET_H +#define LLVM_GCN_OPT_SCHED_TARGET_H + +#include "OptSchedDDGWrapperGCN.h" +#include "SIMachineFunctionInfo.h" +#include "Wrapper/OptSchedMachineWrapper.h" +#include "opt-sched/Scheduler/OptSchedTarget.h" +#include "opt-sched/Scheduler/data_dep.h" +#include "opt-sched/Scheduler/defines.h" +#include "opt-sched/Scheduler/machine_model.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include +#include + +using namespace llvm; +using namespace llvm::opt_sched; + +class OptSchedGCNTarget : public OptSchedTarget { +public: + std::unique_ptr + createMachineModel(const char *ConfigPath) override { + return llvm::make_unique(ConfigPath); + } + + std::unique_ptr + createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG, + OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision, + const std::string &RegionID) override { + return llvm::make_unique(Context, DAG, MM, + LatencyPrecision, RegionID); + } + + void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override; + + void finalizeRegion(const InstSchedule *Schedule) override; + + // Returns occupancy cost with number of VGPRs and SGPRs from PRP for + // a partial or complete schedule. + InstCount getCost(const llvm::SmallVectorImpl &PRP) const override; + + void dumpOccupancyInfo(const InstSchedule *Schedule) const; + + // Revert scheduing if we decrease occupancy. + bool shouldKeepSchedule() override; + + void limitOccupancy(unsigned Limit); + unsigned getTargetOcc() { return TargetOccupancy; } + void setTargetOcc(unsigned Target); + +private: + const llvm::MachineFunction *MF; + SIMachineFunctionInfo *MFI; + ScheduleDAGOptSched *DAG; + const GCNSubtarget *ST; + + unsigned RegionStartingOccupancy; + unsigned RegionEndingOccupancy; + unsigned TargetOccupancy; + + // Max occupancy with local memory size; + unsigned MaxOccLDS; + + // In RP only (max occupancy) scheduling mode we should try to find + // a min-RP schedule without considering perf hints which suggest limiting + // occupancy. Returns true if we should consider perf hints. + bool shouldLimitWaves() const; + + // Find occupancy with spill cost. + unsigned getOccupancyWithCost(const InstCount Cost) const; +}; + +#endif diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index ba6985cf..f5b03fe7 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -9,6 +9,8 @@ #include "opt-sched/Scheduler/logger.h" #include "opt-sched/Scheduler/register.h" #include "opt-sched/Scheduler/sched_basic_data.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -26,6 +28,7 @@ #include #include #include +#include #include #define DEBUG_TYPE "optsched-ddg-wrapper" @@ -71,9 +74,12 @@ OptSchedDDGWrapperBasic::OptSchedDDGWrapperBasic( if (ShouldFilterRegisterTypes) RTFilter = createLLVMRegTypeFilter(MM, DAG->TRI, DAG->getRegPressure().MaxSetPressure); + + ClusterCount = 0; } -void OptSchedDDGWrapperBasic::convertSUnits() { +void OptSchedDDGWrapperBasic::convertSUnits(bool IgnoreRealEdges, + bool IgnoreArtificialEdges) { LLVM_DEBUG(dbgs() << "Building opt_sched DAG\n"); // The extra 2 are for the artifical root and leaf nodes. instCnt_ = nodeCnt_ = DAG->SUnits.size() + 2; @@ -89,7 +95,7 @@ void OptSchedDDGWrapperBasic::convertSUnits() { // Create edges. for (const auto &SU : DAG->SUnits) { - convertEdges(SU); + convertEdges(SU, IgnoreRealEdges, IgnoreArtificialEdges); } // Add artificial root and leaf nodes and edges. @@ -407,13 +413,27 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() { CreateEdge_(i, LeafNum, 0, DEP_OTHER); } -void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU) { +void OptSchedDDGWrapperBasic::addArtificialEdges() { + for (const auto &SU : DAG->SUnits) { + convertEdges(SU, true, false); + } +} + +void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU, + bool IgnoreRealEdges, + bool IgnoreArtificialEdges) { const MachineInstr *instr = SU.getInstr(); SUnit::const_succ_iterator I, E; for (I = SU.Succs.begin(), E = SU.Succs.end(); I != E; ++I) { if (I->getSUnit()->isBoundaryNode()) continue; + bool IsArtificial = I->isArtificial() || I->isCluster(); + if (IgnoreArtificialEdges && IsArtificial) + continue; + else if (IgnoreRealEdges && !IsArtificial) + continue; + DependenceType DepType; switch (I->getKind()) { case SDep::Data: @@ -440,7 +460,8 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU) { else Latency = 1; // unit latency = ignore ilp - CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType); + CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType, + IsArtificial); } } @@ -500,6 +521,148 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness( } } +// Iterate through all chains found by LLVm and verify that the instructions +// are actually able to be clustered together. +// Partially copied from +// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554 +int OptSchedDDGWrapperBasic::clusterNeighboringMemOps( + ArrayRef MemOps) { + // Will be set to true if clustering was found to be possible in this chain. + bool InitForNewCluster = true; + // Keep track of the count of instructions that are able to be clustered + // and return the number. + int TotalInstructionsPossible = 0; + int InstructionsInEachCluster = 0; + SmallVector MemOpRecords; + for (const SUnit *SU : MemOps) { + MachineOperand *BaseOp; + int64_t Offset; + if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, + DAG->TRI)) + MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset)); + } + + if (MemOpRecords.size() < 2) { + LLVM_DEBUG(dbgs() << " Unable to cluster memop cluster of 1.\n"); + return 0; + } + + llvm::sort(MemOpRecords); + unsigned ClusterLength = 1; + for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { + const SUnit *SUa = MemOpRecords[Idx].SU; + const SUnit *SUb = MemOpRecords[Idx + 1].SU; + LLVM_DEBUG(dbgs() << " Checking possible clustering of (" << SUa->NodeNum << ") and (" + << SUb->NodeNum << ")\n"); + + // Pass constant of 1 to AMD's function to determine clustering to remove + // the limit of 15. Our enumerator can determine when it has reached the + // limit instead of depending on AMD. + if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, + *MemOpRecords[Idx + 1].BaseOp, 1u)) { + LLVM_DEBUG(dbgs() << " Cluster possible at SU(" << SUa->NodeNum << ")- SU(" + << SUb->NodeNum << ")\n"); + + // If clustering is possible then increase the cluster count. This only + // happens once every new cluster + if (InitForNewCluster) { + InitForNewCluster = false; + ClusterCount++; + setMinClusterCount(ClusterCount); + dbgs() << " Setting total cluster count to " << ClusterCount << "\n"; + } + + // Tell the instructions what cluster group they are in + if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) { + insts_[SUa->NodeNum]->SetMayCluster(ClusterCount); + InstructionsInEachCluster++; + } + + if (insts_[SUb->NodeNum]->GetClusterGroup() == 0) { + insts_[SUb->NodeNum]->SetMayCluster(ClusterCount); + InstructionsInEachCluster++; + } + + ++ClusterLength; + } else { + if (!InitForNewCluster) { + // If a cluster was initialized and started then the information before + // starting a new one. + MaxInstructionsInEachClusters.insert( + std::make_pair(ClusterCount, InstructionsInEachCluster)); + TotalInstructionsPossible += InstructionsInEachCluster; + InitForNewCluster = true; + InstructionsInEachCluster = 0; + } + ClusterLength = 1; + } + } + // Save the total instructions possible in this cluster. This number will be + // used in enumeration to estimate an optimistic cost on the remaining + // cluster blocks.i + if (!InitForNewCluster) { + MaxInstructionsInEachClusters.insert( + std::make_pair(ClusterCount, InstructionsInEachCluster)); + TotalInstructionsPossible += InstructionsInEachCluster; + } + + // Return the total number of instructions in this cluster block + return TotalInstructionsPossible; +} + +// Iterate through SUnits and find all possible clustering using LLVM/AMD's +// method for possible clustering detection then transfer the information to +// our scheduler so that our scheduler can access it during enumeration. +// Partially copied from +// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595 +int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) { + // The count of all of the instructions that are in a load/store cluster. + int TotalInstructionsPossible = 0; + // Map DAG NodeNum to store chain ID. + DenseMap StoreChainIDs; + // Map each store chain to a set of dependent MemOps. + SmallVector, 32> StoreChainDependents; + for (const SUnit &SU : DAG->SUnits) { + if ((IsLoad && !SU.getInstr()->mayLoad()) || + (!IsLoad && !SU.getInstr()->mayStore())) + continue; + auto MI = SU.getInstr(); + + // Print which instruction may load or store. Used for debugging purposes. + dbgs() << "Instruction (" << SU.NodeNum << ") " + << DAG->TII->getName(MI->getOpcode()) << " may " + << (IsLoad ? "load" : "store") << "\n"; + + unsigned ChainPredID = DAG->SUnits.size(); + for (const SDep &Pred : SU.Preds) { + if (Pred.isCtrl() && !(Pred.isArtificial() || Pred.isCluster())) { + ChainPredID = Pred.getSUnit()->NodeNum; + break; + } + } + // Check if this chain-like pred has been seen + // before. ChainPredID==MaxNodeID at the top of the schedule. + unsigned NumChains = StoreChainDependents.size(); + std::pair::iterator, bool> Result = + StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); + if (Result.second) + StoreChainDependents.resize(NumChains + 1); + StoreChainDependents[Result.first->second].push_back(&SU); + } + + // Iterate over the store chains. + for (auto &SCD : StoreChainDependents) { + // Print the chain that LLVM has found + LLVM_DEBUG(dbgs() << "Printing the Node ID of the current chain: "); + for (auto SU1 : SCD) + LLVM_DEBUG(dbgs() << SU1->NodeNum << " "); + LLVM_DEBUG(dbgs() << '\n'); + + TotalInstructionsPossible += clusterNeighboringMemOps(SCD); + } + return TotalInstructionsPossible; +} + LLVMRegTypeFilter::LLVMRegTypeFilter( const MachineModel *MM, const llvm::TargetRegisterInfo *TRI, const std::vector &RegionPressure, float RegFilterFactor) diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index 88631511..0679e2b8 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -13,6 +13,7 @@ #include "opt-sched/Scheduler/graph_trans.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include #include @@ -47,8 +48,10 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { /// Dump Optsched register def/use information for the region. void dumpOptSchedRegisters() const; - void convertSUnits() override; + void convertSUnits(bool IgnoreRealEdges, bool IgnoreArtificialEdges) override; + void addArtificialEdges(); void convertRegFiles() override; + int findPossibleClusters(bool IsLoad) override; protected: // A convenience machMdl_ pointer casted to OptSchedMachineModel*. @@ -123,7 +126,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { void convertSUnit(const llvm::SUnit &SU); // Create edges between optsched graph nodes using SUnit successors. - void convertEdges(const llvm::SUnit &SU); + void convertEdges(const llvm::SUnit &SU, bool IgnoreRealEdges, + bool IgnoreArtificialEdges); // Count number or registers defined by the region boundary. void countBoundaryLiveness(std::vector &RegDefCounts, @@ -133,6 +137,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { // Find liveness info generated by the region boundary. void discoverBoundaryLiveness(const llvm::MachineInstr *MI); + int clusterNeighboringMemOps(ArrayRef MemOps); + // Holds a register live range, mapping a producer to a set of consumers. struct LiveRange { // The node which defines the register tracked by this live range. @@ -140,12 +146,56 @@ class OptSchedDDGWrapperBasic : public DataDepGraph { // The nodes which use the register tracked by this live range. std::vector consumers; }; + + /// Count of the total clusters possible + int ClusterCount; + + // Copied from + // https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467 + struct MemOpInfo { + const SUnit *SU; + MachineOperand *BaseOp; + int64_t Offset; + + MemOpInfo(const SUnit *su, MachineOperand *Op, int64_t ofs) + : SU(su), BaseOp(Op), Offset(ofs) {} + + bool operator<(const MemOpInfo &RHS) const { + if (BaseOp->getType() != RHS.BaseOp->getType()) + return BaseOp->getType() < RHS.BaseOp->getType(); + + if (BaseOp->isReg()) + return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) < + std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset, + RHS.SU->NodeNum); + if (BaseOp->isFI()) { + const MachineFunction &MF = + *BaseOp->getParent()->getParent()->getParent(); + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); + bool StackGrowsDown = TFI.getStackGrowthDirection() == + TargetFrameLowering::StackGrowsDown; + // Can't use tuple comparison here since we might need to use a + // different order when the stack grows down. + if (BaseOp->getIndex() != RHS.BaseOp->getIndex()) + return StackGrowsDown ? BaseOp->getIndex() > RHS.BaseOp->getIndex() + : BaseOp->getIndex() < RHS.BaseOp->getIndex(); + + if (Offset != RHS.Offset) + return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset; + + return SU->NodeNum < RHS.SU->NodeNum; + } + + llvm_unreachable("MemOpClusterMutation only supports register or frame " + "index bases."); + } + }; }; // Exclude certain registers from being visible to the scheduler. Use LLVM's -// register pressure tracker to find the MAX register pressure for each register -// type (pressure set). If the MAX pressure is below a certain threshold don't -// track that register. +// register pressure tracker to find the MAX register pressure for each +// register type (pressure set). If the MAX pressure is below a certain +// threshold don't track that register. class LLVMRegTypeFilter { private: const MachineModel *MM; diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index 5d0416c5..a3ad4e1c 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -16,10 +16,15 @@ #include "opt-sched/Scheduler/register.h" #include "opt-sched/Scheduler/sched_region.h" #include "opt-sched/Scheduler/utilities.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineScheduler.h" +/*#include "llvm/CodeGen/OptSequential.h"*/ +#include "AMDGPU/OptSchedGCNTarget.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -39,20 +44,21 @@ using namespace llvm::opt_sched; +llvm::SmallVector UniqueRegionNames; +llvm::DenseMap RegionCounter; + // hack to print spills bool OPTSCHED_gPrintSpills; // An array of possible OptSched heuristic names constexpr struct { - const char* Name; + const char *Name; LISTSCHED_HEURISTIC HID; -} HeuristicNames[] = { - {"CP", LSH_CP}, {"LUC", LSH_LUC}, - {"UC", LSH_UC}, {"NID", LSH_NID}, - {"CPR", LSH_CPR}, {"ISO", LSH_ISO}, - {"SC", LSH_SC}, {"LS", LSH_LS}, - {"LLVM", LSH_LLVM} -}; +} HeuristicNames[] = {{"CP", LSH_CP}, {"LUC", LSH_LUC}, + {"UC", LSH_UC}, {"NID", LSH_NID}, + {"CPR", LSH_CPR}, {"ISO", LSH_ISO}, + {"SC", LSH_SC}, {"LS", LSH_LS}, + {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER}}; // Default path to the the configuration directory for opt-sched. static constexpr const char *DEFAULT_CFG_DIR = "~/.optsched-cfg/"; @@ -128,7 +134,8 @@ nextIfDebug(MachineBasicBlock::iterator I, return I; } -static bool scheduleSpecificRegion(const StringRef RegionName, const Config &SchedIni) { +static bool scheduleSpecificRegion(const StringRef RegionName, + const Config &SchedIni) { const bool ScheduleSpecificRegions = SchedIni.GetBool("SCHEDULE_SPECIFIC_REGIONS"); @@ -259,10 +266,8 @@ void ScheduleDAGOptSched::schedule() { ShouldTrackLaneMasks = true; Config &schedIni = SchedulerOptions::getInstance(); - ++RegionNumber; const std::string RegionName = C->MF->getFunction().getName().data() + - std::string(":") + - std::to_string(RegionNumber); + std::string(":") + std::to_string(RegionIdx); // If two pass scheduling is enabled then // first just record the scheduling region. @@ -375,10 +380,59 @@ void ScheduleDAGOptSched::schedule() { // Build LLVM DAG SetupLLVMDag(); OST->initRegion(this, MM.get()); + /*if (IsSecondPass && !IsThirdPass && !IsFourthPass) { + auto GCNOST = static_cast(OST.get()); + GCNOST->setTargetOcc(5); + }*/ + // Convert graph auto DDG = OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); - DDG->convertSUnits(); + + // In the second pass, ignore artificial edges before running the sequential + // heuristic list scheduler. + if (IsSecondPass) + DDG->convertSUnits(/*IgnoreRealEdges=*/false, + /*IgnoreArtificialEdges=*/true); + else + DDG->convertSUnits(false, false); + + // Find all clusterable instructions for the second pass. + if (IsSecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) { + dbgs() << "Finding load clusters.\n"; + int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true); + if (TotalLoadsInstructionsClusterable == 0) + dbgs() << " No load clustering possible\n"; + + dbgs() << "Finding store clusters.\n"; + int TotalStoreInstructionsClusterable = DDG->findPossibleClusters(false); + if (TotalStoreInstructionsClusterable == 0) + dbgs() << " No store clustering possible\n"; + + Logger::Info("Total clusterable instructions: %d loads, %d stores", + TotalLoadsInstructionsClusterable, + TotalStoreInstructionsClusterable); + + // Get the DDG instance so that we can set and get information that will + // be read later on during enumeration. + auto DataDepGraphInstance = static_cast(DDG.get()); + // Store total instructions in all clusters in the DDG instance. + DataDepGraphInstance->setTotalInstructionsInAllClusters( + TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable); + int end = DataDepGraphInstance->getMinClusterCount(); + + // Iterate through all of the cluster blocks and print the total + // instructions in each block. + if (end > 0) { + Logger::Info("Total clusters in region: %d", end); + for (int begin = 1; begin <= end; begin++) { + Logger::Info( + " Cluster %d has total instructions %d", begin, + DataDepGraphInstance->getTotalInstructionsInCluster(begin)); + } + } + } + DDG->convertRegFiles(); auto *BDDG = static_cast(DDG.get()); @@ -409,30 +463,60 @@ void ScheduleDAGOptSched::schedule() { } // Used for two-pass-optsched to alter upper bound value. - if (SecondPass) + if (IsSecondPass) region->InitSecondPass(); // Setup time before scheduling Utilities::startTime = std::chrono::high_resolution_clock::now(); // Schedule region. - Rslt = region->FindOptimalSchedule(CurrentRegionTimeout, CurrentLengthTimeout, - IsEasy, NormBestCost, BestSchedLngth, - NormHurstcCost, HurstcSchedLngth, Sched, - FilterByPerp, blocksToKeep(schedIni)); - - if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) { - LLVM_DEBUG( - Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.", - Rslt, (void *)Sched)); - // Scheduling with opt-sched failed. - // fallbackScheduler(); - return; + if (!IsFourthPass) { + Rslt = region->FindOptimalSchedule( + CurrentRegionTimeout, CurrentLengthTimeout, IsEasy, NormBestCost, + BestSchedLngth, NormHurstcCost, HurstcSchedLngth, Sched, FilterByPerp, + blocksToKeep(schedIni)); + + if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) { + LLVM_DEBUG( + Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.", + Rslt, (void *)Sched)); + // Scheduling with opt-sched failed. + // fallbackScheduler(); + return; + } + } else { + dbgs() << "Processing DAG " << RegionName << '\n'; + dbgs() << "Restoring schedule from second ILP pass: \n"; + Sched = LowerOccScheds[RegionIdx]; + dbgs() << "Applying lower occupancy schedule\n"; } + // BB Enumerator did not find a schedule. + // Add the region to the list to be rescheduled. + if (IsSecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass && + !IsFourthPass) + RescheduleRegions[RegionIdx] = true; + LLVM_DEBUG(Logger::Info("OptSched succeeded.")); + OST->finalizeRegion(Sched); - if (!OST->shouldKeepSchedule()) + + if (!IsThirdPass && !IsFourthPass && (IsFirstPass || IsSecondPass)) + if (!OST->shouldKeepSchedule()) { + if (IsSecondPass) { + // We do not keep the schedule so the results of the sequential + // heuristic scheduler is the final result for the second pass. + ILPAnalysis[RegionIdx].first = HurstcSchedLngth; + } + return; + } + + if (IsSecondPass && !IsThirdPass && !IsFourthPass) + ILPAnalysis[RegionIdx].first = BestSchedLngth; + else if (IsThirdPass) { + ILPAnalysis[RegionIdx].second = BestSchedLngth; + LowerOccScheds[RegionIdx] = Sched; return; + } // Count simulated spills. if (isSimRegAllocEnabled()) { @@ -529,7 +613,10 @@ void ScheduleDAGOptSched::loadOptSchedConfig() { OptSchedEnabled = isOptSchedEnabled(); TwoPassEnabled = isTwoPassEnabled(); TwoPassSchedulingStarted = false; - SecondPass = false; + IsFirstPass = false; + IsSecondPass = false; + IsThirdPass = false; + IsFourthPass = false; LatencyPrecision = fetchLatencyPrecision(); TreatOrderAsDataDeps = schedIni.GetBool("TREAT_ORDER_DEPS_AS_DATA_DEPS"); @@ -665,6 +752,7 @@ SchedPriorities ScheduleDAGOptSched::parseHeuristic(const std::string &Str) { Priorities.vctr[Priorities.cnt++] = LSH; switch (LSH) { // Is LUC still the only dynamic heuristic? + case LSH_CLUSTER: case LSH_LUC: Priorities.isDynmc = true; break; @@ -743,13 +831,14 @@ bool ScheduleDAGOptSched::rpMismatch(InstSchedule *sched) { void ScheduleDAGOptSched::finalizeSchedule() { if (TwoPassEnabled && OptSchedEnabled) { initSchedulers(); + RescheduleRegions.resize(Regions.size()); LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n"); TwoPassSchedulingStarted = true; for (const SchedPassStrategy &S : SchedPasses) { MachineBasicBlock *MBB = nullptr; // Reset - RegionNumber = ~0u; + RegionIdx = 0; for (auto &Region : Regions) { RegionBegin = Region.first; @@ -791,14 +880,17 @@ void ScheduleDAGOptSched::runSchedPass(SchedPassStrategy S) { switch (S) { case OptSchedMinRP: scheduleOptSchedMinRP(); + Logger::Info("End of first pass through"); break; case OptSchedBalanced: scheduleOptSchedBalanced(); + Logger::Info("End of second pass through"); break; } } void ScheduleDAGOptSched::scheduleOptSchedMinRP() { + IsFirstPass = true; LatencyPrecision = LTP_UNITY; // Set times for the first pass RegionTimeout = FirstPassRegionTimeout; @@ -806,11 +898,11 @@ void ScheduleDAGOptSched::scheduleOptSchedMinRP() { HeurSchedType = SCHED_LIST; schedule(); - Logger::Info("End of first pass through\n"); + IsFirstPass = false; } void ScheduleDAGOptSched::scheduleOptSchedBalanced() { - SecondPass = true; + IsSecondPass = true; LatencyPrecision = LTP_ROUGH; // Set times for the second pass @@ -837,7 +929,7 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() { MultiPassStaticNodeSup = false; schedule(); - Logger::Info("End of second pass through"); + IsSecondPass = false; } bool ScheduleDAGOptSched::isSimRegAllocEnabled() const { diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h index 13b92e7d..72191801 100644 --- a/lib/Wrapper/OptimizingScheduler.h +++ b/lib/Wrapper/OptimizingScheduler.h @@ -14,6 +14,7 @@ #include "opt-sched/Scheduler/data_dep.h" #include "opt-sched/Scheduler/graph_trans.h" #include "opt-sched/Scheduler/sched_region.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/Support/Debug.h" @@ -37,12 +38,21 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { SmallVector SchedPasses; protected: - // Vector of regions recorded for later rescheduling SmallVector< std::pair, 32> Regions; + /// Contains the results of the first ILP pass and second analysis ILP pass. + /// Used to calculate if we should keep the lower target occupancy schedules + /// in the second ILP pass. First element is the first ILP pass and second + /// element is the second analysis ILP pass. + SmallVector, 32> ILPAnalysis; + /// TODO: Same as above for cost analysis. + SmallVector, 32> CostAnalysis; + /// Store the lower occupancy schedules from the second ILP pass. + SmallVector LowerOccScheds; + // Path to opt-sched config options directory. SmallString<128> PathCfg; @@ -55,17 +65,27 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Path to the machine model specification file for opt-sched. SmallString<128> PathCfgMM; + bool IsFirstPass; + // Bool value indicating that the scheduler is in the second // pass. Used for the two pass scheduling approach. - bool SecondPass; + bool IsSecondPass; + + bool IsThirdPass; + + bool IsFourthPass; // Region number uniquely identifies DAGs. - unsigned RegionNumber = ~0u; + size_t RegionIdx; + + // Records if a region is not yet scheduled, or schedule has been reverted, + // or we generally desire to reschedule it. + llvm::BitVector RescheduleRegions; MachineSchedContext *C; // The OptSched target machine. - std::unique_ptr OST; + std::shared_ptr OST; // into the OptSched machine model std::unique_ptr MM; @@ -158,7 +178,10 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // The heuristic used for the enumerator. SchedPriorities EnumPriorities; - // The heuristic used for the second pass enumerator in the two-pass scheduling approach. + SchedPriorities SecondPassPriorities; + + // The heuristic used for the second pass enumerator in the two-pass + // scheduling approach. SchedPriorities SecondPassEnumPriorities; // Static node superiority RP only graph transformation. @@ -249,7 +272,7 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { void dumpLLVMRegisters() const; // Getter for region number - int getRegionNum() const { return RegionNumber; } + int getRegionNum() const { return RegionIdx; } // Return the boundary instruction for this region if it is not a sentinel // value.