From 0434e607c42c6be7b8f3cd43f83989eed650f0da Mon Sep 17 00:00:00 2001 From: Paul McHugh Date: Wed, 21 Jul 2021 09:42:08 -0700 Subject: [PATCH 1/7] Refactored out the key/heuristic computations --- include/opt-sched/Scheduler/defines.h | 6 + include/opt-sched/Scheduler/ready_list.h | 72 ++++--- lib/Scheduler/ready_list.cpp | 234 +++++++++++------------ 3 files changed, 168 insertions(+), 144 deletions(-) diff --git a/include/opt-sched/Scheduler/defines.h b/include/opt-sched/Scheduler/defines.h index ba3edcf8..fd68db4e 100644 --- a/include/opt-sched/Scheduler/defines.h +++ b/include/opt-sched/Scheduler/defines.h @@ -25,6 +25,12 @@ typedef int64_t Milliseconds; // Instruction count. typedef int InstCount; +// type for the aco heuristics and ready list keys +typedef unsigned long HeurType; + +// Pheromone type +typedef double pheromone_t; + // A generic sentinel value. Should be used with care. // TODO(max): Get rid of this in favor of type- or purpose-specific sentinels. const int INVALID_VALUE = -1; diff --git a/include/opt-sched/Scheduler/ready_list.h b/include/opt-sched/Scheduler/ready_list.h index 0a281dff..1b371cf9 100644 --- a/include/opt-sched/Scheduler/ready_list.h +++ b/include/opt-sched/Scheduler/ready_list.h @@ -21,6 +21,50 @@ Last Update: Sept. 2013 namespace llvm { namespace opt_sched { +struct PriorityEntry { + uint16_t Width; + uint16_t Offset; +}; + +class KeysHelper { + public: + KeysHelper(SchedPriorities Prirts) : Priorities(Prirts), Entries{} {}; + KeysHelper() : KeysHelper(SchedPriorities{}) {}; + + // pre-compute region info + void initForRegion(DataDepGraph *DDG); + + // compute key + HeurType computeKey(SchedInstruction *Inst, bool IncludeDynamic) const; + HeurType computeKey(const uint64_t *Values) const; + + // get information about a keys layout + PriorityEntry getPriorityEntry(int16_t Indx) const { return Entries[Indx]; } + + //get the max key size and value + HeurType getKeySizeInBits() const { return KeysSz; } + HeurType getMaxValue() const { return MaxValue; } + + private: + // private member variables + // scheduling priorities used for this KeysHelper + SchedPriorities Priorities; + + // width and offset info for each priority + PriorityEntry Entries[MAX_SCHED_PRIRTS]; + + // pre-computed size of all keys for this region + uint16_t KeysSz = 0; + + // pre-computed max key value; + HeurType MaxValue = 0; + HeurType MaxNID = 0; + HeurType MaxISO = 0; + + // Field to store if this KeyHelper was initialized + bool WasInitialized = false; +}; + // A priority list of instruction that are ready to schedule at a given point // during the scheduling process. class ReadyList { @@ -90,6 +134,9 @@ class ReadyList { // An ordered vector of priorities SchedPriorities prirts_; + // The KeysHelper for the key computations + KeysHelper KHelper; + // The priority list containing the actual instructions. PriorityList prirtyLst_; @@ -100,36 +147,13 @@ class ReadyList { llvm::SmallVector *, 0> keyedEntries_; - // Is there a priority scheme that needs to be changed dynamically - // bool isDynmcPrirty_; - - // The maximum values for each part of the priority key. - InstCount maxUseCnt_; - InstCount maxCrtclPath_; - InstCount maxScsrCnt_; - InstCount maxLtncySum_; - InstCount maxNodeID_; - InstCount maxInptSchedOrder_; - - unsigned long maxPriority_; - // The number of bits for each part of the priority key. int16_t useCntBits_; - int16_t crtclPathBits_; - int16_t scsrCntBits_; - int16_t ltncySumBits_; - int16_t nodeID_Bits_; - int16_t inptSchedOrderBits_; + int16_t LUCOffset; // Adds instructions at the bottom of a given list which have not been added // to the ready list already. void AddLatestSubList_(LinkedList *lst); - - // Calculates a new priority key given an existing key of size keySize by - // appending bitCnt bits holding the value val, assuming val < maxVal. - static void AddPrirtyToKey_(unsigned long &key, int16_t &keySize, - int16_t bitCnt, unsigned long val, - unsigned long maxVal); }; } // namespace opt_sched diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index adc29b86..15a57338 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -5,113 +5,155 @@ using namespace llvm::opt_sched; -ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { - prirts_ = prirts; - int i; - uint16_t totKeyBits = 0; +// pre-compute region info +void KeysHelper::initForRegion(DataDepGraph *DDG) { - // Initialize an array of KeyedEntry if a dynamic heuristic is used. This - // enable fast updating for dynamic heuristics. - if (prirts_.isDynmc) { - keyedEntries_.resize(dataDepGraph->GetInstCnt()); - } + uint16_t CurrentOffset = 0, CurrentWidth = 0; - useCntBits_ = crtclPathBits_ = scsrCntBits_ = ltncySumBits_ = nodeID_Bits_ = - inptSchedOrderBits_ = 0; + uint64_t MaxKVs[MAX_SCHED_PRIRTS] = { 0 }; // Calculate the number of bits needed to hold the maximum value of each // priority scheme - for (i = 0; i < prirts.cnt; i++) { - switch (prirts.vctr[i]) { + for (int I = 0; I < Priorities.cnt; ++I) { + LISTSCHED_HEURISTIC Heur = Priorities.vctr[I]; + uint64_t MaxV = 0; + switch (Heur) { case LSH_CP: case LSH_CPR: - maxCrtclPath_ = dataDepGraph->GetRootInst()->GetCrntLwrBound(DIR_BKWRD); - crtclPathBits_ = Utilities::clcltBitsNeededToHoldNum(maxCrtclPath_); - totKeyBits += crtclPathBits_; + MaxV = DDG->GetRootInst()->GetCrntLwrBound(DIR_BKWRD); break; case LSH_LUC: - for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) { - keyedEntries_[j] = NULL; - } - maxUseCnt_ = dataDepGraph->GetMaxUseCnt(); - useCntBits_ = Utilities::clcltBitsNeededToHoldNum(maxUseCnt_); - totKeyBits += useCntBits_; - break; - case LSH_UC: - maxUseCnt_ = dataDepGraph->GetMaxUseCnt(); - useCntBits_ = Utilities::clcltBitsNeededToHoldNum(maxUseCnt_); - totKeyBits += useCntBits_; + MaxV = DDG->GetMaxUseCnt(); break; case LSH_NID: case LSH_LLVM: - maxNodeID_ = dataDepGraph->GetInstCnt() - 1; - nodeID_Bits_ = Utilities::clcltBitsNeededToHoldNum(maxNodeID_); - totKeyBits += nodeID_Bits_; + MaxV = DDG->GetInstCnt() - 1; break; case LSH_ISO: - maxInptSchedOrder_ = dataDepGraph->GetMaxFileSchedOrder(); - inptSchedOrderBits_ = - Utilities::clcltBitsNeededToHoldNum(maxInptSchedOrder_); - totKeyBits += inptSchedOrderBits_; + MaxV = DDG->GetMaxFileSchedOrder(); break; case LSH_SC: - maxScsrCnt_ = dataDepGraph->GetMaxScsrCnt(); - scsrCntBits_ = Utilities::clcltBitsNeededToHoldNum(maxScsrCnt_); - totKeyBits += scsrCntBits_; + MaxV = DDG->GetMaxScsrCnt(); break; case LSH_LS: - maxLtncySum_ = dataDepGraph->GetMaxLtncySum(); - ltncySumBits_ = Utilities::clcltBitsNeededToHoldNum(maxLtncySum_); - totKeyBits += ltncySumBits_; + MaxV = DDG->GetMaxLtncySum(); break; } // end switch + + // Track the size of the key and the width and location of our values + CurrentWidth = Utilities::clcltBitsNeededToHoldNum(MaxV); + Entries[Heur] = PriorityEntry{CurrentWidth, CurrentOffset}; + MaxKVs[Heur] = MaxV; + CurrentOffset += CurrentWidth; } // end for - assert(totKeyBits <= 8 * sizeof(unsigned long)); + // check to see if the key can fit in our type + assert(CurrentOffset <= 8 * sizeof(HeurType)); -#ifdef IS_DEBUG_READY_LIST2 - Logger::Info("The ready list key size is %d bits", totKeyBits); -#endif + // set the key size value to the final offset of the key + KeysSz = CurrentOffset; + + //set maximumvalues needed to compute keys + MaxNID = MaxKVs[LSH_NID]; + MaxISO = MaxKVs[LSH_ISO]; + + // mark the object as initialized + WasInitialized = true; - int16_t keySize = 0; - maxPriority_ = 0; - for (i = 0; i < prirts_.cnt; i++) { - switch (prirts_.vctr[i]) { + // set the max value using the values compute key + MaxValue = computeKey(MaxKVs); +} + +// compute key +HeurType KeysHelper::computeKey(SchedInstruction *Inst, bool IncludeDynamic) const { + assert(WasInitialized); + + HeurType Key= 0; + for (int I = 0; I < Priorities.cnt; ++I) { + LISTSCHED_HEURISTIC Heur = Priorities.vctr[I]; + HeurType PriorityValue = 0; + switch (Heur) { case LSH_CP: case LSH_CPR: - AddPrirtyToKey_(maxPriority_, keySize, crtclPathBits_, maxCrtclPath_, - maxCrtclPath_); + PriorityValue = Inst->GetCrtclPath(DIR_BKWRD); break; + case LSH_LUC: + PriorityValue = IncludeDynamic ? Inst->CmputLastUseCnt() : 0; + break; + case LSH_UC: - AddPrirtyToKey_(maxPriority_, keySize, useCntBits_, maxUseCnt_, - maxUseCnt_); + PriorityValue = Inst->NumUses(); break; + case LSH_NID: case LSH_LLVM: - AddPrirtyToKey_(maxPriority_, keySize, nodeID_Bits_, maxNodeID_, - maxNodeID_); + PriorityValue = MaxNID - Inst->GetNodeID(); break; + case LSH_ISO: - AddPrirtyToKey_(maxPriority_, keySize, inptSchedOrderBits_, - maxInptSchedOrder_, maxInptSchedOrder_); + PriorityValue = MaxISO - Inst->GetFileSchedOrder(); break; + case LSH_SC: - AddPrirtyToKey_(maxPriority_, keySize, scsrCntBits_, maxScsrCnt_, - maxScsrCnt_); + PriorityValue = Inst->GetScsrCnt(); break; + case LSH_LS: - AddPrirtyToKey_(maxPriority_, keySize, ltncySumBits_, maxLtncySum_, - maxLtncySum_); + PriorityValue = Inst->GetLtncySum(); break; } + + Key <<= Entries[Heur].Width; + Key |= PriorityValue; + } +} + +HeurType KeysHelper::computeKey(const uint64_t *Values) const { + assert(WasInitialized); + + HeurType Key = 0; + + for (int I = 0; I < Priorities.cnt; ++I) { + LISTSCHED_HEURISTIC Heur = Priorities.vctr[I]; + Key <<= Entries[Heur].Width; + Key |= Values[Heur]; } + + return Key; +} + +ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) { + prirts_ = prirts; + + // Initialize an array of KeyedEntry if a dynamic heuristic is used. This + // enable fast updating for dynamic heuristics. + if (prirts_.isDynmc) { + keyedEntries_.resize(dataDepGraph->GetInstCnt()); + } + + // Initialize the KeyHelper + KHelper = KeysHelper(prirts); + KHelper.initForRegion(dataDepGraph); + + // if we have an luc in the Priorities then lets store some info about it + // to improve efficiency + PriorityEntry LUCEntry = KHelper.getPriorityEntry(LSH_LUC); + if (LUCEntry.Width) { + useCntBits_ = LUCEntry.Width; + LUCOffset = LUCEntry.Offset; + } + +#ifdef IS_DEBUG_READY_LIST2 + Logger::Info("The ready list key size is %d bits", KHelper->getKeySizeInBits()); +#endif + } ReadyList::~ReadyList() { Reset(); } @@ -133,59 +175,21 @@ void ReadyList::CopyList(ReadyList *otherList) { unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate, bool &changed) { - unsigned long key = 0; - int16_t keySize = 0; - int i; - int16_t oldLastUseCnt, newLastUseCnt; - changed = true; - if (isUpdate) - changed = false; - - for (i = 0; i < prirts_.cnt; i++) { - switch (prirts_.vctr[i]) { - case LSH_CP: - case LSH_CPR: - AddPrirtyToKey_(key, keySize, crtclPathBits_, - inst->GetCrtclPath(DIR_BKWRD), maxCrtclPath_); - break; + int16_t OldLastUseCnt, NewLastUseCnt; - case LSH_LUC: - oldLastUseCnt = inst->GetLastUseCnt(); - newLastUseCnt = inst->CmputLastUseCnt(); - if (newLastUseCnt != oldLastUseCnt) - changed = true; + // if we have an LUC Priority then we need to save the oldLUC + OldLastUseCnt = inst->GetLastUseCnt(); - AddPrirtyToKey_(key, keySize, useCntBits_, newLastUseCnt, maxUseCnt_); - break; + HeurType Key = KHelper.computeKey(inst, /*IncludeDynamic*/ true); - case LSH_UC: - AddPrirtyToKey_(key, keySize, useCntBits_, inst->NumUses(), maxUseCnt_); - break; - - case LSH_NID: - case LSH_LLVM: - AddPrirtyToKey_(key, keySize, nodeID_Bits_, - maxNodeID_ - inst->GetNodeID(), maxNodeID_); - break; + //check if the luc value changed + HeurType Mask = (0x01 << useCntBits_) - 1; + HeurType LUCVal = (Key >> LUCOffset) & Mask; + NewLastUseCnt = (int16_t) LUCVal; + //set changed if the compute is not an update or the luc was changed + changed = !isUpdate || OldLastUseCnt != NewLastUseCnt; - case LSH_ISO: - AddPrirtyToKey_(key, keySize, inptSchedOrderBits_, - maxInptSchedOrder_ - inst->GetFileSchedOrder(), - maxInptSchedOrder_); - break; - - case LSH_SC: - AddPrirtyToKey_(key, keySize, scsrCntBits_, inst->GetScsrCnt(), - maxScsrCnt_); - break; - - case LSH_LS: - AddPrirtyToKey_(key, keySize, ltncySumBits_, inst->GetLtncySum(), - maxLtncySum_); - break; - } - } - return key; + return Key; } void ReadyList::AddLatestSubLists(LinkedList *lst1, @@ -312,14 +316,4 @@ bool ReadyList::FindInst(SchedInstruction *inst, int &hitCnt) { return prirtyLst_.FindElmnt(inst, hitCnt); } -void ReadyList::AddPrirtyToKey_(unsigned long &key, int16_t &keySize, - int16_t bitCnt, unsigned long val, - unsigned long maxVal) { - assert(val <= maxVal); - if (keySize > 0) - key <<= bitCnt; - key |= val; - keySize += bitCnt; -} - -unsigned long ReadyList::MaxPriority() { return maxPriority_; } +unsigned long ReadyList::MaxPriority() { return KHelper.getMaxValue(); } From 5bf0e93d0180c12c9d86c5649de43f993c271733 Mon Sep 17 00:00:00 2001 From: Paul McHugh Date: Wed, 21 Jul 2021 09:44:26 -0700 Subject: [PATCH 2/7] wip --- include/opt-sched/Scheduler/aco.h | 6 +- include/opt-sched/Scheduler/gen_sched.h | 6 +- .../opt-sched/Scheduler/simplified_aco_ds.h | 135 ++++++++++ lib/CMakeLists.txt | 1 + lib/Scheduler/aco.cpp | 36 ++- lib/Scheduler/gen_sched.cpp | 4 +- lib/Scheduler/simplified_aco_ds.cpp | 241 ++++++++++++++++++ 7 files changed, 422 insertions(+), 7 deletions(-) create mode 100644 include/opt-sched/Scheduler/simplified_aco_ds.h create mode 100644 lib/Scheduler/simplified_aco_ds.cpp diff --git a/include/opt-sched/Scheduler/aco.h b/include/opt-sched/Scheduler/aco.h index 9c1c775e..5e3ea1e1 100644 --- a/include/opt-sched/Scheduler/aco.h +++ b/include/opt-sched/Scheduler/aco.h @@ -10,6 +10,7 @@ Last Update: Jan. 2020 #define OPTSCHED_ACO_H #include "opt-sched/Scheduler/gen_sched.h" +#include "opt-sched/Scheduler/simplified_aco_ds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" @@ -20,8 +21,6 @@ Last Update: Jan. 2020 namespace llvm { namespace opt_sched { -typedef double pheromone_t; - enum class DCF_OPT { OFF, GLOBAL_ONLY, @@ -77,8 +76,11 @@ class ACOScheduler : public ConstrainedScheduler { Choice SelectInstruction(const llvm::ArrayRef &ready, SchedInstruction *lastInst); void UpdatePheromone(InstSchedule *schedule); + void UpdateACOReadyList(SchedInstruction *Inst); std::unique_ptr FindOneSchedule(InstCount TargetRPCost); llvm::SmallVector pheromone_; + //new ds representations + ACOReadyList ReadyLs; pheromone_t initialValue_; bool use_fixed_bias; int count_; diff --git a/include/opt-sched/Scheduler/gen_sched.h b/include/opt-sched/Scheduler/gen_sched.h index a7221822..53d088d2 100644 --- a/include/opt-sched/Scheduler/gen_sched.h +++ b/include/opt-sched/Scheduler/gen_sched.h @@ -105,7 +105,7 @@ class ConstrainedScheduler : public InstScheduler { // Constructs a constrained scheduler for the given machine and dependence // graph, with the specified upper bound. ConstrainedScheduler(DataDepGraph *dataDepGraph, MachineModel *machMdl, - InstCount schedUprBound); + InstCount schedUprBound, bool ACOEn=false); // Deallocates memory used by the scheduler. virtual ~ConstrainedScheduler(); @@ -113,6 +113,10 @@ class ConstrainedScheduler : public InstScheduler { virtual FUNC_RESULT FindSchedule(InstSchedule *sched, SchedRegion *rgn) = 0; protected: + // Whether this instance of ConstrainedScheduler is being used with ACO and + // therefore can use the aco optimizations + bool IsACO; + // The data dependence graph to be scheduled. DataDepGraph *dataDepGraph_; // The current schedule. diff --git a/include/opt-sched/Scheduler/simplified_aco_ds.h b/include/opt-sched/Scheduler/simplified_aco_ds.h new file mode 100644 index 00000000..d17eb708 --- /dev/null +++ b/include/opt-sched/Scheduler/simplified_aco_ds.h @@ -0,0 +1,135 @@ +/******************************************************************************* +Description: Implements smaller more performant data structures for ACO +Author: Paul McHugh +Created: Jun. 2021 +*******************************************************************************/ + +#ifndef OPTSCHED_SIMPLIFIED_ACO_H +#define OPTSCHED_SIMPLIFIED_ACO_H + +#include "opt-sched/Scheduler/defines.h" +#include "opt-sched/Scheduler/sched_basic_data.h" +#include + +namespace llvm { +namespace opt_sched { + +//forward declarations to reduce the number of classes that need to be included +class DataDepGraph; + +//class for tracking the schedule cycle state +class ACOCycleState { + +public: + ACOCycleState(InstCount IssueRate) : MIssueRate(IssueRate), cycle(0), slot(0) {} + + //stores the issue rate of the CPU (here for convienience) + const InstCount MIssueRate; + + //schedule cycle and slot + InstCount cycle; + InstCount slot; + +}; + +struct ACOReadyListEntry { + InstCount InstId, ReadyOn; + HeurType Heuristic; + pheromone_t Score; +}; + +//this aco specific readylist stores each ready instruction, its dynamic heuristic score, and the cycle it becomes ready +//It uses a (generous) heuristic to decide how much space to allocate. If that space is exceeded then it gracefully handles it +//by making a larger allocation and copying the data to it. THIS WILL KILL PERFORMANCE(ESPECIALLY ON THE GPU). That is why it +//will also make a report that its heuristic max size was overrun. Strongly consider fixing such warnings +class ACOReadyList { + +protected: + + //used for the sizing heuristic + InstCount InstrCount; + InstCount PrimaryBufferCapacity; + + bool Overflowed; + InstCount CurrentCapacity; + InstCount CurrentSize; + + //allocation pointers + InstCount *IntAllocation; + HeurType *HeurAllocation; + pheromone_t *ScoreAllocation; + + //pointers to areas in the InstCount allocation that store ready list entry attributes + InstCount *InstrBase; + InstCount *ReadyOnBase; + HeurType *HeurBase; + pheromone_t *ScoreBase; + + //function to decide how large the primary buffer's capacity should be + InstCount computePrimaryCapacity(InstCount RegionSize); + +public: + + ACOReadyList(); + explicit ACOReadyList(InstCount RegionSize); + ACOReadyList(const ACOReadyList &Other); + ACOReadyList &operator=(const ACOReadyList &Other); + ACOReadyList(ACOReadyList &&Other) noexcept; + ACOReadyList &operator=(ACOReadyList &&Other) noexcept; + ~ACOReadyList(); + + //used to store the total score of all instructions in the ready list + pheromone_t ScoreSum; + + //get the total size of both the primary and fallback allocations + size_t getTotalSizeInBytes() const; + + //gets the number of insturctions in the ready list + InstCount getReadyListSize() const { return CurrentSize; } + + //IMPORTANT NOTE: ADDING OR REMOVING INSTRUCTIONS CAN/WILL CAUSE THE INSTRUCTIONS IN THE READY LIST TO BE MOVED TO NEW INDICES + //DO NOT RELY ON AN INSTRUCTION'S INDEX IN THE READY LIST STAYING THE SAME FOLLOWING A REOMVAL/INSERTION + //get instruction into at an index + InstCount *getInstIdAtIndex(InstCount Indx) const; + InstCount *getInstReadyOnAtIndex(InstCount Indx) const; + HeurType *getInstHeuristicAtIndex(InstCount Indx) const; + pheromone_t *getInstScoreAtIndex(InstCount Indx) const; + + //add a new instruction to the ready list + void addInstructionToReadyList(const ACOReadyListEntry &Entry); + ACOReadyListEntry removeInstructionAtIndex(InstCount Indx); + void clearReadyList(); + +}; + +// ---- +// ACOReadyList +// ---- +inline size_t ACOReadyList::getTotalSizeInBytes() const { + return (2 * sizeof(*IntAllocation) + sizeof(*HeurAllocation) + sizeof(*ScoreAllocation)) * CurrentCapacity; +} + +inline InstCount *ACOReadyList::getInstIdAtIndex(InstCount Indx) const { + return InstrBase + Indx; +} + +inline InstCount *ACOReadyList::getInstReadyOnAtIndex(InstCount Indx) const { + return ReadyOnBase + Indx; +} + +inline HeurType *ACOReadyList::getInstHeuristicAtIndex(InstCount Indx) const { + return HeurBase + Indx; +} + +inline pheromone_t *ACOReadyList::getInstScoreAtIndex(InstCount Indx) const { + return ScoreBase + Indx; +} + +inline void ACOReadyList::clearReadyList() { + CurrentSize=0; +} + +} // namespace opt_sched +} // namespace llvm + +#endif diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index f831478b..ef8a7e4b 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -20,6 +20,7 @@ set(OPTSCHED_SRCS Scheduler/aco.cpp Scheduler/relaxed_sched.cpp Scheduler/sched_basic_data.cpp Scheduler/sched_region.cpp + Scheduler/simplified_aco_ds.cpp Scheduler/stats.cpp Wrapper/OptimizingScheduler.cpp Wrapper/OptSchedMachineWrapper.cpp diff --git a/lib/Scheduler/aco.cpp b/lib/Scheduler/aco.cpp index 9d2b8d71..e277d388 100644 --- a/lib/Scheduler/aco.cpp +++ b/lib/Scheduler/aco.cpp @@ -100,6 +100,10 @@ ACOScheduler::ACOScheduler(DataDepGraph *dataDepGraph, */ int pheromone_size = (count_ + 1) * count_; pheromone_.resize(pheromone_size); + + //construct the ACOReadyList member + ReadyLs = ACOReadyList(count_); + InitialSchedule = nullptr; } @@ -284,18 +288,20 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { SchedInstruction *lastInst = NULL; std::unique_ptr schedule = llvm::make_unique(machMdl_, dataDepGraph_, true); - InstCount maxPriority = rdyLst_->MaxPriority(); + + // The MaxPriority that we are getting from the ready list represents the maximum possible heuristic/key value that we can have + // I want to move all the heuristic computation stuff to another class for code tidiness reasons. + HeurType maxPriority = rdyLst_->MaxPriority(); if (maxPriority == 0) maxPriority = 1; // divide by 0 is bad Initialize_(); - rgn_->InitForSchdulng(); SchedInstruction *waitFor = NULL; InstCount waitUntil = 0; double maxPriorityInv = 1 / maxPriority; llvm::SmallVector ready; while (!IsSchedComplete_()) { - UpdtRdyLst_(crntCycleNum_, crntSlotNum_); + UpdtRdyLst_(crntCycleNum_, crntSlotNum_);//rm me // there are two steps to scheduling an instruction: // 1)Select the instruction(if we are not waiting on another instruction) @@ -398,6 +404,7 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { rgn_->SchdulInst(inst, crntCycleNum_, crntSlotNum_, false); DoRsrvSlots_(inst); // this is annoying + // remove me SchedInstruction *blah = rdyLst_->GetNextPriorityInst(); while (blah != NULL && blah != inst) { blah = rdyLst_->GetNextPriorityInst(); @@ -406,6 +413,10 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { rdyLst_->RemoveNextPriorityInst(); UpdtSlotAvlblty_(inst); + // new readylist update + UpdateACOReadyList(inst); + + if (rgn_->getUnnormalizedIncrementalRPCost() > TargetRPCost) { delete rdyLst_; rdyLst_ = new ReadyList(dataDepGraph_, prirts_); @@ -596,6 +607,25 @@ void ACOScheduler::UpdatePheromone(InstSchedule *schedule) { PrintPheromone(); } +void ACOScheduler::UpdateACOReadyList(SchedInstruction *Inst) { + InstCount prdcsrNum, scsrRdyCycle; + InstCount InstId = Inst->GetNum(); + + // Notify each successor of this instruction that it has been scheduled. + for (SchedInstruction *crntScsr = Inst->GetFrstScsr(&prdcsrNum); + crntScsr != NULL; crntScsr = Inst->GetNxtScsr(&prdcsrNum)) { + bool wasLastPrdcsr = + crntScsr->PrdcsrSchduld(prdcsrNum, crntCycleNum_, scsrRdyCycle); + + if (wasLastPrdcsr) { + // Add this successor to the first-ready list of the future cycle + // in which we now know it will become ready + //HeurType HeurWOLuc = + ReadyLs.addInstructionToReadyList(ACOReadyListEntry{InstId, scsrRdyCycle,100,1.5}); + } + } +} + // copied from Enumerator inline void ACOScheduler::UpdtRdyLst_(InstCount cycleNum, int slotNum) { InstCount prevCycleNum = cycleNum - 1; diff --git a/lib/Scheduler/gen_sched.cpp b/lib/Scheduler/gen_sched.cpp index 14e6ad10..37dafb3f 100644 --- a/lib/Scheduler/gen_sched.cpp +++ b/lib/Scheduler/gen_sched.cpp @@ -61,9 +61,11 @@ void ConstrainedScheduler::ResetRsrvSlots_() { ConstrainedScheduler::ConstrainedScheduler(DataDepGraph *dataDepGraph, MachineModel *machMdl, - InstCount schedUprBound) + InstCount schedUprBound, + bool ACOEn) : InstScheduler(dataDepGraph, machMdl, schedUprBound) { dataDepGraph_ = dataDepGraph; + IsACO = ACOEn; // Allocate the array of first-ready lists - one list per cycle. assert(schedUprBound_ > 0); diff --git a/lib/Scheduler/simplified_aco_ds.cpp b/lib/Scheduler/simplified_aco_ds.cpp new file mode 100644 index 00000000..6c8d84df --- /dev/null +++ b/lib/Scheduler/simplified_aco_ds.cpp @@ -0,0 +1,241 @@ +#include "opt-sched/Scheduler/logger.h" +#include "opt-sched/Scheduler/simplified_aco_ds.h" +#include "opt-sched/Scheduler/register.h" +#include "opt-sched/Scheduler/data_dep.h" +#include "opt-sched/Scheduler/sched_basic_data.h" +#include "opt-sched/Scheduler/machine_model.h" +#include +#include +#include +//aco simplified ds impl + +using namespace llvm::opt_sched; + +//use the log message macro to make GPU porting easier +#define LOG_MESSAGE(...) Logger::Info(__VA_ARGS__) + +// ---- +// ACOReadyList +// ---- + +ACOReadyList::ACOReadyList() { + InstrCount = 0; + CurrentSize = 0; + CurrentCapacity = PrimaryBufferCapacity = 0; + Overflowed = false; + + // create new allocations for the data + IntAllocation = nullptr; + HeurAllocation = nullptr; + ScoreAllocation = nullptr; + + //build shortcut pointers + InstrBase = nullptr; + ReadyOnBase = nullptr; + HeurBase = nullptr; + ScoreBase = nullptr; + +} + +ACOReadyList::ACOReadyList(InstCount RegionSize) { + InstrCount = RegionSize; + CurrentSize = 0; + CurrentCapacity = PrimaryBufferCapacity = computePrimaryCapacity(InstrCount); + Overflowed = false; + + // create new allocations for the data + IntAllocation = new InstCount[2*CurrentCapacity]; + HeurAllocation = new HeurType[CurrentCapacity]; + ScoreAllocation = new pheromone_t[CurrentCapacity]; + + //build shortcut pointers + InstrBase = IntAllocation; + ReadyOnBase = IntAllocation + CurrentCapacity; + HeurBase = HeurAllocation; + ScoreBase = ScoreBase; +} + +ACOReadyList::ACOReadyList(const ACOReadyList &Other) { + InstrCount = Other.InstrCount; + PrimaryBufferCapacity = Other.PrimaryBufferCapacity; + Overflowed = Other.Overflowed; + CurrentCapacity = Other.CurrentCapacity; + CurrentSize = Other.CurrentSize; + + // create new allocations for the data + IntAllocation = new InstCount[2*CurrentCapacity]; + HeurAllocation = new HeurType[CurrentCapacity]; + ScoreAllocation = new pheromone_t[CurrentCapacity]; + + //build shortcut pointers + InstrBase = IntAllocation; + ReadyOnBase = IntAllocation + CurrentCapacity; + HeurBase = HeurAllocation; + ScoreBase = ScoreBase; + + // copy the allocation's entries + for (InstCount I = 0; I < CurrentSize; ++I) { + InstrBase[I] = Other.InstrBase[I]; + ReadyOnBase[I] = Other.ReadyOnBase[I]; + HeurBase[I] = Other.HeurBase[I]; + ScoreBase[I] = Other.ScoreBase[I]; + } +} + +ACOReadyList &ACOReadyList::operator=(const ACOReadyList &Other) { + InstrCount = Other.InstrCount; + PrimaryBufferCapacity = Other.PrimaryBufferCapacity; + Overflowed = Other.Overflowed; + CurrentCapacity = Other.CurrentCapacity; + CurrentSize = Other.CurrentSize; + + // delete current allocations + delete[] IntAllocation; + delete[] HeurAllocation; + delete[] ScoreAllocation; + + // create new allocations for the data + IntAllocation = new InstCount[2*CurrentCapacity]; + HeurAllocation = new HeurType[CurrentCapacity]; + ScoreAllocation = new pheromone_t[CurrentCapacity]; + + //build shortcut pointers + InstrBase = IntAllocation; + ReadyOnBase = IntAllocation + CurrentCapacity; + HeurBase = HeurAllocation; + ScoreBase = ScoreBase; + + // copy over the allocation's entries + for (InstCount I = 0; I < CurrentSize; ++I) { + InstrBase[I] = Other.InstrBase[I]; + ReadyOnBase[I] = Other.ReadyOnBase[I]; + HeurBase[I] = Other.HeurBase[I]; + ScoreBase[I] = Other.ScoreBase[I]; + } + + return *this; +} + +ACOReadyList::ACOReadyList(ACOReadyList &&Other) noexcept { + InstrCount = Other.InstrCount; + PrimaryBufferCapacity = Other.PrimaryBufferCapacity; + Overflowed = Other.Overflowed; + CurrentCapacity = Other.CurrentCapacity; + CurrentSize = Other.CurrentSize; + + // copy over the old ready lists allocations and set them to NULL + // so that the data we took won't get deleted + IntAllocation = Other.IntAllocation; + HeurAllocation = Other.HeurAllocation; + ScoreAllocation = Other.ScoreAllocation; + Other.IntAllocation = nullptr; + Other.HeurAllocation = nullptr; + Other.ScoreAllocation = nullptr; + + InstrBase = Other.InstrBase; + ReadyOnBase = Other.ReadyOnBase; + HeurBase = Other.HeurBase; + ScoreBase = Other.ScoreBase; +} + +ACOReadyList &ACOReadyList::operator=(ACOReadyList &&Other) noexcept { + InstrCount = Other.InstrCount; + PrimaryBufferCapacity = Other.PrimaryBufferCapacity; + Overflowed = Other.Overflowed; + CurrentCapacity = Other.CurrentCapacity; + CurrentSize = Other.CurrentSize; + + // swap the allocations to give Other our allocations to delete + std::swap(IntAllocation, Other.IntAllocation); + std::swap(HeurAllocation, Other.HeurAllocation); + std::swap(ScoreAllocation, Other.ScoreAllocation); + + InstrBase = Other.InstrBase; + ReadyOnBase = Other.ReadyOnBase; + HeurBase = Other.HeurBase; + ScoreBase = Other.ScoreBase; + + return *this; +} + +ACOReadyList::~ACOReadyList() { + delete[] IntAllocation; + delete[] HeurAllocation; + delete[] ScoreAllocation; +} + + +// This is just a heuristic for the ready list size. +// A better function should be chosen experimentally +InstCount ACOReadyList::computePrimaryCapacity(InstCount RegionSize) { + return std::max(32, RegionSize/4); +} + +void ACOReadyList::addInstructionToReadyList(const ACOReadyListEntry &Entry) { + + // check to see if we need to expand the allocation/get a new allocation + if (CurrentSize == CurrentCapacity) { + int OldCap = CurrentCapacity; + bool PrevOverflowed = Overflowed; + + // get a new allocation to put the data in + // The expansion formula is to make the new allocation 1.5 times the size of the old one + // consider making this formula more aggressive + int NewCap = OldCap + OldCap/2 + 1; + InstCount *NewIntFallback = new InstCount[NewCap]; + HeurType *NewHeurFallback = new HeurType[NewCap]; + pheromone_t *NewScoreFallback = new pheromone_t[NewCap]; + + // copy the data + InstCount NewInstrOffset = 0, NewReadyOnOffset = NewCap, HeurOffset = 0, ScoreOffset = 0; + for (int I = 0; I < CurrentSize; ++I) { + NewIntFallback[NewInstrOffset + I] = InstrBase[I]; + NewIntFallback[NewReadyOnOffset + I] = ReadyOnBase[I]; + NewHeurFallback[HeurOffset + I] = HeurBase[I]; + NewScoreFallback[ScoreOffset + I] = ScoreBase[I]; + } + + //delete the old allocations + delete[] IntAllocation; + delete[] HeurAllocation; + delete[] ScoreAllocation; + + //copy the new allocations + IntAllocation = NewIntFallback; + HeurAllocation = NewHeurFallback; + ScoreAllocation = NewScoreFallback; + + // update/recompute pointers and other values + InstrBase = IntAllocation + NewInstrOffset; + ReadyOnBase = IntAllocation + NewReadyOnOffset; + HeurBase = HeurAllocation + HeurOffset; + ScoreBase = ScoreAllocation + ScoreOffset; + Overflowed = true; + CurrentCapacity = NewCap; + + //print out a notice/error message + //Welp this may be a performance disaster if this is happening too much + LOG_MESSAGE("Overflowed ReadyList capacity. Old Cap:%d, New Cap:%d, Primary Cap:%d, Prev Overflowed:%B", OldCap, NewCap, PrimaryBufferCapacity, PrevOverflowed); + } + + //add the instruction to the ready list + InstrBase[CurrentSize] = Entry.InstId; + ReadyOnBase[CurrentSize] = Entry.ReadyOn; + HeurBase[CurrentSize] = Entry.Heuristic; + ScoreBase[CurrentSize] = Entry.Score; + ++CurrentSize; +} + +// We copy the instruction at the end of the array to the instruction at the target index +// then we decrement the Ready List's CurrentSize +// This function has undefined behavior if CurrentSize == 0 +ACOReadyListEntry ACOReadyList::removeInstructionAtIndex(InstCount Indx) { + assert(CurrentSize != 0); + ACOReadyListEntry E{InstrBase[Indx], ReadyOnBase[Indx], HeurBase[Indx], ScoreBase[Indx]}; + InstCount EndIndx = --CurrentSize; + InstrBase[Indx] = InstrBase[EndIndx]; + ReadyOnBase[Indx] = ReadyOnBase[EndIndx]; + HeurBase[Indx] = HeurBase[EndIndx]; + ScoreBase[Indx] = ScoreBase[EndIndx]; + return E; +} From f0ef65038d3506a143b64c96e571df8f0ccaf9c9 Mon Sep 17 00:00:00 2001 From: Paul McHugh Date: Mon, 26 Jul 2021 13:57:05 -0700 Subject: [PATCH 3/7] readylist changes, needs bug fixes --- include/opt-sched/Scheduler/aco.h | 11 +- .../opt-sched/Scheduler/simplified_aco_ds.h | 3 +- lib/Scheduler/aco.cpp | 237 +++++++----------- lib/Scheduler/gen_sched.cpp | 36 +-- lib/Scheduler/simplified_aco_ds.cpp | 6 +- 5 files changed, 121 insertions(+), 172 deletions(-) diff --git a/include/opt-sched/Scheduler/aco.h b/include/opt-sched/Scheduler/aco.h index 5e3ea1e1..a67507c5 100644 --- a/include/opt-sched/Scheduler/aco.h +++ b/include/opt-sched/Scheduler/aco.h @@ -11,6 +11,7 @@ Last Update: Jan. 2020 #include "opt-sched/Scheduler/gen_sched.h" #include "opt-sched/Scheduler/simplified_aco_ds.h" +#include "opt-sched/Scheduler/ready_list.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" @@ -49,7 +50,7 @@ class ACOScheduler : public ConstrainedScheduler { private: pheromone_t &Pheromone(SchedInstruction *from, SchedInstruction *to); pheromone_t &Pheromone(InstCount from, InstCount to); - pheromone_t Score(SchedInstruction *from, Choice choice); + pheromone_t Score(InstCount FromId, InstCount ToId, HeurType ToHeuristic); bool shouldReplaceSchedule(InstSchedule *OldSched, InstSchedule *NewSched, bool IsGlobal); DCF_OPT ParseDCFOpt(const std::string &opt); @@ -72,15 +73,17 @@ class ACOScheduler : public ConstrainedScheduler { llvm::SetVector &Visited); // pheromone Graph Debugging end - - Choice SelectInstruction(const llvm::ArrayRef &ready, - SchedInstruction *lastInst); + InstCount SelectInstruction(SchedInstruction *lastInst); void UpdatePheromone(InstSchedule *schedule); void UpdateACOReadyList(SchedInstruction *Inst); std::unique_ptr FindOneSchedule(InstCount TargetRPCost); llvm::SmallVector pheromone_; //new ds representations ACOReadyList ReadyLs; + KeysHelper KHelper; + pheromone_t MaxPriorityInv; + InstCount MaxScoringInst; + pheromone_t initialValue_; bool use_fixed_bias; int count_; diff --git a/include/opt-sched/Scheduler/simplified_aco_ds.h b/include/opt-sched/Scheduler/simplified_aco_ds.h index d17eb708..ae74cfb7 100644 --- a/include/opt-sched/Scheduler/simplified_aco_ds.h +++ b/include/opt-sched/Scheduler/simplified_aco_ds.h @@ -126,7 +126,8 @@ inline pheromone_t *ACOReadyList::getInstScoreAtIndex(InstCount Indx) const { } inline void ACOReadyList::clearReadyList() { - CurrentSize=0; + CurrentSize = 0; + ScoreSum = 0; } } // namespace opt_sched diff --git a/lib/Scheduler/aco.cpp b/lib/Scheduler/aco.cpp index e277d388..f716e974 100644 --- a/lib/Scheduler/aco.cpp +++ b/lib/Scheduler/aco.cpp @@ -2,7 +2,6 @@ #include "opt-sched/Scheduler/config.h" #include "opt-sched/Scheduler/data_dep.h" #include "opt-sched/Scheduler/random.h" -#include "opt-sched/Scheduler/ready_list.h" #include "opt-sched/Scheduler/register.h" #include "opt-sched/Scheduler/sched_region.h" #include "llvm/ADT/STLExtras.h" @@ -57,7 +56,6 @@ ACOScheduler::ACOScheduler(DataDepGraph *dataDepGraph, VrfySched_ = vrfySched; this->IsPostBB = IsPostBB; prirts_ = priorities; - rdyLst_ = new ReadyList(dataDepGraph_, priorities); count_ = dataDepGraph->GetInstCnt(); Config &schedIni = SchedulerOptions::getInstance(); @@ -101,13 +99,15 @@ ACOScheduler::ACOScheduler(DataDepGraph *dataDepGraph, int pheromone_size = (count_ + 1) * count_; pheromone_.resize(pheromone_size); - //construct the ACOReadyList member + //construct the ACOReadyList member and a key helper ReadyLs = ACOReadyList(count_); + KHelper = KeysHelper(priorities); + KHelper.initForRegion(dataDepGraph); InitialSchedule = nullptr; } -ACOScheduler::~ACOScheduler() { delete rdyLst_; } +ACOScheduler::~ACOScheduler() {} // Pheromone table lookup // -1 means no instruction, so e.g. pheromone(-1, 10) gives pheromone on path @@ -128,12 +128,12 @@ pheromone_t &ACOScheduler::Pheromone(InstCount from, InstCount to) { return pheromone_[(row * count_) + to]; } -double ACOScheduler::Score(SchedInstruction *from, Choice choice) { +pheromone_t ACOScheduler::Score(InstCount FromId, InstCount ToId, HeurType ToHeuristic) { // tuneable heuristic importance is temporarily disabled - // return Pheromone(from, choice.inst) * - // pow(choice.heuristic, heuristicImportance_); - double hf = heuristicImportance_ ? choice.heuristic : 1.0; - return Pheromone(from, choice.inst) * hf; + // double Hf = pow(ToHeuristic, heuristicImportance_); + pheromone_t HeurScore = ToHeuristic * MaxPriorityInv + 1; + pheromone_t Hf = heuristicImportance_ ? HeurScore : 1.0; + return Pheromone(FromId, ToId) * Hf; } bool ACOScheduler::shouldReplaceSchedule(InstSchedule *OldSched, @@ -218,37 +218,29 @@ DCF_OPT ACOScheduler::ParseDCFOpt(const std::string &opt) { false); } -Choice ACOScheduler::SelectInstruction(const llvm::ArrayRef &ready, - SchedInstruction *lastInst) { +InstCount ACOScheduler::SelectInstruction(SchedInstruction *lastInst) { + + //compute the choose bese chance (0 if TWO_STEP is off) + double choose_best_chance = 0; #if TWO_STEP - double choose_best_chance; if (use_fixed_bias) choose_best_chance = fmax(0, 1 - (double)fixed_bias / count_); else choose_best_chance = bias_ratio; - - if (RandDouble(0, 1) < choose_best_chance) { - if (print_aco_trace) - std::cerr << "choose_best, use fixed bias: " << use_fixed_bias << "\n"; - pheromone_t max = -1; - Choice maxChoice; - for (auto &choice : ready) { - if (Score(lastInst, choice) > max) { - max = Score(lastInst, choice); - maxChoice = choice; - } - } - return maxChoice; - } #endif + + if (RandDouble(0, 1) < choose_best_chance) + return MaxScoringInst; + + // tournament code in case it is ever used again if (use_tournament) { - int POPULATION_SIZE = ready.size(); + int POPULATION_SIZE = ReadyLs.getReadyListSize(); int r_pos = (int)(RandDouble(0, 1) * POPULATION_SIZE); int s_pos = (int)(RandDouble(0, 1) * POPULATION_SIZE); // int t_pos = (int) (RandDouble(0, 1) *POPULATION_SIZE); - Choice r = ready[r_pos]; - Choice s = ready[s_pos]; - // Choice t = ready[t_pos]; + pheromone_t r = *ReadyLs.getInstScoreAtIndex(r_pos); + pheromone_t s = *ReadyLs.getInstScoreAtIndex(s_pos); + // pheromone_t t = *ReadyLs.getInstScoreAtIndex(t_pos); if (print_aco_trace) { std::cerr << "tournament Start \n"; std::cerr << "array_size:" << POPULATION_SIZE << "\n"; @@ -256,31 +248,27 @@ Choice ACOScheduler::SelectInstruction(const llvm::ArrayRef &ready, std::cerr << "s:\t" << s_pos << "\n"; // std::cerr<<"t:\t"<= - Score(lastInst, s)) //&& Score(lastInst, r) >= Score(lastInst, t)) + if (r >= s) //&& r >= t) return r; - // else if (Score(lastInst, s) >= Score(lastInst, r) && Score(lastInst, - // s) >= Score(lastInst, t)) + // else if (s >= r && s >= t) // return s; else return s; } - pheromone_t sum = 0; - for (auto choice : ready) - sum += Score(lastInst, choice); - pheromone_t point = RandDouble(0, sum); - for (auto choice : ready) { - point -= Score(lastInst, choice); - if (point <= 0) - return choice; + + pheromone_t Point = RandDouble(0, ReadyLs.ScoreSum); + for (InstCount I = 0; I < ReadyLs.getReadyListSize(); ++I) { + Point -= *ReadyLs.getInstScoreAtIndex(I); + if (Point <= 0) + return I; } std::cerr << "returning last instruction" << std::endl; - assert(point < 0.001); // floats should not be this inaccurate - return ready.back(); + assert(Point < 0.001); // floats should not be this inaccurate + return ReadyLs.getReadyListSize(); } std::unique_ptr @@ -291,78 +279,34 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { // The MaxPriority that we are getting from the ready list represents the maximum possible heuristic/key value that we can have // I want to move all the heuristic computation stuff to another class for code tidiness reasons. - HeurType maxPriority = rdyLst_->MaxPriority(); - if (maxPriority == 0) - maxPriority = 1; // divide by 0 is bad + HeurType MaxPriority = KHelper.getMaxValue(); + if (MaxPriority == 0) + MaxPriority = 1; // divide by 0 is bad Initialize_(); SchedInstruction *waitFor = NULL; InstCount waitUntil = 0; - double maxPriorityInv = 1 / maxPriority; - llvm::SmallVector ready; + MaxPriorityInv = 1 / (pheromone_t)MaxPriority; + + // initialize the aco ready list so that the start instruction is ready + // The luc component is 0 since the root inst uses no instructions + InstCount RootId = rootInst_->GetNum(); + HeurType RootHeuristic = KHelper.computeKey(rootInst_, true); + ACOReadyListEntry InitialRoot{RootId, 0, RootHeuristic, Score(-1, RootId, RootHeuristic)}; + ReadyLs.addInstructionToReadyList(InitialRoot); + while (!IsSchedComplete_()) { - UpdtRdyLst_(crntCycleNum_, crntSlotNum_);//rm me // there are two steps to scheduling an instruction: // 1)Select the instruction(if we are not waiting on another instruction) SchedInstruction *inst = NULL; - if (!waitFor) { - // if we have not already committed to schedule an instruction - // next then pick one. First add ready instructions. Including - //"illegal" e.g. blocked instructions - - // convert the ready list from a custom priority queue to a std::vector, - // much nicer for this particular scheduler - ready.reserve(rdyLst_->GetInstCnt()); - unsigned long heuristic; - SchedInstruction *rInst = rdyLst_->GetNextPriorityInst(heuristic); - while (rInst != NULL) { - if (ACO_SCHED_STALLS || ChkInstLglty_(rInst)) { - Choice c; - c.inst = rInst; - c.heuristic = (double)heuristic * maxPriorityInv + 1; - c.readyOn = 0; - ready.push_back(c); - if (IsDbg && lastInst) - LastHeu[std::make_pair(lastInst->GetNum(), rInst->GetNum())] = - c.heuristic; - } - rInst = rdyLst_->GetNextPriorityInst(heuristic); - } - rdyLst_->ResetIterator(); - -#if ACO_SCHED_STALLS - // add all instructions that are waiting due to latency to the choices - // list - for (InstCount fCycle = 1; fCycle < dataDepGraph_->GetMaxLtncy() && - crntCycleNum_ + fCycle < schedUprBound_; - ++fCycle) { - LinkedList *futureReady = - frstRdyLstPerCycle_[crntCycleNum_ + fCycle]; - if (!futureReady) - continue; - - for (SchedInstruction *fIns = futureReady->GetFrstElmnt(); fIns; - fIns = futureReady->GetNxtElmnt()) { - bool changed; - unsigned long heuristic = rdyLst_->CmputKey_(fIns, false, changed); - Choice c; - c.inst = fIns; - c.heuristic = (double)heuristic * maxPriorityInv + 1; - c.readyOn = crntCycleNum_ + fCycle; - ready.push_back(c); - if (IsDbg && lastInst) - LastHeu[std::make_pair(lastInst->GetNum(), fIns->GetNum())] = - c.heuristic; - } - futureReady->ResetIterator(); - } -#endif + if (waitFor) { - if (!ready.empty()) { - Choice Sel = SelectInstruction(ready, lastInst); - waitUntil = Sel.readyOn; - inst = Sel.inst; + if (ReadyLs.getReadyListSize()) { + InstCount SelIndx = SelectInstruction(lastInst); + waitUntil = *ReadyLs.getInstReadyOnAtIndex(SelIndx); + InstCount InstId = *ReadyLs.getInstIdAtIndex(SelIndx); + inst = dataDepGraph_->GetInstByIndx(InstId); if (waitUntil > crntCycleNum_ || !ChkInstLglty_(inst)) { waitFor = inst; inst = NULL; @@ -380,6 +324,8 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { CrntAntEdges.insert( std::make_pair(lastInst->GetNum(), inst->GetNum())); } + + //save the last instruction scheduled lastInst = inst; } } @@ -395,7 +341,7 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { // boilerplate, mostly copied from ListScheduler, try not to touch it InstCount instNum; - if (inst == NULL) { + if (!inst) { instNum = SCHD_STALL; } else { instNum = inst->GetNum(); @@ -404,22 +350,13 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { rgn_->SchdulInst(inst, crntCycleNum_, crntSlotNum_, false); DoRsrvSlots_(inst); // this is annoying - // remove me - SchedInstruction *blah = rdyLst_->GetNextPriorityInst(); - while (blah != NULL && blah != inst) { - blah = rdyLst_->GetNextPriorityInst(); - } - if (blah == inst) - rdyLst_->RemoveNextPriorityInst(); UpdtSlotAvlblty_(inst); // new readylist update UpdateACOReadyList(inst); - if (rgn_->getUnnormalizedIncrementalRPCost() > TargetRPCost) { - delete rdyLst_; - rdyLst_ = new ReadyList(dataDepGraph_, prirts_); + ReadyLs.clearReadyList(); return nullptr; } } @@ -427,8 +364,6 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { schedule->AppendInst(instNum); if (MovToNxtSlot_(inst)) InitNewCycle_(); - rdyLst_->ResetIterator(); - ready.clear(); } rgn_->UpdateScheduleCost(schedule.get()); return schedule; @@ -558,7 +493,7 @@ FUNC_RESULT ACOScheduler::FindSchedule(InstSchedule *schedule_out, bestSchedule->GetCost(), "iterations", iterations, "improvement", InitialCost - bestSchedule->GetCost()); PrintSchedule(bestSchedule.get()); - schedule_out->Copy(bestSchedule.release()); + schedule_out->Copy(bestSchedule.get()); Logger::Info("ACO finished after %d iterations", iterations); return RES_SUCCESS; @@ -620,36 +555,44 @@ void ACOScheduler::UpdateACOReadyList(SchedInstruction *Inst) { if (wasLastPrdcsr) { // Add this successor to the first-ready list of the future cycle // in which we now know it will become ready - //HeurType HeurWOLuc = - ReadyLs.addInstructionToReadyList(ACOReadyListEntry{InstId, scsrRdyCycle,100,1.5}); + HeurType HeurWOLuc = KHelper.computeKey(Inst, false); + ReadyLs.addInstructionToReadyList(ACOReadyListEntry{InstId, scsrRdyCycle, HeurWOLuc, 0}); } } -} -// copied from Enumerator -inline void ACOScheduler::UpdtRdyLst_(InstCount cycleNum, int slotNum) { - InstCount prevCycleNum = cycleNum - 1; - LinkedList *lst1 = NULL; - LinkedList *lst2 = frstRdyLstPerCycle_[cycleNum]; - - if (slotNum == 0 && prevCycleNum >= 0) { - // If at the begining of a new cycle other than the very first cycle, then - // we also have to include the instructions that might have become ready in - // the previous cycle due to a zero latency of the instruction scheduled in - // the very last slot of that cycle [GOS 9.8.02]. - lst1 = frstRdyLstPerCycle_[prevCycleNum]; - - if (lst1 != NULL) { - rdyLst_->AddList(lst1); - lst1->Reset(); - CleanupCycle_(prevCycleNum); + // Make sure the scores are valid. The scheduling of an instruction may + // have increased another instruction's LUC Score + pheromone_t MaxScore = -1; + InstCount MaxScoreIndx = 0; + ReadyLs.ScoreSum = 0; + PriorityEntry LUCEntry = KHelper.getPriorityEntry(LSH_LUC); + for (InstCount I = 0; I < ReadyLs.getReadyListSize(); ++I) { + //we first get the heuristic without the LUC component, add the LUC + //LUC component, and then compute the score + HeurType Heur = *ReadyLs.getInstHeuristicAtIndex(I); + if (LUCEntry.Width) { + HeurType LUCVal = Inst->CmputLastUseCnt(); + LUCVal <<= LUCEntry.Offset; + Heur &= LUCVal; } - } - if (lst2 != NULL) { - rdyLst_->AddList(lst2); - lst2->Reset(); + // compute the score + InstCount CandidateId = *ReadyLs.getInstIdAtIndex(I); + pheromone_t IScore = Score(InstId, CandidateId, Heur); + ReadyLs.ScoreSum += IScore; + *ReadyLs.getInstScoreAtIndex(I) = IScore; + if(IScore > MaxScore) { + MaxScoreIndx = I; + MaxScore = IScore; + } } + MaxScoringInst = MaxScore; +} + +// copied from Enumerator +inline void ACOScheduler::UpdtRdyLst_(InstCount cycleNum, int slotNum) { + assert(false); // do not use this function with aco + // it is only implemented b/c it is a pure virtual in ConstrainedScheduler } void ACOScheduler::PrintPheromone() { diff --git a/lib/Scheduler/gen_sched.cpp b/lib/Scheduler/gen_sched.cpp index 37dafb3f..4038b4fe 100644 --- a/lib/Scheduler/gen_sched.cpp +++ b/lib/Scheduler/gen_sched.cpp @@ -147,24 +147,26 @@ void ConstrainedScheduler::SchdulInst_(SchedInstruction *inst, InstCount) { InstCount prdcsrNum, scsrRdyCycle; // Notify each successor of this instruction that it has been scheduled. - for (SchedInstruction *crntScsr = inst->GetFrstScsr(&prdcsrNum); - crntScsr != NULL; crntScsr = inst->GetNxtScsr(&prdcsrNum)) { - bool wasLastPrdcsr = - crntScsr->PrdcsrSchduld(prdcsrNum, crntCycleNum_, scsrRdyCycle); - - if (wasLastPrdcsr) { - // If all other predecessors of this successor have been scheduled then - // we now know in which cycle this successor will become ready. - assert(scsrRdyCycle < schedUprBound_); - - // If the first-ready list of that cycle has not been created yet. - if (frstRdyLstPerCycle_[scsrRdyCycle] == NULL) { - frstRdyLstPerCycle_[scsrRdyCycle] = new LinkedList; + if(!IsACO) { + for (SchedInstruction *crntScsr = inst->GetFrstScsr(&prdcsrNum); + crntScsr != NULL; crntScsr = inst->GetNxtScsr(&prdcsrNum)) { + bool wasLastPrdcsr = + crntScsr->PrdcsrSchduld(prdcsrNum, crntCycleNum_, scsrRdyCycle); + + if (wasLastPrdcsr) { + // If all other predecessors of this successor have been scheduled then + // we now know in which cycle this successor will become ready. + assert(scsrRdyCycle < schedUprBound_); + + // If the first-ready list of that cycle has not been created yet. + if (frstRdyLstPerCycle_[scsrRdyCycle] == NULL) { + frstRdyLstPerCycle_[scsrRdyCycle] = new LinkedList; + } + + // Add this successor to the first-ready list of the future cycle + // in which we now know it will become ready + frstRdyLstPerCycle_[scsrRdyCycle]->InsrtElmnt(crntScsr); } - - // Add this successor to the first-ready list of the future cycle - // in which we now know it will become ready - frstRdyLstPerCycle_[scsrRdyCycle]->InsrtElmnt(crntScsr); } } diff --git a/lib/Scheduler/simplified_aco_ds.cpp b/lib/Scheduler/simplified_aco_ds.cpp index 6c8d84df..37774256 100644 --- a/lib/Scheduler/simplified_aco_ds.cpp +++ b/lib/Scheduler/simplified_aco_ds.cpp @@ -52,7 +52,7 @@ ACOReadyList::ACOReadyList(InstCount RegionSize) { InstrBase = IntAllocation; ReadyOnBase = IntAllocation + CurrentCapacity; HeurBase = HeurAllocation; - ScoreBase = ScoreBase; + ScoreBase = ScoreAllocation; } ACOReadyList::ACOReadyList(const ACOReadyList &Other) { @@ -71,7 +71,7 @@ ACOReadyList::ACOReadyList(const ACOReadyList &Other) { InstrBase = IntAllocation; ReadyOnBase = IntAllocation + CurrentCapacity; HeurBase = HeurAllocation; - ScoreBase = ScoreBase; + ScoreBase = ScoreAllocation; // copy the allocation's entries for (InstCount I = 0; I < CurrentSize; ++I) { @@ -103,7 +103,7 @@ ACOReadyList &ACOReadyList::operator=(const ACOReadyList &Other) { InstrBase = IntAllocation; ReadyOnBase = IntAllocation + CurrentCapacity; HeurBase = HeurAllocation; - ScoreBase = ScoreBase; + ScoreBase = ScoreAllocation; // copy over the allocation's entries for (InstCount I = 0; I < CurrentSize; ++I) { From 10e512ff31bf32430c9aeb17e838efe72b10798a Mon Sep 17 00:00:00 2001 From: Paul McHugh Date: Tue, 27 Jul 2021 00:51:54 -0700 Subject: [PATCH 4/7] fixed bugs and silly dedundant code --- lib/Scheduler/aco.cpp | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/lib/Scheduler/aco.cpp b/lib/Scheduler/aco.cpp index f716e974..1c5a424f 100644 --- a/lib/Scheduler/aco.cpp +++ b/lib/Scheduler/aco.cpp @@ -52,7 +52,7 @@ ACOScheduler::ACOScheduler(DataDepGraph *dataDepGraph, MachineModel *machineModel, InstCount upperBound, SchedPriorities priorities, bool vrfySched, bool IsPostBB) - : ConstrainedScheduler(dataDepGraph, machineModel, upperBound) { + : ConstrainedScheduler(dataDepGraph, machineModel, upperBound, true) { VrfySched_ = vrfySched; this->IsPostBB = IsPostBB; prirts_ = priorities; @@ -274,6 +274,7 @@ InstCount ACOScheduler::SelectInstruction(SchedInstruction *lastInst) { std::unique_ptr ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { SchedInstruction *lastInst = NULL; + ACOReadyListEntry LastInstInfo; std::unique_ptr schedule = llvm::make_unique(machMdl_, dataDepGraph_, true); @@ -300,18 +301,25 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { // there are two steps to scheduling an instruction: // 1)Select the instruction(if we are not waiting on another instruction) SchedInstruction *inst = NULL; - if (waitFor) { - - if (ReadyLs.getReadyListSize()) { - InstCount SelIndx = SelectInstruction(lastInst); - waitUntil = *ReadyLs.getInstReadyOnAtIndex(SelIndx); - InstCount InstId = *ReadyLs.getInstIdAtIndex(SelIndx); - inst = dataDepGraph_->GetInstByIndx(InstId); - if (waitUntil > crntCycleNum_ || !ChkInstLglty_(inst)) { - waitFor = inst; - inst = NULL; - } + if (!waitFor) { + // If an instruction is ready select it + assert(ReadyLs.getReadyListSize()); // we should always have something in the rl + + // select the instruction and get info on it + InstCount SelIndx = SelectInstruction(lastInst); + LastInstInfo = ReadyLs.removeInstructionAtIndex(SelIndx); + waitUntil = LastInstInfo.ReadyOn; + InstCount InstId = LastInstInfo.InstId; + inst = dataDepGraph_->GetInstByIndx(InstId); + ReadyLs.ScoreSum -= LastInstInfo.Score; + + // potentially wait on the current instruction + if (waitUntil > crntCycleNum_ || !ChkInstLglty_(inst)) { + waitFor = inst; + inst = NULL; } + + // if (inst != NULL) { #if USE_ACS // local pheromone decay @@ -555,8 +563,8 @@ void ACOScheduler::UpdateACOReadyList(SchedInstruction *Inst) { if (wasLastPrdcsr) { // Add this successor to the first-ready list of the future cycle // in which we now know it will become ready - HeurType HeurWOLuc = KHelper.computeKey(Inst, false); - ReadyLs.addInstructionToReadyList(ACOReadyListEntry{InstId, scsrRdyCycle, HeurWOLuc, 0}); + HeurType HeurWOLuc = KHelper.computeKey(crntScsr, false); + ReadyLs.addInstructionToReadyList(ACOReadyListEntry{crntScsr->GetNum(), scsrRdyCycle, HeurWOLuc, 0}); } } @@ -570,14 +578,15 @@ void ACOScheduler::UpdateACOReadyList(SchedInstruction *Inst) { //we first get the heuristic without the LUC component, add the LUC //LUC component, and then compute the score HeurType Heur = *ReadyLs.getInstHeuristicAtIndex(I); + InstCount CandidateId = *ReadyLs.getInstIdAtIndex(I); if (LUCEntry.Width) { - HeurType LUCVal = Inst->CmputLastUseCnt(); + SchedInstruction *ScsrInst = dataDepGraph_->GetInstByIndx(CandidateId); + HeurType LUCVal = ScsrInst->CmputLastUseCnt(); LUCVal <<= LUCEntry.Offset; Heur &= LUCVal; } // compute the score - InstCount CandidateId = *ReadyLs.getInstIdAtIndex(I); pheromone_t IScore = Score(InstId, CandidateId, Heur); ReadyLs.ScoreSum += IScore; *ReadyLs.getInstScoreAtIndex(I) = IScore; From 2bb98111ba2b589425d6b341bd056c851c89a3d5 Mon Sep 17 00:00:00 2001 From: Paul McHugh Date: Tue, 27 Jul 2021 02:13:10 -0700 Subject: [PATCH 5/7] fixed bug --- CMakeLists.txt | 2 ++ lib/Scheduler/ready_list.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b0eab82..c045bb3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,8 @@ endif() # If asserts are enabled opt-sched must be built with "IS_DEBUG". set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DIS_DEBUG") +add_compile_options(-g) + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp index 15a57338..cbab958b 100644 --- a/lib/Scheduler/ready_list.cpp +++ b/lib/Scheduler/ready_list.cpp @@ -113,6 +113,7 @@ HeurType KeysHelper::computeKey(SchedInstruction *Inst, bool IncludeDynamic) con Key <<= Entries[Heur].Width; Key |= PriorityValue; } + return Key; } HeurType KeysHelper::computeKey(const uint64_t *Values) const { From edbedbf451baaac651e40aa3f30dfda830e0262d Mon Sep 17 00:00:00 2001 From: Paul McHugh Date: Wed, 28 Jul 2021 04:57:36 -0700 Subject: [PATCH 6/7] fixed intermittent crashes --- lib/Scheduler/aco.cpp | 10 ++++++---- lib/Scheduler/simplified_aco_ds.cpp | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/Scheduler/aco.cpp b/lib/Scheduler/aco.cpp index 1c5a424f..95cc710b 100644 --- a/lib/Scheduler/aco.cpp +++ b/lib/Scheduler/aco.cpp @@ -268,7 +268,7 @@ InstCount ACOScheduler::SelectInstruction(SchedInstruction *lastInst) { } std::cerr << "returning last instruction" << std::endl; assert(Point < 0.001); // floats should not be this inaccurate - return ReadyLs.getReadyListSize(); + return ReadyLs.getReadyListSize() - 1; } std::unique_ptr @@ -293,8 +293,11 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { // The luc component is 0 since the root inst uses no instructions InstCount RootId = rootInst_->GetNum(); HeurType RootHeuristic = KHelper.computeKey(rootInst_, true); - ACOReadyListEntry InitialRoot{RootId, 0, RootHeuristic, Score(-1, RootId, RootHeuristic)}; + pheromone_t RootScore = Score(-1, RootId, RootHeuristic); + ACOReadyListEntry InitialRoot{RootId, 0, RootHeuristic, RootScore}; ReadyLs.addInstructionToReadyList(InitialRoot); + ReadyLs.ScoreSum = RootScore; + MaxScoringInst = 0; while (!IsSchedComplete_()) { @@ -311,7 +314,6 @@ ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { waitUntil = LastInstInfo.ReadyOn; InstCount InstId = LastInstInfo.InstId; inst = dataDepGraph_->GetInstByIndx(InstId); - ReadyLs.ScoreSum -= LastInstInfo.Score; // potentially wait on the current instruction if (waitUntil > crntCycleNum_ || !ChkInstLglty_(inst)) { @@ -595,7 +597,7 @@ void ACOScheduler::UpdateACOReadyList(SchedInstruction *Inst) { MaxScore = IScore; } } - MaxScoringInst = MaxScore; + MaxScoringInst = MaxScoreIndx; } // copied from Enumerator diff --git a/lib/Scheduler/simplified_aco_ds.cpp b/lib/Scheduler/simplified_aco_ds.cpp index 37774256..abb37a1a 100644 --- a/lib/Scheduler/simplified_aco_ds.cpp +++ b/lib/Scheduler/simplified_aco_ds.cpp @@ -230,7 +230,7 @@ void ACOReadyList::addInstructionToReadyList(const ACOReadyListEntry &Entry) { // then we decrement the Ready List's CurrentSize // This function has undefined behavior if CurrentSize == 0 ACOReadyListEntry ACOReadyList::removeInstructionAtIndex(InstCount Indx) { - assert(CurrentSize != 0); + assert(CurrentSize <= 0 || Indx >= CurrentSize || Indx < 0); ACOReadyListEntry E{InstrBase[Indx], ReadyOnBase[Indx], HeurBase[Indx], ScoreBase[Indx]}; InstCount EndIndx = --CurrentSize; InstrBase[Indx] = InstrBase[EndIndx]; From a2e359973ec58aef1f2c135ca61f4b18fa1ac0a0 Mon Sep 17 00:00:00 2001 From: Paul McHugh Date: Thu, 29 Jul 2021 00:00:09 -0700 Subject: [PATCH 7/7] fixed rare memory bug --- lib/Scheduler/simplified_aco_ds.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Scheduler/simplified_aco_ds.cpp b/lib/Scheduler/simplified_aco_ds.cpp index abb37a1a..24496aed 100644 --- a/lib/Scheduler/simplified_aco_ds.cpp +++ b/lib/Scheduler/simplified_aco_ds.cpp @@ -182,7 +182,7 @@ void ACOReadyList::addInstructionToReadyList(const ACOReadyListEntry &Entry) { // The expansion formula is to make the new allocation 1.5 times the size of the old one // consider making this formula more aggressive int NewCap = OldCap + OldCap/2 + 1; - InstCount *NewIntFallback = new InstCount[NewCap]; + InstCount *NewIntFallback = new InstCount[2*NewCap]; HeurType *NewHeurFallback = new HeurType[NewCap]; pheromone_t *NewScoreFallback = new pheromone_t[NewCap];