Skip to content

Commit

Permalink
Merge pull request CSUS-LLVM#9 from balinck/HIP_ACO
Browse files Browse the repository at this point in the history
HIP_ACO smaller SchedInstruction & flattened arrays
  • Loading branch information
balinck authored Jan 13, 2023
2 parents b28c2c4 + eaa2f92 commit 7347d2b
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 131 deletions.
18 changes: 13 additions & 5 deletions include/opt-sched/Scheduler/bb_spill.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ class BBWithSpill : public SchedRegion {
int *sumOfLiveIntervalLengths_;
// pointer to a device array used to store sumOfLiveIntervalLengths_ for
// each thread by parallel ACO
int **dev_sumOfLiveIntervalLengths_;
// Indexed by register type * numThreads_ + GLOBALTID
int *dev_sumOfLiveIntervalLengths_;

InstCount staticSlilLowerBound_ = 0;

Expand All @@ -89,16 +90,19 @@ class BBWithSpill : public SchedRegion {
InstCount *spillCosts_;
// pointer to a device array used to store spillCosts_ for
// each thread by parallel ACO
InstCount **dev_spillCosts_;
// Indexed by instruction number * numThreads_ + GLOBALTID
InstCount *dev_spillCosts_;
// Current register pressure for each register type.
SmallVector<unsigned, 8> regPressures_;
// pointer to a device array used to store regPressures_ for
// each thread by parallel ACO
unsigned **dev_regPressures_;
// Indexed by register type * numThreads_ + GLOBALTID
unsigned *dev_regPressures_;
InstCount *peakRegPressures_;
// pointer to a device array used to store peakRegPressures_ for
// each thread by parallel ACO
InstCount **dev_peakRegPressures_;
// Indexed by register type * numThreads_ + GLOBALTID
InstCount *dev_peakRegPressures_;

InstCount crntStepNum_;
// pointer to a device array used to store crntStepNum_ for
Expand Down Expand Up @@ -178,6 +182,10 @@ class BBWithSpill : public SchedRegion {
MachineModel *dev_machMdl);
~BBWithSpill();

__device__
InstCount getAMDGPUCost(unsigned * PRP, unsigned TargetOccupancy,
unsigned MaxOccLDS, int16_t regTypeCnt);

InstCount CmputCostLwrBound();
InstCount CmputExecCostLwrBound();
InstCount CmputRPCostLwrBound();
Expand Down Expand Up @@ -233,7 +241,7 @@ class BBWithSpill : public SchedRegion {
__host__ __device__
bool IsRPHigh(int regType) const {
#ifdef __HIP_DEVICE_COMPILE__
return dev_regPressures_[regType][GLOBALTID] > (unsigned int) machMdl_->GetPhysRegCnt(regType);
return dev_regPressures_[regType*numThreads_+GLOBALTID] > (unsigned int) machMdl_->GetPhysRegCnt(regType);
#else
return regPressures_[regType] > (unsigned int) machMdl_->GetPhysRegCnt(regType);
#endif
Expand Down
27 changes: 25 additions & 2 deletions include/opt-sched/Scheduler/data_dep.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
__host__
virtual ~DataDepGraph();

void SetNumThreads(int numThreads);

//Prevent DDG from being abstract, these should not actually be invoked
virtual void convertSUnits(bool IgnoreRealEdges, bool IgnoreArtificialEdges) {
Logger::Fatal("Wrong convertSUnits called");
Expand Down Expand Up @@ -346,12 +348,26 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
Register *getRegByTuple(RegIndxTuple *tuple) {
return RegFiles[tuple->regType_].GetReg(tuple->regNum_);
}
__host__ __device__
RegIndxTuple *getUseByIndex(int index) {
return uses_ + index;
}

__host__ __device__
RegIndxTuple *getDefByIndex(int index) {
return defs_ + index;
}

int* scsrs_;
int* latencies_;
int* predOrder_;
RegIndxTuple* uses_;
RegIndxTuple* defs_;
int* ltncyPerPrdcsr_;

// Number of threads used by parallel ACO.
int numThreads_;

// Tracks all registers in the scheduling region. Each RegisterFile
// object holds all registers for a given register type.
RegisterFile *RegFiles;
Expand Down Expand Up @@ -742,6 +758,9 @@ class InstSchedule {
int totalStalls_, unnecessaryStalls_;
bool isZeroPerp_;

// Number of threads used by parallel ACO.
int numThreads_;

bool VerifySlots_(MachineModel *machMdl, DataDepGraph *dataDepGraph);
bool VerifyDataDeps_(DataDepGraph *dataDepGraph);
__host__ __device__
Expand Down Expand Up @@ -818,12 +837,12 @@ class InstSchedule {
void SetSpillCosts(InstCount *spillCosts);
// Device version of set spill costs
__device__
void Dev_SetSpillCosts(InstCount **spillCosts);
void Dev_SetSpillCosts(InstCount *spillCosts);
__host__ __device__
void SetPeakRegPressures(InstCount *regPressures);
// Device version of PeakRegPressures
__device__
void Dev_SetPeakRegPressures(InstCount **regPressures);
void Dev_SetPeakRegPressures(InstCount *regPressures);
InstCount GetPeakRegPressures(const InstCount *&regPressures) const;
__host__ __device__
InstCount GetSpillCost(InstCount stepNum);
Expand Down Expand Up @@ -853,6 +872,10 @@ class InstSchedule {
// Copies device arrays to host
void CopyArraysToHost();
void FreeDeviceArrays();

__host__ __device__
void SetNumThreads(int numThreads);

// Initializes schedules on device, used between iterations of ACO
__device__
void Initialize();
Expand Down
6 changes: 4 additions & 2 deletions include/opt-sched/Scheduler/gen_sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,13 +176,15 @@ class ConstrainedScheduler : public InstScheduler {
int16_t *avlblSlotsInCrntCycle_;
// pointer to a device array used to store avlblSlotsInCrntCycle_ for
// each thread by parallel ACO
int16_t **dev_avlblSlotsInCrntCycle_;
// Indexed by GLOBALTID * num issue types + issue type
int16_t *dev_avlblSlotsInCrntCycle_;

// The reserved scheduling slots.
ReserveSlot *rsrvSlots_;
// pointer to a device array used to store rsrvSlots_ for
// each thread by parallel ACO
ReserveSlot **dev_rsrvSlots_;
// Indexed by GLOBALTID * issue rate + issue slot number
ReserveSlot *dev_rsrvSlots_;
// The number of elements in rsrvSlots_.
int16_t rsrvSlotCnt_;
// pointer to a device array used to store rsrvSlotCnt_ for
Expand Down
20 changes: 12 additions & 8 deletions include/opt-sched/Scheduler/sched_basic_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -488,10 +488,10 @@ class SchedInstruction : public GraphNode {
void SetMustBeInBBExit(bool val);

// Add a register definition to this instruction node.
__host__ __device__
__host__
void AddDef(Register *reg);
// Add a register usage to this instruction node.
__host__ __device__
__host__
void AddUse(Register *reg);
// Returns whether this instruction defines the specified register.
__host__ __device__
Expand All @@ -502,11 +502,11 @@ class SchedInstruction : public GraphNode {

// Retrieves the list of registers defined by this node. The array is put
// into defs and the number of elements is returned.
__host__ __device__
__host__
int16_t GetDefs(RegIndxTuple *&defs);
// Retrieves the list of registers used by this node. The array is put
// into uses and the number of elements is returned.
__host__ __device__
__host__
int16_t GetUses(RegIndxTuple *&uses);

__host__ __device__
Expand All @@ -518,7 +518,7 @@ class SchedInstruction : public GraphNode {
__host__ __device__
int16_t GetAdjustedUseCnt() { return adjustedUseCnt_; }
// Computer the adjusted use count. Update "adjustedUseCnt_".
__host__ __device__
__host__
void ComputeAdjustedUseCnt(SchedInstruction *inst);

__host__ __device__
Expand Down Expand Up @@ -567,6 +567,10 @@ class SchedInstruction : public GraphNode {
// This instruction's index in the ltncyPerPrdcsr_ array in the DDG.
int ddgPredecessorIndex;

// This instruction's indices for the uses_ and defs_ arrays in the DDG.
int ddgUseIndex;
int ddgDefIndex;

__device__
int GetScsrCnt_();

Expand Down Expand Up @@ -705,11 +709,11 @@ class SchedInstruction : public GraphNode {
// Pointer to RegFiles
RegisterFile *RegFiles_;
// The registers defined by this instruction node.
RegIndxTuple defs_[MAX_DEFS_PER_INSTR];
RegIndxTuple *defs_;
// The number of elements in defs.
int16_t defCnt_;
// The registers used by this instruction node.
RegIndxTuple uses_[MAX_USES_PER_INSTR];
RegIndxTuple *uses_;
// The number of elements in uses.
int16_t useCnt_;
// The number of uses minus live-out registers. Live-out registers are uses
Expand Down Expand Up @@ -761,7 +765,7 @@ class SchedInstruction : public GraphNode {
__host__
void SetScsrNums_();
// Computer the adjusted use count. Update "adjustedUseCnt_".
__host__ __device__
__host__
void ComputeAdjustedUseCnt_();
};

Expand Down
4 changes: 4 additions & 0 deletions include/opt-sched/Scheduler/sched_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class SchedRegion {
// Destroys the region. Must be overriden by child classes.
virtual ~SchedRegion() {}

void SetNumThreads(int numThreads_);

// Returns the dependence graph of this region.
inline DataDepGraph *GetDepGraph() { return dataDepGraph_; }
//for updating DDG pointer to DDG created on device
Expand Down Expand Up @@ -187,6 +189,8 @@ class SchedRegion {
// Pointer to machMdl_ on the device
MachineModel *dev_machMdl_;

int numThreads_;

// The schedule currently used by the enumerator
InstSchedule *enumCrntSched_;
// The best schedule found by the enumerator so far
Expand Down
37 changes: 13 additions & 24 deletions lib/Scheduler/aco.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ ACOScheduler::ACOScheduler(DataDepGraph *dataDepGraph,
numThreads_ = numBlocks_ * NUMTHREADSPERBLOCK;
if(!DEV_ACO || count_ < REGION_MIN_SIZE)
numThreads_ = schedIni.GetInt("HOST_ANTS");
else {
dev_rgn_->SetNumThreads(numThreads_);
dev_DDG_->SetNumThreads(numThreads_);
}

use_fixed_bias = schedIni.GetBool("ACO_USE_FIXED_BIAS");
use_tournament = schedIni.GetBool("ACO_TOURNAMENT");
Expand Down Expand Up @@ -301,11 +305,13 @@ InstCount ACOScheduler::SelectInstruction(SchedInstruction *lastInst, InstCount

// check if any reg types used by the instructions are above the physical register limit
SchedInstruction *tempInst = dataDepGraph_->GetInstByIndx(*dev_readyLs->getInstIdAtIndex(I));
// TODO(bruce): convert to dev uses
RegIndxTuple *uses;
Register *use;
uint16_t usesCount = tempInst->GetUses(uses);
uint16_t usesCount = tempInst->GetUseCnt();
int useStart = tempInst->ddgUseIndex;
for (uint16_t i = 0; i < usesCount; i++) {
use = dataDepGraph_->getRegByTuple(&uses[i]);
use = dataDepGraph_->getRegByTuple(dataDepGraph_->getUseByIndex(useStart + i));
int16_t regType = use->GetType();
if ( ((BBWithSpill *)rgn)->IsRPHigh(regType) ) {
RPIsHigh = true;
Expand Down Expand Up @@ -536,6 +542,7 @@ InstSchedule *ACOScheduler::FindOneSchedule(InstCount RPTarget,
SchedInstruction *lastInst = NULL;
ACOReadyListEntry LastInstInfo;
InstSchedule *schedule = dev_schedule;
schedule->SetNumThreads(numThreads_);
bool IsSecondPass = dev_rgn_->IsSecondPass();
dev_readyLs->clearReadyList();
ScRelMax = dev_rgn_->GetHeuristicCost();
Expand Down Expand Up @@ -1854,20 +1861,11 @@ void ACOScheduler::AllocDevArraysForParallelACO() {
memSize = sizeof(InstCount) * numThreads_;
gpuErrchk(hipMalloc(&dev_MaxScoringInst, memSize));
// Alloc dev array for avlblSlotsInCrntCycle_
memSize = sizeof(int16_t *) * numThreads_;
gpuErrchk(hipMallocManaged(&dev_avlblSlotsInCrntCycle_, memSize));
// Alloc dev arrays of avlblSlotsInCrntCycle_ for each thread
memSize = sizeof(int16_t) * issuTypeCnt_;
for (int i = 0; i < numThreads_; i++) {
gpuErrchk(hipMalloc(&dev_avlblSlotsInCrntCycle_[i], memSize));
}
memSize = sizeof(int16_t) * issuTypeCnt_ * numThreads_;
gpuErrchk(hipMalloc(&dev_avlblSlotsInCrntCycle_, memSize));
// Alloc dev arrays for rsrvSlots_
memSize = sizeof(ReserveSlot *) * numThreads_;
gpuErrchk(hipMallocManaged(&dev_rsrvSlots_, memSize));
memSize = sizeof(ReserveSlot) * issuRate_;
for (int i = 0; i < numThreads_; i++) {
gpuErrchk(hipMalloc(&dev_rsrvSlots_[i], memSize));
}
memSize = sizeof(ReserveSlot) * issuRate_ * numThreads_;
gpuErrchk(hipMalloc(&dev_rsrvSlots_, memSize));
memSize = sizeof(int16_t) * numThreads_;
gpuErrchk(hipMalloc(&dev_rsrvSlotCnt_, memSize));
}
Expand Down Expand Up @@ -1921,11 +1919,6 @@ void ACOScheduler::CopyPointersToDevice(ACOScheduler *dev_ACOSchedulr) {
gpuErrchk(hipMalloc(&dev_ACOSchedulr->dev_kHelper, memSize));
gpuErrchk(hipMemcpy(dev_ACOSchedulr->dev_kHelper, kHelper, memSize,
hipMemcpyHostToDevice));
// make sure hipMallocManaged memory is copied to device before kernel start
memSize = sizeof(int16_t *) * numThreads_;
gpuErrchk(hipMemPrefetchAsync(dev_avlblSlotsInCrntCycle_, memSize, 0));
memSize = sizeof(ReserveSlot *) * numThreads_;
gpuErrchk(hipMemPrefetchAsync(dev_rsrvSlots_, memSize, 0));
}

void ACOScheduler::FreeDevicePointers() {
Expand All @@ -1935,10 +1928,6 @@ void ACOScheduler::FreeDevicePointers() {
hipFree(dev_isCrntCycleBlkd_);
hipFree(slotsPerTypePerCycle_);
hipFree(instCntPerIssuType_);
for (int i = 0; i < numThreads_; i++){
hipFree(dev_avlblSlotsInCrntCycle_[i]);
hipFree(dev_rsrvSlots_[i]);
}
hipFree(dev_MaxScoringInst);
readyLs->FreeDevicePointers();
hipFree(dev_avlblSlotsInCrntCycle_);
Expand Down
Loading

0 comments on commit 7347d2b

Please sign in to comment.