Skip to content

Commit

Permalink
1 Bug fix a missing write to memory on DMAWr
Browse files Browse the repository at this point in the history
2	Parameterize Optimization 3: no WB clean victims (risky, but worth testing)
3	Add dedicated debug flags for the 3 (so far) optimizations
4	Clearer comments
  • Loading branch information
mysoreanoop committed Nov 25, 2023
1 parent 86ffd92 commit db0d2fa
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 78 deletions.
5 changes: 3 additions & 2 deletions configs/ruby/GPU_VIPER.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,8 @@ def construct_dirs(options, system, ruby_system, network):
dir_cntrl = DirCntrl(noTCCdir=True, TCC_select_num_bits=TCC_bits)
dir_cntrl.create(options, dir_ranges, ruby_system, system)
dir_cntrl.number_of_TBEs = options.num_tbes
dir_cntrl.useL3OnWT = True
dir_cntrl.useL3OnWT = options.use_L3_on_WT
dir_cntrl.noWBCleanVictims = options.no_WB_clean_victims
dir_cntrl.L2isWB = options.WB_L2
# the number_of_TBEs is inclusive of TBEs below

Expand Down Expand Up @@ -618,7 +619,7 @@ def construct_gpudirs(options, system, ruby_system, network):
dir_cntrl = DirCntrl(noTCCdir=True, TCC_select_num_bits=TCC_bits)
dir_cntrl.create(options, [addr_range], ruby_system, system)
dir_cntrl.number_of_TBEs = options.num_tbes
dir_cntrl.useL3OnWT = True
dir_cntrl.useL3OnWT = False
dir_cntrl.L2isWB = options.WB_L2

# Connect the Directory controller to the ruby network
Expand Down
18 changes: 9 additions & 9 deletions src/gpu-compute/shader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -463,12 +463,12 @@ Shader::sampleStore(const Tick accessTime, bool isAtomic, const Tick t, bool isS
{
stats.storeLatencyDist.sample(accessTime);
stats.allLatencyDist.sample(accessTime);
if(isAtomic)
std::cout << "ATOMIC_ST_LAT " << accessTime << " " << t << std::endl;
else if(isSync)
std::cout << "SYNC_LAT " << accessTime << " " << t << std::endl;
else
std::cout << "STORE_LAT " << accessTime << " " << t << std::endl;
//if(isAtomic)
// std::cout << "ATOMIC_ST_LAT " << accessTime << " " << t << std::endl;
//else if(isSync)
// std::cout << "SYNC_LAT " << accessTime << " " << t << std::endl;
//else
// std::cout << "STORE_LAT " << accessTime << " " << t << std::endl;
}

/*
Expand All @@ -479,9 +479,9 @@ Shader::sampleLoad(const Tick accessTime, bool isAtomic, const Tick t)
{
stats.loadLatencyDist.sample(accessTime);
stats.allLatencyDist.sample(accessTime);
std::cout << "LOAD_LAT " << accessTime << " " << t << std::endl;
if(isAtomic)
std::cout << "ATOMIC_LD_LAT " << accessTime << " " << t << std::endl;
//std::cout << "LOAD_LAT " << accessTime << " " << t << std::endl;
//if(isAtomic)
// std::cout << "ATOMIC_LD_LAT " << accessTime << " " << t << std::endl;
}

void
Expand Down
3 changes: 3 additions & 0 deletions src/mem/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,6 @@ DebugFlag('TokenPort')
DebugFlag("MemChecker")
DebugFlag("MemCheckerMonitor")
DebugFlag("QOS")
DebugFlag("OPT1")
DebugFlag("OPT2")
DebugFlag("OPT3")
127 changes: 60 additions & 67 deletions src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
bool GPUonly := "False";
int TCC_select_num_bits;
bool useL3OnWT := "False";
bool noWBCleanVictims := "False";
bool L2isWB;
Cycles to_memory_controller_latency := 1;

Expand Down Expand Up @@ -452,7 +453,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")

action(s_sendResponseS, "s", desc="send Shared response") {
if(!tbe.responded) {
DPRINTF(RubySlicc, "NotEarly: Responding to RdBlkS\n", tbe.TBEState);
DPRINTF(OPT1, "NotEarly: Responding to RdBlkS\n");
enqueue(responseNetwork_out, ResponseMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:NBSysResp;
Expand All @@ -478,7 +479,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")

action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
if(!tbe.responded) {
DPRINTF(RubySlicc, "NotEarly: Responding to RdBlk\n", tbe.TBEState);
DPRINTF(OPT1, "NotEarly: Responding to RdBlk\n");
enqueue(responseNetwork_out, ResponseMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:NBSysResp;
Expand Down Expand Up @@ -510,7 +511,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")

action(m_sendResponseM, "m", desc="send Modified response") {
if(!tbe.responded) {
DPRINTF(RubySlicc, "NotEarly: Responding to RdBlkM\n", tbe.TBEState);
DPRINTF(OPT1, "NotEarly: Responding to RdBlkM\n");
if (tbe.wtData) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
Expand Down Expand Up @@ -965,42 +966,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")

action(d_writeDataToMemory, "d", desc="Write data to memory") {
peek(responseNetwork_in, ResponseMsg) {
// here too memory write can be saved if L2 victim not dirty
// here too, memory write can be saved; only writeback to L3
// this action is entirely just a stat recorder
if (in_msg.Dirty) {
DPRINTF(RubySlicc, "Lazymuth: L2 victim dirty, still not immediately writing back\n");
//enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
// out_msg.addr := address;
// out_msg.Type := MemoryRequestType:MEMORY_WB;
// out_msg.Sender := machineID;
// out_msg.MessageSize := MessageSizeType:Writeback_Data;
// out_msg.DataBlk := in_msg.DataBlk;
//}
// tbe.Dirty is gratuitous here, but functional writes may apply
tbe.Dirty := true; // gratuitous
// PS we're not writing back in case of !in_msg.Dirty,
// Since the original FSM does not wait for the WBAck,
// we don't need to either (TODO safe?)
// if needed, use the trigger queue as so:
// enqueue(triggerQueue_out, TriggerMsg, 1) {
// out_msg.addr := address;
// out_msg.Type := TriggerType:WriteDone;
// // this is different from the actual usage of WriteDone
// }

} else {
DPRINTF(RubySlicc, "Lazymuth: L2 victim clean, saved a write back (or 2)\n");
assert (tbe.Dirty == false); //default unchanged
// we are skipping writing to the memory if victim is clean
// subsequent L3 write will still read dirtyness from in_msg
tbe.Dirty := true;
// TODO: If an intermediate GPU write has dirtied L3, would the
// victim be written back without incorporating GPU's stuff?
// The way it's done now, that's how it is

} else {
// have to update the TBE, too, because of how this
// directory deals with functional writes -- TODO what??
// gratuitous?
// directory deals with functional writes
// TODO gratuitous?
tbe.DataBlk := in_msg.DataBlk;
tbe.Dirty := false;// overrides the dirty bit
tbe.Dirty := tbe.Dirty || in_msg.Dirty;
}
DPRINTF(OPT2, "MemWrite saved; only L3 will be updated\n");
}
}

Expand Down Expand Up @@ -1031,6 +1013,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
tbe.Dirty := false;
tbe.Len := in_msg.Len;
if (in_msg.Type == DMARequestType:WRITE) {
tbe.TBEState := State:BDW_PM;
tbe.wtData := true;
tbe.Dirty := true;
tbe.DataBlk := in_msg.DataBlk;
Expand Down Expand Up @@ -1119,17 +1102,22 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}
}
// Skipping writing to memory blindly, a compulsorily following
// action will determine if it's necessary to write back data.
// The above action on atomicData will still be needed.

//enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
// out_msg.addr := address;
// out_msg.Type := MemoryRequestType:MEMORY_WB;
// out_msg.Sender := machineID;
// out_msg.MessageSize := MessageSizeType:Writeback_Data;
// out_msg.DataBlk := tbe.DataBlk;
// DPRINTF(ProtocolTrace, "%s\n", out_msg);
//}
// action (alwt) will determine if it's necessary to write back data.
// Exception: DMA writes -- they do not write to L3, so memory writeback
// might be necessary for those;
// the above action on atomicData will still be needed though
if (tbe.TBEState == State:BDW_PM) {
enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
out_msg.addr := address;
out_msg.Type := MemoryRequestType:MEMORY_WB;
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := tbe.DataBlk;
DPRINTF(ProtocolTrace, "%s\n", out_msg);
}
} else {
DPRINTF(OPT2, "MemWrite saved\n");
}
}
}

Expand Down Expand Up @@ -1181,7 +1169,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
action(edc_earlyDataToCore, "edc", desc="early data to core") {
if(tbe.readyToResp && !tbe.responded) {
if (tbe.TBEState == State:BM_PM || tbe.TBEState == State:BM_Pm) {
DPRINTF(RubySlicc, "Early: Responding to RdBlkM/WT/Atomic\n", tbe.TBEState);
DPRINTF(OPT1, "Early: Responding to RdBlkM/WT/Atomic at %d probes\n", tbe.NumPendingAcks);
/* this would otherwise have been done before
* responding to the core, as part of writing back to memory
* since we are skipping that part now, we need to do that here
Expand Down Expand Up @@ -1228,7 +1216,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}
}
} else if (tbe.TBEState == State:BS_PM || tbe.TBEState == State:BS_Pm) {
DPRINTF(RubySlicc, "Early: Responding to RdBlkS\n", tbe.TBEState);
DPRINTF(OPT1, "Early: Responding to RdBlkS at %d probes\n", tbe.NumPendingAcks);
enqueue(responseNetwork_out, ResponseMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:NBSysResp;
Expand All @@ -1250,7 +1238,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
DPRINTF(RubySlicc, "%s\n", out_msg);
}
} else if (tbe.TBEState == State:B_PM || tbe.TBEState == State:B_Pm) {
DPRINTF(RubySlicc, "Early: Responding to RdBlk\n", tbe.TBEState);
DPRINTF(OPT1, "Early: Responding to RdBlk at %d probes\n", tbe.NumPendingAcks);
enqueue(responseNetwork_out, ResponseMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:NBSysResp;
Expand Down Expand Up @@ -1312,12 +1300,22 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}

action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
// this is for when a victim block is being processed
// we have already written to memory if in_msg.Dirty
// rewrite L3 iff in_msg.Dirty -- best case, this saves a memory read
// worst case, this may need to be evicted on a critical path
/* optimization 3 (risky!): not writing back clean victims to L3
* naively, it's easy to see how it would perform worse because the clean victims are
* lost "in the air" as L3 -- being a victim cache -- would not have cached it on the
* refill path, and the next time a read/write miss is incurred for those lines,
* directory will now need to fetch from memory. My argument is that if there's ever
* data that needs to be evicted from CPU to GPU, what matters is the data that CPU
* modifies (victims will hence be dirty). I don't see any reason why CPU would read
* a bunch and not modify it and evict it and the GPU ends up needing it, unless it's an
* atomic variable, which is handled differently anyway.
* Even data that's evicted as an unintended consequence
* (like capacity-induced writeback) is also not readily evident if needed by the GPU
* best case, this saves an L3 write, and potentially a memory write, pollution
* worst case, if GPU ends up needing it, it would then have the dir fetch from mem
*/
peek(responseNetwork_in, ResponseMsg) {
if (in_msg.Dirty) {
if (!noWBCleanVictims || (noWBCleanVictims && in_msg.Dirty)) {
if (L3CacheMemory.isTagPresent(address)) {
CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
Expand All @@ -1327,7 +1325,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
// (due to a prior dirty write, not yet written back)
// retain the dirty bit, even if this victim is clean
// TODO does the consistency model allow this?
entry.Dirty := true;
entry.Dirty := true; // overrides dirtyness
assert(is_valid(tbe));
//The controller always allocates a TBE entry upon receipt of a request from L2 caches.
//L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss.
Expand All @@ -1347,11 +1345,10 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := victim_entry.DataBlk;
}
DPRINTF(RubySlicc, "L3VictimWriteBack NotSaved\n");
} else {
// do not write-back, directly deallocate
// since it's guaranteed memory has not diverged from L3
DPRINTF(RubySlicc, "L3VictimWriteBack Saved\n");
DPRINTF(OPT2, "MemWrite saved\n");
}
L3CacheMemory.deallocate(victim);
}
Expand All @@ -1361,17 +1358,14 @@ machine(MachineType:Directory, "AMD Baseline protocol")
entry.DataBlk := in_msg.DataBlk;

entry.LastSender := in_msg.Sender;
entry.Dirty := true;
entry.Dirty := true; // fresh entry, explicitify dirtiness
}
DPRINTF(RubySlicc, "L2VictimWriteToL3 Happened\n");
} else {
// do not allocate L3 block
// best case: the original L3 from which the victim was originally sourced
// has not been evicted, so a reallocation is redundant
// worst case: functionally correct, but a missed free caching opportunity
DPRINTF(RubySlicc, "L2VictimWriteToL3 DidNotHappen\n");
DPRINTF(OPT3, "L2DirtyVictimToL3\n");
} else { // never entered on !noWBCleanVictims
DPRINTF(OPT3, "NoL2CleanVictimToL3\n");
// TODO how to capture the consequential read misses?
}
}
}
}

action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
Expand All @@ -1392,7 +1386,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
entry.LastSender := tbe.LastSender;
// this ensures dirty probes are recognized
// and also carries forward the original dirtyness of the line
entry.Dirty := true; // override
entry.Dirty := entry.Dirty || tbe.Dirty;
// atomic can be just a read, so not necessarily dirty
} else {
if (L3CacheMemory.cacheAvail(address) == false) {
Addr victim := L3CacheMemory.cacheProbe(address);
Expand All @@ -1413,12 +1408,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := victim_entry.DataBlk;
}
DPRINTF(RubySlicc, "L3VictimWriteBack NotSaved\n");
// TODO do we need to wait for WBAck?
} else {
// else, drop WB because memory and clean L3 are
// by definition reconciled
DPRINTF(RubySlicc, "L3VictimWriteBack Saved\n");
DPRINTF(OPT2, "MemWrite saved\n");
}
L3CacheMemory.deallocate(victim);
}
Expand All @@ -1427,8 +1421,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
entry.DataBlk := tbe.DataBlk;
entry.LastSender := tbe.LastSender;
assert(tbe.Dirty == true);
entry.Dirty := tbe.Dirty;
entry.Dirty := tbe.Dirty; // same reason as in the L3 victimization case above
/* because the new allocation can now be dirty, the memory writeback can be skipped
* Next time an L3 line is victimized, it will only be written back iff also dirty
*/
Expand Down
3 changes: 3 additions & 0 deletions src/mem/slicc/symbols/StateMachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ def __init__(self, symtab, ident, location, pairs, config_parameters):
self.debug_flags = set()
self.debug_flags.add("RubyGenerated")
self.debug_flags.add("RubySlicc")
self.debug_flags.add("OPT1")
self.debug_flags.add("OPT2")
self.debug_flags.add("OPT3")

def __repr__(self):
return f"[StateMachine: {self.ident}]"
Expand Down

0 comments on commit db0d2fa

Please sign in to comment.