From db0d2fa23f79a2d212a11a9e42687249e3a8e9e4 Mon Sep 17 00:00:00 2001 From: mysoreanoop Date: Sat, 25 Nov 2023 22:25:24 +0100 Subject: [PATCH] 1 Bug fix a missing write to memory on DMAWr 2 Parameterize Optimization 3: no WB clean victims (risky, but worth testing) 3 Add dedicated debug flags for the 3 (so far) optimizations 4 Clearer comments --- configs/ruby/GPU_VIPER.py | 5 +- src/gpu-compute/shader.cc | 18 +-- src/mem/SConscript | 3 + src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 127 +++++++++----------- src/mem/slicc/symbols/StateMachine.py | 3 + 5 files changed, 78 insertions(+), 78 deletions(-) diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py index a60288923e..d528b02f1b 100644 --- a/configs/ruby/GPU_VIPER.py +++ b/configs/ruby/GPU_VIPER.py @@ -554,7 +554,8 @@ def construct_dirs(options, system, ruby_system, network): dir_cntrl = DirCntrl(noTCCdir=True, TCC_select_num_bits=TCC_bits) dir_cntrl.create(options, dir_ranges, ruby_system, system) dir_cntrl.number_of_TBEs = options.num_tbes - dir_cntrl.useL3OnWT = True + dir_cntrl.useL3OnWT = options.use_L3_on_WT + dir_cntrl.noWBCleanVictims = options.no_WB_clean_victims dir_cntrl.L2isWB = options.WB_L2 # the number_of_TBEs is inclusive of TBEs below @@ -618,7 +619,7 @@ def construct_gpudirs(options, system, ruby_system, network): dir_cntrl = DirCntrl(noTCCdir=True, TCC_select_num_bits=TCC_bits) dir_cntrl.create(options, [addr_range], ruby_system, system) dir_cntrl.number_of_TBEs = options.num_tbes - dir_cntrl.useL3OnWT = True + dir_cntrl.useL3OnWT = False dir_cntrl.L2isWB = options.WB_L2 # Connect the Directory controller to the ruby network diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index fa7ca3c4c7..ad736ee0b4 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -463,12 +463,12 @@ Shader::sampleStore(const Tick accessTime, bool isAtomic, const Tick t, bool isS { stats.storeLatencyDist.sample(accessTime); stats.allLatencyDist.sample(accessTime); - if(isAtomic) - std::cout << "ATOMIC_ST_LAT " << accessTime << " " << t << std::endl; - else if(isSync) - std::cout << "SYNC_LAT " << accessTime << " " << t << std::endl; - else - std::cout << "STORE_LAT " << accessTime << " " << t << std::endl; + //if(isAtomic) + // std::cout << "ATOMIC_ST_LAT " << accessTime << " " << t << std::endl; + //else if(isSync) + // std::cout << "SYNC_LAT " << accessTime << " " << t << std::endl; + //else + // std::cout << "STORE_LAT " << accessTime << " " << t << std::endl; } /* @@ -479,9 +479,9 @@ Shader::sampleLoad(const Tick accessTime, bool isAtomic, const Tick t) { stats.loadLatencyDist.sample(accessTime); stats.allLatencyDist.sample(accessTime); - std::cout << "LOAD_LAT " << accessTime << " " << t << std::endl; - if(isAtomic) - std::cout << "ATOMIC_LD_LAT " << accessTime << " " << t << std::endl; + //std::cout << "LOAD_LAT " << accessTime << " " << t << std::endl; + //if(isAtomic) + // std::cout << "ATOMIC_LD_LAT " << accessTime << " " << t << std::endl; } void diff --git a/src/mem/SConscript b/src/mem/SConscript index e2a91146d0..50a902deca 100644 --- a/src/mem/SConscript +++ b/src/mem/SConscript @@ -166,3 +166,6 @@ DebugFlag('TokenPort') DebugFlag("MemChecker") DebugFlag("MemCheckerMonitor") DebugFlag("QOS") +DebugFlag("OPT1") +DebugFlag("OPT2") +DebugFlag("OPT3") diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 43fe461fab..6b20d4f07d 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -39,6 +39,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") bool GPUonly := "False"; int TCC_select_num_bits; bool useL3OnWT := "False"; + bool noWBCleanVictims := "False"; bool L2isWB; Cycles to_memory_controller_latency := 1; @@ -452,7 +453,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") action(s_sendResponseS, "s", desc="send Shared response") { if(!tbe.responded) { - DPRINTF(RubySlicc, "NotEarly: Responding to RdBlkS\n", tbe.TBEState); + DPRINTF(OPT1, "NotEarly: Responding to RdBlkS\n"); enqueue(responseNetwork_out, ResponseMsg, response_latency) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:NBSysResp; @@ -478,7 +479,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") action(es_sendResponseES, "es", desc="send Exclusive or Shared response") { if(!tbe.responded) { - DPRINTF(RubySlicc, "NotEarly: Responding to RdBlk\n", tbe.TBEState); + DPRINTF(OPT1, "NotEarly: Responding to RdBlk\n"); enqueue(responseNetwork_out, ResponseMsg, response_latency) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:NBSysResp; @@ -510,7 +511,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") action(m_sendResponseM, "m", desc="send Modified response") { if(!tbe.responded) { - DPRINTF(RubySlicc, "NotEarly: Responding to RdBlkM\n", tbe.TBEState); + DPRINTF(OPT1, "NotEarly: Responding to RdBlkM\n"); if (tbe.wtData) { enqueue(triggerQueue_out, TriggerMsg, 1) { out_msg.addr := address; @@ -965,42 +966,23 @@ machine(MachineType:Directory, "AMD Baseline protocol") action(d_writeDataToMemory, "d", desc="Write data to memory") { peek(responseNetwork_in, ResponseMsg) { - // here too memory write can be saved if L2 victim not dirty + // here too, memory write can be saved; only writeback to L3 + // this action is entirely just a stat recorder if (in_msg.Dirty) { - DPRINTF(RubySlicc, "Lazymuth: L2 victim dirty, still not immediately writing back\n"); - //enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { - // out_msg.addr := address; - // out_msg.Type := MemoryRequestType:MEMORY_WB; - // out_msg.Sender := machineID; - // out_msg.MessageSize := MessageSizeType:Writeback_Data; - // out_msg.DataBlk := in_msg.DataBlk; - //} // tbe.Dirty is gratuitous here, but functional writes may apply - tbe.Dirty := true; // gratuitous - // PS we're not writing back in case of !in_msg.Dirty, - // Since the original FSM does not wait for the WBAck, - // we don't need to either (TODO safe?) - // if needed, use the trigger queue as so: - // enqueue(triggerQueue_out, TriggerMsg, 1) { - // out_msg.addr := address; - // out_msg.Type := TriggerType:WriteDone; - // // this is different from the actual usage of WriteDone - // } - - } else { - DPRINTF(RubySlicc, "Lazymuth: L2 victim clean, saved a write back (or 2)\n"); - assert (tbe.Dirty == false); //default unchanged - // we are skipping writing to the memory if victim is clean + // subsequent L3 write will still read dirtyness from in_msg + tbe.Dirty := true; // TODO: If an intermediate GPU write has dirtied L3, would the // victim be written back without incorporating GPU's stuff? // The way it's done now, that's how it is - + } else { // have to update the TBE, too, because of how this - // directory deals with functional writes -- TODO what?? - // gratuitous? + // directory deals with functional writes + // TODO gratuitous? tbe.DataBlk := in_msg.DataBlk; - tbe.Dirty := false;// overrides the dirty bit + tbe.Dirty := tbe.Dirty || in_msg.Dirty; } + DPRINTF(OPT2, "MemWrite saved; only L3 will be updated\n"); } } @@ -1031,6 +1013,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") tbe.Dirty := false; tbe.Len := in_msg.Len; if (in_msg.Type == DMARequestType:WRITE) { + tbe.TBEState := State:BDW_PM; tbe.wtData := true; tbe.Dirty := true; tbe.DataBlk := in_msg.DataBlk; @@ -1119,17 +1102,22 @@ machine(MachineType:Directory, "AMD Baseline protocol") } } // Skipping writing to memory blindly, a compulsorily following - // action will determine if it's necessary to write back data. - // The above action on atomicData will still be needed. - - //enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { - // out_msg.addr := address; - // out_msg.Type := MemoryRequestType:MEMORY_WB; - // out_msg.Sender := machineID; - // out_msg.MessageSize := MessageSizeType:Writeback_Data; - // out_msg.DataBlk := tbe.DataBlk; - // DPRINTF(ProtocolTrace, "%s\n", out_msg); - //} + // action (alwt) will determine if it's necessary to write back data. + // Exception: DMA writes -- they do not write to L3, so memory writeback + // might be necessary for those; + // the above action on atomicData will still be needed though + if (tbe.TBEState == State:BDW_PM) { + enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { + out_msg.addr := address; + out_msg.Type := MemoryRequestType:MEMORY_WB; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + out_msg.DataBlk := tbe.DataBlk; + DPRINTF(ProtocolTrace, "%s\n", out_msg); + } + } else { + DPRINTF(OPT2, "MemWrite saved\n"); + } } } @@ -1181,7 +1169,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") action(edc_earlyDataToCore, "edc", desc="early data to core") { if(tbe.readyToResp && !tbe.responded) { if (tbe.TBEState == State:BM_PM || tbe.TBEState == State:BM_Pm) { - DPRINTF(RubySlicc, "Early: Responding to RdBlkM/WT/Atomic\n", tbe.TBEState); + DPRINTF(OPT1, "Early: Responding to RdBlkM/WT/Atomic at %d probes\n", tbe.NumPendingAcks); /* this would otherwise have been done before * responding to the core, as part of writing back to memory * since we are skipping that part now, we need to do that here @@ -1228,7 +1216,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") } } } else if (tbe.TBEState == State:BS_PM || tbe.TBEState == State:BS_Pm) { - DPRINTF(RubySlicc, "Early: Responding to RdBlkS\n", tbe.TBEState); + DPRINTF(OPT1, "Early: Responding to RdBlkS at %d probes\n", tbe.NumPendingAcks); enqueue(responseNetwork_out, ResponseMsg, response_latency) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:NBSysResp; @@ -1250,7 +1238,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") DPRINTF(RubySlicc, "%s\n", out_msg); } } else if (tbe.TBEState == State:B_PM || tbe.TBEState == State:B_Pm) { - DPRINTF(RubySlicc, "Early: Responding to RdBlk\n", tbe.TBEState); + DPRINTF(OPT1, "Early: Responding to RdBlk at %d probes\n", tbe.NumPendingAcks); enqueue(responseNetwork_out, ResponseMsg, response_latency) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:NBSysResp; @@ -1312,12 +1300,22 @@ machine(MachineType:Directory, "AMD Baseline protocol") } action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { - // this is for when a victim block is being processed - // we have already written to memory if in_msg.Dirty - // rewrite L3 iff in_msg.Dirty -- best case, this saves a memory read - // worst case, this may need to be evicted on a critical path + /* optimization 3 (risky!): not writing back clean victims to L3 + * naively, it's easy to see how it would perform worse because the clean victims are + * lost "in the air" as L3 -- being a victim cache -- would not have cached it on the + * refill path, and the next time a read/write miss is incurred for those lines, + * directory will now need to fetch from memory. My argument is that if there's ever + * data that needs to be evicted from CPU to GPU, what matters is the data that CPU + * modifies (victims will hence be dirty). I don't see any reason why CPU would read + * a bunch and not modify it and evict it and the GPU ends up needing it, unless it's an + * atomic variable, which is handled differently anyway. + * Even data that's evicted as an unintended consequence + * (like capacity-induced writeback) is also not readily evident if needed by the GPU + * best case, this saves an L3 write, and potentially a memory write, pollution + * worst case, if GPU ends up needing it, it would then have the dir fetch from mem + */ peek(responseNetwork_in, ResponseMsg) { - if (in_msg.Dirty) { + if (!noWBCleanVictims || (noWBCleanVictims && in_msg.Dirty)) { if (L3CacheMemory.isTagPresent(address)) { CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); @@ -1327,7 +1325,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") // (due to a prior dirty write, not yet written back) // retain the dirty bit, even if this victim is clean // TODO does the consistency model allow this? - entry.Dirty := true; + entry.Dirty := true; // overrides dirtyness assert(is_valid(tbe)); //The controller always allocates a TBE entry upon receipt of a request from L2 caches. //L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss. @@ -1347,11 +1345,10 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := victim_entry.DataBlk; } - DPRINTF(RubySlicc, "L3VictimWriteBack NotSaved\n"); } else { // do not write-back, directly deallocate // since it's guaranteed memory has not diverged from L3 - DPRINTF(RubySlicc, "L3VictimWriteBack Saved\n"); + DPRINTF(OPT2, "MemWrite saved\n"); } L3CacheMemory.deallocate(victim); } @@ -1361,17 +1358,14 @@ machine(MachineType:Directory, "AMD Baseline protocol") entry.DataBlk := in_msg.DataBlk; entry.LastSender := in_msg.Sender; - entry.Dirty := true; + entry.Dirty := true; // fresh entry, explicitify dirtiness } - DPRINTF(RubySlicc, "L2VictimWriteToL3 Happened\n"); - } else { - // do not allocate L3 block - // best case: the original L3 from which the victim was originally sourced - // has not been evicted, so a reallocation is redundant - // worst case: functionally correct, but a missed free caching opportunity - DPRINTF(RubySlicc, "L2VictimWriteToL3 DidNotHappen\n"); + DPRINTF(OPT3, "L2DirtyVictimToL3\n"); + } else { // never entered on !noWBCleanVictims + DPRINTF(OPT3, "NoL2CleanVictimToL3\n"); + // TODO how to capture the consequential read misses? } - } + } } action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") { @@ -1392,7 +1386,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") entry.LastSender := tbe.LastSender; // this ensures dirty probes are recognized // and also carries forward the original dirtyness of the line - entry.Dirty := true; // override + entry.Dirty := entry.Dirty || tbe.Dirty; + // atomic can be just a read, so not necessarily dirty } else { if (L3CacheMemory.cacheAvail(address) == false) { Addr victim := L3CacheMemory.cacheProbe(address); @@ -1413,12 +1408,11 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := victim_entry.DataBlk; } - DPRINTF(RubySlicc, "L3VictimWriteBack NotSaved\n"); // TODO do we need to wait for WBAck? } else { // else, drop WB because memory and clean L3 are // by definition reconciled - DPRINTF(RubySlicc, "L3VictimWriteBack Saved\n"); + DPRINTF(OPT2, "MemWrite saved\n"); } L3CacheMemory.deallocate(victim); } @@ -1427,8 +1421,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); entry.DataBlk := tbe.DataBlk; entry.LastSender := tbe.LastSender; - assert(tbe.Dirty == true); - entry.Dirty := tbe.Dirty; + entry.Dirty := tbe.Dirty; // same reason as in the L3 victimization case above /* because the new allocation can now be dirty, the memory writeback can be skipped * Next time an L3 line is victimized, it will only be written back iff also dirty */ diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py index 68a1a6a8af..f2b26bf494 100644 --- a/src/mem/slicc/symbols/StateMachine.py +++ b/src/mem/slicc/symbols/StateMachine.py @@ -129,6 +129,9 @@ def __init__(self, symtab, ident, location, pairs, config_parameters): self.debug_flags = set() self.debug_flags.add("RubyGenerated") self.debug_flags.add("RubySlicc") + self.debug_flags.add("OPT1") + self.debug_flags.add("OPT2") + self.debug_flags.add("OPT3") def __repr__(self): return f"[StateMachine: {self.ident}]"