From db0d2fa23f79a2d212a11a9e42687249e3a8e9e4 Mon Sep 17 00:00:00 2001
From: mysoreanoop <mysanoop@gmail.com>
Date: Sat, 25 Nov 2023 22:25:24 +0100
Subject: [PATCH] 1	Bug fix a missing write to memory on DMAWr 2
 Parameterize Optimization 3: no WB clean victims (risky, but worth testing) 3
 Add dedicated debug flags for the 3 (so far) optimizations 4	Clearer
 comments

---
 configs/ruby/GPU_VIPER.py                   |   5 +-
 src/gpu-compute/shader.cc                   |  18 +--
 src/mem/SConscript                          |   3 +
 src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 127 +++++++++-----------
 src/mem/slicc/symbols/StateMachine.py       |   3 +
 5 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index a60288923e..d528b02f1b 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -554,7 +554,8 @@ def construct_dirs(options, system, ruby_system, network):
         dir_cntrl = DirCntrl(noTCCdir=True, TCC_select_num_bits=TCC_bits)
         dir_cntrl.create(options, dir_ranges, ruby_system, system)
         dir_cntrl.number_of_TBEs = options.num_tbes
-        dir_cntrl.useL3OnWT = True
+        dir_cntrl.useL3OnWT = options.use_L3_on_WT
+        dir_cntrl.noWBCleanVictims = options.no_WB_clean_victims
         dir_cntrl.L2isWB = options.WB_L2
         # the number_of_TBEs is inclusive of TBEs below
 
@@ -618,7 +619,7 @@ def construct_gpudirs(options, system, ruby_system, network):
         dir_cntrl = DirCntrl(noTCCdir=True, TCC_select_num_bits=TCC_bits)
         dir_cntrl.create(options, [addr_range], ruby_system, system)
         dir_cntrl.number_of_TBEs = options.num_tbes
-        dir_cntrl.useL3OnWT = True
+        dir_cntrl.useL3OnWT = False
         dir_cntrl.L2isWB = options.WB_L2
 
         # Connect the Directory controller to the ruby network
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index fa7ca3c4c7..ad736ee0b4 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -463,12 +463,12 @@ Shader::sampleStore(const Tick accessTime, bool isAtomic, const Tick t, bool isS
 {
     stats.storeLatencyDist.sample(accessTime);
     stats.allLatencyDist.sample(accessTime);
-    if(isAtomic)
-      std::cout << "ATOMIC_ST_LAT " << accessTime << " " <<  t << std::endl;
-    else if(isSync)
-      std::cout << "SYNC_LAT " << accessTime << " " <<  t << std::endl;
-    else
-      std::cout << "STORE_LAT " << accessTime << " " << t << std::endl;
+    //if(isAtomic)
+    //  std::cout << "ATOMIC_ST_LAT " << accessTime << " " <<  t << std::endl;
+    //else if(isSync)
+    //  std::cout << "SYNC_LAT " << accessTime << " " <<  t << std::endl;
+    //else
+    //  std::cout << "STORE_LAT " << accessTime << " " << t << std::endl;
 }
 
 /*
@@ -479,9 +479,9 @@ Shader::sampleLoad(const Tick accessTime, bool isAtomic, const Tick t)
 {
     stats.loadLatencyDist.sample(accessTime);
     stats.allLatencyDist.sample(accessTime);
-    std::cout << "LOAD_LAT " << accessTime << " " << t << std::endl;
-    if(isAtomic)
-      std::cout << "ATOMIC_LD_LAT " << accessTime << " " << t << std::endl;
+    //std::cout << "LOAD_LAT " << accessTime << " " << t << std::endl;
+    //if(isAtomic)
+    //  std::cout << "ATOMIC_LD_LAT " << accessTime << " " << t << std::endl;
 }
 
 void
diff --git a/src/mem/SConscript b/src/mem/SConscript
index e2a91146d0..50a902deca 100644
--- a/src/mem/SConscript
+++ b/src/mem/SConscript
@@ -166,3 +166,6 @@ DebugFlag('TokenPort')
 DebugFlag("MemChecker")
 DebugFlag("MemCheckerMonitor")
 DebugFlag("QOS")
+DebugFlag("OPT1")
+DebugFlag("OPT2")
+DebugFlag("OPT3")
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 43fe461fab..6b20d4f07d 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -39,6 +39,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
   bool GPUonly := "False";
   int TCC_select_num_bits;
   bool useL3OnWT := "False";
+  bool noWBCleanVictims := "False";
   bool L2isWB;
   Cycles to_memory_controller_latency := 1;
 
@@ -452,7 +453,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   action(s_sendResponseS, "s", desc="send Shared response") {
     if(!tbe.responded) {
-      DPRINTF(RubySlicc, "NotEarly: Responding to RdBlkS\n", tbe.TBEState);
+      DPRINTF(OPT1, "NotEarly: Responding to RdBlkS\n");
       enqueue(responseNetwork_out, ResponseMsg, response_latency) {
         out_msg.addr := address;
         out_msg.Type := CoherenceResponseType:NBSysResp;
@@ -478,7 +479,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
     if(!tbe.responded) {
-      DPRINTF(RubySlicc, "NotEarly: Responding to RdBlk\n", tbe.TBEState);
+      DPRINTF(OPT1, "NotEarly: Responding to RdBlk\n");
       enqueue(responseNetwork_out, ResponseMsg, response_latency) {
         out_msg.addr := address;
         out_msg.Type := CoherenceResponseType:NBSysResp;
@@ -510,7 +511,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   action(m_sendResponseM, "m", desc="send Modified response") {
     if(!tbe.responded) {
-      DPRINTF(RubySlicc, "NotEarly: Responding to RdBlkM\n", tbe.TBEState);
+      DPRINTF(OPT1, "NotEarly: Responding to RdBlkM\n");
       if (tbe.wtData) {
         enqueue(triggerQueue_out, TriggerMsg, 1) {
           out_msg.addr := address;
@@ -965,42 +966,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   action(d_writeDataToMemory, "d", desc="Write data to memory") {
     peek(responseNetwork_in, ResponseMsg) {
-      // here too memory write can be saved if L2 victim not dirty
+      // here too, memory write can be saved; only writeback to L3
+      // this action is entirely just a stat recorder
       if (in_msg.Dirty) {
-        DPRINTF(RubySlicc, "Lazymuth: L2 victim dirty, still not immediately writing back\n");
-        //enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
-        //  out_msg.addr := address;
-        //  out_msg.Type := MemoryRequestType:MEMORY_WB;
-        //  out_msg.Sender := machineID;
-        //  out_msg.MessageSize := MessageSizeType:Writeback_Data;
-        //  out_msg.DataBlk := in_msg.DataBlk;
-        //}
         // tbe.Dirty is gratuitous here, but functional writes may apply
-        tbe.Dirty := true; // gratuitous
-        // PS we're not writing back in case of !in_msg.Dirty,
-        // Since the original FSM does not wait for the WBAck, 
-        // we don't need to either (TODO safe?)
-        // if needed, use the trigger queue as so:
-        // enqueue(triggerQueue_out, TriggerMsg, 1) {
-        //   out_msg.addr := address;
-        //   out_msg.Type := TriggerType:WriteDone; 
-        //   // this is different from the actual usage of WriteDone
-        // }
-
-      } else {
-        DPRINTF(RubySlicc, "Lazymuth: L2 victim clean, saved a write back (or 2)\n");
-        assert (tbe.Dirty == false); //default unchanged
-        // we are skipping writing to the memory if victim is clean
+        // subsequent L3 write will still read dirtyness from in_msg
+        tbe.Dirty := true;
         // TODO: If an intermediate GPU write has dirtied L3, would the
         //      victim be written back without incorporating GPU's stuff?
         //      The way it's done now, that's how it is
-
+      } else {
         // have to update the TBE, too, because of how this
-        // directory deals with functional writes -- TODO what??
-        // gratuitous?
+        // directory deals with functional writes
+        // TODO gratuitous?
         tbe.DataBlk := in_msg.DataBlk;
-        tbe.Dirty   := false;// overrides the dirty bit
+        tbe.Dirty   := tbe.Dirty || in_msg.Dirty;
       }
+      DPRINTF(OPT2, "MemWrite saved; only L3 will be updated\n");
     }
   }
 
@@ -1031,6 +1013,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
       tbe.Dirty := false;
       tbe.Len := in_msg.Len;
       if (in_msg.Type == DMARequestType:WRITE) {
+        tbe.TBEState := State:BDW_PM;
         tbe.wtData := true;
         tbe.Dirty := true;
         tbe.DataBlk := in_msg.DataBlk;
@@ -1119,17 +1102,22 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         }
       }
       // Skipping writing to memory blindly, a compulsorily following 
-      // action will determine if it's necessary to write back data.
-      // The above action on atomicData will still be needed.
-
-      //enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
-      //  out_msg.addr := address;
-      //  out_msg.Type := MemoryRequestType:MEMORY_WB;
-      //  out_msg.Sender := machineID;
-      //  out_msg.MessageSize := MessageSizeType:Writeback_Data;
-      //  out_msg.DataBlk := tbe.DataBlk;
-      //  DPRINTF(ProtocolTrace, "%s\n", out_msg);
-      //}
+      // action (alwt) will determine if it's necessary to write back data.
+      // Exception: DMA writes -- they do not write to L3, so memory writeback
+      // might be necessary for those;
+      // the above action on atomicData will still be needed though
+      if (tbe.TBEState == State:BDW_PM) {
+        enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+          out_msg.addr := address;
+          out_msg.Type := MemoryRequestType:MEMORY_WB;
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Data;
+          out_msg.DataBlk := tbe.DataBlk;
+          DPRINTF(ProtocolTrace, "%s\n", out_msg);
+        }
+      } else {
+        DPRINTF(OPT2, "MemWrite saved\n");
+      }
     }
   }
 
@@ -1181,7 +1169,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
   action(edc_earlyDataToCore, "edc", desc="early data to core") {
     if(tbe.readyToResp && !tbe.responded) {
       if (tbe.TBEState == State:BM_PM || tbe.TBEState == State:BM_Pm) {
-        DPRINTF(RubySlicc, "Early: Responding to RdBlkM/WT/Atomic\n", tbe.TBEState);
+        DPRINTF(OPT1, "Early: Responding to RdBlkM/WT/Atomic at %d probes\n", tbe.NumPendingAcks);
         /* this would otherwise have been done before
          * responding to the core, as part of writing back to memory
          * since we are skipping that part now, we need to do that here
@@ -1228,7 +1216,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           }
         }
       } else if (tbe.TBEState == State:BS_PM || tbe.TBEState == State:BS_Pm) {
-        DPRINTF(RubySlicc, "Early: Responding to RdBlkS\n", tbe.TBEState);
+        DPRINTF(OPT1, "Early: Responding to RdBlkS at %d probes\n", tbe.NumPendingAcks);
         enqueue(responseNetwork_out, ResponseMsg, response_latency) {
           out_msg.addr := address;
           out_msg.Type := CoherenceResponseType:NBSysResp;
@@ -1250,7 +1238,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           DPRINTF(RubySlicc, "%s\n", out_msg);
         }
       } else if (tbe.TBEState == State:B_PM || tbe.TBEState == State:B_Pm) {
-        DPRINTF(RubySlicc, "Early: Responding to RdBlk\n", tbe.TBEState);
+        DPRINTF(OPT1, "Early: Responding to RdBlk at %d probes\n", tbe.NumPendingAcks);
         enqueue(responseNetwork_out, ResponseMsg, response_latency) {
           out_msg.addr := address;
           out_msg.Type := CoherenceResponseType:NBSysResp;
@@ -1312,12 +1300,22 @@ machine(MachineType:Directory, "AMD Baseline protocol")
   }
 
   action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
-    // this is for when a victim block is being processed
-    // we have already written to memory if in_msg.Dirty
-    // rewrite L3 iff in_msg.Dirty -- best case, this saves a memory read
-    // worst case, this may need to be evicted on a critical path
+    /* optimization 3 (risky!): not writing back clean victims to L3
+     * naively, it's easy to see how it would perform worse because the clean victims are
+     * lost "in the air" as L3 -- being a victim cache -- would not have cached it on the
+     * refill path, and the next time a read/write miss is incurred for those lines, 
+     * directory will now need to fetch from memory. My argument is that if there's ever 
+     * data that needs to be evicted from CPU to GPU, what matters is the data that CPU 
+     * modifies (victims will hence be dirty). I don't see any reason why CPU would read 
+     * a bunch and not modify it and evict it and the GPU ends up needing it, unless it's an
+     * atomic variable, which is handled differently anyway. 
+     * Even data that's evicted as an unintended consequence 
+     * (like capacity-induced writeback) is also not readily evident if needed by the GPU
+     * best case, this saves an L3 write, and potentially a memory write, pollution
+     * worst case, if GPU ends up needing it, it would then have the dir fetch from mem
+     */
     peek(responseNetwork_in, ResponseMsg) {
-      if (in_msg.Dirty) {
+      if (!noWBCleanVictims || (noWBCleanVictims && in_msg.Dirty)) {
         if (L3CacheMemory.isTagPresent(address)) {
           CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
           APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
@@ -1327,7 +1325,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           // (due to a prior dirty write, not yet written back)
           // retain the dirty bit, even if this victim is clean
           // TODO does the consistency model allow this?
-          entry.Dirty := true;
+          entry.Dirty := true; // overrides dirtyness
           assert(is_valid(tbe));
           //The controller always allocates a TBE entry upon receipt of a request from L2 caches.
           //L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss.
@@ -1347,11 +1345,10 @@ machine(MachineType:Directory, "AMD Baseline protocol")
                 out_msg.MessageSize := MessageSizeType:Writeback_Data;
                 out_msg.DataBlk := victim_entry.DataBlk;
               }
-              DPRINTF(RubySlicc, "L3VictimWriteBack NotSaved\n");
             } else {
               // do not write-back, directly deallocate
               // since it's guaranteed memory has not diverged from L3
-              DPRINTF(RubySlicc, "L3VictimWriteBack Saved\n");
+              DPRINTF(OPT2, "MemWrite saved\n");
             }
             L3CacheMemory.deallocate(victim);
           }
@@ -1361,17 +1358,14 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           entry.DataBlk := in_msg.DataBlk;
 
           entry.LastSender := in_msg.Sender;
-          entry.Dirty := true;
+          entry.Dirty := true; // fresh entry, explicitify dirtiness
         }
-        DPRINTF(RubySlicc, "L2VictimWriteToL3 Happened\n");
-      } else {
-        // do not allocate L3 block
-        // best case: the original L3 from which the victim was originally sourced
-        //            has not been evicted, so a reallocation is redundant
-        // worst case: functionally correct, but a missed free caching opportunity
-        DPRINTF(RubySlicc, "L2VictimWriteToL3 DidNotHappen\n");
+        DPRINTF(OPT3, "L2DirtyVictimToL3\n");
+      } else { // never entered on !noWBCleanVictims
+        DPRINTF(OPT3, "NoL2CleanVictimToL3\n");
+        // TODO how to capture the consequential read misses?
       }
-    } 
+    }
   }
 
   action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
@@ -1392,7 +1386,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         entry.LastSender := tbe.LastSender;
         // this ensures dirty probes are recognized
         // and also carries forward the original dirtyness of the line
-        entry.Dirty := true; // override
+        entry.Dirty := entry.Dirty || tbe.Dirty; 
+        // atomic can be just a read, so not necessarily dirty
       } else {
         if (L3CacheMemory.cacheAvail(address) == false) {
           Addr victim := L3CacheMemory.cacheProbe(address);
@@ -1413,12 +1408,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
               out_msg.MessageSize := MessageSizeType:Writeback_Data;
               out_msg.DataBlk := victim_entry.DataBlk;
             }
-            DPRINTF(RubySlicc, "L3VictimWriteBack NotSaved\n");
             // TODO do we need to wait for WBAck?
           } else {
             // else, drop WB because memory and clean L3 are 
             // by definition reconciled
-            DPRINTF(RubySlicc, "L3VictimWriteBack Saved\n");
+            DPRINTF(OPT2, "MemWrite saved\n");
           }
           L3CacheMemory.deallocate(victim);
         }
@@ -1427,8 +1421,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
         entry.DataBlk := tbe.DataBlk;
         entry.LastSender := tbe.LastSender;
-        assert(tbe.Dirty == true);
-        entry.Dirty := tbe.Dirty;
+        entry.Dirty := tbe.Dirty; // same reason as in the L3 victimization case above
         /* because the new allocation can now be dirty, the memory writeback can be skipped
          * Next time an L3 line is victimized, it will only be written back iff also dirty
          */
diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index 68a1a6a8af..f2b26bf494 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py
@@ -129,6 +129,9 @@ def __init__(self, symtab, ident, location, pairs, config_parameters):
         self.debug_flags = set()
         self.debug_flags.add("RubyGenerated")
         self.debug_flags.add("RubySlicc")
+        self.debug_flags.add("OPT1")
+        self.debug_flags.add("OPT2")
+        self.debug_flags.add("OPT3")
 
     def __repr__(self):
         return f"[StateMachine: {self.ident}]"