From 5b4afdae3a9023b5fec48b121dc32aa329e8a734 Mon Sep 17 00:00:00 2001
From: Nick <CoffeeBeforeArch@gmail.com>
Date: Sun, 5 Apr 2020 15:14:34 -0700
Subject: [PATCH 1/2] Remove runtime aerialvision overhead w/ preprocessor

---
 setup_environment          | 15 ++++++++++++++-
 src/Makefile               |  4 ++--
 src/cuda-sim/Makefile      |  7 +++----
 src/gpgpu-sim/Makefile     |  8 +++-----
 src/gpgpu-sim/gpu-cache.cc | 18 ++++++++++++++++++
 src/gpgpu-sim/gpu-sim.cc   |  2 ++
 src/gpgpu-sim/l2cache.cc   |  7 ++++++-
 7 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/setup_environment b/setup_environment
index ca60d6bd9..6b1d27009 100644
--- a/setup_environment
+++ b/setup_environment
@@ -62,7 +62,20 @@ fi
 
 if [ $# = '1' ] ;
 then
-    export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
+    if [ $1 = 'debug' ] ;
+    then
+      export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
+    elif [ $1 = 'av_enabled' ] ;
+    then
+        export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
+        export AV_ENABLED=1
+    elif [ $1 = 'release' ] ;
+    then
+        export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
+    else
+        echo "ERROR - BAD SETUP VARIABLE"
+        return;
+    fi
 else
     export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
 fi
diff --git a/src/Makefile b/src/Makefile
index 3ad511e20..85a599421 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,7 +34,7 @@ TRACE?=1
 
 include ../version_detection.mk
 
-CXXFLAGS = -Wall -DDEBUG
+CXXFLAGS = -Wall
 CXXFLAGS += -DCUDART_VERSION=$(CUDART_VERSION)
 
 ifeq ($(GNUC_CPP0X), 1)
@@ -48,7 +48,7 @@ endif
 ifneq ($(DEBUG),1)
 	OPTFLAGS += -O3
 else
-	CXXFLAGS += 
+	CXXFLAGS += -DDEBUG
 endif
 
 CXXFLAGS += -I$(CUDA_INSTALL_PATH)/include
diff --git a/src/cuda-sim/Makefile b/src/cuda-sim/Makefile
index 85d1c8c01..fb430ee1d 100644
--- a/src/cuda-sim/Makefile
+++ b/src/cuda-sim/Makefile
@@ -42,12 +42,11 @@ include ../../version_detection.mk
 
 OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/cuda-sim
 
-OPT	:=  -O3 -g3 -Wall -Wno-unused-function -Wno-sign-compare
-ifeq ($(DEBUG),1)
-	OPT := -g3 -Wall  -Wno-unused-function -Wno-sign-compare
+OPT	:= -g3 -Wall -Wno-unused-function -Wno-sign-compare -fPIC
+ifneq ($(DEBUG),1)
+	OPT += -O3
 endif
 OPT += -I$(CUDA_INSTALL_PATH)/include  -I$(OUTPUT_DIR)/ -I. -I$(SIM_OBJ_FILES_DIR)
-OPT += -fPIC 
 
 ifeq ($(TRACE),1)
 	OPT += -DTRACING_ON=1
diff --git a/src/gpgpu-sim/Makefile b/src/gpgpu-sim/Makefile
index 4994577cd..cc6e371b2 100644
--- a/src/gpgpu-sim/Makefile
+++ b/src/gpgpu-sim/Makefile
@@ -34,7 +34,7 @@ TRACE?=0
 ifeq ($(DEBUG),1)
 	CXXFLAGS = -Wall -DDEBUG
 else
-	CXXFLAGS = -Wall
+	CXXFLAGS = -Wall -O3
 endif
 
 ifeq ($(TRACE),1)
@@ -47,10 +47,8 @@ ifeq ($(GNUC_CPP0X), 1)
     CXXFLAGS += -std=c++0x
 endif
 
-ifneq ($(DEBUG),1)
-	OPTFLAGS += -O3
-else
-	CXXFLAGS += 
+ifneq ($(AV_ENABLED),1)
+	CXXFLAGS += -DAV_ENABLED
 endif
 
 CXXFLAGS += -I$(CUDA_INSTALL_PATH)/include
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index af22c4c2c..e8a346124 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -605,11 +605,15 @@ void mshr_table::display(FILE *fp) const {
  * *****************************************************************/
 cache_stats::cache_stats() {
   m_stats.resize(NUM_MEM_ACCESS_TYPE);
+#ifdef AV_ENABLED
   m_stats_pw.resize(NUM_MEM_ACCESS_TYPE);
+#endif
   m_fail_stats.resize(NUM_MEM_ACCESS_TYPE);
   for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
     m_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
+#ifdef AV_ENABLED
     m_stats_pw[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
+#endif
     m_fail_stats[i].resize(NUM_CACHE_RESERVATION_FAIL_STATUS, 0);
   }
   m_cache_port_available_cycles = 0;
@@ -630,6 +634,7 @@ void cache_stats::clear() {
   m_cache_fill_port_busy_cycles = 0;
 }
 
+#ifdef AV_ENABLED
 void cache_stats::clear_pw() {
   ///
   /// Zero out per-window cache statistics
@@ -638,6 +643,7 @@ void cache_stats::clear_pw() {
     std::fill(m_stats_pw[i].begin(), m_stats_pw[i].end(), 0);
   }
 }
+#endif
 
 void cache_stats::inc_stats(int access_type, int access_outcome) {
   ///
@@ -649,6 +655,7 @@ void cache_stats::inc_stats(int access_type, int access_outcome) {
   m_stats[access_type][access_outcome]++;
 }
 
+#ifdef AV_ENABLED
 void cache_stats::inc_stats_pw(int access_type, int access_outcome) {
   ///
   /// Increment the corresponding per-window cache stat
@@ -657,6 +664,7 @@ void cache_stats::inc_stats_pw(int access_type, int access_outcome) {
     assert(0 && "Unknown cache access type or access outcome");
   m_stats_pw[access_type][access_outcome]++;
 }
+#endif
 
 void cache_stats::inc_fail_stats(int access_type, int fail_outcome) {
   if (!check_fail_valid(access_type, fail_outcome))
@@ -751,9 +759,11 @@ cache_stats &cache_stats::operator+=(const cache_stats &cs) {
     for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
       m_stats[type][status] += cs(type, status, false);
     }
+#ifdef AV_ENABLED
     for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
       m_stats_pw[type][status] += cs(type, status, false);
     }
+#endif
     for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
          ++status) {
       m_fail_stats[type][status] += cs(type, status, true);
@@ -872,6 +882,7 @@ void cache_stats::get_sub_stats(struct cache_sub_stats &css) const {
   css = t_css;
 }
 
+#ifdef AV_ENABLED
 void cache_stats::get_sub_stats_pw(struct cache_sub_stats_pw &css) const {
   ///
   /// Overwrites "css" with the appropriate statistics from this cache.
@@ -921,6 +932,7 @@ void cache_stats::get_sub_stats_pw(struct cache_sub_stats_pw &css) const {
 
   css = t_css;
 }
+#endif
 
 bool cache_stats::check_valid(int type, int status) const {
   ///
@@ -1588,8 +1600,10 @@ enum cache_request_status read_only_cache::access(
 
   m_stats.inc_stats(mf->get_access_type(),
                     m_stats.select_stats_status(status, cache_status));
+#ifdef AV_ENABLED
   m_stats.inc_stats_pw(mf->get_access_type(),
                        m_stats.select_stats_status(status, cache_status));
+#endif
   return cache_status;
 }
 
@@ -1655,8 +1669,10 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
       process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
   m_stats.inc_stats(mf->get_access_type(),
                     m_stats.select_stats_status(probe_status, access_status));
+#ifdef AV_ENABLED
   m_stats.inc_stats_pw(mf->get_access_type(), m_stats.select_stats_status(
                                                   probe_status, access_status));
+#endif
   return access_status;
 }
 
@@ -1719,8 +1735,10 @@ enum cache_request_status tex_cache::access(new_addr_type addr, mem_fetch *mf,
   }
   m_stats.inc_stats(mf->get_access_type(),
                     m_stats.select_stats_status(status, cache_status));
+#ifdef AV_ENABLED
   m_stats.inc_stats_pw(mf->get_access_type(),
                        m_stats.select_stats_status(status, cache_status));
+#endif
   return cache_status;
 }
 
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index a6a39ab0a..97fbbf527 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -960,11 +960,13 @@ void gpgpu_sim::init() {
     m_cluster[i]->reinit();
   m_shader_stats->new_grid();
   // initialize the control-flow, memory access, memory latency logger
+#ifdef AV_ENABLED
   if (m_config.g_visualizer_enabled) {
     create_thread_CFlogger(gpgpu_ctx, m_config.num_shader(),
                            m_shader_config->n_thread_per_shader, 0,
                            m_config.gpgpu_cflog_interval);
   }
+#endif
   shader_CTA_count_create(m_config.num_shader(), m_config.gpgpu_cflog_interval);
   if (m_config.gpgpu_cflog_interval != 0) {
     insn_warp_occ_create(m_config.num_shader(), m_shader_config->warp_size);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index ab6e5c228..5bfe6b0e1 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -624,19 +624,22 @@ void memory_stats_t::visualizer_print(gzFile visualizer_file) {
   gzprintf(visualizer_file, "Ltwowritehit: %d\n", L2_write_hit);
   gzprintf(visualizer_file, "Ltworeadmiss: %d\n", L2_read_miss);
   gzprintf(visualizer_file, "Ltworeadhit: %d\n", L2_read_hit);
+#ifdef AV_EANBLED
   clear_L2_stats_pw();
-
+#endif
   if (num_mfs)
     gzprintf(visualizer_file, "averagemflatency: %lld\n",
              mf_total_lat / num_mfs);
 }
 
+#ifdef AV_ENABLED
 void memory_stats_t::clear_L2_stats_pw() {
   L2_write_miss = 0;
   L2_write_hit = 0;
   L2_read_miss = 0;
   L2_read_hit = 0;
 }
+#endif
 
 void gpgpu_sim::print_dram_stats(FILE *fout) const {
   unsigned cmd = 0;
@@ -832,6 +835,7 @@ void memory_sub_partition::get_L2cache_sub_stats(
   }
 }
 
+#ifdef AV_ENABLED
 void memory_sub_partition::get_L2cache_sub_stats_pw(
     struct cache_sub_stats_pw &css) const {
   if (!m_config->m_L2_config.disabled()) {
@@ -858,3 +862,4 @@ void memory_sub_partition::visualizer_print(gzFile visualizer_file) {
 
   clear_L2cache_stats_pw();
 }
+#endif

From 97550da53f6554230358375cc8045d11736725b4 Mon Sep 17 00:00:00 2001
From: Nick <CoffeeBeforeArch@gmail.com>
Date: Tue, 7 Apr 2020 21:03:35 -0700
Subject: [PATCH 2/2] Fix preprocessor flags

---
 setup_environment          |  1 +
 src/gpgpu-sim/Makefile     |  6 ++++--
 src/gpgpu-sim/gpu-cache.cc | 18 +++++++++---------
 src/gpgpu-sim/gpu-sim.cc   |  2 +-
 src/gpgpu-sim/l2cache.cc   |  6 +++---
 src/gpgpu-sim/shader.cc    |  2 ++
 6 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/setup_environment b/setup_environment
index 02ba50aa9..88b2f25ee 100644
--- a/setup_environment
+++ b/setup_environment
@@ -61,6 +61,7 @@ if [ $CUDA_VERSION_NUMBER -ge 6000 ]; then
 fi
 
 
+export AV_ENABLED=0
 if [ $# = '1' ] ;
 then
     if [ $1 = 'debug' ] ;
diff --git a/src/gpgpu-sim/Makefile b/src/gpgpu-sim/Makefile
index cc6e371b2..669e0aba0 100644
--- a/src/gpgpu-sim/Makefile
+++ b/src/gpgpu-sim/Makefile
@@ -47,8 +47,10 @@ ifeq ($(GNUC_CPP0X), 1)
     CXXFLAGS += -std=c++0x
 endif
 
-ifneq ($(AV_ENABLED),1)
-	CXXFLAGS += -DAV_ENABLED
+ifeq ($(AV_ENABLED),1)
+	CXXFLAGS += -DAV_ENABLED=1
+else
+	CXXFLAGS += -DAV_ENABLED=0
 endif
 
 CXXFLAGS += -I$(CUDA_INSTALL_PATH)/include
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index e8a346124..0f46e8189 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -605,13 +605,13 @@ void mshr_table::display(FILE *fp) const {
  * *****************************************************************/
 cache_stats::cache_stats() {
   m_stats.resize(NUM_MEM_ACCESS_TYPE);
-#ifdef AV_ENABLED
+#if AV_ENABLED
   m_stats_pw.resize(NUM_MEM_ACCESS_TYPE);
 #endif
   m_fail_stats.resize(NUM_MEM_ACCESS_TYPE);
   for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
     m_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
-#ifdef AV_ENABLED
+#if AV_ENABLED
     m_stats_pw[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
 #endif
     m_fail_stats[i].resize(NUM_CACHE_RESERVATION_FAIL_STATUS, 0);
@@ -634,7 +634,7 @@ void cache_stats::clear() {
   m_cache_fill_port_busy_cycles = 0;
 }
 
-#ifdef AV_ENABLED
+#if AV_ENABLED
 void cache_stats::clear_pw() {
   ///
   /// Zero out per-window cache statistics
@@ -655,7 +655,7 @@ void cache_stats::inc_stats(int access_type, int access_outcome) {
   m_stats[access_type][access_outcome]++;
 }
 
-#ifdef AV_ENABLED
+#if AV_ENABLED
 void cache_stats::inc_stats_pw(int access_type, int access_outcome) {
   ///
   /// Increment the corresponding per-window cache stat
@@ -759,7 +759,7 @@ cache_stats &cache_stats::operator+=(const cache_stats &cs) {
     for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
       m_stats[type][status] += cs(type, status, false);
     }
-#ifdef AV_ENABLED
+#if AV_ENABLED
     for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
       m_stats_pw[type][status] += cs(type, status, false);
     }
@@ -882,7 +882,7 @@ void cache_stats::get_sub_stats(struct cache_sub_stats &css) const {
   css = t_css;
 }
 
-#ifdef AV_ENABLED
+#if AV_ENABLED
 void cache_stats::get_sub_stats_pw(struct cache_sub_stats_pw &css) const {
   ///
   /// Overwrites "css" with the appropriate statistics from this cache.
@@ -1600,7 +1600,7 @@ enum cache_request_status read_only_cache::access(
 
   m_stats.inc_stats(mf->get_access_type(),
                     m_stats.select_stats_status(status, cache_status));
-#ifdef AV_ENABLED
+#if AV_ENABLED
   m_stats.inc_stats_pw(mf->get_access_type(),
                        m_stats.select_stats_status(status, cache_status));
 #endif
@@ -1669,7 +1669,7 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
       process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
   m_stats.inc_stats(mf->get_access_type(),
                     m_stats.select_stats_status(probe_status, access_status));
-#ifdef AV_ENABLED
+#if AV_ENABLED
   m_stats.inc_stats_pw(mf->get_access_type(), m_stats.select_stats_status(
                                                   probe_status, access_status));
 #endif
@@ -1735,7 +1735,7 @@ enum cache_request_status tex_cache::access(new_addr_type addr, mem_fetch *mf,
   }
   m_stats.inc_stats(mf->get_access_type(),
                     m_stats.select_stats_status(status, cache_status));
-#ifdef AV_ENABLED
+#if AV_ENABLED
   m_stats.inc_stats_pw(mf->get_access_type(),
                        m_stats.select_stats_status(status, cache_status));
 #endif
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 97fbbf527..9b207d844 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -960,7 +960,7 @@ void gpgpu_sim::init() {
     m_cluster[i]->reinit();
   m_shader_stats->new_grid();
   // initialize the control-flow, memory access, memory latency logger
-#ifdef AV_ENABLED
+#if AV_ENABLED
   if (m_config.g_visualizer_enabled) {
     create_thread_CFlogger(gpgpu_ctx, m_config.num_shader(),
                            m_shader_config->n_thread_per_shader, 0,
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 5bfe6b0e1..7cebb04e3 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -624,7 +624,7 @@ void memory_stats_t::visualizer_print(gzFile visualizer_file) {
   gzprintf(visualizer_file, "Ltwowritehit: %d\n", L2_write_hit);
   gzprintf(visualizer_file, "Ltworeadmiss: %d\n", L2_read_miss);
   gzprintf(visualizer_file, "Ltworeadhit: %d\n", L2_read_hit);
-#ifdef AV_EANBLED
+#if AV_EANBLED
   clear_L2_stats_pw();
 #endif
   if (num_mfs)
@@ -632,7 +632,7 @@ void memory_stats_t::visualizer_print(gzFile visualizer_file) {
              mf_total_lat / num_mfs);
 }
 
-#ifdef AV_ENABLED
+#if AV_ENABLED
 void memory_stats_t::clear_L2_stats_pw() {
   L2_write_miss = 0;
   L2_write_hit = 0;
@@ -835,7 +835,7 @@ void memory_sub_partition::get_L2cache_sub_stats(
   }
 }
 
-#ifdef AV_ENABLED
+#if AV_ENABLED
 void memory_sub_partition::get_L2cache_sub_stats_pw(
     struct cache_sub_stats_pw &css) const {
   if (!m_config->m_L2_config.disabled()) {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index b596c0daa..c18c4e3d1 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -4258,6 +4258,7 @@ void shader_core_ctx::checkExecutionStatusAndUpdate(warp_inst_t &inst,
   }
 
   // PC-Histogram Update
+#if AV_ENABLED
   unsigned warp_id = inst.warp_id();
   unsigned pc = inst.pc;
   for (unsigned t = 0; t < m_config->warp_size; t++) {
@@ -4266,4 +4267,5 @@ void shader_core_ctx::checkExecutionStatusAndUpdate(warp_inst_t &inst,
       cflog_update_thread_pc(m_sid, tid, pc);
     }
   }
+#endif
 }