Add Scan, Read, and Write benchmarks

gonzalobg · Jun 10, 2024 · 0b7fdb3 · 0b7fdb3
1 parent 62a5051
commit 0b7fdb3
Show file tree

Hide file tree

Showing 33 changed files with 808 additions and 122 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,8 +31,8 @@ endmacro()
 # the final executable name
 set(EXE_NAME babelstream)
 
-# for chrono, make_unique, and some basic CXX features, models can overwrite this if required
-set(CMAKE_CXX_STANDARD 14)
+# for chrono, make_unique, exclusive_scan_, and other basic features, models can override if required
+set(CMAKE_CXX_STANDARD 17)
 
 if (NOT CMAKE_BUILD_TYPE)
     message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
@@ -177,10 +177,13 @@ else ()
     message(STATUS "Selected model  :  ${MODEL}")
 endif ()
 
-if (MODEL STREQUAL "sycl2020")
+if (MODEL STREQUAL "sycl2020-acc")
     message(FATAL_ERROR "
-    Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available.
-    Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM")
+    Model sycl2020-acc has been renamed to sycl2020 and may be enabled with the -DSYCL_ACCESS=ACCESSOR cmake option.")
+endif ()
+if (MODEL STREQUAL "sycl2020-usm")
+    message(FATAL_ERROR "
+    Model sycl2020-usm has been renamed to sycl2020 and may be enabled with the -DSYCL_ACCESS=USM cmake option.")
 endif ()
 
 # load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL

diff --git a/src/Stream.h b/src/Stream.h
@@ -11,10 +11,15 @@
 #include <array>
 #include <vector>
 #include <string>
+#include <type_traits>
 #include "benchmark.h"
 
 using std::intptr_t;
 
+template <typename T>
+using scan_t = std::conditional_t<sizeof(T) == 4, std::uint32_t,
+               std::conditional_t<sizeof(T) == 8, std::uint64_t, void>>;
+
 template <class T>
 class Stream
 {
@@ -29,9 +34,12 @@ class Stream
     virtual void triad() = 0;
     virtual void nstream() = 0;
     virtual T dot() = 0;
+    virtual void read() = 0;
+    virtual void write(T initA) = 0;
+    virtual void scan() = 0;
 
     // Set pointers to read from arrays
-    virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
+    virtual void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) = 0;
 };
 
 // Implementation specific device functions

diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp
@@ -5,6 +5,7 @@
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
+#include <numeric>
 #include "ACCStream.h"
 
 template <class T>
@@ -24,6 +25,11 @@ ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_
   T * restrict b = this->b;
   T * restrict c = this->c;
 
+  if (needs_buffer(bs, 's')) {
+    s_i = new scan_t<T>[array_size];
+    s_o = new scan_t<T>[array_size];
+  }
+
   #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 
@@ -46,6 +52,11 @@ ACCStream<T>::~ACCStream()
   delete[] a;
   delete[] b;
   delete[] c;
+
+  if (s_i) {
+    delete[] s_i;
+    delete[] s_o;
+  }
 }
 
 template <class T>
@@ -62,10 +73,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
     b[i] = initB;
     c[i] = initC;
   }
+
+  if (s_i) {
+    for (intptr_t i = 0; i < array_size; i++)
+    {
+      s_i[i] = scan_t<T>(i);
+    }
+  }
 }
 
 template <class T>
-void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
+void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c, scan_t<T> const*& h_s)
 {
   T *a = this->a;
   T *b = this->b;
@@ -76,6 +94,10 @@ void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
   h_a = a;
   h_b = b;
   h_c = c;
+
+  if (s_o) {
+    h_s = s_o;
+  }
 }
 
 template <class T>
@@ -169,6 +191,44 @@ T ACCStream<T>::dot()
   return sum;
 }
 
+template <class T>
+void ACCStream<T>::read()
+{
+  intptr_t array_size = this->array_size;
+  T * restrict a = this->a;
+  #pragma acc parallel loop present(a[0:array_size]) wait
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    T tmp = a[i];
+    if (tmp == T(3.14)) {
+      a[i] *= 2;;
+    }
+  }
+}
+
+template <class T>
+void ACCStream<T>::write(T initA)
+{
+  intptr_t array_size = this->array_size;
+  T * restrict a = this->a;
+  #pragma acc parallel loop present(a[0:array_size]) wait
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    a[i] = initA;
+  }
+}
+
+template <class T>
+void ACCStream<T>::scan()
+{
+  if (!s_i) {
+    throw std::runtime_error("Trying to run scan but storage not allocated");
+  }
+
+  // OpenAcc doesn't have scan; run sequentially
+  std::exclusive_scan(s_i, s_i + array_size, s_o, scan_t<T>(0));
+}
+
 void listDevices(void)
 {
   // Get number of devices

diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h
@@ -25,6 +25,8 @@ class ACCStream : public Stream<T>
     T* restrict a;
     T* restrict b;
     T* restrict c;
+    scan_t<T>* restrict s_i;
+    scan_t<T>* restrict s_o;
 
   public:
     ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
@@ -37,7 +39,10 @@ class ACCStream : public Stream<T>
     void triad() override;
     void nstream() override;
     T dot() override;
+    void read() override;
+    void write(T initA) override;
+    void scan() override;
 
-    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) override;
     void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/benchmark.h b/src/benchmark.h
@@ -15,7 +15,7 @@
 // - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
 // - All: all kernels.
 // - Individual kernels only.
-enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
+enum class BenchId : int {Write, Copy, Mul, Add, Triad, Dot, Nstream, Scan, Read, Classic, All};
 
 struct Benchmark {
   BenchId id;
@@ -28,14 +28,17 @@ struct Benchmark {
 };
 
 // Benchmarks in the order in which - if present - should be run for validation purposes:
-constexpr size_t num_benchmarks = 6;
-constexpr std::array<Benchmark, num_benchmarks> bench = {
+constexpr size_t num_benchmarks = 9;
+inline constexpr std::array<Benchmark, num_benchmarks> bench = {
+  Benchmark { .id = BenchId::Write,   .label = "Write",   .weight = 1, .classic = false },
   Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
   Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
   Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
   Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
   Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
+  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false },
+  Benchmark { .id = BenchId::Scan,    .label = "Scan",    .weight = 2, .classic = false },
+  Benchmark { .id = BenchId::Read,    .label = "Read",    .weight = 1, .classic = false }
 };
 
 // Which buffers are needed by each benchmark
@@ -44,14 +47,17 @@ inline bool needs_buffer(BenchId id, char n) {
     return std::find(values.begin(), values.end(), n) != values.end();
   };
   switch(id) {
-  case BenchId::All:     return in({'a','b','c'});       
-  case BenchId::Classic: return in({'a','b','c'});   
+  case BenchId::All:     return in({'a','b','c', 's'});
+  case BenchId::Classic: return in({'a','b','c'});
   case BenchId::Copy:    return in({'a','c'});
   case BenchId::Mul:	 return in({'b','c'});
   case BenchId::Add:	 return in({'a','b','c'});
   case BenchId::Triad:   return in({'a','b','c'});
   case BenchId::Dot:	 return in({'a','b'});
-  case BenchId::Nstream: return in({'a','b','c'});  
+  case BenchId::Nstream: return in({'a','b','c'});
+  case BenchId::Read:    return in({'a'});
+  case BenchId::Write:   return in({'a'});
+  case BenchId::Scan:    return in({'s'});
   default:
     std::cerr << "Unknown benchmark" << std::endl;
     abort();

diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
@@ -6,6 +6,8 @@
 
 #include "CUDAStream.h"
 #include <nvml.h>
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
 
 #if !defined(UNROLL_FACTOR)
 #define UNROLL_FACTOR 4
@@ -128,7 +130,8 @@ CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int devic
   // Size of partial sums for dot kernels
   size_t sums_bytes = sizeof(T) * dot_num_blocks;
   size_t array_bytes = sizeof(T) * array_size;
-  size_t total_bytes = array_bytes * size_t(3) + sums_bytes;
+  size_t scan_bytes = needs_buffer(bs, 's')? size_t(2) * array_size * sizeof(scan_t<T>) : 0;
+  size_t total_bytes = array_bytes * size_t(3) + scan_bytes + sums_bytes;
   std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
 
   // Check buffers fit on the device
@@ -144,6 +147,11 @@ CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int devic
   d_c = alloc_device<T>(array_size);
   sums = alloc_host<T>(dot_num_blocks);
 
+  if (needs_buffer(bs, 's')) {
+    d_si = alloc_device<scan_t<T>>(array_size);
+    d_so = alloc_device<scan_t<T>>(array_size);
+  }
+
   // Initialize buffers:
   init_arrays(initA, initB, initC);
 }
@@ -156,6 +164,10 @@ CUDAStream<T>::~CUDAStream()
   free_device(d_b);
   free_device(d_c);
   free_host(sums);
+  if (d_si) {
+    free_device(d_si);
+    free_device(d_so);
+  }
 }
 
 template <typename F>
@@ -203,22 +215,26 @@ void for_each(size_t array_size, F f) {
 template <class T>
 void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  for_each(array_size, [=,a=d_a,b=d_b,c=d_c] __device__ (size_t i) {
+  for_each(array_size, [=,a=d_a,b=d_b,c=d_c,s=d_si] __device__ (size_t i) {
     a[i] = initA;
     b[i] = initB;
     c[i] = initC;
+    if (s) {
+      s[i] = static_cast<scan_t<T>>(i);
+    }
   });
 }
 
 template <class T>
-void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
+void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s)
 {
   CU(cudaStreamSynchronize(stream));
 #if defined(PAGEFAULT) || defined(MANAGED)
   // Unified memory: return pointers to device memory
   a = d_a;
   b = d_b;
   c = d_c;
+  s = d_so;
 #else
   // No Unified memory: copy data to the host
   size_t nbytes = array_size * sizeof(T);
@@ -231,7 +247,14 @@ void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
   CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
   CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
   CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
+  if (d_so) {
+    size_t nbytes = array_size * sizeof(scan_t<T>);
+    h_s.resize(array_size);  
+    s = h_s.data();
+    CU(cudaMemcpy(h_s.data(), d_so, nbytes, cudaMemcpyDeviceToHost));
+  }
 #endif
+  CU(cudaStreamSynchronize(stream));
 }
 
 template <class T>
@@ -308,6 +331,38 @@ T CUDAStream<T>::dot()
   return sum;
 }
 
+template <class T>
+void CUDAStream<T>::scan()
+{
+  if (!d_so) {
+    std::cerr << "Trying to run scan but storage not allocated" << std::endl;
+    std::terminate();
+  }
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), d_si, d_si + array_size, d_so);
+  CU(cudaPeekAtLastError());
+  CU(cudaStreamSynchronize(stream));
+}
+
+template <class T>
+void CUDAStream<T>::read()
+{
+  for_each(array_size, [a=d_a] __device__ (size_t i) {
+    T tmp = a[i];
+    // Control-dependency on loading a[i]: never true, but checking it requires loading value:
+    if (tmp == T(3.14)) {
+      a[i] *= 2;
+    }
+  });
+}
+
+template <class T>
+void CUDAStream<T>::write(T initA)
+{
+  for_each(array_size, [a=d_a, initA] __device__ (size_t i) {
+      a[i] = initA;
+  });
+}
+
 void listDevices(void)
 {
   // Get number of devices

diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
@@ -35,6 +35,10 @@ class CUDAStream : public Stream<T>
 
     // If UVM is disabled, host arrays for verification purposes
     std::vector<T> h_a, h_b, h_c;
+    std::vector<scan_t<T>> h_s;
+
+    // Allocate memory for scan
+    scan_t<T> *d_si, *d_so;
 
     // Number of blocks for dot kernel
     intptr_t dot_num_blocks;
@@ -50,7 +54,10 @@ class CUDAStream : public Stream<T>
     void triad() override;
     void nstream() override;
     T dot() override;
+    void read() override;
+    void write(T initA) override;
+    void scan() override;
 
-    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) override;
     void init_arrays(T initA, T initB, T initC);
 };