Move host-side allocation to benchmarks and reuse device with UVM

This commit puts benchmarks in control of allocating the host memory used for verifying the results. This enables benchmarks that use Unified Memory for the device allocations, to avoid the host-side allocation and just pass pointers to the device allocation to the benchmark driver. Closes UoB-HPC#128 .
gonzalobg · Jun 3, 2024 · d5b776c · d5b776c
1 parent 321ba62
commit d5b776c
Show file tree

Hide file tree

Showing 43 changed files with 578 additions and 509 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
     message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
 endif ()
 
+option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
+
 # setup some defaults flags for everything
 set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
-set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
+set(DEFAULT_RELEASE_FLAGS -O3)
+if (BUILD_NATIVE)
+    set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
+endif()
 
 macro(hint_flag FLAG DESCRIPTION)
     if (NOT DEFINED ${FLAG})

diff --git a/src/Stream.h b/src/Stream.h
@@ -7,14 +7,10 @@
 
 #pragma once
 
+#include <array>
 #include <vector>
 #include <string>
-
-// Array values
-#define startA (0.1)
-#define startB (0.2)
-#define startC (0.0)
-#define startScalar (0.4)
+#include "benchmark.h"
 
 template <class T>
 class Stream
@@ -31,9 +27,8 @@ class Stream
     virtual void nstream() = 0;
     virtual T dot() = 0;
 
-    // Copy memory between host and device
-    virtual void init_arrays(T initA, T initB, T initC) = 0;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
+    // Set pointers to read from arrays
+    virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
 };
 
 // Implementation specific device functions

diff --git a/src/StreamModels.h b/src/StreamModels.h
@@ -35,67 +35,67 @@
 #include "FutharkStream.h"
 #endif
 
-template <typename T>
-std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
+template <typename T, typename...Args>
+std::unique_ptr<Stream<T>> make_stream(Args... args) {
 #if defined(CUDA)
   // Use the CUDA implementation
-  return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<CUDAStream<T>>(args...);
 
 #elif defined(HIP)
   // Use the HIP implementation
-  return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HIPStream<T>>(args...);
 
 #elif defined(HC)
   // Use the HC implementation
-  return std::make_unique<HCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HCStream<T>>(args...);
 
 #elif defined(OCL)
   // Use the OpenCL implementation
-  return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OCLStream<T>>(args...);
 
 #elif defined(USE_RAJA)
   // Use the RAJA implementation
-  return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<RAJAStream<T>>(args...);
 
 #elif defined(KOKKOS)
   // Use the Kokkos implementation
-  return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
+  return std::make_unique<KokkosStream<T>>(args...);
 
 #elif defined(STD_DATA)
   // Use the C++ STD data-oriented implementation
-  return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDDataStream<T>>(args...);
 
 #elif defined(STD_INDICES)
   // Use the C++ STD index-oriented implementation
-  return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDIndicesStream<T>>(args...);
 
 #elif defined(STD_RANGES)
   // Use the C++ STD ranges implementation
-  return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDRangesStream<T>>(args...);
 
 #elif defined(TBB)
   // Use the C++20 implementation
-  return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
+  return std::make_unique<TBBStream<T>>(args...);
 
 #elif defined(THRUST)
   // Use the Thrust implementation
-  return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ThrustStream<T>>(args...);
 
 #elif defined(ACC)
   // Use the OpenACC implementation
-  return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ACCStream<T>>(args...);
 
 #elif defined(SYCL) || defined(SYCL2020)
   // Use the SYCL implementation
-  return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<SYCLStream<T>>(args...);
 
 #elif defined(OMP)
   // Use the OpenMP implementation
-  return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OMPStream<T>>(args...);
 
 #elif defined(FUTHARK)
   // Use the Futhark implementation
-  return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
+  return std::make_unique<FutharkStream<T>>(args...);
 
 #else
 

diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp
@@ -8,11 +8,12 @@
 #include "ACCStream.h"
 
 template <class T>
-ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
-  : array_size{ARRAY_SIZE}
+ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+			T initA, T initB, T initC)
+  : array_size{array_size}
 {
   acc_device_t device_type = acc_get_device_type();
-  acc_set_device_num(device, device_type);
+  acc_set_device_num(device_id, device_type);
 
   // Set up data region on device
   this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
 
   #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
   #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>

diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h
@@ -19,32 +19,25 @@
 template <class T>
 class ACCStream : public Stream<T>
 {
-  struct A{
-    T *a;
-    T *b;
-    T *c;
-  };
-
-  protected:
     // Size of arrays
     intptr_t array_size;
-    A aa;
     // Device side pointers
-    T *a;
-    T *b;
-    T *c;
+    T* restrict a;
+    T* restrict b;
+    T* restrict c;
 
   public:
-    ACCStream(const intptr_t, int);
+    ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~ACCStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/benchmark.h b/src/benchmark.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <iostream>
+
+// Array values
+#define startA (0.1)
+#define startB (0.2)
+#define startC (0.0)
+#define startScalar (0.4)
+
+// Benchmark Identifier: identifies individual & groups of benchmarks:
+// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
+// - All: all kernels.
+// - Individual kernels only.
+enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
+
+struct Benchmark {
+  BenchId id;
+  char const* label;
+  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
+  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
+  size_t weight;
+  // Is it one of: Copy, Mul, Add, Triad, Dot?
+  bool classic = false;
+};
+
+// Benchmarks in the order in which - if present - should be run for validation purposes:
+constexpr size_t num_benchmarks = 6;
+constexpr std::array<Benchmark, num_benchmarks> bench = {
+  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
+};
+
+// Which buffers are needed by each benchmark
+inline bool needs_buffer(BenchId id, char n) {
+  auto in = [n](std::initializer_list<char> values) {
+    return std::find(values.begin(), values.end(), n) != values.end();
+  };
+  switch(id) {
+  case BenchId::All:     return in({'a','b','c'});       
+  case BenchId::Classic: return in({'a','b','c'});   
+  case BenchId::Copy:    return in({'a','c'});
+  case BenchId::Mul:	 return in({'b','c'});
+  case BenchId::Add:	 return in({'a','b','c'});
+  case BenchId::Triad:   return in({'a','b','c'});
+  case BenchId::Dot:	 return in({'a','b'});
+  case BenchId::Nstream: return in({'a','b','c'});  
+  default:
+    std::cerr << "Unknown benchmark" << std::endl;
+    abort();
+  }
+}
+
+// Returns true if the benchmark needs to be run:
+inline bool run_benchmark(BenchId selection, Benchmark const& b) {
+  if (selection == BenchId::All)                  return true;
+  if (selection == BenchId::Classic && b.classic) return true;
+  return selection == b.id;
+}
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
@@ -77,7 +77,8 @@ void free_host(T* p) {
 }
 
 template <class T>
-CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
+CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
   : array_size(array_size)
 {
   // Set device
@@ -131,14 +132,20 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
   std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
 
   // Check buffers fit on the device
-  if (dprop.totalGlobalMem < total_bytes)
+  if (dprop.totalGlobalMem < total_bytes) {
+    std::cerr << "Requested array size of " << total_bytes * 1e-9
+	      << " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl;
     throw std::runtime_error("Device does not have enough memory for all buffers");
+  }
 
   // Allocate buffers:
   d_a = alloc_device<T>(array_size);
   d_b = alloc_device<T>(array_size);
   d_c = alloc_device<T>(array_size);
   sums = alloc_host<T>(dot_num_blocks);
+
+  // Initialize buffers:
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -204,21 +211,26 @@ void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  // Copy device memory to host
-#if defined(PAGEFAULT) || defined(MANAGED)
   CU(cudaStreamSynchronize(stream));
-  for (intptr_t i = 0; i < array_size; ++i)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+#if defined(PAGEFAULT) || defined(MANAGED)
+  // Unified memory: return pointers to device memory
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost));
+  // No Unified memory: copy data to the host
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
 #endif
 }