diff --git a/CMakeLists.txt b/CMakeLists.txt
index c98cab48..0df3836f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
     message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
 endif ()
 
+option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
+
 # setup some defaults flags for everything
 set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
-set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
+set(DEFAULT_RELEASE_FLAGS -O3)
+if (BUILD_NATIVE)
+    set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
+endif()
 
 macro(hint_flag FLAG DESCRIPTION)
     if (NOT DEFINED ${FLAG})
diff --git a/src/Stream.h b/src/Stream.h
index 45c144c3..c8c6af1c 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -7,14 +7,10 @@
 
 #pragma once
 
+#include <array>
 #include <vector>
 #include <string>
-
-// Array values
-#define startA (0.1)
-#define startB (0.2)
-#define startC (0.0)
-#define startScalar (0.4)
+#include "benchmark.h"
 
 template <class T>
 class Stream
@@ -31,9 +27,8 @@ class Stream
     virtual void nstream() = 0;
     virtual T dot() = 0;
 
-    // Copy memory between host and device
-    virtual void init_arrays(T initA, T initB, T initC) = 0;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
+    // Set pointers to read from arrays
+    virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
 };
 
 // Implementation specific device functions
diff --git a/src/StreamModels.h b/src/StreamModels.h
index 556beb4d..6a0836f3 100644
--- a/src/StreamModels.h
+++ b/src/StreamModels.h
@@ -35,67 +35,67 @@
 #include "FutharkStream.h"
 #endif
 
-template <typename T>
-std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
+template <typename T, typename...Args>
+std::unique_ptr<Stream<T>> make_stream(Args... args) {
 #if defined(CUDA)
   // Use the CUDA implementation
-  return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<CUDAStream<T>>(args...);
 
 #elif defined(HIP)
   // Use the HIP implementation
-  return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HIPStream<T>>(args...);
 
 #elif defined(HC)
   // Use the HC implementation
-  return std::make_unique<HCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HCStream<T>>(args...);
 
 #elif defined(OCL)
   // Use the OpenCL implementation
-  return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OCLStream<T>>(args...);
 
 #elif defined(USE_RAJA)
   // Use the RAJA implementation
-  return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<RAJAStream<T>>(args...);
 
 #elif defined(KOKKOS)
   // Use the Kokkos implementation
-  return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
+  return std::make_unique<KokkosStream<T>>(args...);
 
 #elif defined(STD_DATA)
   // Use the C++ STD data-oriented implementation
-  return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDDataStream<T>>(args...);
 
 #elif defined(STD_INDICES)
   // Use the C++ STD index-oriented implementation
-  return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDIndicesStream<T>>(args...);
 
 #elif defined(STD_RANGES)
   // Use the C++ STD ranges implementation
-  return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDRangesStream<T>>(args...);
 
 #elif defined(TBB)
   // Use the C++20 implementation
-  return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
+  return std::make_unique<TBBStream<T>>(args...);
 
 #elif defined(THRUST)
   // Use the Thrust implementation
-  return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ThrustStream<T>>(args...);
 
 #elif defined(ACC)
   // Use the OpenACC implementation
-  return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ACCStream<T>>(args...);
 
 #elif defined(SYCL) || defined(SYCL2020)
   // Use the SYCL implementation
-  return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<SYCLStream<T>>(args...);
 
 #elif defined(OMP)
   // Use the OpenMP implementation
-  return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OMPStream<T>>(args...);
 
 #elif defined(FUTHARK)
   // Use the Futhark implementation
-  return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
+  return std::make_unique<FutharkStream<T>>(args...);
 
 #else
 
diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp
index a346a39c..034336a4 100644
--- a/src/acc/ACCStream.cpp
+++ b/src/acc/ACCStream.cpp
@@ -8,11 +8,12 @@
 #include "ACCStream.h"
 
 template <class T>
-ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
-  : array_size{ARRAY_SIZE}
+ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+			T initA, T initB, T initC)
+  : array_size{array_size}
 {
   acc_device_t device_type = acc_get_device_type();
-  acc_set_device_num(device, device_type);
+  acc_set_device_num(device_id, device_type);
 
   // Set up data region on device
   this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
 
   #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -62,7 +65,7 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
   T *a = this->a;
   T *b = this->b;
@@ -70,12 +73,9 @@ void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
   #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h
index 1b053cb4..8345b785 100644
--- a/src/acc/ACCStream.h
+++ b/src/acc/ACCStream.h
@@ -19,32 +19,25 @@
 template <class T>
 class ACCStream : public Stream<T>
 {
-  struct A{
-    T *a;
-    T *b;
-    T *c;
-  };
-
-  protected:
     // Size of arrays
     intptr_t array_size;
-    A aa;
     // Device side pointers
-    T *a;
-    T *b;
-    T *c;
+    T* restrict a;
+    T* restrict b;
+    T* restrict c;
 
   public:
-    ACCStream(const intptr_t, int);
+    ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~ACCStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/benchmark.h b/src/benchmark.h
new file mode 100644
index 00000000..95d675f7
--- /dev/null
+++ b/src/benchmark.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <iostream>
+
+// Array values
+#define startA (0.1)
+#define startB (0.2)
+#define startC (0.0)
+#define startScalar (0.4)
+
+// Benchmark Identifier: identifies individual & groups of benchmarks:
+// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
+// - All: all kernels.
+// - Individual kernels only.
+enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
+
+struct Benchmark {
+  BenchId id;
+  char const* label;
+  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
+  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
+  size_t weight;
+  // Is it one of: Copy, Mul, Add, Triad, Dot?
+  bool classic = false;
+};
+
+// Benchmarks in the order in which - if present - should be run for validation purposes:
+constexpr size_t num_benchmarks = 6;
+constexpr std::array<Benchmark, num_benchmarks> bench = {
+  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
+};
+
+// Which buffers are needed by each benchmark
+inline bool needs_buffer(BenchId id, char n) {
+  auto in = [n](std::initializer_list<char> values) {
+    return std::find(values.begin(), values.end(), n) != values.end();
+  };
+  switch(id) {
+  case BenchId::All:     return in({'a','b','c'});       
+  case BenchId::Classic: return in({'a','b','c'});   
+  case BenchId::Copy:    return in({'a','c'});
+  case BenchId::Mul:	 return in({'b','c'});
+  case BenchId::Add:	 return in({'a','b','c'});
+  case BenchId::Triad:   return in({'a','b','c'});
+  case BenchId::Dot:	 return in({'a','b'});
+  case BenchId::Nstream: return in({'a','b','c'});  
+  default:
+    std::cerr << "Unknown benchmark" << std::endl;
+    abort();
+  }
+}
+
+// Returns true if the benchmark needs to be run:
+inline bool run_benchmark(BenchId selection, Benchmark const& b) {
+  if (selection == BenchId::All)                  return true;
+  if (selection == BenchId::Classic && b.classic) return true;
+  return selection == b.id;
+}
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
index 9d63ff3f..4f5599a7 100644
--- a/src/cuda/CUDAStream.cu
+++ b/src/cuda/CUDAStream.cu
@@ -77,7 +77,8 @@ void free_host(T* p) {
 }
 
 template <class T>
-CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
+CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
   : array_size(array_size)
 {
   // Set device
@@ -131,14 +132,20 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
   std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
 
   // Check buffers fit on the device
-  if (dprop.totalGlobalMem < total_bytes)
+  if (dprop.totalGlobalMem < total_bytes) {
+    std::cerr << "Requested array size of " << total_bytes * 1e-9
+	      << " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl;
     throw std::runtime_error("Device does not have enough memory for all buffers");
+  }
 
   // Allocate buffers:
   d_a = alloc_device<T>(array_size);
   d_b = alloc_device<T>(array_size);
   d_c = alloc_device<T>(array_size);
   sums = alloc_host<T>(dot_num_blocks);
+
+  // Initialize buffers:
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -204,21 +211,26 @@ void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  // Copy device memory to host
-#if defined(PAGEFAULT) || defined(MANAGED)
   CU(cudaStreamSynchronize(stream));
-  for (intptr_t i = 0; i < array_size; ++i)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+#if defined(PAGEFAULT) || defined(MANAGED)
+  // Unified memory: return pointers to device memory
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost));
+  // No Unified memory: copy data to the host
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
 #endif
 }
 
diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
index 5b739569..50e099dc 100644
--- a/src/cuda/CUDAStream.h
+++ b/src/cuda/CUDAStream.h
@@ -26,27 +26,31 @@ class CUDAStream : public Stream<T>
     intptr_t array_size;
 
     // Host array for partial sums for dot kernel
-    T *sums;
+    T* sums;
 
     // Device side pointers to arrays
-    T *d_a;
-    T *d_b;
-    T *d_c;
+    T* d_a;
+    T* d_b;
+    T* d_c;
+
+    // If UVM is disabled, host arrays for verification purposes
+    std::vector<T> h_a, h_b, h_c;
 
     // Number of blocks for dot kernel
     intptr_t dot_num_blocks;
 
   public:
-    CUDAStream(const intptr_t, const int);
+    CUDAStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~CUDAStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/futhark/FutharkStream.cpp b/src/futhark/FutharkStream.cpp
index ebd3633b..392ff898 100644
--- a/src/futhark/FutharkStream.cpp
+++ b/src/futhark/FutharkStream.cpp
@@ -11,9 +11,10 @@
 #include "FutharkStream.h"
 
 template <class T>
-FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
+FutharkStream<T>::FutharkStream(BenchId bs, const intptr_t array_size, const int device,
+				T initA, T initB, T initC)
+  : array_size(array_size)
 {
-  this->array_size = ARRAY_SIZE;
   this->cfg = futhark_context_config_new();
   this->device = "#" + std::to_string(device);
 #if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl)
@@ -23,6 +24,7 @@ FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
   this->a = NULL;
   this->b = NULL;
   this->c = NULL;
+  init_arrays(initA, initB, initC);
 }
 
 template <>
@@ -98,19 +100,31 @@ void FutharkStream<double>::init_arrays(double initA, double initB, double initC
 }
 
 template <>
-void FutharkStream<float>::read_arrays(std::vector<float>& h_a, std::vector<float>& h_b, std::vector<float>& h_c) {
+void FutharkStream<float>::get_arrays(float const*& a_, float const*& b_, float const*& c_) {
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data());
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data());
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data());
   futhark_context_sync(this->ctx);
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
 }
 
 template <>
-void FutharkStream<double>::read_arrays(std::vector<double>& h_a, std::vector<double>& h_b, std::vector<double>& h_c) {
+void FutharkStream<double>::get_arrays(double const*& a_, double const*& b_, double const*& c_) {
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data());
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data());
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data());
   futhark_context_sync(this->ctx);
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
 }
 
 template <>
diff --git a/src/futhark/FutharkStream.h b/src/futhark/FutharkStream.h
index 6290e79a..eabdabbe 100644
--- a/src/futhark/FutharkStream.h
+++ b/src/futhark/FutharkStream.h
@@ -44,17 +44,21 @@ class FutharkStream : public Stream<T>
   void* b;
   void* c;
 
+  // Host side arrays for verification
+  std::vector<T> h_a, h_b, h_c;
+
 public:
-  FutharkStream(const int, int);
+  FutharkStream(BenchId bs, const intptr_t array_size, const int device_id,
+		T initA, T initB, T initC);
   ~FutharkStream();
 
-  virtual void copy() override;
-  virtual void add() override;
-  virtual void mul() override;
-  virtual void triad() override;
-  virtual void nstream() override;
-  virtual T dot() override;
+  void copy() override;
+  void add() override;
+  void mul() override;
+  void triad() override;
+  void nstream() override;
+  T dot() override;
 
-  virtual void init_arrays(T initA, T initB, T initC) override;
-  virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+  void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+  void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/futhark/model.cmake b/src/futhark/model.cmake
index edd21fa6..d7b08795 100644
--- a/src/futhark/model.cmake
+++ b/src/futhark/model.cmake
@@ -44,6 +44,7 @@ macro(setup)
   elseif (${FUTHARK_BACKEND} STREQUAL "cuda")
     find_package(CUDA REQUIRED)
     register_link_library("nvrtc" "cuda" "cudart")
+    set(CMAKE_C_COMPILER "nvcc")
   else ()
     message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}")
   endif()
diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp
index ec02425a..e3878afd 100644
--- a/src/hip/HIPStream.cpp
+++ b/src/hip/HIPStream.cpp
@@ -25,7 +25,9 @@ void check_error(void)
 __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; }
 
 template <class T>
-HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
+HIPStream<T>::HIPStream(BenchId bs, const intptr_t array_size, const int device_index,
+	      T initA, T initB, T initC)
+  : array_size(array_size)
 {
   // Set device
   int count;
@@ -47,13 +49,12 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
     std::cout << "Memory: DEFAULT" << std::endl;
 #endif
 
-  array_size = ARRAY_SIZE;
   // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
   dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
 
   size_t array_bytes = sizeof(T);
-  array_bytes *= ARRAY_SIZE;
-  size_t total_bytes = array_bytes * 3;
+  array_bytes *= array_size;
+  size_t total_bytes = array_bytes * std::size_t{3};
 
   // Allocate the host array for partial sums for dot kernels using hipHostMalloc.
   // This creates an array on the host which is visible to the device. However, it requires
@@ -65,7 +66,7 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
   // Check buffers fit on the device
   hipDeviceProp_t props;
   hipGetDeviceProperties(&props, 0);
-  if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
+  if (props.totalGlobalMem < total_bytes)
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
   // Create device buffers
@@ -88,6 +89,8 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
   hipMalloc(&d_c, array_bytes);
   check_error();
 #endif
+
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -127,24 +130,28 @@ void HIPStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void HIPStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-
-  // Copy device memory to host
+  hipDeviceSynchronize();
 #if defined(PAGEFAULT) || defined(MANAGED)
-    hipDeviceSynchronize();
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+  // Unified memory: return pointers to device memory  
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
+  // No Unified memory: copy data to the host  
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  hipMemcpy(h_a.data(), d_a, nbytes, hipMemcpyDeviceToHost);
+  check_error();
+  hipMemcpy(h_b.data(), d_b, nbytes, hipMemcpyDeviceToHost);
+  check_error();
+  hipMemcpy(h_c.data(), d_c, nbytes, hipMemcpyDeviceToHost);
   check_error();
 #endif
 }
diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h
index 76ef7df4..a1c45802 100644
--- a/src/hip/HIPStream.h
+++ b/src/hip/HIPStream.h
@@ -48,20 +48,21 @@ class HIPStream : public Stream<T>
     T *d_b;
     T *d_c;
 
+    // If UVM is disabled, host arrays for verification purposes
+    std::vector<T> h_a, h_b, h_c;
 
   public:
-
-    HIPStream(const intptr_t, const int);
+    HIPStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~HIPStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;    
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp
index e49d5bcc..fcbdb7a7 100644
--- a/src/kokkos/KokkosStream.cpp
+++ b/src/kokkos/KokkosStream.cpp
@@ -8,21 +8,23 @@
 #include "KokkosStream.hpp"
 
 template <class T>
-KokkosStream<T>::KokkosStream(
-        const intptr_t ARRAY_SIZE, const int device_index)
-    : array_size(ARRAY_SIZE)
+KokkosStream<T>::KokkosStream(BenchId bs, const intptr_t array_size, const int device_index,
+			      T initA, T initB, T initC)
+    : array_size(array_size)
 {
   Kokkos::initialize(Kokkos::InitializationSettings().set_device_id(device_index));
 
-  d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE);
-  d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE);
-  d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE);
+  d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), array_size);
+  d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), array_size);
+  d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), array_size);
   hm_a = new typename Kokkos::View<T*>::HostMirror();
   hm_b = new typename Kokkos::View<T*>::HostMirror();
   hm_c = new typename Kokkos::View<T*>::HostMirror();
   *hm_a = create_mirror_view(*d_a);
   *hm_b = create_mirror_view(*d_b);
   *hm_c = create_mirror_view(*d_c);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -47,18 +49,14 @@ void KokkosStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void KokkosStream<T>::read_arrays(
-        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void KokkosStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   deep_copy(*hm_a, *d_a);
   deep_copy(*hm_b, *d_b);
   deep_copy(*hm_c, *d_c);
-  for(intptr_t ii = 0; ii < array_size; ++ii)
-  {
-    a[ii] = (*hm_a)(ii);
-    b[ii] = (*hm_b)(ii);
-    c[ii] = (*hm_c)(ii);
-  }
+  a = hm_a->data();
+  b = hm_b->data();
+  c = hm_c->data();
 }
 
 template <class T>
diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp
index 8e40119c..bc3ac3ee 100644
--- a/src/kokkos/KokkosStream.hpp
+++ b/src/kokkos/KokkosStream.hpp
@@ -22,27 +22,27 @@ class KokkosStream : public Stream<T>
     intptr_t array_size;
 
     // Device side pointers to arrays
-     typename Kokkos::View<T*>* d_a;
-     typename Kokkos::View<T*>* d_b;
-     typename Kokkos::View<T*>* d_c;
-     typename Kokkos::View<T*>::HostMirror* hm_a;
-     typename Kokkos::View<T*>::HostMirror* hm_b;
-     typename Kokkos::View<T*>::HostMirror* hm_c;
+    typename Kokkos::View<T*>* d_a;
+    typename Kokkos::View<T*>* d_b;
+    typename Kokkos::View<T*>* d_c;
+    typename Kokkos::View<T*>::HostMirror* hm_a;
+    typename Kokkos::View<T*>::HostMirror* hm_b;
+    typename Kokkos::View<T*>::HostMirror* hm_c;
 
   public:
 
-    KokkosStream(const intptr_t, const int);
+    KokkosStream(BenchId bs, const intptr_t array_size, const int device_id,
+		 T initA, T initB, T initC);
     ~KokkosStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(
-            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake
index 7457eebd..2223c753 100644
--- a/src/kokkos/model.cmake
+++ b/src/kokkos/model.cmake
@@ -1,5 +1,5 @@
 register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection and RAJA.
+        "Any CXX compiler that is supported by CMake detection and Kokkos.
          See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
         "c++")
 
@@ -21,7 +21,7 @@ macro(setup)
     set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
     cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
 
-
+    message("KOKKOS_IN_PACKAGE=${KOKKOS_IN_PACKAGE}")
     if (EXISTS "${KOKKOS_IN_TREE}")
         message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
         add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
diff --git a/src/main.cpp b/src/main.cpp
index c677f048..55c3a4e7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -23,7 +23,7 @@
 #include "Unit.h"
 
 // Default size of 2^25
-intptr_t ARRAY_SIZE = 33554432;
+intptr_t array_size = 33554432;
 size_t num_times = 100;
 size_t deviceIndex = 0;
 bool use_float = false;
@@ -33,42 +33,11 @@ Unit unit{Unit::Kind::MegaByte};
 bool silence_errors = false;
 std::string csv_separator = ",";
 
-// Benchmark Identifier: identifies individual & groups of benchmarks:
-// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
-// - All: all kernels.
-// - Individual kernels only.
-enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
-
-struct Benchmark {
-  BenchId id;
-  char const* label;
-  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
-  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
-  size_t weight;
-  // Is it one of: Copy, Mul, Add, Triad, Dot?
-  bool classic = false;
-};
-
-// Benchmarks in the order in which - if present - should be run for validation purposes:
-constexpr size_t num_benchmarks = 6;
-std::array<Benchmark, num_benchmarks> bench = {
-  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
-  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
-  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
-};
-
 // Selected benchmarks to run: default is all 5 classic benchmarks.
 BenchId selection = BenchId::Classic;
 
 // Returns true if the benchmark needs to be run:
-bool run_benchmark(Benchmark const& b) {
-  if (selection == BenchId::All)                  return true;
-  if (selection == BenchId::Classic && b.classic) return true;
-  return selection == b.id;
-}
+bool run_benchmark(Benchmark const& b) { return run_benchmark(selection, b); }
 
 // Benchmark run order
 // - Classic: runs each bench once in the order above, and repeats n times.
@@ -174,8 +143,7 @@ std::vector<std::vector<double>> run_all(std::unique_ptr<Stream<T>>& stream, T&
 }
 
 template <typename T>
-void check_solution(const size_t ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c,
-		    T& sum);
+void check_solution(const size_t ntimes, T const* a, T const* b, T const* c, T sum);
 
 // Generic run routine
 // Runs the kernel(s) and prints output.
@@ -186,7 +154,7 @@ void run()
 
   // Formatting utilities:
   auto fmt_bw = [&](size_t weight, double dt) {
-    return unit.fmt((weight * sizeof(T) * ARRAY_SIZE)/dt);
+    return unit.fmt((weight * sizeof(T) * array_size)/dt);
   };
   auto fmt_csv_header = [] {
     std::cout
@@ -251,46 +219,37 @@ void run()
     default: std::cerr << "Error: Unknown order" << std::endl; abort();
     };
     std::cout << " order " << std::endl;
-    std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
+    std::cout << "Number of elements: " << array_size << std::endl;
     std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl;
 
-    size_t nbytes = ARRAY_SIZE * sizeof(T);
+    size_t nbytes = array_size * sizeof(T);
     std::cout << std::setprecision(1) << std::fixed
 	      << "Array size: " << unit.fmt(nbytes) << " " << unit.str() << std::endl;
     std::cout << "Total size: " << unit.fmt(3.0*nbytes) << " " << unit.str() << std::endl;
     std::cout.precision(ss);
   }
 
-  std::unique_ptr<Stream<T>> stream = make_stream<T>(ARRAY_SIZE, deviceIndex);
-  auto initElapsedS = time([&] { stream->init_arrays(startA, startB, startC); });
+  std::unique_ptr<Stream<T>> stream
+    = make_stream<T>(selection, array_size, deviceIndex, startA, startB, startC);
 
   // Result of the Dot kernel, if used.
   T sum{};
   std::vector<std::vector<double>> timings = run_all<T>(stream, sum);
 
   // Create & read host vectors:
-  std::vector<T> a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE);
-  auto readElapsedS = time([&] { stream->read_arrays(a, b, c); });
+  T const* a;
+  T const* b;
+  T const* c;
+  stream->get_arrays(a, b, c);
 
   check_solution<T>(num_times, a, b, c, sum);
-  auto initBWps = fmt_bw(3, initElapsedS);
-  auto readBWps = fmt_bw(3, readElapsedS);
 
   if (output_as_csv)
   {
     fmt_csv_header();
-    fmt_csv("Init", 1, ARRAY_SIZE, sizeof(T), initBWps, initElapsedS, initElapsedS, initElapsedS);
-    fmt_csv("Read", 1, ARRAY_SIZE, sizeof(T), readBWps, readElapsedS, readElapsedS, readElapsedS);
   }
   else
   {
-    std::cout << "Init: "
-      << std::setw(7)
-      << initElapsedS << " s (=" << initBWps << " " << unit.str() << "/s" << ")" << std::endl;
-    std::cout << "Read: "
-      << std::setw(7)
-      << readElapsedS << " s (=" << readBWps << " " << unit.str() << "/s" << ")" << std::endl;
-
     std::cout
       << std::left << std::setw(12) << "Function"
       << std::left << std::setw(12) << (std::string(unit.str()) + "/s")
@@ -313,15 +272,13 @@ void run()
       / (double)(num_times - 1);
 
     // Display results
-    fmt_result(bench[i].label, num_times, ARRAY_SIZE, sizeof(T),
+    fmt_result(bench[i].label, num_times, array_size, sizeof(T),
 	       fmt_bw(bench[i].weight, *minmax.first), *minmax.first, *minmax.second, average);
   }
 }
 
 template <typename T>
-void check_solution(const size_t num_times,
-		    std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
-{
+void check_solution(const size_t num_times, T const* a, T const* b, T const* c, T sum) {
   // Generate correct solution
   T goldA = startA;
   T goldB = startB;
@@ -338,7 +295,7 @@ void check_solution(const size_t num_times,
     case BenchId::Add:     goldC = goldA + goldB; break;
     case BenchId::Triad:   goldA = goldB + scalar * goldC; break;
     case BenchId::Nstream: goldA += goldB + scalar * goldC; break;
-    case BenchId::Dot:     goldS = goldA * goldB * T(ARRAY_SIZE); break; // This calculates the answer exactly
+    case BenchId::Dot:     goldS = goldA * goldB * T(array_size); break; // This calculates the answer exactly
     default:
     std::cerr << "Unimplemented Check: " << bench[b].label << std::endl;
     abort();
@@ -372,38 +329,38 @@ void check_solution(const size_t num_times,
 
   // Error relative tolerance check
   size_t failed = 0;
-  T eps = std::numeric_limits<T>::epsilon();
-  T epsi = eps * T(100000.0);
-  auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) {
-    if (e > epsi || std::isnan(e) || std::isnan(is)) {
+  T max_rel = std::numeric_limits<T>::epsilon() * T(1000000.0);
+  auto check = [&](const char* name, T is, T should, size_t i = size_t(-1)) {
+    // Relative difference:
+    T diff = std::abs(is - should);
+    T abs_is = std::abs(is);
+    T abs_sh = std::abs(should);
+    T largest = std::max(abs_is, abs_sh);
+    T same = diff <= largest * max_rel;
+    if (!same || std::isnan(is)) {
       ++failed;
       if (failed > 10) return;
       std::cerr << "FAILED validation of " << name;
       if (i != size_t(-1)) std::cerr << "[" << i << "]";
-      std::cerr << ": " << is << " != " << should
-		<< ", relative error=" << e << " > " << epsi << std::endl;
+      std::cerr << ": " << is << " (is) != " << should
+		<< " (should)" << ", diff=" << diff << " > "
+		<< largest * max_rel << std::endl;
     }
   };
 
   // Sum
-  T eS = std::fabs(sum - goldS) / std::fabs(goldS + eps);
   for (size_t i = 0; i < num_benchmarks; ++i) {
     if (bench[i].id != BenchId::Dot) continue;
     if (run_benchmark(bench[i]))
-      check("sum", sum,  goldS, eS);
+      check("sum", sum,  goldS);
     break;
   }
 
   // Calculate the L^infty-norm relative error
-  for (size_t i = 0; i < a.size(); ++i) {
-    T vA = a[i], vB = b[i], vC = c[i];
-    T eA = std::fabs(vA - goldA) / std::fabs(goldA + eps);
-    T eB = std::fabs(vB - goldB) / std::fabs(goldB + eps);
-    T eC = std::fabs(vC - goldC) / std::fabs(goldC + eps);
-
-    check("a", a[i], goldA, eA, i);
-    check("b", b[i], goldB, eB, i);
-    check("c", c[i], goldC, eC, i);
+  for (size_t i = 0; i < array_size; ++i) {
+    check("a", a[i], goldA, i);
+    check("b", b[i], goldB, i);
+    check("c", c[i], goldC, i);
   }
 
   if (failed > 0 && !silence_errors)
@@ -449,13 +406,11 @@ void parseArguments(int argc, char *argv[])
     else if (!std::string("--arraysize").compare(argv[i]) ||
              !std::string("-s").compare(argv[i]))
     {
-      intptr_t array_size;
       if (++i >= argc || !parseInt(argv[i], &array_size) || array_size <= 0)
       {
         std::cerr << "Invalid array size." << std::endl;
         std::exit(EXIT_FAILURE);
       }
-      ARRAY_SIZE = array_size;
     }
     else if (!std::string("--numtimes").compare(argv[i]) ||
              !std::string("-n").compare(argv[i]))
diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp
index c70a701d..fc1ae30c 100644
--- a/src/ocl/OCLStream.cpp
+++ b/src/ocl/OCLStream.cpp
@@ -100,8 +100,9 @@ std::string kernels{R"CLC(
 
 
 template <class T>
-OCLStream<T>::OCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-  : array_size{ARRAY_SIZE}
+OCLStream<T>::OCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+	       T initA, T initB, T initC)
+  : array_size{array_size}
 {
   if (!cached)
     getDeviceList();
@@ -172,18 +173,20 @@ OCLStream<T>::OCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   // Check buffers fit on the device
   cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
   cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
-  if (maxbuffer < sizeof(T)*ARRAY_SIZE)
+  if (maxbuffer < sizeof(T)*array_size)
     throw std::runtime_error("Device cannot allocate a buffer big enough");
-  if (totalmem < 3*sizeof(T)*ARRAY_SIZE)
+  if (totalmem < 3*sizeof(T)*array_size)
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
   // Create buffers
-  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
-  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
-  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
+  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
+  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
   d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups);
 
   sums = std::vector<T>(dot_num_groups);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -277,11 +280,17 @@ void OCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void OCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  cl::copy(queue, d_a, a.begin(), a.end());
-  cl::copy(queue, d_b, b.begin(), b.end());
-  cl::copy(queue, d_c, c.begin(), c.end());
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  cl::copy(queue, d_a, h_a.begin(), h_a.end());
+  cl::copy(queue, d_b, h_b.begin(), h_b.end());
+  cl::copy(queue, d_c, h_c.begin(), h_c.end());
 }
 
 void getDeviceList(void)
diff --git a/src/ocl/OCLStream.h b/src/ocl/OCLStream.h
index e2366dad..e5405dde 100644
--- a/src/ocl/OCLStream.h
+++ b/src/ocl/OCLStream.h
@@ -42,6 +42,9 @@ class OCLStream : public Stream<T>
     cl::Buffer d_c;
     cl::Buffer d_sum;
 
+    // Host-side arrays for verification
+    std::vector<T> h_a, h_b, h_c;
+
     cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T> *init_kernel;
     cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
     cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
@@ -56,19 +59,19 @@ class OCLStream : public Stream<T>
 
   public:
 
-    OCLStream(const intptr_t, const int);
+    OCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~OCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp
index 09b749fd..f0389373 100644
--- a/src/omp/OMPStream.cpp
+++ b/src/omp/OMPStream.cpp
@@ -13,10 +13,10 @@
 #endif
 
 template <class T>
-OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
+OMPStream<T>::OMPStream(BenchId bs, const intptr_t array_size, const int device,
+			T initA, T initB, T initC)
+  : array_size(array_size)
 {
-  array_size = ARRAY_SIZE;
-
   // Allocate on the host
   this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
   this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
@@ -32,6 +32,7 @@ OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
   {}
 #endif
 
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -77,7 +78,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void OMPStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
 
 #ifdef OMP_TARGET_GPU
@@ -87,15 +88,9 @@ void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
   #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 #endif
-
-  #pragma omp parallel for
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
-
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/omp/OMPStream.h b/src/omp/OMPStream.h
index 40770005..fca4906c 100644
--- a/src/omp/OMPStream.h
+++ b/src/omp/OMPStream.h
@@ -29,16 +29,17 @@ class OMPStream : public Stream<T>
     T *c;
 
   public:
-    OMPStream(const intptr_t, int);
+    OMPStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~OMPStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp
index 6d6e8342..35fe6e8d 100644
--- a/src/raja/RAJAStream.cpp
+++ b/src/raja/RAJAStream.cpp
@@ -16,8 +16,9 @@ using RAJA::forall;
 #endif
 
 template <class T>
-RAJAStream<T>::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index)
-    : array_size(ARRAY_SIZE), range(0, ARRAY_SIZE)
+RAJAStream<T>::RAJAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size), range(0, array_size)
 {
 
 #ifdef RAJA_TARGET_CPU
@@ -25,11 +26,13 @@ RAJAStream<T>::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index)
   d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
   d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
-  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
-  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
-  cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_a, sizeof(T)*array_size, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_b, sizeof(T)*array_size, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_c, sizeof(T)*array_size, cudaMemAttachGlobal);
   cudaDeviceSynchronize();
 #endif
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -61,12 +64,11 @@ void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void RAJAStream<T>::read_arrays(
-        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void RAJAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  std::copy(d_a, d_a + array_size, a.data());
-  std::copy(d_b, d_b + array_size, b.data());
-  std::copy(d_c, d_c + array_size, c.data());
+  a = d_a;
+  b = d_b;
+  c = d_c;
 }
 
 template <class T>
diff --git a/src/raja/RAJAStream.hpp b/src/raja/RAJAStream.hpp
index e98b0778..a2565ccc 100644
--- a/src/raja/RAJAStream.hpp
+++ b/src/raja/RAJAStream.hpp
@@ -50,19 +50,18 @@ class RAJAStream : public Stream<T>
     T* d_c;
 
   public:
-
-    RAJAStream(const intptr_t, const int);
+    RAJAStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~RAJAStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-	virtual void nstream() override;
-	virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(
-            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
index 3efeb1b3..8c280f8a 100644
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@@ -7,9 +7,10 @@
 #include "STDDataStream.h"
 
 template <class T>
-STDDataStream<T>::STDDataStream(const intptr_t ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE},
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
+STDDataStream<T>::STDDataStream(BenchId bs, const intptr_t array_size, const int device_id,
+				T initA, T initB, T initC)
+  noexcept : array_size{array_size},
+  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
 {
     std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@@ -25,6 +26,7 @@ STDDataStream<T>::STDDataStream(const intptr_t ARRAY_SIZE, int device)
 #endif
     std::cout << std::endl;
 #endif
+    init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -43,11 +45,11 @@ void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDDataStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  std::copy(a, a + array_size, h_a.begin());
-  std::copy(b, b + array_size, h_b.begin());
-  std::copy(c, c + array_size, h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index d92864be..6db998b2 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -25,17 +25,18 @@ class STDDataStream : public Stream<T>
     T *a, *b, *c;
 
   public:
-    STDDataStream(const intptr_t, int) noexcept;
+    STDDataStream(BenchId bs, const intptr_t array_size, const int device_id,
+		  T initA, T initB, T initC) noexcept;
     ~STDDataStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
index 473d93d0..4f8efe20 100644
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@@ -11,9 +11,10 @@
 #endif
 
 template <class T>
-STDIndicesStream<T>::STDIndicesStream(const intptr_t ARRAY_SIZE, int device)
-noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
+STDIndicesStream<T>::STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id,
+				      T initA, T initB, T initC)
+noexcept : array_size{array_size}, range(0, array_size),
+  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
 {
     std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@@ -29,6 +30,7 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
 #endif
     std::cout << std::endl;
 #endif
+    init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -47,11 +49,11 @@ void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDIndicesStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  std::copy(a, a + array_size, h_a.begin());
-  std::copy(b, b + array_size, h_b.begin());
-  std::copy(c, c + array_size, h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
index 8a8f5de8..7a43b1ec 100644
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@@ -80,17 +80,18 @@ class STDIndicesStream : public Stream<T>
     T *a, *b, *c;
 
   public:
-    STDIndicesStream(const intptr_t, int) noexcept;
+    STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id,
+		     T initA, T initB, T initC) noexcept;
     ~STDIndicesStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp
index 8b7ada4b..02bd56b2 100644
--- a/src/std-ranges/STDRangesStream.cpp
+++ b/src/std-ranges/STDRangesStream.cpp
@@ -12,9 +12,10 @@
 #endif
 
 template <class T>
-STDRangesStream<T>::STDRangesStream(const intptr_t ARRAY_SIZE, int device)
-noexcept : array_size{ARRAY_SIZE},
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
+STDRangesStream<T>::STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id,
+				    T initA, T initB, T initC)
+  noexcept : array_size{array_size},
+	     a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
 {
     std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@@ -30,6 +31,7 @@ noexcept : array_size{ARRAY_SIZE},
 #endif
     std::cout << std::endl;
 #endif
+    init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -54,12 +56,11 @@ void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDRangesStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  // Element-wise copy.
-    std::copy(a, a + array_size, h_a.begin());
-    std::copy(b, b + array_size, h_b.begin());
-    std::copy(c, c + array_size, h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp
index 51680c62..da04f1f4 100644
--- a/src/std-ranges/STDRangesStream.hpp
+++ b/src/std-ranges/STDRangesStream.hpp
@@ -24,18 +24,18 @@ class STDRangesStream : public Stream<T>
     T *a, *b, *c;
 
   public:
-    STDRangesStream(const intptr_t, int) noexcept;
+    STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id,
+		    T initA, T initB, T initC) noexcept;
     ~STDRangesStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp
index e99454e6..5c00211e 100644
--- a/src/sycl/SYCLStream.cpp
+++ b/src/sycl/SYCLStream.cpp
@@ -17,13 +17,13 @@ std::vector<device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size)
 {
   if (!cached)
     getDeviceList();
 
-  array_size = ARRAY_SIZE;
-
   if (device_index >= devices.size())
     throw std::runtime_error("Invalid device index");
   device dev = devices[device_index];
@@ -79,6 +79,8 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   d_b = new buffer<T>(array_size);
   d_c = new buffer<T>(array_size);
   d_sum = new buffer<T>(dot_num_groups);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -238,17 +240,14 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void SYCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   auto _a = d_a->template get_access<access::mode::read>();
   auto _b = d_b->template get_access<access::mode::read>();
   auto _c = d_c->template get_access<access::mode::read>();
-  for (int i = 0; i < array_size; i++)
-  {
-    a[i] = _a[i];
-    b[i] = _b[i];
-    c[i] = _c[i];
-  }
+  a = &_a[0];
+  b = &_b[0];
+  c = &_c[0];
 }
 
 void getDeviceList(void)
diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h
index 1a40242d..94c3c4e9 100644
--- a/src/sycl/SYCLStream.h
+++ b/src/sycl/SYCLStream.h
@@ -54,19 +54,19 @@ class SYCLStream : public Stream<T>
 
   public:
 
-    SYCLStream(const intptr_t, const int);
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~SYCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/sycl/model.cmake b/src/sycl/model.cmake
index 3826c3c7..8f45186d 100644
--- a/src/sycl/model.cmake
+++ b/src/sycl/model.cmake
@@ -9,36 +9,32 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp)
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-
-
-        set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
-
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
-            set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
         endif ()
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
         endif ()
 
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
-        find_package(hipSYCL CONFIG REQUIRED)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
         message(STATUS "ok")
-
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
@@ -62,8 +58,8 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-        # so hipSYCL has this weird (and bad) CMake usage where they append their
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
         add_sycl_to_target(
diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp
index 742be95b..d0f97e68 100644
--- a/src/sycl2020-acc/SYCLStream2020.cpp
+++ b/src/sycl2020-acc/SYCLStream2020.cpp
@@ -15,11 +15,12 @@ std::vector<sycl::device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-: array_size {ARRAY_SIZE},
-  d_a {ARRAY_SIZE},
-  d_b {ARRAY_SIZE},
-  d_c {ARRAY_SIZE},
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size),
+  d_a {array_size},
+  d_b {array_size},
+  d_c {array_size},
   d_sum {1}
 {
   if (!cached)
@@ -68,7 +69,7 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   devices.clear();
   cached = true;
 
-
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -164,18 +165,17 @@ T SYCLStream<T>::dot()
     sycl::accessor kb {d_b, cgh, sycl::read_only};
 
     cgh.parallel_for(sycl::range<1>{array_size},
-       // Reduction object, to perform summation - initialises the result to zero
-       // hipSYCL doesn't sypport the initialize_to_identity property yet
-#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
+      // Reduction object, to perform summation - initialises the result to zero
+      // AdaptiveCpp doesn't sypport the initialize_to_identity property yet
+#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__)
       sycl::reduction(d_sum. template get_access<sycl::access_mode::read_write>(cgh), sycl::plus<T>()),
 #else
-      sycl::reduction(d_sum, cgh, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
-#endif
+      sycl::reduction(sum, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
+#endif		     
       [=](sycl::id<1> idx, auto& sum)
       {
         sum += ka[idx] * kb[idx];
       });
-
   });
 
   // Get access on the host, and return a copy of the data (single number)
@@ -206,17 +206,14 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void SYCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   sycl::host_accessor _a {d_a, sycl::read_only};
   sycl::host_accessor _b {d_b, sycl::read_only};
   sycl::host_accessor _c {d_c, sycl::read_only};
-  for (int i = 0; i < array_size; i++)
-  {
-    a[i] = _a[i];
-    b[i] = _b[i];
-    c[i] = _c[i];
-  }
+  a = &_a[0];
+  b = &_b[0];
+  c = &_c[0];
 }
 
 void getDeviceList(void)
diff --git a/src/sycl2020-acc/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h
index cd515f87..c0caae2e 100644
--- a/src/sycl2020-acc/SYCLStream2020.h
+++ b/src/sycl2020-acc/SYCLStream2020.h
@@ -35,19 +35,19 @@ class SYCLStream : public Stream<T>
 
   public:
 
-    SYCLStream(const intptr_t, const int);
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~SYCLStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;    
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake
index 3826c3c7..c34051ea 100644
--- a/src/sycl2020-acc/model.cmake
+++ b/src/sycl2020-acc/model.cmake
@@ -9,34 +9,34 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
 
 
-        set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
+        set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/AdaptiveCpp)
 
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
-            set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
         endif ()
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
         endif ()
 
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
-        find_package(hipSYCL CONFIG REQUIRED)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
         message(STATUS "ok")
 
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
@@ -62,8 +62,8 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-        # so hipSYCL has this weird (and bad) CMake usage where they append their
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
         add_sycl_to_target(
diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp
index e4c6ec27..c8b863ad 100644
--- a/src/sycl2020-usm/SYCLStream2020.cpp
+++ b/src/sycl2020-usm/SYCLStream2020.cpp
@@ -15,8 +15,9 @@ std::vector<sycl::device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-: array_size {ARRAY_SIZE}
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size)
 {
   if (!cached)
     getDeviceList();
@@ -69,7 +70,7 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   devices.clear();
   cached = true;
 
-
+  init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -156,8 +157,8 @@ T SYCLStream<T>::dot()
   {
     cgh.parallel_for(sycl::range<1>{array_size},
       // Reduction object, to perform summation - initialises the result to zero
-      // hipSYCL doesn't sypport the initialize_to_identity property yet
-#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
+      // AdaptiveCpp doesn't sypport the initialize_to_identity property yet
+#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__)
       sycl::reduction(sum, sycl::plus<T>()),
 #else
       sycl::reduction(sum, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
@@ -166,7 +167,6 @@ T SYCLStream<T>::dot()
       {
         sum += a[idx] * b[idx];
       });
-
   });
   queue->wait();
   return *sum;
@@ -189,14 +189,11 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void SYCLStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  for (int i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 void getDeviceList(void)
diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h
index 811c26ef..c88c87a3 100644
--- a/src/sycl2020-usm/SYCLStream2020.h
+++ b/src/sycl2020-usm/SYCLStream2020.h
@@ -35,19 +35,19 @@ class SYCLStream : public Stream<T>
 
   public:
 
-    SYCLStream(const intptr_t, const int);
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+	       T initA, T initB, T initC);
     ~SYCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020-usm/model.cmake
index 950daefd..8db608ba 100644
--- a/src/sycl2020-usm/model.cmake
+++ b/src/sycl2020-usm/model.cmake
@@ -9,13 +9,13 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
 
@@ -23,21 +23,21 @@ macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
 
 
-        set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
+        set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/AdaptiveCpp)
 
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
-            set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
         endif ()
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
         endif ()
 
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
-        find_package(hipSYCL CONFIG REQUIRED)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
         message(STATUS "ok")
 
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
@@ -63,8 +63,8 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-        # so hipSYCL has this weird (and bad) CMake usage where they append their
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
         add_sycl_to_target(
diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp
index 75af6141..9338ae39 100644
--- a/src/tbb/TBBStream.cpp
+++ b/src/tbb/TBBStream.cpp
@@ -20,15 +20,16 @@
 #endif
 
 template <class T>
-TBBStream<T>::TBBStream(const intptr_t ARRAY_SIZE, int device)
-  : partitioner(), range(0, (size_t)ARRAY_SIZE),
+TBBStream<T>::TBBStream(BenchId bs, const intptr_t array_size, const int device,
+			T initA, T initB, T initC)
+  : partitioner(), range(0, (size_t)array_size),
 #ifdef USE_VECTOR
-   a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+   a(array_size), b(array_size), c(array_size)
 #else
-   array_size(ARRAY_SIZE),
-   a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-   b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-   c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+   array_size(array_size),
+   a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)),
+   b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)),
+   c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size))
 #endif
 {
   if(device != 0){
@@ -36,6 +37,8 @@ TBBStream<T>::TBBStream(const intptr_t ARRAY_SIZE, int device)
   }
   std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
   std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -54,12 +57,11 @@ void TBBStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void TBBStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  // Element-wise copy.
-  std::copy(BEGIN(a), END(a), h_a.begin());
-  std::copy(BEGIN(b), END(b), h_b.begin());
-  std::copy(BEGIN(c), END(c), h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp
index 80f11c17..0a73e892 100644
--- a/src/tbb/TBBStream.hpp
+++ b/src/tbb/TBBStream.hpp
@@ -31,7 +31,6 @@ using tbb_partitioner = tbb::auto_partitioner;
 #define PARTITIONER_NAME  "auto_partitioner"
 #endif
 
-
 template <class T>
 class TBBStream : public Stream<T>
 {
@@ -48,17 +47,17 @@ class TBBStream : public Stream<T>
 #endif
 
   public:
-    TBBStream(const intptr_t, int);
+    TBBStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~TBBStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+    void init_arrays(T initA, T initB, T initC);
 };
-
diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu
index 84b27b8e..321470b8 100644
--- a/src/thrust/ThrustStream.cu
+++ b/src/thrust/ThrustStream.cu
@@ -19,7 +19,8 @@ static inline void synchronise()
 }
 
 template <class T>
-ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
+ThrustStream<T>::ThrustStream(BenchId bs, const intptr_t array_size, const int device,
+			      T initA, T initB, T initC)
     : array_size{array_size}, a(array_size), b(array_size), c(array_size) {
   std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
@@ -36,8 +37,6 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
   std::cout << "Thrust backend: TBB" << std::endl;
 #elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
   std::cout << "Thrust backend: CPP" << std::endl;
-#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
-  std::cout << "Thrust backend: TBB" << std::endl;
 #else
 
 #if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
@@ -48,6 +47,7 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
 
 #endif
 
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -60,11 +60,23 @@ void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
-{
+void ThrustStream<T>::get_arrays(T const*& a_, T const*& b_, T const*& c_)
+{
+  #if defined(MANAGED)
+  a_ = &*a.data();
+  b_ = &*b.data();
+  c_ = &*c.data();
+  #else
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   thrust::copy(a.begin(), a.end(), h_a.begin());
   thrust::copy(b.begin(), b.end(), h_b.begin());
   thrust::copy(c.begin(), c.end(), h_c.begin());
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
+  #endif
 }
 
 template <class T>
diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h
index b0acd80f..676ecaeb 100644
--- a/src/thrust/ThrustStream.h
+++ b/src/thrust/ThrustStream.h
@@ -26,28 +26,25 @@ class ThrustStream : public Stream<T>
     intptr_t array_size;
 
   #if defined(MANAGED)
-    thrust::universtal_vector<T> a;
-    thrust::universtal_vector<T> b;
-    thrust::universtal_vector<T> c;
+    thrust::universal_vector<T> a, b, c;
   #else
-    thrust::device_vector<T> a;
-    thrust::device_vector<T> b;
-    thrust::device_vector<T> c;
+    thrust::device_vector<T> a, b, c;
+    std::vector<T> h_a, h_b, h_c;
   #endif
 
   public:
-    ThrustStream(const intptr_t, int);
+    ThrustStream(BenchId bs, const intptr_t array_size, const int device_id,
+		 T initA, T initB, T initC);
     ~ThrustStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake
index 6b82ef59..23627c11 100644
--- a/src/thrust/model.cmake
+++ b/src/thrust/model.cmake
@@ -18,8 +18,7 @@ register_flag_optional(BACKEND
         "
         "CUDA")
 
-      register_flag_optional(MANAGED "Enabled managed memory mode."
-        "OFF")
+register_flag_optional(MANAGED "Enabled managed memory mode." "OFF")
 
 register_flag_optional(CMAKE_CUDA_COMPILER
         "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"