From 616b3b27103734b293716489bbc42e93eb638f99 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob@nvidia.com>
Date: Thu, 30 May 2024 14:34:36 -0700
Subject: [PATCH] Move host-side allocation to benchmarks and reuse device with
 UVM

This commit puts benchmarks in control of allocating the host
memory used for verifying the results.

This enables benchmarks that use Unified Memory for the device
allocations, to avoid the host-side allocation and just pass
pointers to the device allocation to the benchmark driver.

Closes #128 .
---
 CMakeLists.txt                       |   7 +-
 src/Stream.h                         |  13 +---
 src/StreamModels.h                   |  34 ++++-----
 src/acc/ACCStream.cpp                |  20 ++---
 src/acc/ACCStream.h                  |  33 ++++----
 src/benchmark.h                      |  66 ++++++++++++++++
 src/cuda/CUDAStream.cu               |  40 ++++++----
 src/cuda/CUDAStream.h                |  30 ++++----
 src/futhark/FutharkStream.cpp        |  22 +++++-
 src/futhark/FutharkStream.h          |  22 +++---
 src/futhark/model.cmake              |   1 +
 src/hip/HIPStream.cpp                |  47 +++++++-----
 src/hip/HIPStream.h                  |  23 +++---
 src/kokkos/KokkosStream.cpp          |  26 +++----
 src/kokkos/KokkosStream.hpp          |  32 ++++----
 src/kokkos/model.cmake               |   4 +-
 src/main.cpp                         | 109 ++++++++-------------------
 src/ocl/OCLStream.cpp                |  31 +++++---
 src/ocl/OCLStream.h                  |  23 +++---
 src/omp/OMPStream.cpp                |  21 ++----
 src/omp/OMPStream.h                  |  19 ++---
 src/raja/RAJAStream.cpp              |  22 +++---
 src/raja/RAJAStream.hpp              |  21 +++---
 src/std-data/STDDataStream.cpp       |  16 ++--
 src/std-data/STDDataStream.h         |  19 ++---
 src/std-indices/STDIndicesStream.cpp |  16 ++--
 src/std-indices/STDIndicesStream.h   |  19 ++---
 src/std-ranges/STDRangesStream.cpp   |  17 +++--
 src/std-ranges/STDRangesStream.hpp   |  20 ++---
 src/sycl/SYCLStream.cpp              |  19 +++--
 src/sycl/SYCLStream.h                |  20 ++---
 src/sycl/model.cmake                 |  30 ++++++--
 src/sycl2020-acc/SYCLStream2020.cpp  |  35 ++++-----
 src/sycl2020-acc/SYCLStream2020.h    |  20 ++---
 src/sycl2020-acc/model.cmake         |  29 +++++--
 src/sycl2020-usm/SYCLStream2020.cpp  |  23 +++---
 src/sycl2020-usm/SYCLStream2020.h    |  20 ++---
 src/sycl2020-usm/model.cmake         |  31 ++++++--
 src/tbb/TBBStream.cpp                |  32 +++++---
 src/tbb/TBBStream.hpp                |  21 +++---
 src/thrust/ThrustStream.cu           |  22 ++++--
 src/thrust/ThrustStream.h            |  29 ++++---
 src/thrust/model.cmake               |   3 +-
 43 files changed, 620 insertions(+), 487 deletions(-)
 create mode 100644 src/benchmark.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c98cab48..0df3836f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
     message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
 endif ()
 
+option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
+
 # setup some defaults flags for everything
 set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
-set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
+set(DEFAULT_RELEASE_FLAGS -O3)
+if (BUILD_NATIVE)
+    set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
+endif()
 
 macro(hint_flag FLAG DESCRIPTION)
     if (NOT DEFINED ${FLAG})
diff --git a/src/Stream.h b/src/Stream.h
index 45c144c3..c8c6af1c 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -7,14 +7,10 @@
 
 #pragma once
 
+#include <array>
 #include <vector>
 #include <string>
-
-// Array values
-#define startA (0.1)
-#define startB (0.2)
-#define startC (0.0)
-#define startScalar (0.4)
+#include "benchmark.h"
 
 template <class T>
 class Stream
@@ -31,9 +27,8 @@ class Stream
     virtual void nstream() = 0;
     virtual T dot() = 0;
 
-    // Copy memory between host and device
-    virtual void init_arrays(T initA, T initB, T initC) = 0;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
+    // Set pointers to read from arrays
+    virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
 };
 
 // Implementation specific device functions
diff --git a/src/StreamModels.h b/src/StreamModels.h
index 556beb4d..6a0836f3 100644
--- a/src/StreamModels.h
+++ b/src/StreamModels.h
@@ -35,67 +35,67 @@
 #include "FutharkStream.h"
 #endif
 
-template <typename T>
-std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
+template <typename T, typename...Args>
+std::unique_ptr<Stream<T>> make_stream(Args... args) {
 #if defined(CUDA)
   // Use the CUDA implementation
-  return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<CUDAStream<T>>(args...);
 
 #elif defined(HIP)
   // Use the HIP implementation
-  return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HIPStream<T>>(args...);
 
 #elif defined(HC)
   // Use the HC implementation
-  return std::make_unique<HCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HCStream<T>>(args...);
 
 #elif defined(OCL)
   // Use the OpenCL implementation
-  return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OCLStream<T>>(args...);
 
 #elif defined(USE_RAJA)
   // Use the RAJA implementation
-  return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<RAJAStream<T>>(args...);
 
 #elif defined(KOKKOS)
   // Use the Kokkos implementation
-  return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
+  return std::make_unique<KokkosStream<T>>(args...);
 
 #elif defined(STD_DATA)
   // Use the C++ STD data-oriented implementation
-  return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDDataStream<T>>(args...);
 
 #elif defined(STD_INDICES)
   // Use the C++ STD index-oriented implementation
-  return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDIndicesStream<T>>(args...);
 
 #elif defined(STD_RANGES)
   // Use the C++ STD ranges implementation
-  return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDRangesStream<T>>(args...);
 
 #elif defined(TBB)
   // Use the C++20 implementation
-  return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
+  return std::make_unique<TBBStream<T>>(args...);
 
 #elif defined(THRUST)
   // Use the Thrust implementation
-  return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ThrustStream<T>>(args...);
 
 #elif defined(ACC)
   // Use the OpenACC implementation
-  return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ACCStream<T>>(args...);
 
 #elif defined(SYCL) || defined(SYCL2020)
   // Use the SYCL implementation
-  return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<SYCLStream<T>>(args...);
 
 #elif defined(OMP)
   // Use the OpenMP implementation
-  return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OMPStream<T>>(args...);
 
 #elif defined(FUTHARK)
   // Use the Futhark implementation
-  return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
+  return std::make_unique<FutharkStream<T>>(args...);
 
 #else
 
diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp
index a346a39c..034336a4 100644
--- a/src/acc/ACCStream.cpp
+++ b/src/acc/ACCStream.cpp
@@ -8,11 +8,12 @@
 #include "ACCStream.h"
 
 template <class T>
-ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
-  : array_size{ARRAY_SIZE}
+ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+			T initA, T initB, T initC)
+  : array_size{array_size}
 {
   acc_device_t device_type = acc_get_device_type();
-  acc_set_device_num(device, device_type);
+  acc_set_device_num(device_id, device_type);
 
   // Set up data region on device
   this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
 
   #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -62,7 +65,7 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
   T *a = this->a;
   T *b = this->b;
@@ -70,12 +73,9 @@ void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
   #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h
index 1b053cb4..8345b785 100644
--- a/src/acc/ACCStream.h
+++ b/src/acc/ACCStream.h
@@ -19,32 +19,25 @@
 template <class T>
 class ACCStream : public Stream<T>
 {
-  struct A{
-    T *a;
-    T *b;
-    T *c;
-  };
-
-  protected:
     // Size of arrays
     intptr_t array_size;
-    A aa;
     // Device side pointers
-    T *a;
-    T *b;
-    T *c;
+    T* restrict a;
+    T* restrict b;
+    T* restrict c;
 
   public:
-    ACCStream(const intptr_t, int);
+    ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~ACCStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/benchmark.h b/src/benchmark.h
new file mode 100644
index 00000000..95d675f7
--- /dev/null
+++ b/src/benchmark.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <iostream>
+
+// Array values
+#define startA (0.1)
+#define startB (0.2)
+#define startC (0.0)
+#define startScalar (0.4)
+
+// Benchmark Identifier: identifies individual & groups of benchmarks:
+// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
+// - All: all kernels.
+// - Individual kernels only.
+enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
+
+struct Benchmark {
+  BenchId id;
+  char const* label;
+  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
+  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
+  size_t weight;
+  // Is it one of: Copy, Mul, Add, Triad, Dot?
+  bool classic = false;
+};
+
+// Benchmarks in the order in which - if present - should be run for validation purposes:
+constexpr size_t num_benchmarks = 6;
+constexpr std::array<Benchmark, num_benchmarks> bench = {
+  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
+};
+
+// Which buffers are needed by each benchmark
+inline bool needs_buffer(BenchId id, char n) {
+  auto in = [n](std::initializer_list<char> values) {
+    return std::find(values.begin(), values.end(), n) != values.end();
+  };
+  switch(id) {
+  case BenchId::All:     return in({'a','b','c'});       
+  case BenchId::Classic: return in({'a','b','c'});   
+  case BenchId::Copy:    return in({'a','c'});
+  case BenchId::Mul:	 return in({'b','c'});
+  case BenchId::Add:	 return in({'a','b','c'});
+  case BenchId::Triad:   return in({'a','b','c'});
+  case BenchId::Dot:	 return in({'a','b'});
+  case BenchId::Nstream: return in({'a','b','c'});  
+  default:
+    std::cerr << "Unknown benchmark" << std::endl;
+    abort();
+  }
+}
+
+// Returns true if the benchmark needs to be run:
+inline bool run_benchmark(BenchId selection, Benchmark const& b) {
+  if (selection == BenchId::All)                  return true;
+  if (selection == BenchId::Classic && b.classic) return true;
+  return selection == b.id;
+}
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
index 9d63ff3f..4f5599a7 100644
--- a/src/cuda/CUDAStream.cu
+++ b/src/cuda/CUDAStream.cu
@@ -77,7 +77,8 @@ void free_host(T* p) {
 }
 
 template <class T>
-CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
+CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
   : array_size(array_size)
 {
   // Set device
@@ -131,14 +132,20 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
   std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
 
   // Check buffers fit on the device
-  if (dprop.totalGlobalMem < total_bytes)
+  if (dprop.totalGlobalMem < total_bytes) {
+    std::cerr << "Requested array size of " << total_bytes * 1e-9
+	      << " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl;
     throw std::runtime_error("Device does not have enough memory for all buffers");
+  }
 
   // Allocate buffers:
   d_a = alloc_device<T>(array_size);
   d_b = alloc_device<T>(array_size);
   d_c = alloc_device<T>(array_size);
   sums = alloc_host<T>(dot_num_blocks);
+
+  // Initialize buffers:
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -204,21 +211,26 @@ void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  // Copy device memory to host
-#if defined(PAGEFAULT) || defined(MANAGED)
   CU(cudaStreamSynchronize(stream));
-  for (intptr_t i = 0; i < array_size; ++i)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+#if defined(PAGEFAULT) || defined(MANAGED)
+  // Unified memory: return pointers to device memory
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost));
+  // No Unified memory: copy data to the host
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
 #endif
 }
 
diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
index 5b739569..50e099dc 100644
--- a/src/cuda/CUDAStream.h
+++ b/src/cuda/CUDAStream.h
@@ -26,27 +26,31 @@ class CUDAStream : public Stream<T>
     intptr_t array_size;
 
     // Host array for partial sums for dot kernel
-    T *sums;
+    T* sums;
 
     // Device side pointers to arrays
-    T *d_a;
-    T *d_b;
-    T *d_c;
+    T* d_a;
+    T* d_b;
+    T* d_c;
+
+    // If UVM is disabled, host arrays for verification purposes
+    std::vector<T> h_a, h_b, h_c;
 
     // Number of blocks for dot kernel
     intptr_t dot_num_blocks;
 
   public:
-    CUDAStream(const intptr_t, const int);
+    CUDAStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~CUDAStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/futhark/FutharkStream.cpp b/src/futhark/FutharkStream.cpp
index ebd3633b..392ff898 100644
--- a/src/futhark/FutharkStream.cpp
+++ b/src/futhark/FutharkStream.cpp
@@ -11,9 +11,10 @@
 #include "FutharkStream.h"
 
 template <class T>
-FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
+FutharkStream<T>::FutharkStream(BenchId bs, const intptr_t array_size, const int device,
+				T initA, T initB, T initC)
+  : array_size(array_size)
 {
-  this->array_size = ARRAY_SIZE;
   this->cfg = futhark_context_config_new();
   this->device = "#" + std::to_string(device);
 #if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl)
@@ -23,6 +24,7 @@ FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
   this->a = NULL;
   this->b = NULL;
   this->c = NULL;
+  init_arrays(initA, initB, initC);
 }
 
 template <>
@@ -98,19 +100,31 @@ void FutharkStream<double>::init_arrays(double initA, double initB, double initC
 }
 
 template <>
-void FutharkStream<float>::read_arrays(std::vector<float>& h_a, std::vector<float>& h_b, std::vector<float>& h_c) {
+void FutharkStream<float>::get_arrays(float const*& a_, float const*& b_, float const*& c_) {
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data());
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data());
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data());
   futhark_context_sync(this->ctx);
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
 }
 
 template <>
-void FutharkStream<double>::read_arrays(std::vector<double>& h_a, std::vector<double>& h_b, std::vector<double>& h_c) {
+void FutharkStream<double>::get_arrays(double const*& a_, double const*& b_, double const*& c_) {
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data());
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data());
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data());
   futhark_context_sync(this->ctx);
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
 }
 
 template <>
diff --git a/src/futhark/FutharkStream.h b/src/futhark/FutharkStream.h
index 6290e79a..eabdabbe 100644
--- a/src/futhark/FutharkStream.h
+++ b/src/futhark/FutharkStream.h
@@ -44,17 +44,21 @@ class FutharkStream : public Stream<T>
   void* b;
   void* c;
 
+  // Host side arrays for verification
+  std::vector<T> h_a, h_b, h_c;
+
 public:
-  FutharkStream(const int, int);
+  FutharkStream(BenchId bs, const intptr_t array_size, const int device_id,
+		T initA, T initB, T initC);
   ~FutharkStream();
 
-  virtual void copy() override;
-  virtual void add() override;
-  virtual void mul() override;
-  virtual void triad() override;
-  virtual void nstream() override;
-  virtual T dot() override;
+  void copy() override;
+  void add() override;
+  void mul() override;
+  void triad() override;
+  void nstream() override;
+  T dot() override;
 
-  virtual void init_arrays(T initA, T initB, T initC) override;
-  virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+  void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+  void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/futhark/model.cmake b/src/futhark/model.cmake
index edd21fa6..d7b08795 100644
--- a/src/futhark/model.cmake
+++ b/src/futhark/model.cmake
@@ -44,6 +44,7 @@ macro(setup)
   elseif (${FUTHARK_BACKEND} STREQUAL "cuda")
     find_package(CUDA REQUIRED)
     register_link_library("nvrtc" "cuda" "cudart")
+    set(CMAKE_C_COMPILER "nvcc")
   else ()
     message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}")
   endif()
diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp
index ec02425a..e3878afd 100644
--- a/src/hip/HIPStream.cpp
+++ b/src/hip/HIPStream.cpp
@@ -25,7 +25,9 @@ void check_error(void)
 __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; }
 
 template <class T>
-HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
+HIPStream<T>::HIPStream(BenchId bs, const intptr_t array_size, const int device_index,
+	      T initA, T initB, T initC)
+  : array_size(array_size)
 {
   // Set device
   int count;
@@ -47,13 +49,12 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
     std::cout << "Memory: DEFAULT" << std::endl;
 #endif
 
-  array_size = ARRAY_SIZE;
   // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
   dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
 
   size_t array_bytes = sizeof(T);
-  array_bytes *= ARRAY_SIZE;
-  size_t total_bytes = array_bytes * 3;
+  array_bytes *= array_size;
+  size_t total_bytes = array_bytes * std::size_t{3};
 
   // Allocate the host array for partial sums for dot kernels using hipHostMalloc.
   // This creates an array on the host which is visible to the device. However, it requires
@@ -65,7 +66,7 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
   // Check buffers fit on the device
   hipDeviceProp_t props;
   hipGetDeviceProperties(&props, 0);
-  if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
+  if (props.totalGlobalMem < total_bytes)
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
   // Create device buffers
@@ -88,6 +89,8 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
   hipMalloc(&d_c, array_bytes);
   check_error();
 #endif
+
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -127,24 +130,28 @@ void HIPStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void HIPStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-
-  // Copy device memory to host
+  hipDeviceSynchronize();
 #if defined(PAGEFAULT) || defined(MANAGED)
-    hipDeviceSynchronize();
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+  // Unified memory: return pointers to device memory  
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
+  // No Unified memory: copy data to the host  
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  hipMemcpy(h_a.data(), d_a, nbytes, hipMemcpyDeviceToHost);
+  check_error();
+  hipMemcpy(h_b.data(), d_b, nbytes, hipMemcpyDeviceToHost);
+  check_error();
+  hipMemcpy(h_c.data(), d_c, nbytes, hipMemcpyDeviceToHost);
   check_error();
 #endif
 }
diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h
index 76ef7df4..a1c45802 100644
--- a/src/hip/HIPStream.h
+++ b/src/hip/HIPStream.h
@@ -48,20 +48,21 @@ class HIPStream : public Stream<T>
     T *d_b;
     T *d_c;
 
+    // If UVM is disabled, host arrays for verification purposes
+    std::vector<T> h_a, h_b, h_c;
 
   public:
-
-    HIPStream(const intptr_t, const int);
+    HIPStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~HIPStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;    
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp
index e49d5bcc..fcbdb7a7 100644
--- a/src/kokkos/KokkosStream.cpp
+++ b/src/kokkos/KokkosStream.cpp
@@ -8,21 +8,23 @@
 #include "KokkosStream.hpp"
 
 template <class T>
-KokkosStream<T>::KokkosStream(
-        const intptr_t ARRAY_SIZE, const int device_index)
-    : array_size(ARRAY_SIZE)
+KokkosStream<T>::KokkosStream(BenchId bs, const intptr_t array_size, const int device_index,
+			      T initA, T initB, T initC)
+    : array_size(array_size)
 {
   Kokkos::initialize(Kokkos::InitializationSettings().set_device_id(device_index));
 
-  d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE);
-  d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE);
-  d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE);
+  d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), array_size);
+  d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), array_size);
+  d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), array_size);
   hm_a = new typename Kokkos::View<T*>::HostMirror();
   hm_b = new typename Kokkos::View<T*>::HostMirror();
   hm_c = new typename Kokkos::View<T*>::HostMirror();
   *hm_a = create_mirror_view(*d_a);
   *hm_b = create_mirror_view(*d_b);
   *hm_c = create_mirror_view(*d_c);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -47,18 +49,14 @@ void KokkosStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void KokkosStream<T>::read_arrays(
-        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void KokkosStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   deep_copy(*hm_a, *d_a);
   deep_copy(*hm_b, *d_b);
   deep_copy(*hm_c, *d_c);
-  for(intptr_t ii = 0; ii < array_size; ++ii)
-  {
-    a[ii] = (*hm_a)(ii);
-    b[ii] = (*hm_b)(ii);
-    c[ii] = (*hm_c)(ii);
-  }
+  a = hm_a->data();
+  b = hm_b->data();
+  c = hm_c->data();
 }
 
 template <class T>
diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp
index 8e40119c..bc3ac3ee 100644
--- a/src/kokkos/KokkosStream.hpp
+++ b/src/kokkos/KokkosStream.hpp
@@ -22,27 +22,27 @@ class KokkosStream : public Stream<T>
     intptr_t array_size;
 
     // Device side pointers to arrays
-     typename Kokkos::View<T*>* d_a;
-     typename Kokkos::View<T*>* d_b;
-     typename Kokkos::View<T*>* d_c;
-     typename Kokkos::View<T*>::HostMirror* hm_a;
-     typename Kokkos::View<T*>::HostMirror* hm_b;
-     typename Kokkos::View<T*>::HostMirror* hm_c;
+    typename Kokkos::View<T*>* d_a;
+    typename Kokkos::View<T*>* d_b;
+    typename Kokkos::View<T*>* d_c;
+    typename Kokkos::View<T*>::HostMirror* hm_a;
+    typename Kokkos::View<T*>::HostMirror* hm_b;
+    typename Kokkos::View<T*>::HostMirror* hm_c;
 
   public:
 
-    KokkosStream(const intptr_t, const int);
+    KokkosStream(BenchId bs, const intptr_t array_size, const int device_id,
+		 T initA, T initB, T initC);
     ~KokkosStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(
-            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake
index 7457eebd..2223c753 100644
--- a/src/kokkos/model.cmake
+++ b/src/kokkos/model.cmake
@@ -1,5 +1,5 @@
 register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection and RAJA.
+        "Any CXX compiler that is supported by CMake detection and Kokkos.
          See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
         "c++")
 
@@ -21,7 +21,7 @@ macro(setup)
     set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
     cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
 
-
+    message("KOKKOS_IN_PACKAGE=${KOKKOS_IN_PACKAGE}")
     if (EXISTS "${KOKKOS_IN_TREE}")
         message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
         add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
diff --git a/src/main.cpp b/src/main.cpp
index c677f048..55c3a4e7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -23,7 +23,7 @@
 #include "Unit.h"
 
 // Default size of 2^25
-intptr_t ARRAY_SIZE = 33554432;
+intptr_t array_size = 33554432;
 size_t num_times = 100;
 size_t deviceIndex = 0;
 bool use_float = false;
@@ -33,42 +33,11 @@ Unit unit{Unit::Kind::MegaByte};
 bool silence_errors = false;
 std::string csv_separator = ",";
 
-// Benchmark Identifier: identifies individual & groups of benchmarks:
-// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
-// - All: all kernels.
-// - Individual kernels only.
-enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
-
-struct Benchmark {
-  BenchId id;
-  char const* label;
-  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
-  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
-  size_t weight;
-  // Is it one of: Copy, Mul, Add, Triad, Dot?
-  bool classic = false;
-};
-
-// Benchmarks in the order in which - if present - should be run for validation purposes:
-constexpr size_t num_benchmarks = 6;
-std::array<Benchmark, num_benchmarks> bench = {
-  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
-  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
-  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
-};
-
 // Selected benchmarks to run: default is all 5 classic benchmarks.
 BenchId selection = BenchId::Classic;
 
 // Returns true if the benchmark needs to be run:
-bool run_benchmark(Benchmark const& b) {
-  if (selection == BenchId::All)                  return true;
-  if (selection == BenchId::Classic && b.classic) return true;
-  return selection == b.id;
-}
+bool run_benchmark(Benchmark const& b) { return run_benchmark(selection, b); }
 
 // Benchmark run order
 // - Classic: runs each bench once in the order above, and repeats n times.
@@ -174,8 +143,7 @@ std::vector<std::vector<double>> run_all(std::unique_ptr<Stream<T>>& stream, T&
 }
 
 template <typename T>
-void check_solution(const size_t ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c,
-		    T& sum);
+void check_solution(const size_t ntimes, T const* a, T const* b, T const* c, T sum);
 
 // Generic run routine
 // Runs the kernel(s) and prints output.
@@ -186,7 +154,7 @@ void run()
 
   // Formatting utilities:
   auto fmt_bw = [&](size_t weight, double dt) {
-    return unit.fmt((weight * sizeof(T) * ARRAY_SIZE)/dt);
+    return unit.fmt((weight * sizeof(T) * array_size)/dt);
   };
   auto fmt_csv_header = [] {
     std::cout
@@ -251,46 +219,37 @@ void run()
     default: std::cerr << "Error: Unknown order" << std::endl; abort();
     };
     std::cout << " order " << std::endl;
-    std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
+    std::cout << "Number of elements: " << array_size << std::endl;
     std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl;
 
-    size_t nbytes = ARRAY_SIZE * sizeof(T);
+    size_t nbytes = array_size * sizeof(T);
     std::cout << std::setprecision(1) << std::fixed
 	      << "Array size: " << unit.fmt(nbytes) << " " << unit.str() << std::endl;
     std::cout << "Total size: " << unit.fmt(3.0*nbytes) << " " << unit.str() << std::endl;
     std::cout.precision(ss);
   }
 
-  std::unique_ptr<Stream<T>> stream = make_stream<T>(ARRAY_SIZE, deviceIndex);
-  auto initElapsedS = time([&] { stream->init_arrays(startA, startB, startC); });
+  std::unique_ptr<Stream<T>> stream
+    = make_stream<T>(selection, array_size, deviceIndex, startA, startB, startC);
 
   // Result of the Dot kernel, if used.
   T sum{};
   std::vector<std::vector<double>> timings = run_all<T>(stream, sum);
 
   // Create & read host vectors:
-  std::vector<T> a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE);
-  auto readElapsedS = time([&] { stream->read_arrays(a, b, c); });
+  T const* a;
+  T const* b;
+  T const* c;
+  stream->get_arrays(a, b, c);
 
   check_solution<T>(num_times, a, b, c, sum);
-  auto initBWps = fmt_bw(3, initElapsedS);
-  auto readBWps = fmt_bw(3, readElapsedS);
 
   if (output_as_csv)
   {
     fmt_csv_header();
-    fmt_csv("Init", 1, ARRAY_SIZE, sizeof(T), initBWps, initElapsedS, initElapsedS, initElapsedS);
-    fmt_csv("Read", 1, ARRAY_SIZE, sizeof(T), readBWps, readElapsedS, readElapsedS, readElapsedS);
   }
   else
   {
-    std::cout << "Init: "
-      << std::setw(7)
-      << initElapsedS << " s (=" << initBWps << " " << unit.str() << "/s" << ")" << std::endl;
-    std::cout << "Read: "
-      << std::setw(7)
-      << readElapsedS << " s (=" << readBWps << " " << unit.str() << "/s" << ")" << std::endl;
-
     std::cout
       << std::left << std::setw(12) << "Function"
       << std::left << std::setw(12) << (std::string(unit.str()) + "/s")
@@ -313,15 +272,13 @@ void run()
       / (double)(num_times - 1);
 
     // Display results
-    fmt_result(bench[i].label, num_times, ARRAY_SIZE, sizeof(T),
+    fmt_result(bench[i].label, num_times, array_size, sizeof(T),
 	       fmt_bw(bench[i].weight, *minmax.first), *minmax.first, *minmax.second, average);
   }
 }
 
 template <typename T>
-void check_solution(const size_t num_times,
-		    std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
-{
+void check_solution(const size_t num_times, T const* a, T const* b, T const* c, T sum) {
   // Generate correct solution
   T goldA = startA;
   T goldB = startB;
@@ -338,7 +295,7 @@ void check_solution(const size_t num_times,
     case BenchId::Add:     goldC = goldA + goldB; break;
     case BenchId::Triad:   goldA = goldB + scalar * goldC; break;
     case BenchId::Nstream: goldA += goldB + scalar * goldC; break;
-    case BenchId::Dot:     goldS = goldA * goldB * T(ARRAY_SIZE); break; // This calculates the answer exactly
+    case BenchId::Dot:     goldS = goldA * goldB * T(array_size); break; // This calculates the answer exactly
     default:
     std::cerr << "Unimplemented Check: " << bench[b].label << std::endl;
     abort();
@@ -372,38 +329,38 @@ void check_solution(const size_t num_times,
 
   // Error relative tolerance check
   size_t failed = 0;
-  T eps = std::numeric_limits<T>::epsilon();
-  T epsi = eps * T(100000.0);
-  auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) {
-    if (e > epsi || std::isnan(e) || std::isnan(is)) {
+  T max_rel = std::numeric_limits<T>::epsilon() * T(1000000.0);
+  auto check = [&](const char* name, T is, T should, size_t i = size_t(-1)) {
+    // Relative difference:
+    T diff = std::abs(is - should);
+    T abs_is = std::abs(is);
+    T abs_sh = std::abs(should);
+    T largest = std::max(abs_is, abs_sh);
+    T same = diff <= largest * max_rel;
+    if (!same || std::isnan(is)) {
       ++failed;
       if (failed > 10) return;
       std::cerr << "FAILED validation of " << name;
       if (i != size_t(-1)) std::cerr << "[" << i << "]";
-      std::cerr << ": " << is << " != " << should
-		<< ", relative error=" << e << " > " << epsi << std::endl;
+      std::cerr << ": " << is << " (is) != " << should
+		<< " (should)" << ", diff=" << diff << " > "
+		<< largest * max_rel << std::endl;
     }
   };
 
   // Sum
-  T eS = std::fabs(sum - goldS) / std::fabs(goldS + eps);
   for (size_t i = 0; i < num_benchmarks; ++i) {
     if (bench[i].id != BenchId::Dot) continue;
     if (run_benchmark(bench[i]))
-      check("sum", sum,  goldS, eS);
+      check("sum", sum,  goldS);
     break;
   }
 
   // Calculate the L^infty-norm relative error
-  for (size_t i = 0; i < a.size(); ++i) {
-    T vA = a[i], vB = b[i], vC = c[i];
-    T eA = std::fabs(vA - goldA) / std::fabs(goldA + eps);
-    T eB = std::fabs(vB - goldB) / std::fabs(goldB + eps);
-    T eC = std::fabs(vC - goldC) / std::fabs(goldC + eps);
-
-    check("a", a[i], goldA, eA, i);
-    check("b", b[i], goldB, eB, i);
-    check("c", c[i], goldC, eC, i);
+  for (size_t i = 0; i < array_size; ++i) {
+    check("a", a[i], goldA, i);
+    check("b", b[i], goldB, i);
+    check("c", c[i], goldC, i);
   }
 
   if (failed > 0 && !silence_errors)
@@ -449,13 +406,11 @@ void parseArguments(int argc, char *argv[])
     else if (!std::string("--arraysize").compare(argv[i]) ||
              !std::string("-s").compare(argv[i]))
     {
-      intptr_t array_size;
       if (++i >= argc || !parseInt(argv[i], &array_size) || array_size <= 0)
       {
         std::cerr << "Invalid array size." << std::endl;
         std::exit(EXIT_FAILURE);
       }
-      ARRAY_SIZE = array_size;
     }
     else if (!std::string("--numtimes").compare(argv[i]) ||
              !std::string("-n").compare(argv[i]))
diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp
index c70a701d..fc1ae30c 100644
--- a/src/ocl/OCLStream.cpp
+++ b/src/ocl/OCLStream.cpp
@@ -100,8 +100,9 @@ std::string kernels{R"CLC(
 
 
 template <class T>
-OCLStream<T>::OCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-  : array_size{ARRAY_SIZE}
+OCLStream<T>::OCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+	       T initA, T initB, T initC)
+  : array_size{array_size}
 {
   if (!cached)
     getDeviceList();
@@ -172,18 +173,20 @@ OCLStream<T>::OCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   // Check buffers fit on the device
   cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
   cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
-  if (maxbuffer < sizeof(T)*ARRAY_SIZE)
+  if (maxbuffer < sizeof(T)*array_size)
     throw std::runtime_error("Device cannot allocate a buffer big enough");
-  if (totalmem < 3*sizeof(T)*ARRAY_SIZE)
+  if (totalmem < 3*sizeof(T)*array_size)
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
   // Create buffers
-  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
-  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
-  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
+  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
+  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
   d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups);
 
   sums = std::vector<T>(dot_num_groups);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -277,11 +280,17 @@ void OCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void OCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  cl::copy(queue, d_a, a.begin(), a.end());
-  cl::copy(queue, d_b, b.begin(), b.end());
-  cl::copy(queue, d_c, c.begin(), c.end());
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  cl::copy(queue, d_a, h_a.begin(), h_a.end());
+  cl::copy(queue, d_b, h_b.begin(), h_b.end());
+  cl::copy(queue, d_c, h_c.begin(), h_c.end());
 }
 
 void getDeviceList(void)
diff --git a/src/ocl/OCLStream.h b/src/ocl/OCLStream.h
index e2366dad..e5405dde 100644
--- a/src/ocl/OCLStream.h
+++ b/src/ocl/OCLStream.h
@@ -42,6 +42,9 @@ class OCLStream : public Stream<T>
     cl::Buffer d_c;
     cl::Buffer d_sum;
 
+    // Host-side arrays for verification
+    std::vector<T> h_a, h_b, h_c;
+
     cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T> *init_kernel;
     cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
     cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
@@ -56,19 +59,19 @@ class OCLStream : public Stream<T>
 
   public:
 
-    OCLStream(const intptr_t, const int);
+    OCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~OCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp
index 09b749fd..f0389373 100644
--- a/src/omp/OMPStream.cpp
+++ b/src/omp/OMPStream.cpp
@@ -13,10 +13,10 @@
 #endif
 
 template <class T>
-OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
+OMPStream<T>::OMPStream(BenchId bs, const intptr_t array_size, const int device,
+			T initA, T initB, T initC)
+  : array_size(array_size)
 {
-  array_size = ARRAY_SIZE;
-
   // Allocate on the host
   this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
   this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
@@ -32,6 +32,7 @@ OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
   {}
 #endif
 
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -77,7 +78,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void OMPStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
 
 #ifdef OMP_TARGET_GPU
@@ -87,15 +88,9 @@ void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
   #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 #endif
-
-  #pragma omp parallel for
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
-
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/omp/OMPStream.h b/src/omp/OMPStream.h
index 40770005..fca4906c 100644
--- a/src/omp/OMPStream.h
+++ b/src/omp/OMPStream.h
@@ -29,16 +29,17 @@ class OMPStream : public Stream<T>
     T *c;
 
   public:
-    OMPStream(const intptr_t, int);
+    OMPStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~OMPStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp
index 6d6e8342..35fe6e8d 100644
--- a/src/raja/RAJAStream.cpp
+++ b/src/raja/RAJAStream.cpp
@@ -16,8 +16,9 @@ using RAJA::forall;
 #endif
 
 template <class T>
-RAJAStream<T>::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index)
-    : array_size(ARRAY_SIZE), range(0, ARRAY_SIZE)
+RAJAStream<T>::RAJAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size), range(0, array_size)
 {
 
 #ifdef RAJA_TARGET_CPU
@@ -25,11 +26,13 @@ RAJAStream<T>::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index)
   d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
   d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
-  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
-  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
-  cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_a, sizeof(T)*array_size, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_b, sizeof(T)*array_size, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_c, sizeof(T)*array_size, cudaMemAttachGlobal);
   cudaDeviceSynchronize();
 #endif
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -61,12 +64,11 @@ void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void RAJAStream<T>::read_arrays(
-        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void RAJAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  std::copy(d_a, d_a + array_size, a.data());
-  std::copy(d_b, d_b + array_size, b.data());
-  std::copy(d_c, d_c + array_size, c.data());
+  a = d_a;
+  b = d_b;
+  c = d_c;
 }
 
 template <class T>
diff --git a/src/raja/RAJAStream.hpp b/src/raja/RAJAStream.hpp
index e98b0778..a2565ccc 100644
--- a/src/raja/RAJAStream.hpp
+++ b/src/raja/RAJAStream.hpp
@@ -50,19 +50,18 @@ class RAJAStream : public Stream<T>
     T* d_c;
 
   public:
-
-    RAJAStream(const intptr_t, const int);
+    RAJAStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~RAJAStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-	virtual void nstream() override;
-	virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(
-            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
index 3efeb1b3..8c280f8a 100644
--- a/src/std-data/STDDataStream.cpp
+++ b/src/std-data/STDDataStream.cpp
@@ -7,9 +7,10 @@
 #include "STDDataStream.h"
 
 template <class T>
-STDDataStream<T>::STDDataStream(const intptr_t ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE},
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
+STDDataStream<T>::STDDataStream(BenchId bs, const intptr_t array_size, const int device_id,
+				T initA, T initB, T initC)
+  noexcept : array_size{array_size},
+  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
 {
     std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@@ -25,6 +26,7 @@ STDDataStream<T>::STDDataStream(const intptr_t ARRAY_SIZE, int device)
 #endif
     std::cout << std::endl;
 #endif
+    init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -43,11 +45,11 @@ void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDDataStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  std::copy(a, a + array_size, h_a.begin());
-  std::copy(b, b + array_size, h_b.begin());
-  std::copy(c, c + array_size, h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
index d92864be..6db998b2 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std-data/STDDataStream.h
@@ -25,17 +25,18 @@ class STDDataStream : public Stream<T>
     T *a, *b, *c;
 
   public:
-    STDDataStream(const intptr_t, int) noexcept;
+    STDDataStream(BenchId bs, const intptr_t array_size, const int device_id,
+		  T initA, T initB, T initC) noexcept;
     ~STDDataStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
index 473d93d0..4f8efe20 100644
--- a/src/std-indices/STDIndicesStream.cpp
+++ b/src/std-indices/STDIndicesStream.cpp
@@ -11,9 +11,10 @@
 #endif
 
 template <class T>
-STDIndicesStream<T>::STDIndicesStream(const intptr_t ARRAY_SIZE, int device)
-noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
+STDIndicesStream<T>::STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id,
+				      T initA, T initB, T initC)
+noexcept : array_size{array_size}, range(0, array_size),
+  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
 {
     std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@@ -29,6 +30,7 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
 #endif
     std::cout << std::endl;
 #endif
+    init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -47,11 +49,11 @@ void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDIndicesStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  std::copy(a, a + array_size, h_a.begin());
-  std::copy(b, b + array_size, h_b.begin());
-  std::copy(c, c + array_size, h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
index 8a8f5de8..7a43b1ec 100644
--- a/src/std-indices/STDIndicesStream.h
+++ b/src/std-indices/STDIndicesStream.h
@@ -80,17 +80,18 @@ class STDIndicesStream : public Stream<T>
     T *a, *b, *c;
 
   public:
-    STDIndicesStream(const intptr_t, int) noexcept;
+    STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id,
+		     T initA, T initB, T initC) noexcept;
     ~STDIndicesStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp
index 8b7ada4b..02bd56b2 100644
--- a/src/std-ranges/STDRangesStream.cpp
+++ b/src/std-ranges/STDRangesStream.cpp
@@ -12,9 +12,10 @@
 #endif
 
 template <class T>
-STDRangesStream<T>::STDRangesStream(const intptr_t ARRAY_SIZE, int device)
-noexcept : array_size{ARRAY_SIZE},
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
+STDRangesStream<T>::STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id,
+				    T initA, T initB, T initC)
+  noexcept : array_size{array_size},
+	     a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
 {
     std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
 #ifdef USE_ONEDPL
@@ -30,6 +31,7 @@ noexcept : array_size{ARRAY_SIZE},
 #endif
     std::cout << std::endl;
 #endif
+    init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -54,12 +56,11 @@ void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void STDRangesStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  // Element-wise copy.
-    std::copy(a, a + array_size, h_a.begin());
-    std::copy(b, b + array_size, h_b.begin());
-    std::copy(c, c + array_size, h_c.begin());
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp
index 51680c62..da04f1f4 100644
--- a/src/std-ranges/STDRangesStream.hpp
+++ b/src/std-ranges/STDRangesStream.hpp
@@ -24,18 +24,18 @@ class STDRangesStream : public Stream<T>
     T *a, *b, *c;
 
   public:
-    STDRangesStream(const intptr_t, int) noexcept;
+    STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id,
+		    T initA, T initB, T initC) noexcept;
     ~STDRangesStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp
index e99454e6..5c00211e 100644
--- a/src/sycl/SYCLStream.cpp
+++ b/src/sycl/SYCLStream.cpp
@@ -17,13 +17,13 @@ std::vector<device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size)
 {
   if (!cached)
     getDeviceList();
 
-  array_size = ARRAY_SIZE;
-
   if (device_index >= devices.size())
     throw std::runtime_error("Invalid device index");
   device dev = devices[device_index];
@@ -79,6 +79,8 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   d_b = new buffer<T>(array_size);
   d_c = new buffer<T>(array_size);
   d_sum = new buffer<T>(dot_num_groups);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -238,17 +240,14 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void SYCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   auto _a = d_a->template get_access<access::mode::read>();
   auto _b = d_b->template get_access<access::mode::read>();
   auto _c = d_c->template get_access<access::mode::read>();
-  for (int i = 0; i < array_size; i++)
-  {
-    a[i] = _a[i];
-    b[i] = _b[i];
-    c[i] = _c[i];
-  }
+  a = &_a[0];
+  b = &_b[0];
+  c = &_c[0];
 }
 
 void getDeviceList(void)
diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h
index 1a40242d..94c3c4e9 100644
--- a/src/sycl/SYCLStream.h
+++ b/src/sycl/SYCLStream.h
@@ -54,19 +54,19 @@ class SYCLStream : public Stream<T>
 
   public:
 
-    SYCLStream(const intptr_t, const int);
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~SYCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/sycl/model.cmake b/src/sycl/model.cmake
index 3826c3c7..72aa7c40 100644
--- a/src/sycl/model.cmake
+++ b/src/sycl/model.cmake
@@ -9,22 +9,34 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp)
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        endif ()
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
+        endif ()
 
+        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
+        message(STATUS "ok")
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
 
         if (NOT EXISTS "${hipSYCL_DIR}")
@@ -38,7 +50,6 @@ macro(setup)
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
         find_package(hipSYCL CONFIG REQUIRED)
         message(STATUS "ok")
-
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
@@ -62,7 +73,14 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
+        # own custom integration header flags AFTER the target has been specified
+        # hence this macro here
+        add_sycl_to_target(
+                TARGET ${NAME}
+                SOURCES ${IMPL_SOURCES})
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         # so hipSYCL has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp
index 742be95b..d0f97e68 100644
--- a/src/sycl2020-acc/SYCLStream2020.cpp
+++ b/src/sycl2020-acc/SYCLStream2020.cpp
@@ -15,11 +15,12 @@ std::vector<sycl::device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-: array_size {ARRAY_SIZE},
-  d_a {ARRAY_SIZE},
-  d_b {ARRAY_SIZE},
-  d_c {ARRAY_SIZE},
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size),
+  d_a {array_size},
+  d_b {array_size},
+  d_c {array_size},
   d_sum {1}
 {
   if (!cached)
@@ -68,7 +69,7 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   devices.clear();
   cached = true;
 
-
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -164,18 +165,17 @@ T SYCLStream<T>::dot()
     sycl::accessor kb {d_b, cgh, sycl::read_only};
 
     cgh.parallel_for(sycl::range<1>{array_size},
-       // Reduction object, to perform summation - initialises the result to zero
-       // hipSYCL doesn't sypport the initialize_to_identity property yet
-#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
+      // Reduction object, to perform summation - initialises the result to zero
+      // AdaptiveCpp doesn't sypport the initialize_to_identity property yet
+#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__)
       sycl::reduction(d_sum. template get_access<sycl::access_mode::read_write>(cgh), sycl::plus<T>()),
 #else
-      sycl::reduction(d_sum, cgh, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
-#endif
+      sycl::reduction(sum, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
+#endif		     
       [=](sycl::id<1> idx, auto& sum)
       {
         sum += ka[idx] * kb[idx];
       });
-
   });
 
   // Get access on the host, and return a copy of the data (single number)
@@ -206,17 +206,14 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void SYCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   sycl::host_accessor _a {d_a, sycl::read_only};
   sycl::host_accessor _b {d_b, sycl::read_only};
   sycl::host_accessor _c {d_c, sycl::read_only};
-  for (int i = 0; i < array_size; i++)
-  {
-    a[i] = _a[i];
-    b[i] = _b[i];
-    c[i] = _c[i];
-  }
+  a = &_a[0];
+  b = &_b[0];
+  c = &_c[0];
 }
 
 void getDeviceList(void)
diff --git a/src/sycl2020-acc/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h
index cd515f87..c0caae2e 100644
--- a/src/sycl2020-acc/SYCLStream2020.h
+++ b/src/sycl2020-acc/SYCLStream2020.h
@@ -35,19 +35,19 @@ class SYCLStream : public Stream<T>
 
   public:
 
-    SYCLStream(const intptr_t, const int);
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~SYCLStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;    
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake
index 3826c3c7..9847b348 100644
--- a/src/sycl2020-acc/model.cmake
+++ b/src/sycl2020-acc/model.cmake
@@ -9,22 +9,35 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp)
 
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        endif ()
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
+        endif ()
 
+        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
+        message(STATUS "ok")
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
 
         if (NOT EXISTS "${hipSYCL_DIR}")
@@ -38,7 +51,6 @@ macro(setup)
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
         find_package(hipSYCL CONFIG REQUIRED)
         message(STATUS "ok")
-
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
@@ -62,7 +74,14 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
+        # own custom integration header flags AFTER the target has been specified
+        # hence this macro here
+        add_sycl_to_target(
+                TARGET ${NAME}
+                SOURCES ${IMPL_SOURCES})
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         # so hipSYCL has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp
index e4c6ec27..c8b863ad 100644
--- a/src/sycl2020-usm/SYCLStream2020.cpp
+++ b/src/sycl2020-usm/SYCLStream2020.cpp
@@ -15,8 +15,9 @@ std::vector<sycl::device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-: array_size {ARRAY_SIZE}
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size)
 {
   if (!cached)
     getDeviceList();
@@ -69,7 +70,7 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   devices.clear();
   cached = true;
 
-
+  init_arrays(initA, initB, initC);
 }
 
 template<class T>
@@ -156,8 +157,8 @@ T SYCLStream<T>::dot()
   {
     cgh.parallel_for(sycl::range<1>{array_size},
       // Reduction object, to perform summation - initialises the result to zero
-      // hipSYCL doesn't sypport the initialize_to_identity property yet
-#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
+      // AdaptiveCpp doesn't sypport the initialize_to_identity property yet
+#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__)
       sycl::reduction(sum, sycl::plus<T>()),
 #else
       sycl::reduction(sum, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
@@ -166,7 +167,6 @@ T SYCLStream<T>::dot()
       {
         sum += a[idx] * b[idx];
       });
-
   });
   queue->wait();
   return *sum;
@@ -189,14 +189,11 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void SYCLStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  for (int i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 void getDeviceList(void)
diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h
index 811c26ef..c88c87a3 100644
--- a/src/sycl2020-usm/SYCLStream2020.h
+++ b/src/sycl2020-usm/SYCLStream2020.h
@@ -35,19 +35,19 @@ class SYCLStream : public Stream<T>
 
   public:
 
-    SYCLStream(const intptr_t, const int);
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+	       T initA, T initB, T initC);
     ~SYCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020-usm/model.cmake
index 950daefd..72aa7c40 100644
--- a/src/sycl2020-usm/model.cmake
+++ b/src/sycl2020-usm/model.cmake
@@ -9,23 +9,34 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
-
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp)
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        endif ()
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
+        endif ()
 
+        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
+        message(STATUS "ok")
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
 
         if (NOT EXISTS "${hipSYCL_DIR}")
@@ -39,7 +50,6 @@ macro(setup)
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
         find_package(hipSYCL CONFIG REQUIRED)
         message(STATUS "ok")
-
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
@@ -63,7 +73,14 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
+        # own custom integration header flags AFTER the target has been specified
+        # hence this macro here
+        add_sycl_to_target(
+                TARGET ${NAME}
+                SOURCES ${IMPL_SOURCES})
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         # so hipSYCL has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp
index 75af6141..01508022 100644
--- a/src/tbb/TBBStream.cpp
+++ b/src/tbb/TBBStream.cpp
@@ -20,15 +20,16 @@
 #endif
 
 template <class T>
-TBBStream<T>::TBBStream(const intptr_t ARRAY_SIZE, int device)
-  : partitioner(), range(0, (size_t)ARRAY_SIZE),
+TBBStream<T>::TBBStream(BenchId bs, const intptr_t array_size, const int device,
+			T initA, T initB, T initC)
+  : partitioner(), range(0, (size_t)array_size),
 #ifdef USE_VECTOR
-   a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+   a(array_size), b(array_size), c(array_size)
 #else
-   array_size(ARRAY_SIZE),
-   a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-   b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-   c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+   array_size(array_size),
+   a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)),
+   b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)),
+   c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size))
 #endif
 {
   if(device != 0){
@@ -36,6 +37,8 @@ TBBStream<T>::TBBStream(const intptr_t ARRAY_SIZE, int device)
   }
   std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
   std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -54,12 +57,17 @@ void TBBStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void TBBStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  // Element-wise copy.
-  std::copy(BEGIN(a), END(a), h_a.begin());
-  std::copy(BEGIN(b), END(b), h_b.begin());
-  std::copy(BEGIN(c), END(c), h_c.begin());
+#ifdef USE_VECTOR
+  h_a = a.data();
+  h_b = b.data();
+  h_c = c.data();
+#else
+  h_a = a;
+  h_b = b;
+  h_c = c;
+#endif
 }
 
 template <class T>
diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp
index 80f11c17..0a73e892 100644
--- a/src/tbb/TBBStream.hpp
+++ b/src/tbb/TBBStream.hpp
@@ -31,7 +31,6 @@ using tbb_partitioner = tbb::auto_partitioner;
 #define PARTITIONER_NAME  "auto_partitioner"
 #endif
 
-
 template <class T>
 class TBBStream : public Stream<T>
 {
@@ -48,17 +47,17 @@ class TBBStream : public Stream<T>
 #endif
 
   public:
-    TBBStream(const intptr_t, int);
+    TBBStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~TBBStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+    void init_arrays(T initA, T initB, T initC);
 };
-
diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu
index 84b27b8e..321470b8 100644
--- a/src/thrust/ThrustStream.cu
+++ b/src/thrust/ThrustStream.cu
@@ -19,7 +19,8 @@ static inline void synchronise()
 }
 
 template <class T>
-ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
+ThrustStream<T>::ThrustStream(BenchId bs, const intptr_t array_size, const int device,
+			      T initA, T initB, T initC)
     : array_size{array_size}, a(array_size), b(array_size), c(array_size) {
   std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
@@ -36,8 +37,6 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
   std::cout << "Thrust backend: TBB" << std::endl;
 #elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
   std::cout << "Thrust backend: CPP" << std::endl;
-#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
-  std::cout << "Thrust backend: TBB" << std::endl;
 #else
 
 #if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
@@ -48,6 +47,7 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
 
 #endif
 
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -60,11 +60,23 @@ void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
-{
+void ThrustStream<T>::get_arrays(T const*& a_, T const*& b_, T const*& c_)
+{
+  #if defined(MANAGED)
+  a_ = &*a.data();
+  b_ = &*b.data();
+  c_ = &*c.data();
+  #else
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   thrust::copy(a.begin(), a.end(), h_a.begin());
   thrust::copy(b.begin(), b.end(), h_b.begin());
   thrust::copy(c.begin(), c.end(), h_c.begin());
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
+  #endif
 }
 
 template <class T>
diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h
index b0acd80f..676ecaeb 100644
--- a/src/thrust/ThrustStream.h
+++ b/src/thrust/ThrustStream.h
@@ -26,28 +26,25 @@ class ThrustStream : public Stream<T>
     intptr_t array_size;
 
   #if defined(MANAGED)
-    thrust::universtal_vector<T> a;
-    thrust::universtal_vector<T> b;
-    thrust::universtal_vector<T> c;
+    thrust::universal_vector<T> a, b, c;
   #else
-    thrust::device_vector<T> a;
-    thrust::device_vector<T> b;
-    thrust::device_vector<T> c;
+    thrust::device_vector<T> a, b, c;
+    std::vector<T> h_a, h_b, h_c;
   #endif
 
   public:
-    ThrustStream(const intptr_t, int);
+    ThrustStream(BenchId bs, const intptr_t array_size, const int device_id,
+		 T initA, T initB, T initC);
     ~ThrustStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake
index 6b82ef59..23627c11 100644
--- a/src/thrust/model.cmake
+++ b/src/thrust/model.cmake
@@ -18,8 +18,7 @@ register_flag_optional(BACKEND
         "
         "CUDA")
 
-      register_flag_optional(MANAGED "Enabled managed memory mode."
-        "OFF")
+register_flag_optional(MANAGED "Enabled managed memory mode." "OFF")
 
 register_flag_optional(CMAKE_CUDA_COMPILER
         "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"