diff --git a/clic/include/array.hpp b/clic/include/array.hpp
index ca24a6d30..a9ed18572 100644
--- a/clic/include/array.hpp
+++ b/clic/include/array.hpp
@@ -53,7 +53,7 @@ class Array : public std::enable_shared_from_this<Array>
   fill(const float & value) const -> void;
 
   [[nodiscard]] auto
-  nbElements() const -> size_t;
+  size() const -> size_t;
   [[nodiscard]] auto
   width() const -> size_t;
   [[nodiscard]] auto
@@ -61,7 +61,7 @@ class Array : public std::enable_shared_from_this<Array>
   [[nodiscard]] auto
   depth() const -> size_t;
   [[nodiscard]] auto
-  bytesPerElements() const -> size_t;
+  itemSize() const -> size_t;
   [[nodiscard]] auto
   dtype() const -> dType;
   [[nodiscard]] auto
diff --git a/clic/include/device.hpp b/clic/include/device.hpp
index 09973db7b..38edd8cf4 100644
--- a/clic/include/device.hpp
+++ b/clic/include/device.hpp
@@ -162,6 +162,8 @@ class CUDADevice : public Device
   [[nodiscard]] auto
   getInfo() const -> std::string override;
   [[nodiscard]] auto
+  getArch() const -> std::string;
+  [[nodiscard]] auto
   getCache() -> std::map<std::string, CUmodule> &;
 
 private:
diff --git a/clic/include/execution.hpp b/clic/include/execution.hpp
index 809ff485a..29e797779 100644
--- a/clic/include/execution.hpp
+++ b/clic/include/execution.hpp
@@ -38,6 +38,13 @@ execute(const Device::Pointer & device,
         const RangeArray &      global_range = { 1, 1, 1 },
         const ConstantList &    constants = {}) -> void;
 
+auto
+native_execute(const Device::Pointer & device,
+               const KernelInfo &      kernel_func,
+               const ParameterList &   parameters,
+               const RangeArray &      global_range = { 1, 1, 1 },
+               const RangeArray &      local_range = { 1, 1, 1 }) -> void;
+
 auto
 loadSource(const std::string & source_path) -> std::string;
 
diff --git a/clic/include/utils.hpp b/clic/include/utils.hpp
index 8d1a854ae..4c6fe0029 100644
--- a/clic/include/utils.hpp
+++ b/clic/include/utils.hpp
@@ -41,57 +41,57 @@ enum class dType
 };
 
 inline auto
-operator<<(std::ostream & out, const dType & dtype) -> std::ostream &
+toString(const dType & dtype) -> std::string
 {
   switch (dtype)
   {
     case dType::FLOAT:
-      out << "float";
-      break;
+      return "float";
     case dType::INT32:
-      out << "int";
-      break;
+      return "int";
     case dType::UINT32:
-      out << "uint";
-      break;
+      return "uint";
     case dType::INT8:
-      out << "char";
-      break;
+      return "char";
     case dType::UINT8:
-      out << "uchar";
-      break;
+      return "uchar";
     case dType::INT16:
-      out << "short";
-      break;
+      return "short";
     case dType::UINT16:
-      out << "ushort";
-      break;
+      return "ushort";
     case dType::INT64:
-      out << "long";
-      break;
+      return "long";
     case dType::UINT64:
-      out << "ulong";
-      break;
+      return "ulong";
     default:
-      out << "unknown";
-      break;
+      return "unknown";
   }
-  return out;
 }
 
 inline auto
-operator<<(std::ostream & out, const mType & mtype) -> std::ostream &
+operator<<(std::ostream & out, const dType & dtype) -> std::ostream &
+{
+  return out << toString(dtype);
+}
+
+inline auto
+toString(const mType & mtype) -> std::string
 {
   switch (mtype)
   {
     case mType::BUFFER:
-      out << "Buffer";
-      break;
+      return "Buffer";
     case mType::IMAGE:
-      out << "Image";
-      break;
+      return "Image";
+    default:
+      return "unknown";
   }
-  return out;
+}
+
+inline auto
+operator<<(std::ostream & out, const mType & mtype) -> std::ostream &
+{
+  return out << toString(mtype);
 }
 
 template <typename T>
diff --git a/clic/src/array.cpp b/clic/src/array.cpp
index fe6233e9d..2ed8f5571 100644
--- a/clic/src/array.cpp
+++ b/clic/src/array.cpp
@@ -90,6 +90,10 @@ Array::allocate() -> void
 auto
 Array::write(const void * host_data) -> void
 {
+  if (host_data == nullptr)
+  {
+    throw std::runtime_error("Error: host_data is null");
+  }
   if (!initialized())
   {
     allocate();
@@ -101,6 +105,10 @@ Array::write(const void * host_data) -> void
 auto
 Array::read(void * host_data) const -> void
 {
+  if (host_data == nullptr)
+  {
+    throw std::runtime_error("Error: host_data is null");
+  }
   if (!initialized())
   {
     throw std::runtime_error("Error: Array is not initialized, it cannot be read");
@@ -119,8 +127,7 @@ Array::copy(const Array::Pointer & dst) const -> void
   {
     std::cerr << "Error: copying Arrays from different devices" << std::endl;
   }
-  if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() ||
-      bytesPerElements() != dst->bytesPerElements())
+  if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() || itemSize() != dst->itemSize())
   {
     std::cerr << "Error: Arrays dimensions do not match" << std::endl;
   }
@@ -161,7 +168,7 @@ Array::fill(const float & value) const -> void
 }
 
 auto
-Array::nbElements() const -> size_t
+Array::size() const -> size_t
 {
   return width_ * height_ * depth_;
 }
@@ -181,7 +188,7 @@ Array::depth() const -> size_t
   return depth_;
 }
 auto
-Array::bytesPerElements() const -> size_t
+Array::itemSize() const -> size_t
 {
   return toBytes(dataType_);
 }
@@ -224,29 +231,9 @@ Array::c_get() const -> const void **
 auto
 Array::shortType() const -> std::string
 {
-  switch (this->dataType_)
-  {
-    case dType::FLOAT:
-      return "f";
-    case dType::INT32:
-      return "i";
-    case dType::UINT32:
-      return "ui";
-    case dType::INT8:
-      return "c";
-    case dType::UINT8:
-      return "uc";
-    case dType::INT16:
-      return "s";
-    case dType::UINT16:
-      return "us";
-    case dType::INT64:
-      return "l";
-    case dType::UINT64:
-      return "ul";
-    default:
-      throw std::invalid_argument("Invalid Array::Type value");
-  }
+  const auto str_type = toString(dtype());
+  return (str_type[0] == 'u') ? str_type.substr(0, 2) : str_type.substr(0, 1);
 }
 
+
 } // namespace cle
diff --git a/clic/src/cudabackend.cpp b/clic/src/cudabackend.cpp
index 35064522e..a1aecbe8c 100644
--- a/clic/src/cudabackend.cpp
+++ b/clic/src/cudabackend.cpp
@@ -1,6 +1,7 @@
 #include "backend.hpp"
 #include "cle_preamble_cu.h"
 #include <array>
+#include <chrono>
 
 namespace cle
 {
@@ -654,14 +655,15 @@ CUDABackend::loadProgramFromCache(const Device::Pointer & device, const std::str
   -> void
 {
 #if USE_CUDA
-  auto     cuda_device = std::dynamic_pointer_cast<CUDADevice>(device);
-  CUmodule module = nullptr;
-  auto     ite = cuda_device->getCache().find(hash);
-  if (ite != cuda_device->getCache().end())
+  if (auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device))
   {
-    module = ite->second;
+    const auto & cache = cuda_device->getCache();
+    auto         ite = cache.find(hash);
+    if (ite != cache.end())
+    {
+      *static_cast<CUmodule *>(program) = ite->second;
+    }
   }
-  program = module;
 #else
   throw std::runtime_error("Error: CUDA is not enabled");
 #endif
@@ -671,13 +673,16 @@ auto
 CUDABackend::saveProgramToCache(const Device::Pointer & device, const std::string & hash, void * program) const -> void
 {
 #if USE_CUDA
-  auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device);
-  cuda_device->getCache().emplace_hint(cuda_device->getCache().end(), hash, reinterpret_cast<CUmodule>(program));
+  if (auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device))
+  {
+    cuda_device->getCache().emplace(hash, *reinterpret_cast<CUmodule *>(program));
+  }
 #else
   throw std::runtime_error("Error: CUDA is not enabled");
 #endif
 }
 
+
 auto
 CUDABackend::buildKernel(const Device::Pointer & device,
                          const std::string &     kernel_source,
@@ -692,42 +697,59 @@ CUDABackend::buildKernel(const Device::Pointer & device,
     throw std::runtime_error("Error (cuda): Failed to set CUDA device before memory allocation.");
   }
 
-  nvrtcProgram prog;
-  auto         res = nvrtcCreateProgram(&prog, kernel_source.c_str(), nullptr, 0, nullptr, nullptr);
-  if (res != NVRTC_SUCCESS)
-  {
-    throw std::runtime_error("Error (cuda): Failed to create program from source with error code " +
-                             std::to_string(res));
-  }
-  res = nvrtcCompileProgram(prog, 0, nullptr);
-  if (res != NVRTC_SUCCESS)
-  {
-    size_t log_size;
-    nvrtcGetProgramLogSize(prog, &log_size);
-    std::string log(log_size, '\0');
-    nvrtcGetProgramLog(prog, &log[0]);
-    std::cerr << "Build log: " << log << std::endl;
-    throw std::runtime_error("Error (cuda): Failed to build program with error code " + std::to_string(res));
-  }
-  size_t ptxSize;
-  nvrtcGetPTXSize(prog, &ptxSize);
-  std::vector<char> ptx(ptxSize);
-  nvrtcGetPTX(prog, ptx.data());
+  std::chrono::high_resolution_clock::time_point start_time, end_time;
+  std::chrono::microseconds                      duration;
 
-  CUmodule cuModule;
-  err = cuModuleLoadData(&cuModule, ptx.data());
-  if (err != CUDA_SUCCESS)
+  CUmodule    cuModule = nullptr;
+  std::string hash = std::to_string(std::hash<std::string>{}(kernel_source));
+  loadProgramFromCache(device, hash, &cuModule);
+  if (cuModule == nullptr)
   {
-    throw std::runtime_error("Error (cuda): Loading module with error code " + std::to_string(err));
-  }
+    nvrtcProgram prog;
+    auto         res = nvrtcCreateProgram(&prog, kernel_source.c_str(), nullptr, 0, nullptr, nullptr);
+    if (res != NVRTC_SUCCESS)
+    {
+      throw std::runtime_error("Error (cuda): Failed to create program from source with error code " +
+                               std::to_string(res));
+    }
+
+    const std::string                 arch_comp = "-arch=compute_" + cuda_device->getArch();
+    const std::array<const char *, 1> options = { arch_comp.c_str() };
+    res = nvrtcCompileProgram(prog, options.size(), options.data());
+    if (res != NVRTC_SUCCESS)
+    {
+      size_t log_size;
+      nvrtcGetProgramLogSize(prog, &log_size);
+      std::string log(log_size, '\0');
+      nvrtcGetProgramLog(prog, &log[0]);
+      std::cerr << "Build log: " << log << std::endl;
+      throw std::runtime_error("Error (cuda): Failed to build program with error code " + std::to_string(res));
+    }
+    size_t ptxSize;
+    nvrtcGetPTXSize(prog, &ptxSize);
+    std::vector<char> ptx(ptxSize);
+    nvrtcGetPTX(prog, ptx.data());
+    res = nvrtcDestroyProgram(&prog);
+    if (res != NVRTC_SUCCESS)
+    {
+      throw std::runtime_error("Error (cuda): Failed to destroy program with error code " + std::to_string(res));
+    }
 
+    err = cuModuleLoadData(&cuModule, ptx.data());
+    if (err != CUDA_SUCCESS)
+    {
+      throw std::runtime_error("Error (cuda): Loading module with error code " + std::to_string(err));
+    }
+
+
+    saveProgramToCache(device, hash, &cuModule);
+  }
   CUfunction cuFunction;
   err = cuModuleGetFunction(&cuFunction, cuModule, kernel_name.c_str());
   if (err != CUDA_SUCCESS)
   {
     throw std::runtime_error("Error (cuda): Getting function from module with error code " + std::to_string(err));
   }
-
   *(reinterpret_cast<CUfunction *>(kernel)) = cuFunction;
 #else
   throw std::runtime_error("Error: CUDA is not enabled");
diff --git a/clic/src/cudadevice.cpp b/clic/src/cudadevice.cpp
index bef3285d4..4b5648a4e 100644
--- a/clic/src/cudadevice.cpp
+++ b/clic/src/cudadevice.cpp
@@ -128,6 +128,15 @@ CUDADevice::getName() const -> std::string
   return std::string(device_name);
 }
 
+auto
+CUDADevice::getArch() const -> std::string
+{
+  int major = 0, minor = 0;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, this->getCUDADevice());
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, this->getCUDADevice());
+  return std::to_string(major) + std::to_string(minor);
+}
+
 auto
 CUDADevice::getInfo() const -> std::string
 {
diff --git a/clic/src/execution.cpp b/clic/src/execution.cpp
index 1aed81755..c5dcf227d 100644
--- a/clic/src/execution.cpp
+++ b/clic/src/execution.cpp
@@ -4,47 +4,47 @@
 #include <fstream>
 #include <regex>
 #include <string_view>
+#include <vector>
 
 namespace cle
 {
 
 auto
-srcOpenclToCuda(const std::string & opencl_code) -> std::string
+translateOpenclToCuda(std::string & code) -> void
 {
-  std::string cuda_code = opencl_code; // Start with a copy of the input code
-
-  // Precompile regular expressions
-  static const std::regex int2_float4_regex(R"(\((int2|int4|float4|float2)\)\s*\{\s*([^}]*)\s*\}\s*;)");
-  static const std::regex constant_sampler_regex(R"(__constant\s+sampler_t)");
-  static const std::regex kernel_inline_regex(R"(__kernel\s+)");
-  static const std::regex inline_regex(R"(inline)");
-  static const std::regex pragma_regex(R"(#pragma)");
-  static const std::regex kernel_void_regex(R"(\nkernel\s+void)");
-  static const std::regex kernel_regex(R"(__kernel\s+)");
-  static const std::regex global_id0_regex(R"(get_global_id\(0\))");
-  static const std::regex global_id1_regex(R"(get_global_id\(1\))");
-  static const std::regex global_id2_regex(R"(get_global_id\(2\))");
-
-  // Perform replacements in a single pass
-  cuda_code = std::regex_replace(cuda_code, int2_float4_regex, "make_$1($2);");
-  cuda_code = std::regex_replace(cuda_code, constant_sampler_regex, "__device__ int");
-  cuda_code = std::regex_replace(cuda_code, kernel_inline_regex, "extern \"C\" __global__ ");
-  cuda_code = std::regex_replace(cuda_code, inline_regex, "__device__ inline");
-  cuda_code = std::regex_replace(cuda_code, pragma_regex, "// #pragma");
-  cuda_code = std::regex_replace(cuda_code, kernel_void_regex, "\nextern \"C\" __global__ void");
-  cuda_code = std::regex_replace(cuda_code, kernel_regex, "extern \"C\" __global__ ");
-  cuda_code = std::regex_replace(cuda_code, global_id0_regex, "blockDim.x * blockIdx.x + threadIdx.x");
-  cuda_code = std::regex_replace(cuda_code, global_id1_regex, "blockDim.y * blockIdx.y + threadIdx.y");
-  cuda_code = std::regex_replace(cuda_code, global_id2_regex, "blockDim.z * blockIdx.z + threadIdx.z");
-
-  return cuda_code;
+  const std::vector<std::pair<std::string, std::string>> replacements = {
+    { "(int2){", "make_int2(" },     // need to close with ');'
+    { "(int4){", "make_int4(" },     // need to close with ');'
+    { "(float4){", "make_float4(" }, // need to close with ');'
+    { "(float2){", "make_float2(" }, // need to close with ');'
+    { "__constant sampler_t", "__device__ int" },
+    { "inline", "__device__ inline" },
+    { "#pragma", "// #pragma" },
+    { "__kernel void", "extern \"C\" __global__ void" },
+    { "get_global_id(0)", "blockDim.x * blockIdx.x + threadIdx.x" },
+    { "get_global_id(1)", "blockDim.y * blockIdx.y + threadIdx.y" },
+    { "get_global_id(2)", "blockDim.z * blockIdx.z + threadIdx.z" }
+  };
+  for (const auto & [to_replace, replace_with] : replacements)
+  {
+    size_t pos = 0;
+    while ((pos = code.find(to_replace, pos)) != std::string::npos)
+    {
+      code.replace(pos, to_replace.length(), replace_with);
+      pos += replace_with.length();
+      if (to_replace.find("(int") != std::string::npos || to_replace.find("(float") != std::string::npos)
+      {
+        size_t pos2 = code.find("};", pos);
+        code.replace(pos2, 2, ");");
+      }
+    }
+  }
 }
 
 auto
 cudaDefines(const ParameterList & parameter_list, const ConstantList & constant_list) -> std::string
 {
   std::ostringstream defines;
-
   if (!constant_list.empty())
   {
     for (const auto & [key, value] : constant_list)
@@ -53,7 +53,6 @@ cudaDefines(const ParameterList & parameter_list, const ConstantList & constant_
     }
     defines << "\n";
   }
-
   std::string size_params = "";
   for (const auto & param : parameter_list)
   {
@@ -63,46 +62,46 @@ cudaDefines(const ParameterList & parameter_list, const ConstantList & constant_
     }
     const auto & arr = std::get<Array::Pointer>(param.second);
 
-    std::string ndim;
-    std::string pos_type;
-    std::string pos;
-    std::string pixel_type;
-    std::string type_id;
-    switch (arr->dim())
+    // Function to format and append the define string
+    auto appendToString = [](std::ostringstream & os,
+                             const std::string &  paramFirst,
+                             const std::string &  posType,
+                             const std::string &  pos) {
+      if (posType == "int")
+      {
+        os << "\n#define POS_" << paramFirst << "_INSTANCE(pos0,pos1,pos2,pos3) " << pos;
+      }
+      else
+      {
+        os << "\n#define POS_" << paramFirst << "_INSTANCE(pos0,pos1,pos2,pos3) make_" << posType << "" << pos;
+      }
+    };
+    static constexpr std::array<const char *, 3> ndimMap = { "1", "2", "3" };
+    static constexpr std::array<const char *, 3> posTypeMap = { "int", "int2", "int4" };
+    static constexpr std::array<const char *, 3> posMap = { "(pos0)", "(pos0, pos1)", "(pos0, pos1, pos2, 0)" };
+
+    int         dim = arr->dim();
+    std::string ndim = ndimMap[dim - 1];
+    std::string pos_type = posTypeMap[dim - 1];
+    std::string pos = posMap[dim - 1];
+    if (pos_type == "int")
     {
-      case 1:
-        ndim = "1";
-        pos_type = "int";
-        pos = "(pos0)";
-        defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) " << pos;
-        break;
-      case 2:
-        ndim = "2";
-        pos_type = "int2";
-        pos = "(pos0, pos1)";
-        defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) make_" << pos_type << "" << pos;
-        break;
-      case 3:
-      default:
-        ndim = "3";
-        pos_type = "int4";
-        pos = "(pos0, pos1, pos2, 0)";
-        defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) make_" << pos_type << "" << pos;
-        break;
+      defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) " << pos;
+    }
+    else
+    {
+      defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) make_" << pos_type << "" << pos;
     }
-
     defines << "\n";
     defines << "\n#define CONVERT_" << param.first << "_PIXEL_TYPE clij_convert_" << arr->dtype() << "_sat";
     defines << "\n#define IMAGE_" << param.first << "_PIXEL_TYPE " << arr->dtype() << "";
     defines << "\n#define POS_" << param.first << "_TYPE " << pos_type;
     defines << "\n";
-
     defines << "\n";
     defines << "\n#define IMAGE_SIZE_" << param.first << "_WIDTH " << std::to_string(arr->width());
     defines << "\n#define IMAGE_SIZE_" << param.first << "_HEIGHT " << std::to_string(arr->height());
     defines << "\n#define IMAGE_SIZE_" << param.first << "_DEPTH " << std::to_string(arr->depth());
     defines << "\n";
-
     defines << "\n";
     defines << "\n#define IMAGE_" << param.first << "_TYPE " << size_params << "" << arr->dtype() << "*";
     defines << "\n#define READ_" << param.first << "_IMAGE(a,b,c) read_buffer" << ndim << "d" << arr->shortType()
@@ -110,11 +109,9 @@ cudaDefines(const ParameterList & parameter_list, const ConstantList & constant_
     defines << "\n#define WRITE_" << param.first << "_IMAGE(a,b,c) write_buffer" << ndim << "d" << arr->shortType()
             << "(GET_IMAGE_WIDTH(a),GET_IMAGE_HEIGHT(a),GET_IMAGE_DEPTH(a),a,b,c)";
     defines << "\n";
-
     size_params = "";
   }
   defines << "\n";
-
   return defines.str();
 }
 
@@ -122,7 +119,6 @@ auto
 oclDefines(const ParameterList & parameter_list, const ConstantList & constant_list) -> std::string
 {
   std::ostringstream defines;
-
   if (!constant_list.empty())
   {
     for (const auto & [key, value] : constant_list)
@@ -131,12 +127,10 @@ oclDefines(const ParameterList & parameter_list, const ConstantList & constant_l
     }
     defines << "\n";
   }
-
   defines << "\n#define GET_IMAGE_WIDTH(image_key) IMAGE_SIZE_ ## image_key ## _WIDTH";
   defines << "\n#define GET_IMAGE_HEIGHT(image_key) IMAGE_SIZE_ ## image_key ## _HEIGHT";
   defines << "\n#define GET_IMAGE_DEPTH(image_key) IMAGE_SIZE_ ## image_key ## _DEPTH";
   defines << "\n";
-
   for (const auto & param : parameter_list)
   {
     if (std::holds_alternative<const float>(param.second) || std::holds_alternative<const int>(param.second))
@@ -145,32 +139,14 @@ oclDefines(const ParameterList & parameter_list, const ConstantList & constant_l
     }
     const auto & arr = std::get<Array::Pointer>(param.second);
 
-    std::string pos_type;
-    std::string pos;
-    std::string ndim;
-    switch (arr->dim())
-    {
-      case 1:
-        ndim = "1";
-        pos_type = "int";
-        pos = "(pos0)";
-        break;
-      case 2:
-        ndim = "2";
-        pos_type = "int2";
-        pos = "(pos0, pos1)";
-        break;
-      case 3:
-        ndim = "3";
-        pos_type = "int4";
-        pos = "(pos0, pos1, pos2, 0)";
-        break;
-      default:
-        ndim = "3";
-        pos_type = "int4";
-        pos = "(pos0, pos1, pos2, 0)";
-        break;
-    }
+    static constexpr std::array<const char *, 3> ndimMap = { "1", "2", "3" };
+    static constexpr std::array<const char *, 3> posTypeMap = { "int", "int2", "int4" };
+    static constexpr std::array<const char *, 3> posMap = { "(pos0)", "(pos0, pos1)", "(pos0, pos1, pos2, 0)" };
+
+    int         dim = arr->dim();
+    std::string ndim = ndimMap[dim - 1];
+    std::string pos_type = posTypeMap[dim - 1];
+    std::string pos = posMap[dim - 1];
 
     defines << "\n";
     defines << "\n#define CONVERT_" << param.first << "_PIXEL_TYPE clij_convert_" << arr->dtype() << "_sat";
@@ -178,7 +154,6 @@ oclDefines(const ParameterList & parameter_list, const ConstantList & constant_l
     defines << "\n#define POS_" << param.first << "_TYPE " << pos_type;
     defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) (" << pos_type << ")" << pos;
     defines << "\n";
-
     if (arr->mtype() == mType::BUFFER)
     {
       defines << "\n#define IMAGE_" << param.first << "_TYPE __global " << arr->dtype() << "*";
@@ -216,7 +191,6 @@ oclDefines(const ParameterList & parameter_list, const ConstantList & constant_l
       defines << "\n#define READ_" << param.first << "_IMAGE(a,b,c) read_image" << prefix << "(a,b,c)";
       defines << "\n#define WRITE_" << param.first << "_IMAGE(a,b,c) write_image" << prefix << "(a,b,c)";
     }
-
     defines << "\n";
     defines << "\n#define IMAGE_SIZE_" << param.first << "_WIDTH " << std::to_string(arr->width());
     defines << "\n#define IMAGE_SIZE_" << param.first << "_HEIGHT " << std::to_string(arr->height());
@@ -224,7 +198,6 @@ oclDefines(const ParameterList & parameter_list, const ConstantList & constant_l
     defines << "\n";
   }
   defines << "\n";
-
   return defines.str();
 }
 
@@ -235,25 +208,27 @@ execute(const Device::Pointer & device,
         const RangeArray &      global_range,
         const ConstantList &    constants) -> void
 {
-  // build program source
-  std::string program_source;
-  std::string preamble = cle::BackendManager::getInstance().getBackend().getPreamble();
-  std::string kernel_name = kernel_func.first;
-  std::string kernel_source = kernel_func.second;
+  // prepare kernel source for compilation and execution
+  auto        kernel_source = kernel_func.second;
+  const auto  kernel_name = kernel_func.first;
+  const auto  kernel_preamble = cle::BackendManager::getInstance().getBackend().getPreamble();
   std::string defines;
   switch (device->getType())
   {
-    case Device::Type::CUDA:
+    case Device::Type::CUDA: {
       defines = cle::cudaDefines(parameters, constants);
-      kernel_source = cle::srcOpenclToCuda(kernel_source);
+      cle::translateOpenclToCuda(kernel_source);
       break;
-    case Device::Type::OPENCL:
+    }
+    case Device::Type::OPENCL: {
       defines = cle::oclDefines(parameters, constants);
       break;
+    }
   }
-  program_source.reserve(preamble.size() + defines.size() + kernel_source.size());
+  std::string program_source;
+  program_source.reserve(kernel_preamble.size() + defines.size() + kernel_source.size());
   program_source += defines;
-  program_source += preamble;
+  program_source += kernel_preamble;
   program_source += kernel_source;
 
   // prepare parameters to be passed to the backend
@@ -291,7 +266,7 @@ execute(const Device::Pointer & device,
     }
   }
 
-  // execute kernel
+  // execute kernel in backend
   try
   {
     cle::BackendManager::getInstance().getBackend().executeKernel(
@@ -303,4 +278,16 @@ execute(const Device::Pointer & device,
   }
 }
 
+auto
+native_execute(const Device::Pointer & device,
+               const KernelInfo &      kernel_func,
+               const ParameterList &   parameters,
+               const RangeArray &      global_range,
+               const RangeArray &      local_range) -> void
+{
+  // TODO @StRigaud: Implement native execution for OpenCL and CUDA
+  // allows execution of pure CUDA or OpenCL code without CLIJ syntax
+  throw std::runtime_error("Error: Native execution is not implemented yet.");
+}
+
 } // namespace cle
diff --git a/clic/src/openclbackend.cpp b/clic/src/openclbackend.cpp
index c55c27d02..5e755f1fc 100644
--- a/clic/src/openclbackend.cpp
+++ b/clic/src/openclbackend.cpp
@@ -831,11 +831,14 @@ OpenCLBackend::loadProgramFromCache(const Device::Pointer & device, const std::s
   -> void
 {
 #if USE_OPENCL
-  auto opencl_device = std::dynamic_pointer_cast<OpenCLDevice>(device);
-  auto ite = opencl_device->getCache().find(hash);
-  if (ite != opencl_device->getCache().end())
+  if (auto opencl_device = std::dynamic_pointer_cast<OpenCLDevice>(device))
   {
-    *static_cast<cl_program *>(program) = ite->second;
+    const auto & cache = opencl_device->getCache();
+    auto         ite = cache.find(hash);
+    if (ite != cache.end())
+    {
+      *static_cast<cl_program *>(program) = ite->second;
+    }
   }
 #else
   throw std::runtime_error("Error: OpenCL is not enabled");
@@ -847,8 +850,10 @@ OpenCLBackend::saveProgramToCache(const Device::Pointer & device, const std::str
   -> void
 {
 #if USE_OPENCL
-  auto opencl_device = std::dynamic_pointer_cast<OpenCLDevice>(device);
-  opencl_device->getCache().emplace_hint(opencl_device->getCache().end(), hash, *static_cast<cl_program *>(program));
+  if (auto opencl_device = std::dynamic_pointer_cast<OpenCLDevice>(device))
+  {
+    opencl_device->getCache().emplace(hash, *static_cast<cl_program *>(program));
+  }
 #else
   throw std::runtime_error("Error: OpenCL is not enabled");
 #endif
diff --git a/clic/src/tier1.cpp b/clic/src/tier1.cpp
index e4edfbdb3..871467aa3 100644
--- a/clic/src/tier1.cpp
+++ b/clic/src/tier1.cpp
@@ -1913,8 +1913,8 @@ write_values_to_positions_func(const Device::Pointer & device, const Array::Poin
     // flatten the coords to get the max coordinate value in x,y,z
     // as well as the number of rows (2->1D, 3->2D, 4->3D)
     auto             temp = maximum_x_projection_func(device, list, nullptr);
-    auto             nb_max_position = temp->nbElements() - 1;
-    std::vector<int> max_position(temp->nbElements());
+    auto             nb_max_position = temp->size() - 1;
+    std::vector<int> max_position(temp->size());
     temp->read(max_position.data());
     size_t max_pos_x = max_position[0];
     size_t max_pos_y = (nb_max_position > 2) ? max_position[1] : 1;
diff --git a/clic/src/tier3.cpp b/clic/src/tier3.cpp
index 1e26f9675..a06daf748 100644
--- a/clic/src/tier3.cpp
+++ b/clic/src/tier3.cpp
@@ -56,7 +56,7 @@ exclude_labels_func(const Device::Pointer & device,
   {
     throw std::runtime_error("exclude_labels: label list must be of type uint32");
   }
-  std::vector<unsigned int> labels_list(list->nbElements());
+  std::vector<unsigned int> labels_list(list->size());
   list->read(labels_list.data());
   labels_list.front() = 0;
   unsigned int count = 1;
@@ -68,7 +68,7 @@ exclude_labels_func(const Device::Pointer & device,
       count++;
     }
   }
-  auto index_list = Array::create(list->nbElements(), 1, 1, dType::UINT32, mType::BUFFER, src->device());
+  auto index_list = Array::create(list->size(), 1, 1, dType::UINT32, mType::BUFFER, src->device());
   index_list->write(labels_list.data());
   tier1::replace_intensities_func(device, src, index_list, dst);
   return dst;
@@ -106,7 +106,7 @@ exclude_labels_on_edges_func(const Device::Pointer & device,
     const RangeArray range = { src->width(), src->height(), 1 };
     execute(device, kernel, params, range);
   }
-  std::vector<int> label_map_vector(label_map->nbElements());
+  std::vector<int> label_map_vector(label_map->size());
   label_map->read(label_map_vector.data());
   int count = 1;
   for (auto & i : label_map_vector)
@@ -194,7 +194,7 @@ auto
 mean_of_all_pixels_func(const Device::Pointer & device, const Array::Pointer & src) -> float
 {
   auto temp = tier2::sum_of_all_pixels_func(device, src);
-  return temp / src->nbElements();
+  return temp / src->size();
 }
 
 // auto mean_of_n_most_touching_neighbors_map_func
diff --git a/clic/src/tier4.cpp b/clic/src/tier4.cpp
index d6f7a9ece..1499060b4 100644
--- a/clic/src/tier4.cpp
+++ b/clic/src/tier4.cpp
@@ -44,7 +44,7 @@ threshold_otsu_func(const Device::Pointer & device, const Array::Pointer & src,
   const float   max_intensity = tier2::maximum_of_all_pixels_func(device, src);
   auto          hist_array = Array::create(bin, 1, 1, dType::FLOAT, mType::BUFFER, src->device());
   tier3::histogram_func(device, src, hist_array, bin, min_intensity, max_intensity);
-  std::vector<float> histogram_array(hist_array->nbElements());
+  std::vector<float> histogram_array(hist_array->size());
   hist_array->read(histogram_array.data());
   float              threshold = -1;
   float              max_variance = -1;
@@ -55,7 +55,7 @@ threshold_otsu_func(const Device::Pointer & device, const Array::Pointer & src,
   float              weight_2 = 0;
   float              mean_1 = 0;
   float              mean_2 = 0;
-  const float        nb_pixels = src->nbElements();
+  const float        nb_pixels = src->size();
   const float        intensity_factor = (max_intensity - min_intensity) / (bin - 1);
   std::vector<float> range(histogram_array.size());
   std::iota(range.begin(), range.end(), 0);
diff --git a/tests/absolute_test.cpp b/tests/absolute_test.cpp
index 562328e31..397ed4a3c 100644
--- a/tests/absolute_test.cpp
+++ b/tests/absolute_test.cpp
@@ -18,7 +18,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   gpu_input->write(input.data());
   auto gpu_output = cle::tier1::absolute_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/add_image_and_scalar_test.cpp b/tests/add_image_and_scalar_test.cpp
index b29d9f560..bfac70ce1 100644
--- a/tests/add_image_and_scalar_test.cpp
+++ b/tests/add_image_and_scalar_test.cpp
@@ -20,7 +20,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   gpu_input->write(input.data());
   auto gpu_output = cle::tier1::add_image_and_scalar_func(device, gpu_input, nullptr, scalar);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/add_image_weighted_test.cpp b/tests/add_image_weighted_test.cpp
index d51a3b518..536ee99e0 100644
--- a/tests/add_image_weighted_test.cpp
+++ b/tests/add_image_weighted_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::add_images_weighted_func(device, gpu_input1, gpu_input2, nullptr, factor1, factor2);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/array_test.cpp b/tests/array_test.cpp
index 1e7f85646..f8ef90762 100644
--- a/tests/array_test.cpp
+++ b/tests/array_test.cpp
@@ -18,7 +18,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
     gpu_input->write(input.data());
     auto gpu_copy = cle::Array::create(gpu_input);
     gpu_input->copy(gpu_copy);
-    std::vector<type> input_test(gpu_copy->nbElements());
+    std::vector<type> input_test(gpu_copy->size());
     gpu_copy->read(input_test.data());
     std::equal(input_test.begin(), input_test.end(), input.begin()) ? std::cout << "all good\n"
                                                                     : std::cout << "not good\n";
@@ -31,7 +31,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
     gpu_input->write(input.data());
     auto gpu_copy = cle::Array::create(gpu_input);
     gpu_input->copy(gpu_copy);
-    std::vector<type> input_test(gpu_copy->nbElements());
+    std::vector<type> input_test(gpu_copy->size());
     gpu_copy->read(input_test.data());
     std::equal(input_test.begin(), input_test.end(), input.begin()) ? std::cout << "all good\n"
                                                                     : std::cout << "not good\n";
@@ -44,7 +44,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
     gpu_input->write(input.data());
     auto gpu_copy = cle::Array::create(gpu_input);
     gpu_input->copy(gpu_copy);
-    std::vector<type> input_test(gpu_copy->nbElements());
+    std::vector<type> input_test(gpu_copy->size());
     gpu_copy->read(input_test.data());
     std::equal(input_test.begin(), input_test.end(), input.begin()) ? std::cout << "all good\n"
                                                                     : std::cout << "not good\n";
diff --git a/tests/binary_and_test.cpp b/tests/binary_and_test.cpp
index 1b698ed51..418e522ba 100644
--- a/tests/binary_and_test.cpp
+++ b/tests/binary_and_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::binary_and_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/binary_edge_detection_test.cpp b/tests/binary_edge_detection_test.cpp
index a6e36d546..8c39e05c4 100644
--- a/tests/binary_edge_detection_test.cpp
+++ b/tests/binary_edge_detection_test.cpp
@@ -24,7 +24,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::binary_edge_detection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/binary_not_test.cpp b/tests/binary_not_test.cpp
index 7fe7a848e..f84556e2e 100644
--- a/tests/binary_not_test.cpp
+++ b/tests/binary_not_test.cpp
@@ -24,7 +24,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::binary_not_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/binary_or_test.cpp b/tests/binary_or_test.cpp
index 92fa112a1..e60a43bb3 100644
--- a/tests/binary_or_test.cpp
+++ b/tests/binary_or_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::binary_or_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/binary_subtract_test.cpp b/tests/binary_subtract_test.cpp
index cb0b4a237..13284e1ab 100644
--- a/tests/binary_subtract_test.cpp
+++ b/tests/binary_subtract_test.cpp
@@ -29,7 +29,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::binary_subtract_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/binary_xor_test.cpp b/tests/binary_xor_test.cpp
index c14cf63e7..6d635c98c 100644
--- a/tests/binary_xor_test.cpp
+++ b/tests/binary_xor_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::binary_xor_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/block_enumerate_test.cpp b/tests/block_enumerate_test.cpp
index 4074b4a01..12179ac0f 100644
--- a/tests/block_enumerate_test.cpp
+++ b/tests/block_enumerate_test.cpp
@@ -35,7 +35,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   cle::tier1::sum_reduction_x_func(device, gpu_input, gpu_temp, blocksize);
   auto gpu_output = cle::tier1::block_enumerate_func(device, gpu_input, gpu_temp, nullptr, blocksize);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/connected_components_labeling_box_test.cpp b/tests/connected_components_labeling_box_test.cpp
index ea046248e..fd7a13039 100644
--- a/tests/connected_components_labeling_box_test.cpp
+++ b/tests/connected_components_labeling_box_test.cpp
@@ -18,7 +18,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier5::connected_components_labeling_box_func(device, gpu_input, nullptr);
 
-  std::vector<uint32_t> output(gpu_output->nbElements());
+  std::vector<uint32_t> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/convolve_test.cpp b/tests/convolve_test.cpp
index 110ec9c5e..3c6261bf7 100644
--- a/tests/convolve_test.cpp
+++ b/tests/convolve_test.cpp
@@ -23,7 +23,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::convolve_func(device, gpu_input, gpu_kernel, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/copy_test.cpp b/tests/copy_test.cpp
index 5206b3e24..270494c76 100644
--- a/tests/copy_test.cpp
+++ b/tests/copy_test.cpp
@@ -19,7 +19,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::copy_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/detect_maxima_test.cpp b/tests/detect_maxima_test.cpp
index 51562341f..3f11b08a2 100644
--- a/tests/detect_maxima_test.cpp
+++ b/tests/detect_maxima_test.cpp
@@ -21,7 +21,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::detect_maxima_box_func(device, gpu_input, nullptr);
 
-  std::vector<uint8_t> output(gpu_output->nbElements());
+  std::vector<uint8_t> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/difference_of_gaussian_test.cpp b/tests/difference_of_gaussian_test.cpp
index e1734d2fe..764ecd714 100644
--- a/tests/difference_of_gaussian_test.cpp
+++ b/tests/difference_of_gaussian_test.cpp
@@ -33,7 +33,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier2::difference_of_gaussian_func(device, gpu_input, nullptr, 1, 1, 1, 3, 3, 3);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   // for (auto && i : output)
diff --git a/tests/dilate_box_test.cpp b/tests/dilate_box_test.cpp
index d439eeb51..72de18898 100644
--- a/tests/dilate_box_test.cpp
+++ b/tests/dilate_box_test.cpp
@@ -47,7 +47,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::dilate_box_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/dilate_sphere_test.cpp b/tests/dilate_sphere_test.cpp
index 964889fcb..081dd49fe 100644
--- a/tests/dilate_sphere_test.cpp
+++ b/tests/dilate_sphere_test.cpp
@@ -27,7 +27,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::dilate_sphere_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/divide_image_and_scalar_test.cpp b/tests/divide_image_and_scalar_test.cpp
index 189feee59..025706c4c 100644
--- a/tests/divide_image_and_scalar_test.cpp
+++ b/tests/divide_image_and_scalar_test.cpp
@@ -19,7 +19,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   gpu_input->write(input.data());
   auto gpu_output = cle::tier1::divide_image_and_scalar_func(device, gpu_input, nullptr, scalar);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/divide_images_test.cpp b/tests/divide_images_test.cpp
index 7df5799ad..4c8a08657 100644
--- a/tests/divide_images_test.cpp
+++ b/tests/divide_images_test.cpp
@@ -25,7 +25,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::divide_images_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/equal_constant_test.cpp b/tests/equal_constant_test.cpp
index 83deaa365..57ae0fe79 100644
--- a/tests/equal_constant_test.cpp
+++ b/tests/equal_constant_test.cpp
@@ -20,7 +20,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   gpu_input->write(input.data());
   auto gpu_output = cle::tier1::equal_constant_func(device, gpu_input, nullptr, 5);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/equal_test.cpp b/tests/equal_test.cpp
index cce665616..abfbbb275 100644
--- a/tests/equal_test.cpp
+++ b/tests/equal_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::equal_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/erode_box_test.cpp b/tests/erode_box_test.cpp
index 2e5393084..c0a822e1f 100644
--- a/tests/erode_box_test.cpp
+++ b/tests/erode_box_test.cpp
@@ -46,7 +46,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::erode_box_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/erode_sphere_test.cpp b/tests/erode_sphere_test.cpp
index 51d19ad92..2e0bc632f 100644
--- a/tests/erode_sphere_test.cpp
+++ b/tests/erode_sphere_test.cpp
@@ -25,7 +25,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::erode_sphere_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/extend_labeling_via_voronoi_test.cpp b/tests/extend_labeling_via_voronoi_test.cpp
index 856442d93..abb3f4bba 100644
--- a/tests/extend_labeling_via_voronoi_test.cpp
+++ b/tests/extend_labeling_via_voronoi_test.cpp
@@ -22,7 +22,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier2::extend_labeling_via_voronoi_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/gaussian_blur_test.cpp b/tests/gaussian_blur_test.cpp
index 096d8fb7b..7dcc5b8f4 100644
--- a/tests/gaussian_blur_test.cpp
+++ b/tests/gaussian_blur_test.cpp
@@ -47,7 +47,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::gaussian_blur_func(device, gpu_input, nullptr, 1, 1, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   // round values of valid vector values to 6 decimals to avoid float precision errors in comparison
diff --git a/tests/gradient_x_test.cpp b/tests/gradient_x_test.cpp
index 50881c73e..d642c6cf5 100644
--- a/tests/gradient_x_test.cpp
+++ b/tests/gradient_x_test.cpp
@@ -16,7 +16,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::gradient_x_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/gradient_y_test.cpp b/tests/gradient_y_test.cpp
index 9b6c12ae6..16cd2c975 100644
--- a/tests/gradient_y_test.cpp
+++ b/tests/gradient_y_test.cpp
@@ -16,7 +16,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::gradient_y_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/gradient_z_test.cpp b/tests/gradient_z_test.cpp
index 534ee4d61..56ec3cc22 100644
--- a/tests/gradient_z_test.cpp
+++ b/tests/gradient_z_test.cpp
@@ -16,7 +16,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::gradient_z_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/greater_constant_test.cpp b/tests/greater_constant_test.cpp
index c999d9287..08dc8f725 100644
--- a/tests/greater_constant_test.cpp
+++ b/tests/greater_constant_test.cpp
@@ -20,7 +20,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   gpu_input->write(input.data());
   auto gpu_output = cle::tier1::greater_constant_func(device, gpu_input, nullptr, 5);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/greater_or_equal_constant_test.cpp b/tests/greater_or_equal_constant_test.cpp
index 9ce121e16..037997926 100644
--- a/tests/greater_or_equal_constant_test.cpp
+++ b/tests/greater_or_equal_constant_test.cpp
@@ -20,7 +20,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   gpu_input->write(input.data());
   auto gpu_output = cle::tier1::greater_or_equal_constant_func(device, gpu_input, nullptr, 5);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/greater_or_equal_test.cpp b/tests/greater_or_equal_test.cpp
index 957a30be8..259ef7fd8 100644
--- a/tests/greater_or_equal_test.cpp
+++ b/tests/greater_or_equal_test.cpp
@@ -28,7 +28,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::greater_or_equal_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/greater_test.cpp b/tests/greater_test.cpp
index d90878b63..d858c1337 100644
--- a/tests/greater_test.cpp
+++ b/tests/greater_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::greater_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/histogram_test.cpp b/tests/histogram_test.cpp
index 536298406..87a64bcb4 100644
--- a/tests/histogram_test.cpp
+++ b/tests/histogram_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier3::histogram_func(device, gpu_input, nullptr, shape[0], 0, shape[0] - 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/laplace_box_test.cpp b/tests/laplace_box_test.cpp
index e36ddfbd1..40a6d1ab3 100644
--- a/tests/laplace_box_test.cpp
+++ b/tests/laplace_box_test.cpp
@@ -19,7 +19,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::laplace_box_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/mask_test.cpp b/tests/mask_test.cpp
index 3e4aafbbc..ea1e30ea7 100644
--- a/tests/mask_test.cpp
+++ b/tests/mask_test.cpp
@@ -28,7 +28,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::mask_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/masked_voronoi_labeling_test.cpp b/tests/masked_voronoi_labeling_test.cpp
index 9623f7d31..ac1926b65 100644
--- a/tests/masked_voronoi_labeling_test.cpp
+++ b/tests/masked_voronoi_labeling_test.cpp
@@ -25,7 +25,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier6::masked_voronoi_labeling_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<uint32_t> output(gpu_output->nbElements());
+  std::vector<uint32_t> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/maximum_box_test.cpp b/tests/maximum_box_test.cpp
index f7a13c286..caa9f44df 100644
--- a/tests/maximum_box_test.cpp
+++ b/tests/maximum_box_test.cpp
@@ -34,7 +34,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::maximum_box_func(device, gpu_input, nullptr, 1, 1, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/maximum_images_test.cpp b/tests/maximum_images_test.cpp
index e3eb8dfd4..c209a78b9 100644
--- a/tests/maximum_images_test.cpp
+++ b/tests/maximum_images_test.cpp
@@ -25,7 +25,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::maximum_images_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/maximum_sphere_test.cpp b/tests/maximum_sphere_test.cpp
index 10a89c015..62b118f6a 100644
--- a/tests/maximum_sphere_test.cpp
+++ b/tests/maximum_sphere_test.cpp
@@ -27,7 +27,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::maximum_sphere_func(device, gpu_input, nullptr, 1, 1, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/maximum_x_projection_test.cpp b/tests/maximum_x_projection_test.cpp
index 48c75dd6b..28fbd830f 100644
--- a/tests/maximum_x_projection_test.cpp
+++ b/tests/maximum_x_projection_test.cpp
@@ -23,7 +23,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::maximum_x_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/maximum_y_projection_test.cpp b/tests/maximum_y_projection_test.cpp
index f8f3079bb..a59397d81 100644
--- a/tests/maximum_y_projection_test.cpp
+++ b/tests/maximum_y_projection_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::maximum_y_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/maximum_z_projection_test.cpp b/tests/maximum_z_projection_test.cpp
index b8b3c5eb6..4d7c98edc 100644
--- a/tests/maximum_z_projection_test.cpp
+++ b/tests/maximum_z_projection_test.cpp
@@ -23,7 +23,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::maximum_z_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/mean_box_test.cpp b/tests/mean_box_test.cpp
index 8cf9f7f7c..ac2d65741 100644
--- a/tests/mean_box_test.cpp
+++ b/tests/mean_box_test.cpp
@@ -41,7 +41,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::mean_box_func(device, gpu_input, nullptr, 1, 1, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/mean_sphere_test.cpp b/tests/mean_sphere_test.cpp
index 2d30a054d..6baf26ddb 100644
--- a/tests/mean_sphere_test.cpp
+++ b/tests/mean_sphere_test.cpp
@@ -35,7 +35,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::mean_sphere_func(device, gpu_input, nullptr, 1, 1, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/minimum_box_test.cpp b/tests/minimum_box_test.cpp
index 8e759c65e..743b03098 100644
--- a/tests/minimum_box_test.cpp
+++ b/tests/minimum_box_test.cpp
@@ -34,7 +34,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::minimum_box_func(device, gpu_input, nullptr, 1, 1, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/minimum_images_test.cpp b/tests/minimum_images_test.cpp
index 418bdd91a..d59aa2e1f 100644
--- a/tests/minimum_images_test.cpp
+++ b/tests/minimum_images_test.cpp
@@ -25,7 +25,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::minimum_images_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/minimum_of_masked_pixels_test.cpp b/tests/minimum_of_masked_pixels_test.cpp
index 7b0b9ccb8..721fab77b 100644
--- a/tests/minimum_of_masked_pixels_test.cpp
+++ b/tests/minimum_of_masked_pixels_test.cpp
@@ -40,7 +40,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
   std::cout << "GPU output: " << output << std::endl;
   std::cout << "valid output: " << valid << std::endl;
 
-  // std::vector<type> output(gpu_output->nbElements());
+  // std::vector<type> output(gpu_output->size());
   // gpu_output->read(output.data());
 
   return output == valid ? 0 : 1;
diff --git a/tests/minimum_sphere_test.cpp b/tests/minimum_sphere_test.cpp
index fd889645d..0cded47ca 100644
--- a/tests/minimum_sphere_test.cpp
+++ b/tests/minimum_sphere_test.cpp
@@ -27,7 +27,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::minimum_sphere_func(device, gpu_input, nullptr, 1, 1, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/minimum_x_projection_test.cpp b/tests/minimum_x_projection_test.cpp
index 13b219946..d11b9fe94 100644
--- a/tests/minimum_x_projection_test.cpp
+++ b/tests/minimum_x_projection_test.cpp
@@ -23,7 +23,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::minimum_x_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/minimum_y_projection_test.cpp b/tests/minimum_y_projection_test.cpp
index ebed86b73..38ca3c046 100644
--- a/tests/minimum_y_projection_test.cpp
+++ b/tests/minimum_y_projection_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::minimum_y_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/minimum_z_projection_test.cpp b/tests/minimum_z_projection_test.cpp
index 609d04608..3b9af48be 100644
--- a/tests/minimum_z_projection_test.cpp
+++ b/tests/minimum_z_projection_test.cpp
@@ -23,7 +23,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::minimum_z_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/multiply_image_and_scalar_test.cpp b/tests/multiply_image_and_scalar_test.cpp
index e7ba7f4f6..62c832031 100644
--- a/tests/multiply_image_and_scalar_test.cpp
+++ b/tests/multiply_image_and_scalar_test.cpp
@@ -21,7 +21,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::multiply_image_and_scalar_func(device, gpu_input, nullptr, scalar);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/multiply_images_test.cpp b/tests/multiply_images_test.cpp
index 72918d984..0a60dbc47 100644
--- a/tests/multiply_images_test.cpp
+++ b/tests/multiply_images_test.cpp
@@ -25,7 +25,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::multiply_images_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/nonzero_minimum_box_test.cpp b/tests/nonzero_minimum_box_test.cpp
index ce4cf52cc..74a349a0c 100644
--- a/tests/nonzero_minimum_box_test.cpp
+++ b/tests/nonzero_minimum_box_test.cpp
@@ -32,7 +32,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::nonzero_minimum_box_func(device, gpu_input, flag, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/not_equal_constant_test.cpp b/tests/not_equal_constant_test.cpp
index 876bcf19e..de23ea037 100644
--- a/tests/not_equal_constant_test.cpp
+++ b/tests/not_equal_constant_test.cpp
@@ -21,7 +21,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::not_equal_constant_func(device, gpu_input, nullptr, 5);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/not_equal_test.cpp b/tests/not_equal_test.cpp
index 36ed888bd..911b01b64 100644
--- a/tests/not_equal_test.cpp
+++ b/tests/not_equal_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::not_equal_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/onlyzero_overwrite_maximum_box_test.cpp b/tests/onlyzero_overwrite_maximum_box_test.cpp
index 869f7916f..f1035e345 100644
--- a/tests/onlyzero_overwrite_maximum_box_test.cpp
+++ b/tests/onlyzero_overwrite_maximum_box_test.cpp
@@ -22,7 +22,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::onlyzero_overwrite_maximum_box_func(device, gpu_input, flag, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/onlyzero_overwrite_maximum_diamond_test.cpp b/tests/onlyzero_overwrite_maximum_diamond_test.cpp
index 189101603..106388761 100644
--- a/tests/onlyzero_overwrite_maximum_diamond_test.cpp
+++ b/tests/onlyzero_overwrite_maximum_diamond_test.cpp
@@ -22,7 +22,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::onlyzero_overwrite_maximum_diamond_func(device, gpu_input, flag, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/power_images_test.cpp b/tests/power_images_test.cpp
index 3c752206b..fc10e93be 100644
--- a/tests/power_images_test.cpp
+++ b/tests/power_images_test.cpp
@@ -24,7 +24,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::power_images_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/power_test.cpp b/tests/power_test.cpp
index d2beddc87..36b9459a3 100644
--- a/tests/power_test.cpp
+++ b/tests/power_test.cpp
@@ -19,7 +19,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::power_func(device, gpu_input, nullptr, 2);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/replace_intensities_test.cpp b/tests/replace_intensities_test.cpp
index c422ab605..78940ccf8 100644
--- a/tests/replace_intensities_test.cpp
+++ b/tests/replace_intensities_test.cpp
@@ -31,7 +31,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::replace_intensities_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/replace_intensity_test.cpp b/tests/replace_intensity_test.cpp
index 94b273148..8d8c87b14 100644
--- a/tests/replace_intensity_test.cpp
+++ b/tests/replace_intensity_test.cpp
@@ -30,7 +30,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::replace_intensity_func(device, gpu_input, nullptr, 5, 100);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/set_column_test.cpp b/tests/set_column_test.cpp
index 1192cd362..44230088e 100644
--- a/tests/set_column_test.cpp
+++ b/tests/set_column_test.cpp
@@ -27,7 +27,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   cle::tier1::set_column_func(device, gpu_input, 1, 100);
 
-  std::vector<type> output(gpu_input->nbElements());
+  std::vector<type> output(gpu_input->size());
   gpu_input->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/set_nonzero_pixels_to_pixelindex_test.cpp b/tests/set_nonzero_pixels_to_pixelindex_test.cpp
index bbad712d1..00b88a199 100644
--- a/tests/set_nonzero_pixels_to_pixelindex_test.cpp
+++ b/tests/set_nonzero_pixels_to_pixelindex_test.cpp
@@ -29,7 +29,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::set_nonzero_pixels_to_pixelindex_func(device, gpu_input, nullptr, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/set_row_test.cpp b/tests/set_row_test.cpp
index 048f8985d..809235a8d 100644
--- a/tests/set_row_test.cpp
+++ b/tests/set_row_test.cpp
@@ -31,7 +31,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   cle::tier1::set_row_func(device, gpu_input, 1, 100);
 
-  std::vector<type> output(gpu_input->nbElements());
+  std::vector<type> output(gpu_input->size());
   gpu_input->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/set_test.cpp b/tests/set_test.cpp
index c2dae36eb..1ce7a1cdc 100644
--- a/tests/set_test.cpp
+++ b/tests/set_test.cpp
@@ -21,7 +21,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   cle::tier1::set_func(device, gpu_input, 10);
 
-  std::vector<type> output(gpu_input->nbElements());
+  std::vector<type> output(gpu_input->size());
   gpu_input->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/smaller_constant_test.cpp b/tests/smaller_constant_test.cpp
index 5de2dce69..fc0e2bc8f 100644
--- a/tests/smaller_constant_test.cpp
+++ b/tests/smaller_constant_test.cpp
@@ -21,7 +21,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::smaller_constant_func(device, gpu_input, nullptr, 5);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/smaller_or_equal_constant_test.cpp b/tests/smaller_or_equal_constant_test.cpp
index 940a31719..b7eeb62a3 100644
--- a/tests/smaller_or_equal_constant_test.cpp
+++ b/tests/smaller_or_equal_constant_test.cpp
@@ -21,7 +21,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::smaller_or_equal_constant_func(device, gpu_input, nullptr, 5);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/smaller_or_equal_test.cpp b/tests/smaller_or_equal_test.cpp
index e8ee733f3..230847d36 100644
--- a/tests/smaller_or_equal_test.cpp
+++ b/tests/smaller_or_equal_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::smaller_or_equal_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/smaller_test.cpp b/tests/smaller_test.cpp
index b91fcea8c..39b6b4ca7 100644
--- a/tests/smaller_test.cpp
+++ b/tests/smaller_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::smaller_func(device, gpu_input1, gpu_input2, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/sobel_test.cpp b/tests/sobel_test.cpp
index bf285d489..2031d7ea2 100644
--- a/tests/sobel_test.cpp
+++ b/tests/sobel_test.cpp
@@ -51,7 +51,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::sobel_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/sum_reduction_x_test.cpp b/tests/sum_reduction_x_test.cpp
index 97e61171b..44e9aa09e 100644
--- a/tests/sum_reduction_x_test.cpp
+++ b/tests/sum_reduction_x_test.cpp
@@ -21,7 +21,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::sum_reduction_x_func(device, gpu_input, nullptr, 4);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/sum_x_projection_test.cpp b/tests/sum_x_projection_test.cpp
index 8795375ea..1cc61bd89 100644
--- a/tests/sum_x_projection_test.cpp
+++ b/tests/sum_x_projection_test.cpp
@@ -18,7 +18,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::sum_x_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/sum_y_projection_test.cpp b/tests/sum_y_projection_test.cpp
index c63312251..817062071 100644
--- a/tests/sum_y_projection_test.cpp
+++ b/tests/sum_y_projection_test.cpp
@@ -18,7 +18,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::sum_y_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/sum_z_projection_test.cpp b/tests/sum_z_projection_test.cpp
index 9217b2160..807fb82c9 100644
--- a/tests/sum_z_projection_test.cpp
+++ b/tests/sum_z_projection_test.cpp
@@ -19,7 +19,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier1::sum_z_projection_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/tests/threshold_otsu_test.cpp b/tests/threshold_otsu_test.cpp
index f8655e1aa..fbea2b9c3 100644
--- a/tests/threshold_otsu_test.cpp
+++ b/tests/threshold_otsu_test.cpp
@@ -27,7 +27,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier4::threshold_otsu_func(device, gpu_input, nullptr);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   for (auto && i : input)
diff --git a/tests/voronoi_otsu_labeling_test.cpp b/tests/voronoi_otsu_labeling_test.cpp
index 7e057cccf..f122911b6 100644
--- a/tests/voronoi_otsu_labeling_test.cpp
+++ b/tests/voronoi_otsu_labeling_test.cpp
@@ -26,7 +26,7 @@ run_test(const std::array<size_t, 3> & shape, const cle::mType & mem_type) -> bo
 
   auto gpu_output = cle::tier7::voronoi_otsu_labeling_func(device, gpu_input, nullptr, 0, 1);
 
-  std::vector<type> output(gpu_output->nbElements());
+  std::vector<type> output(gpu_output->size());
   gpu_output->read(output.data());
 
   return std::equal(output.begin(), output.end(), valid.begin()) ? 0 : 1;
diff --git a/utilities/code_style_formating.sh b/utilities/code_style_formating.sh
index 5c9524762..7c9f96efd 100644
--- a/utilities/code_style_formating.sh
+++ b/utilities/code_style_formating.sh
@@ -1,7 +1,7 @@
-#!/bin/bash
+#!/bin/sh
 
 # Check if the project root directory is provided as an argument
-if [ $# -ne 1 ]; then
+if [ $# -ne 1 ] || [ ! -d "$1" ]; then
     echo "Usage: $0 <project_directory>"
     exit 1
 fi
@@ -13,6 +13,5 @@ files=$(find "$project_directory" -name "*.hpp" -o -name "*.cpp")
 
 # Run clang-format-14 on each file
 for file in $files; do
-    clang-format -i -style=file "$file"
-    echo "Formatted: $file"
+    clang-format -i -style=file -- "$file" || echo "Failed to format: $file"
 done