Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimise-cuda #186

Merged
merged 12 commits into from
Sep 26, 2023
4 changes: 2 additions & 2 deletions clic/include/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ class Array : public std::enable_shared_from_this<Array>
fill(const float & value) const -> void;

[[nodiscard]] auto
nbElements() const -> size_t;
size() const -> size_t;
[[nodiscard]] auto
width() const -> size_t;
[[nodiscard]] auto
height() const -> size_t;
[[nodiscard]] auto
depth() const -> size_t;
[[nodiscard]] auto
bytesPerElements() const -> size_t;
itemSize() const -> size_t;
[[nodiscard]] auto
dtype() const -> dType;
[[nodiscard]] auto
Expand Down
2 changes: 2 additions & 0 deletions clic/include/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ class CUDADevice : public Device
[[nodiscard]] auto
getInfo() const -> std::string override;
[[nodiscard]] auto
getArch() const -> std::string;
[[nodiscard]] auto
getCache() -> std::map<std::string, CUmodule> &;

private:
Expand Down
7 changes: 7 additions & 0 deletions clic/include/execution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ execute(const Device::Pointer & device,
const RangeArray & global_range = { 1, 1, 1 },
const ConstantList & constants = {}) -> void;

auto
native_execute(const Device::Pointer & device,
const KernelInfo & kernel_func,
const ParameterList & parameters,
const RangeArray & global_range = { 1, 1, 1 },
const RangeArray & local_range = { 1, 1, 1 }) -> void;

auto
loadSource(const std::string & source_path) -> std::string;

Expand Down
56 changes: 28 additions & 28 deletions clic/include/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,57 +41,57 @@ enum class dType
};

inline auto
operator<<(std::ostream & out, const dType & dtype) -> std::ostream &
toString(const dType & dtype) -> std::string
{
switch (dtype)
{
case dType::FLOAT:
out << "float";
break;
return "float";
case dType::INT32:
out << "int";
break;
return "int";
case dType::UINT32:
out << "uint";
break;
return "uint";
case dType::INT8:
out << "char";
break;
return "char";
case dType::UINT8:
out << "uchar";
break;
return "uchar";
case dType::INT16:
out << "short";
break;
return "short";
case dType::UINT16:
out << "ushort";
break;
return "ushort";
case dType::INT64:
out << "long";
break;
return "long";
case dType::UINT64:
out << "ulong";
break;
return "ulong";
default:
out << "unknown";
break;
return "unknown";
}
return out;
}

inline auto
operator<<(std::ostream & out, const mType & mtype) -> std::ostream &
operator<<(std::ostream & out, const dType & dtype) -> std::ostream &
{
return out << toString(dtype);
}

inline auto
toString(const mType & mtype) -> std::string
{
switch (mtype)
{
case mType::BUFFER:
out << "Buffer";
break;
return "Buffer";
case mType::IMAGE:
out << "Image";
break;
return "Image";
default:
return "unknown";
}
return out;
}

inline auto
operator<<(std::ostream & out, const mType & mtype) -> std::ostream &
{
return out << toString(mtype);
}

template <typename T>
Expand Down
41 changes: 14 additions & 27 deletions clic/src/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ Array::allocate() -> void
auto
Array::write(const void * host_data) -> void
{
if (host_data == nullptr)
{
throw std::runtime_error("Error: host_data is null");
}
if (!initialized())
{
allocate();
Expand All @@ -101,6 +105,10 @@ Array::write(const void * host_data) -> void
auto
Array::read(void * host_data) const -> void
{
if (host_data == nullptr)
{
throw std::runtime_error("Error: host_data is null");
}
if (!initialized())
{
throw std::runtime_error("Error: Array is not initialized, it cannot be read");
Expand All @@ -119,8 +127,7 @@ Array::copy(const Array::Pointer & dst) const -> void
{
std::cerr << "Error: copying Arrays from different devices" << std::endl;
}
if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() ||
bytesPerElements() != dst->bytesPerElements())
if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() || itemSize() != dst->itemSize())
{
std::cerr << "Error: Arrays dimensions do not match" << std::endl;
}
Expand Down Expand Up @@ -161,7 +168,7 @@ Array::fill(const float & value) const -> void
}

auto
Array::nbElements() const -> size_t
Array::size() const -> size_t
{
return width_ * height_ * depth_;
}
Expand All @@ -181,7 +188,7 @@ Array::depth() const -> size_t
return depth_;
}
auto
Array::bytesPerElements() const -> size_t
Array::itemSize() const -> size_t
{
return toBytes(dataType_);
}
Expand Down Expand Up @@ -224,29 +231,9 @@ Array::c_get() const -> const void **
auto
Array::shortType() const -> std::string
{
switch (this->dataType_)
{
case dType::FLOAT:
return "f";
case dType::INT32:
return "i";
case dType::UINT32:
return "ui";
case dType::INT8:
return "c";
case dType::UINT8:
return "uc";
case dType::INT16:
return "s";
case dType::UINT16:
return "us";
case dType::INT64:
return "l";
case dType::UINT64:
return "ul";
default:
throw std::invalid_argument("Invalid Array::Type value");
}
const auto str_type = toString(dtype());
return (str_type[0] == 'u') ? str_type.substr(0, 2) : str_type.substr(0, 1);
}


} // namespace cle
92 changes: 57 additions & 35 deletions clic/src/cudabackend.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "backend.hpp"
#include "cle_preamble_cu.h"
#include <array>
#include <chrono>

namespace cle
{
Expand Down Expand Up @@ -654,14 +655,15 @@ CUDABackend::loadProgramFromCache(const Device::Pointer & device, const std::str
-> void
{
#if USE_CUDA
auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device);
CUmodule module = nullptr;
auto ite = cuda_device->getCache().find(hash);
if (ite != cuda_device->getCache().end())
if (auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device))
{
module = ite->second;
const auto & cache = cuda_device->getCache();
auto ite = cache.find(hash);
if (ite != cache.end())
{
*static_cast<CUmodule *>(program) = ite->second;
}
}
program = module;
#else
throw std::runtime_error("Error: CUDA is not enabled");
#endif
Expand All @@ -671,13 +673,16 @@ auto
CUDABackend::saveProgramToCache(const Device::Pointer & device, const std::string & hash, void * program) const -> void
{
#if USE_CUDA
auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device);
cuda_device->getCache().emplace_hint(cuda_device->getCache().end(), hash, reinterpret_cast<CUmodule>(program));
if (auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device))
{
cuda_device->getCache().emplace(hash, *reinterpret_cast<CUmodule *>(program));
}
#else
throw std::runtime_error("Error: CUDA is not enabled");
#endif
}


auto
CUDABackend::buildKernel(const Device::Pointer & device,
const std::string & kernel_source,
Expand All @@ -692,42 +697,59 @@ CUDABackend::buildKernel(const Device::Pointer & device,
throw std::runtime_error("Error (cuda): Failed to set CUDA device before memory allocation.");
}

nvrtcProgram prog;
auto res = nvrtcCreateProgram(&prog, kernel_source.c_str(), nullptr, 0, nullptr, nullptr);
if (res != NVRTC_SUCCESS)
{
throw std::runtime_error("Error (cuda): Failed to create program from source with error code " +
std::to_string(res));
}
res = nvrtcCompileProgram(prog, 0, nullptr);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
std::string log(log_size, '\0');
nvrtcGetProgramLog(prog, &log[0]);
std::cerr << "Build log: " << log << std::endl;
throw std::runtime_error("Error (cuda): Failed to build program with error code " + std::to_string(res));
}
size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
std::vector<char> ptx(ptxSize);
nvrtcGetPTX(prog, ptx.data());
std::chrono::high_resolution_clock::time_point start_time, end_time;
std::chrono::microseconds duration;

CUmodule cuModule;
err = cuModuleLoadData(&cuModule, ptx.data());
if (err != CUDA_SUCCESS)
CUmodule cuModule = nullptr;
std::string hash = std::to_string(std::hash<std::string>{}(kernel_source));
loadProgramFromCache(device, hash, &cuModule);
if (cuModule == nullptr)
{
throw std::runtime_error("Error (cuda): Loading module with error code " + std::to_string(err));
}
nvrtcProgram prog;
auto res = nvrtcCreateProgram(&prog, kernel_source.c_str(), nullptr, 0, nullptr, nullptr);
if (res != NVRTC_SUCCESS)
{
throw std::runtime_error("Error (cuda): Failed to create program from source with error code " +
std::to_string(res));
}

const std::string arch_comp = "-arch=compute_" + cuda_device->getArch();
const std::array<const char *, 1> options = { arch_comp.c_str() };
res = nvrtcCompileProgram(prog, options.size(), options.data());
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
std::string log(log_size, '\0');
nvrtcGetProgramLog(prog, &log[0]);
std::cerr << "Build log: " << log << std::endl;
throw std::runtime_error("Error (cuda): Failed to build program with error code " + std::to_string(res));
}
size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
std::vector<char> ptx(ptxSize);
nvrtcGetPTX(prog, ptx.data());
res = nvrtcDestroyProgram(&prog);
if (res != NVRTC_SUCCESS)
{
throw std::runtime_error("Error (cuda): Failed to destroy program with error code " + std::to_string(res));
}

err = cuModuleLoadData(&cuModule, ptx.data());
if (err != CUDA_SUCCESS)
{
throw std::runtime_error("Error (cuda): Loading module with error code " + std::to_string(err));
}


saveProgramToCache(device, hash, &cuModule);
}
CUfunction cuFunction;
err = cuModuleGetFunction(&cuFunction, cuModule, kernel_name.c_str());
if (err != CUDA_SUCCESS)
{
throw std::runtime_error("Error (cuda): Getting function from module with error code " + std::to_string(err));
}

*(reinterpret_cast<CUfunction *>(kernel)) = cuFunction;
#else
throw std::runtime_error("Error: CUDA is not enabled");
Expand Down
9 changes: 9 additions & 0 deletions clic/src/cudadevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ CUDADevice::getName() const -> std::string
return std::string(device_name);
}

auto
CUDADevice::getArch() const -> std::string
{
int major = 0, minor = 0;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, this->getCUDADevice());
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, this->getCUDADevice());
return std::to_string(major) + std::to_string(minor);
}

auto
CUDADevice::getInfo() const -> std::string
{
Expand Down
Loading