Skip to content

Commit

Permalink
Merge pull request #186 from clEsperanto/optimise-cuda
Browse files Browse the repository at this point in the history
add JIT-cache to cuda
replace regexp by string find/replace (original design)
factorisation of some part of code

TODO in futur: make JIT-cache on disk
  • Loading branch information
StRigaud authored Sep 26, 2023
2 parents 1199296 + 7b27502 commit 098a4d4
Show file tree
Hide file tree
Showing 91 changed files with 311 additions and 293 deletions.
4 changes: 2 additions & 2 deletions clic/include/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ class Array : public std::enable_shared_from_this<Array>
fill(const float & value) const -> void;

[[nodiscard]] auto
nbElements() const -> size_t;
size() const -> size_t;
[[nodiscard]] auto
width() const -> size_t;
[[nodiscard]] auto
height() const -> size_t;
[[nodiscard]] auto
depth() const -> size_t;
[[nodiscard]] auto
bytesPerElements() const -> size_t;
itemSize() const -> size_t;
[[nodiscard]] auto
dtype() const -> dType;
[[nodiscard]] auto
Expand Down
2 changes: 2 additions & 0 deletions clic/include/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ class CUDADevice : public Device
[[nodiscard]] auto
getInfo() const -> std::string override;
[[nodiscard]] auto
getArch() const -> std::string;
[[nodiscard]] auto
getCache() -> std::map<std::string, CUmodule> &;

private:
Expand Down
7 changes: 7 additions & 0 deletions clic/include/execution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ execute(const Device::Pointer & device,
const RangeArray & global_range = { 1, 1, 1 },
const ConstantList & constants = {}) -> void;

auto
native_execute(const Device::Pointer & device,
const KernelInfo & kernel_func,
const ParameterList & parameters,
const RangeArray & global_range = { 1, 1, 1 },
const RangeArray & local_range = { 1, 1, 1 }) -> void;

auto
loadSource(const std::string & source_path) -> std::string;

Expand Down
56 changes: 28 additions & 28 deletions clic/include/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,57 +41,57 @@ enum class dType
};

inline auto
operator<<(std::ostream & out, const dType & dtype) -> std::ostream &
toString(const dType & dtype) -> std::string
{
switch (dtype)
{
case dType::FLOAT:
out << "float";
break;
return "float";
case dType::INT32:
out << "int";
break;
return "int";
case dType::UINT32:
out << "uint";
break;
return "uint";
case dType::INT8:
out << "char";
break;
return "char";
case dType::UINT8:
out << "uchar";
break;
return "uchar";
case dType::INT16:
out << "short";
break;
return "short";
case dType::UINT16:
out << "ushort";
break;
return "ushort";
case dType::INT64:
out << "long";
break;
return "long";
case dType::UINT64:
out << "ulong";
break;
return "ulong";
default:
out << "unknown";
break;
return "unknown";
}
return out;
}

inline auto
operator<<(std::ostream & out, const mType & mtype) -> std::ostream &
operator<<(std::ostream & out, const dType & dtype) -> std::ostream &
{
return out << toString(dtype);
}

inline auto
toString(const mType & mtype) -> std::string
{
switch (mtype)
{
case mType::BUFFER:
out << "Buffer";
break;
return "Buffer";
case mType::IMAGE:
out << "Image";
break;
return "Image";
default:
return "unknown";
}
return out;
}

inline auto
operator<<(std::ostream & out, const mType & mtype) -> std::ostream &
{
return out << toString(mtype);
}

template <typename T>
Expand Down
41 changes: 14 additions & 27 deletions clic/src/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ Array::allocate() -> void
auto
Array::write(const void * host_data) -> void
{
if (host_data == nullptr)
{
throw std::runtime_error("Error: host_data is null");
}
if (!initialized())
{
allocate();
Expand All @@ -101,6 +105,10 @@ Array::write(const void * host_data) -> void
auto
Array::read(void * host_data) const -> void
{
if (host_data == nullptr)
{
throw std::runtime_error("Error: host_data is null");
}
if (!initialized())
{
throw std::runtime_error("Error: Array is not initialized, it cannot be read");
Expand All @@ -119,8 +127,7 @@ Array::copy(const Array::Pointer & dst) const -> void
{
std::cerr << "Error: copying Arrays from different devices" << std::endl;
}
if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() ||
bytesPerElements() != dst->bytesPerElements())
if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() || itemSize() != dst->itemSize())
{
std::cerr << "Error: Arrays dimensions do not match" << std::endl;
}
Expand Down Expand Up @@ -161,7 +168,7 @@ Array::fill(const float & value) const -> void
}

auto
Array::nbElements() const -> size_t
Array::size() const -> size_t
{
return width_ * height_ * depth_;
}
Expand All @@ -181,7 +188,7 @@ Array::depth() const -> size_t
return depth_;
}
auto
Array::bytesPerElements() const -> size_t
Array::itemSize() const -> size_t
{
return toBytes(dataType_);
}
Expand Down Expand Up @@ -224,29 +231,9 @@ Array::c_get() const -> const void **
auto
Array::shortType() const -> std::string
{
switch (this->dataType_)
{
case dType::FLOAT:
return "f";
case dType::INT32:
return "i";
case dType::UINT32:
return "ui";
case dType::INT8:
return "c";
case dType::UINT8:
return "uc";
case dType::INT16:
return "s";
case dType::UINT16:
return "us";
case dType::INT64:
return "l";
case dType::UINT64:
return "ul";
default:
throw std::invalid_argument("Invalid Array::Type value");
}
const auto str_type = toString(dtype());
return (str_type[0] == 'u') ? str_type.substr(0, 2) : str_type.substr(0, 1);
}


} // namespace cle
92 changes: 57 additions & 35 deletions clic/src/cudabackend.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "backend.hpp"
#include "cle_preamble_cu.h"
#include <array>
#include <chrono>

namespace cle
{
Expand Down Expand Up @@ -654,14 +655,15 @@ CUDABackend::loadProgramFromCache(const Device::Pointer & device, const std::str
-> void
{
#if USE_CUDA
auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device);
CUmodule module = nullptr;
auto ite = cuda_device->getCache().find(hash);
if (ite != cuda_device->getCache().end())
if (auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device))
{
module = ite->second;
const auto & cache = cuda_device->getCache();
auto ite = cache.find(hash);
if (ite != cache.end())
{
*static_cast<CUmodule *>(program) = ite->second;
}
}
program = module;
#else
throw std::runtime_error("Error: CUDA is not enabled");
#endif
Expand All @@ -671,13 +673,16 @@ auto
CUDABackend::saveProgramToCache(const Device::Pointer & device, const std::string & hash, void * program) const -> void
{
#if USE_CUDA
auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device);
cuda_device->getCache().emplace_hint(cuda_device->getCache().end(), hash, reinterpret_cast<CUmodule>(program));
if (auto cuda_device = std::dynamic_pointer_cast<CUDADevice>(device))
{
cuda_device->getCache().emplace(hash, *reinterpret_cast<CUmodule *>(program));
}
#else
throw std::runtime_error("Error: CUDA is not enabled");
#endif
}


auto
CUDABackend::buildKernel(const Device::Pointer & device,
const std::string & kernel_source,
Expand All @@ -692,42 +697,59 @@ CUDABackend::buildKernel(const Device::Pointer & device,
throw std::runtime_error("Error (cuda): Failed to set CUDA device before memory allocation.");
}

nvrtcProgram prog;
auto res = nvrtcCreateProgram(&prog, kernel_source.c_str(), nullptr, 0, nullptr, nullptr);
if (res != NVRTC_SUCCESS)
{
throw std::runtime_error("Error (cuda): Failed to create program from source with error code " +
std::to_string(res));
}
res = nvrtcCompileProgram(prog, 0, nullptr);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
std::string log(log_size, '\0');
nvrtcGetProgramLog(prog, &log[0]);
std::cerr << "Build log: " << log << std::endl;
throw std::runtime_error("Error (cuda): Failed to build program with error code " + std::to_string(res));
}
size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
std::vector<char> ptx(ptxSize);
nvrtcGetPTX(prog, ptx.data());
std::chrono::high_resolution_clock::time_point start_time, end_time;
std::chrono::microseconds duration;

CUmodule cuModule;
err = cuModuleLoadData(&cuModule, ptx.data());
if (err != CUDA_SUCCESS)
CUmodule cuModule = nullptr;
std::string hash = std::to_string(std::hash<std::string>{}(kernel_source));
loadProgramFromCache(device, hash, &cuModule);
if (cuModule == nullptr)
{
throw std::runtime_error("Error (cuda): Loading module with error code " + std::to_string(err));
}
nvrtcProgram prog;
auto res = nvrtcCreateProgram(&prog, kernel_source.c_str(), nullptr, 0, nullptr, nullptr);
if (res != NVRTC_SUCCESS)
{
throw std::runtime_error("Error (cuda): Failed to create program from source with error code " +
std::to_string(res));
}

const std::string arch_comp = "-arch=compute_" + cuda_device->getArch();
const std::array<const char *, 1> options = { arch_comp.c_str() };
res = nvrtcCompileProgram(prog, options.size(), options.data());
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
std::string log(log_size, '\0');
nvrtcGetProgramLog(prog, &log[0]);
std::cerr << "Build log: " << log << std::endl;
throw std::runtime_error("Error (cuda): Failed to build program with error code " + std::to_string(res));
}
size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
std::vector<char> ptx(ptxSize);
nvrtcGetPTX(prog, ptx.data());
res = nvrtcDestroyProgram(&prog);
if (res != NVRTC_SUCCESS)
{
throw std::runtime_error("Error (cuda): Failed to destroy program with error code " + std::to_string(res));
}

err = cuModuleLoadData(&cuModule, ptx.data());
if (err != CUDA_SUCCESS)
{
throw std::runtime_error("Error (cuda): Loading module with error code " + std::to_string(err));
}


saveProgramToCache(device, hash, &cuModule);
}
CUfunction cuFunction;
err = cuModuleGetFunction(&cuFunction, cuModule, kernel_name.c_str());
if (err != CUDA_SUCCESS)
{
throw std::runtime_error("Error (cuda): Getting function from module with error code " + std::to_string(err));
}

*(reinterpret_cast<CUfunction *>(kernel)) = cuFunction;
#else
throw std::runtime_error("Error: CUDA is not enabled");
Expand Down
9 changes: 9 additions & 0 deletions clic/src/cudadevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ CUDADevice::getName() const -> std::string
return std::string(device_name);
}

auto
CUDADevice::getArch() const -> std::string
{
int major = 0, minor = 0;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, this->getCUDADevice());
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, this->getCUDADevice());
return std::to_string(major) + std::to_string(minor);
}

auto
CUDADevice::getInfo() const -> std::string
{
Expand Down
Loading

0 comments on commit 098a4d4

Please sign in to comment.