Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD: block and grid functions #171

Merged
merged 3 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions clic/include/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,6 @@ class CUDABackend : public Backend
const std::vector<size_t> & sizes) const -> void override;
[[nodiscard]] auto
getPreamble() const -> std::string override;
auto
toBlockDim(const std::array<size_t, 3> & global_size) const -> std::array<size_t, 3>;
};

class OpenCLBackend : public Backend
Expand Down
66 changes: 47 additions & 19 deletions clic/src/cudabackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -814,11 +814,54 @@ CUDABackend::executeKernel(const Device::Pointer & device,

std::vector<void *> argsValues(args.size());
argsValues = args;
std::array<size_t, 3> block_size = toBlockDim(global_size);

int maxThreads;
auto error = cudaDeviceGetAttribute(&maxThreads, cudaDevAttrMaxThreadsPerBlock, cuda_device->getCUDADeviceIndex());
if (error != CUDA_SUCCESS)
{
throw std::runtime_error("Error (cuda): Failed to get CUDA Maximum Threads." + std::to_string(error));
}

size_t blockSize = maxThreads / 2;
std::array<size_t, 3> block_size = { 0, 0, 0 };
int dim = 0;

for (int i = 0; i < global_size.size(); ++i)
{
if (global_size[i] != 1)
{
dim++;
block_size[i] = 1;
}
}

switch (dim)
{
case 1:
// Warning: Ensure that the third dimension of the block size does not exceed 64.
std::transform(block_size.begin(), block_size.end(), block_size.begin(), [](size_t value) {
return (value == 0) ? (value + 1) : (value * 512);
});
break;
case 2:
std::transform(block_size.begin(), block_size.end(), block_size.begin(), [](size_t value) {
return (value == 0) ? (value + 1) : (value * 16);
});
break;
default:
std::transform(
block_size.begin(), block_size.end(), block_size.begin(), [](size_t value) { return (value * 8); });
break;
}

std::array<size_t, 3> grid_size = { (global_size.data()[0] + block_size.data()[0] - 1) / block_size.data()[0],
(global_size.data()[1] + block_size.data()[1] - 1) / block_size.data()[1],
(global_size.data()[2] + block_size.data()[2] - 1) / block_size.data()[2] };

err = cuLaunchKernel(cuFunction,
global_size.data()[0],
global_size.data()[1],
global_size.data()[2],
grid_size.data()[0],
grid_size.data()[1],
grid_size.data()[2],
block_size.data()[0],
block_size.data()[1],
block_size.data()[2],
Expand All @@ -843,19 +886,4 @@ CUDABackend::getPreamble() const -> std::string
return kernel::preamble_cu;
}

auto
CUDABackend::toBlockDim(const std::array<size_t, 3> & global_size) const -> std::array<size_t, 3>
{
// In general, we add the gridDim.x (gridDim.y & gridDim.z) to the problem size, subtract one and divide by the
// gridDim.x (gridDim.y & gridDim.z). However, since we're taking the global_size, which represents the gridDim which,
// in itself, is the shape of the array that represents the problem size, we get the following formulas:
std::array<size_t, 3> block_size = { (global_size.data()[0] + global_size.data()[0] - 1) / global_size.data()[0],
(global_size.data()[1] + global_size.data()[1] - 1) / global_size.data()[1],
(global_size.data()[2] + global_size.data()[2] - 1) / global_size.data()[2] };

// One can notice that the blockDim (block_size) will always be set to 1.

return block_size;
}

} // namespace cle