Skip to content

Commit

Permalink
Merge branch 'bugfix_reduce_on_gpu_for_hip' into 'master'
Browse files Browse the repository at this point in the history
Bugfix: gpu::reduce was not running on the GPU when using HIP

See merge request npneq/inq!1147
  • Loading branch information
xavierandrade committed Oct 12, 2024
2 parents 2405385 + 6b12600 commit ef72606
Showing 1 changed file with 33 additions and 37 deletions.
70 changes: 33 additions & 37 deletions external_libs/gpurun/include/gpu/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@

#include <inq_config.h>

#ifdef ENABLE_CUDA
#include <cuda.h>
#endif

#include <cassert>

#include <gpu/run.hpp>
#include <gpu/array.hpp>
#include <gpu/host.hpp>

namespace gpu {

Expand All @@ -30,7 +27,7 @@ struct reduce {
};


#ifdef ENABLE_CUDA
#ifdef ENABLE_GPU
template <class kernel_type, class array_type>
__global__ void reduce_kernel_r(long size, kernel_type kernel, array_type odata) {

Expand Down Expand Up @@ -84,7 +81,7 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {

using type = decltype(kernel(0));

#ifndef ENABLE_CUDA
#ifndef ENABLE_GPU

type accumulator(0.0);
for(long ii = 0; ii < size; ii++){
Expand All @@ -100,10 +97,10 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
gpu::array<type, 1> result(nblock);

reduce_kernel_r<<<nblock, blocksize, blocksize*sizeof(type)>>>(size, kernel, begin(result));
check_error(cudaGetLastError());
check_error(last_error());

if(nblock == 1) {
cudaDeviceSynchronize();
gpu::sync();
return result[0];
} else {
return run(gpu::reduce(nblock), array_access<decltype(begin(result))>{begin(result)});
Expand All @@ -112,7 +109,7 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
#endif
}

#ifdef ENABLE_CUDA
#ifdef ENABLE_GPU
template <class kernel_type, class array_type>
__global__ void reduce_kernel_rr(long sizex, long sizey, kernel_type kernel, array_type odata) {

Expand Down Expand Up @@ -154,7 +151,7 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty

using type = decltype(kernel(0, 0));

#ifndef ENABLE_CUDA
#ifndef ENABLE_GPU

type accumulator(0.0);
for(long iy = 0; iy < sizey; iy++){
Expand All @@ -174,11 +171,14 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty

gpu::array<type, 2> result({nblockx, nblocky});

reduce_kernel_rr<<<{nblockx, nblocky}, {bsizex, bsizey}, bsizex*bsizey*sizeof(type)>>>(sizex, sizey, kernel, begin(result));
check_error(cudaGetLastError());
struct dim3 dg{nblockx, nblocky};
struct dim3 db{bsizex, bsizey};

reduce_kernel_rr<<<dg, db, bsizex*bsizey*sizeof(type)>>>(sizex, sizey, kernel, begin(result));
check_error(last_error());

if(nblockx*nblocky == 1) {
cudaDeviceSynchronize();
gpu::sync();
return result[0][0];
} else {
return run(gpu::reduce(nblockx*nblocky), array_access<decltype(begin(result.flatted()))>{begin(result.flatted())});
Expand All @@ -187,7 +187,7 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty
#endif
}

#ifdef ENABLE_CUDA
#ifdef ENABLE_GPU
template <class kernel_type, class array_type>
__global__ void reduce_kernel_rrr(long sizex, long sizey, long sizez, kernel_type kernel, array_type odata) {

Expand Down Expand Up @@ -233,7 +233,7 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t

if(sizex == 0 or sizey == 0 or sizez == 0) return initial_value;

#ifndef ENABLE_CUDA
#ifndef ENABLE_GPU

type accumulator = initial_value;
for(long iy = 0; iy < sizey; iy++){
Expand All @@ -247,9 +247,8 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t

#else

int mingridsize, blocksize;
check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_rrr<kernel_type, decltype(begin(std::declval<gpu::array<type, 3>&>()))>));

auto blocksize = max_blocksize(reduce_kernel_rrr<kernel_type, decltype(begin(std::declval<gpu::array<type, 3>&>()))>);

const unsigned bsizex = blocksize;
const unsigned bsizey = 1;
const unsigned bsizez = 1;
Expand All @@ -260,11 +259,14 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t

gpu::array<type, 3> result({nblockx, nblocky, nblockz});

reduce_kernel_rrr<<<{nblockx, nblocky, nblockz}, {bsizex, bsizey, bsizez}, bsizex*bsizey*bsizez*sizeof(type)>>>(sizex, sizey, sizez, kernel, begin(result));
check_error(cudaGetLastError());
struct dim3 dg{nblockx, nblocky, nblockz};
struct dim3 db{bsizex, bsizey, bsizez};

reduce_kernel_rrr<<<dg, db, bsizex*bsizey*bsizez*sizeof(type)>>>(sizex, sizey, sizez, kernel, begin(result));
check_error(last_error());

if(nblockx*nblocky*nblockz == 1) {
cudaDeviceSynchronize();
gpu::sync();
return initial_value + result[0][0][0];
} else {
return run(gpu::reduce(nblockx*nblocky*nblockz), array_access<decltype(begin(result.flatted().flatted()))>{begin(result.flatted().flatted())});
Expand All @@ -273,7 +275,7 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t
#endif
}

#ifdef ENABLE_CUDA
#ifdef ENABLE_GPU
template <class kernel_type, class array_type>
__global__ void reduce_kernel_vr(long sizex, long sizey, kernel_type kernel, array_type odata) {

Expand Down Expand Up @@ -317,7 +319,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl

using type = decltype(kernel(0, 0));

#ifndef ENABLE_CUDA
#ifndef ENABLE_GPU

gpu::array<type, 1> accumulator(sizex, 0.0);

Expand All @@ -333,10 +335,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl

gpu::array<type, 2> result;

int mingridsize = 0;
int blocksize = 0;

check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_vr<kernel_type, decltype(begin(result))>));
auto blocksize = max_blocksize(reduce_kernel_vr<kernel_type, decltype(begin(result))>);

unsigned bsizex = 4; //this seems to be the optimal value
if(sizex <= 2) bsizex = sizex;
Expand All @@ -357,10 +356,10 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl
assert(shared_mem_size <= 48*1024);

reduce_kernel_vr<<<dg, db, shared_mem_size>>>(sizex, sizey, kernel, begin(result));
check_error(cudaGetLastError());
check_error(last_error());

if(nblocky == 1) {
cudaDeviceSynchronize();
gpu::sync();

assert(result[0].size() == sizex);

Expand All @@ -373,7 +372,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl

}

#ifdef ENABLE_CUDA
#ifdef ENABLE_GPU
template <class kernel_type, class array_type>
__global__ void reduce_kernel_vrr(long sizex, long sizey,long sizez, kernel_type kernel, array_type odata) {

Expand Down Expand Up @@ -417,7 +416,7 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne

using type = decltype(kernel(0, 0, 0));

#ifndef ENABLE_CUDA
#ifndef ENABLE_GPU

gpu::array<type, 1> accumulator(sizex, 0.0);

Expand All @@ -435,10 +434,7 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne

gpu::array<type, 3> result;

int mingridsize = 0;
int blocksize = 0;

check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_vrr<kernel_type, decltype(begin(result))>));
auto blocksize = max_blocksize(reduce_kernel_vrr<kernel_type, decltype(begin(result))>);

unsigned bsizex = 4; //this seems to be the optimal value
if(sizex <= 2) bsizex = sizex;
Expand All @@ -462,10 +458,10 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne
assert(shared_mem_size <= 48*1024);

reduce_kernel_vrr<<<dg, db, shared_mem_size>>>(sizex, sizey, sizez, kernel, begin(result));
check_error(cudaGetLastError());
check_error(last_error());

if(nblocky*nblockz == 1) {
cudaDeviceSynchronize();
gpu::sync();

assert(result[0][0].size() == sizex);

Expand Down

0 comments on commit ef72606

Please sign in to comment.