From 3d2ac47a164e38f9437d09fd930ab7f88a824a21 Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Fri, 11 Oct 2024 11:13:13 -0700 Subject: [PATCH 1/3] Bugfix: gpu::reduce was not running on the GPU when using HIP --- external_libs/gpurun/include/gpu/reduce.hpp | 25 +++++++++------------ 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp index 27cffa7ac..4db1edf36 100644 --- a/external_libs/gpurun/include/gpu/reduce.hpp +++ b/external_libs/gpurun/include/gpu/reduce.hpp @@ -11,14 +11,11 @@ #include -#ifdef ENABLE_CUDA -#include -#endif - #include #include #include +#include namespace gpu { @@ -30,7 +27,7 @@ struct reduce { }; -#ifdef ENABLE_CUDA +#ifdef ENABLE_GPU template __global__ void reduce_kernel_r(long size, kernel_type kernel, array_type odata) { @@ -84,7 +81,7 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) { using type = decltype(kernel(0)); -#ifndef ENABLE_CUDA +#ifndef ENABLE_GPU type accumulator(0.0); for(long ii = 0; ii < size; ii++){ @@ -112,7 +109,7 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) { #endif } -#ifdef ENABLE_CUDA +#ifdef ENABLE_GPU template __global__ void reduce_kernel_rr(long sizex, long sizey, kernel_type kernel, array_type odata) { @@ -154,7 +151,7 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty using type = decltype(kernel(0, 0)); -#ifndef ENABLE_CUDA +#ifndef ENABLE_GPU type accumulator(0.0); for(long iy = 0; iy < sizey; iy++){ @@ -187,7 +184,7 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty #endif } -#ifdef ENABLE_CUDA +#ifdef ENABLE_GPU template __global__ void reduce_kernel_rrr(long sizex, long sizey, long sizez, kernel_type kernel, array_type odata) { @@ -233,7 +230,7 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t if(sizex == 0 or sizey == 0 or sizez == 0) return initial_value; -#ifndef ENABLE_CUDA +#ifndef ENABLE_GPU type accumulator = initial_value; for(long iy = 0; iy < sizey; iy++){ @@ -273,7 +270,7 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t #endif } -#ifdef ENABLE_CUDA +#ifdef ENABLE_GPU template __global__ void reduce_kernel_vr(long sizex, long sizey, kernel_type kernel, array_type odata) { @@ -317,7 +314,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array accumulator(sizex, 0.0); @@ -373,7 +370,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array __global__ void reduce_kernel_vrr(long sizex, long sizey,long sizez, kernel_type kernel, array_type odata) { @@ -417,7 +414,7 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne using type = decltype(kernel(0, 0, 0)); -#ifndef ENABLE_CUDA +#ifndef ENABLE_GPU gpu::array accumulator(sizex, 0.0); From dd0fe52c54c478b869a20235a5dfbf1c6cf5da03 Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Fri, 11 Oct 2024 11:59:15 -0700 Subject: [PATCH 2/3] Use generic versions of cuda functions. --- external_libs/gpurun/include/gpu/reduce.hpp | 35 +++++++++------------ 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp index 4db1edf36..7e51151af 100644 --- a/external_libs/gpurun/include/gpu/reduce.hpp +++ b/external_libs/gpurun/include/gpu/reduce.hpp @@ -97,10 +97,10 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) { gpu::array result(nblock); reduce_kernel_r<<>>(size, kernel, begin(result)); - check_error(cudaGetLastError()); + check_error(last_error()); if(nblock == 1) { - cudaDeviceSynchronize(); + gpu::sync(); return result[0]; } else { return run(gpu::reduce(nblock), array_access{begin(result)}); @@ -172,10 +172,10 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty gpu::array result({nblockx, nblocky}); reduce_kernel_rr<<<{nblockx, nblocky}, {bsizex, bsizey}, bsizex*bsizey*sizeof(type)>>>(sizex, sizey, kernel, begin(result)); - check_error(cudaGetLastError()); + check_error(last_error()); if(nblockx*nblocky == 1) { - cudaDeviceSynchronize(); + gpu::sync(); return result[0][0]; } else { return run(gpu::reduce(nblockx*nblocky), array_access{begin(result.flatted())}); @@ -244,9 +244,8 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t #else - int mingridsize, blocksize; - check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_rrr&>()))>)); - + auto blocksize = max_blocksize(reduce_kernel_rrr&>()))>); + const unsigned bsizex = blocksize; const unsigned bsizey = 1; const unsigned bsizez = 1; @@ -258,10 +257,10 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t gpu::array result({nblockx, nblocky, nblockz}); reduce_kernel_rrr<<<{nblockx, nblocky, nblockz}, {bsizex, bsizey, bsizez}, bsizex*bsizey*bsizez*sizeof(type)>>>(sizex, sizey, sizez, kernel, begin(result)); - check_error(cudaGetLastError()); + check_error(last_error()); if(nblockx*nblocky*nblockz == 1) { - cudaDeviceSynchronize(); + gpu::sync(); return initial_value + result[0][0][0]; } else { return run(gpu::reduce(nblockx*nblocky*nblockz), array_access{begin(result.flatted().flatted())}); @@ -330,10 +329,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array result; - int mingridsize = 0; - int blocksize = 0; - - check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_vr)); + auto blocksize = max_blocksize(reduce_kernel_vr); unsigned bsizex = 4; //this seems to be the optimal value if(sizex <= 2) bsizex = sizex; @@ -354,10 +350,10 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array>>(sizex, sizey, kernel, begin(result)); - check_error(cudaGetLastError()); + check_error(last_error()); if(nblocky == 1) { - cudaDeviceSynchronize(); + gpu::sync(); assert(result[0].size() == sizex); @@ -432,10 +428,7 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne gpu::array result; - int mingridsize = 0; - int blocksize = 0; - - check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_vrr)); + auto blocksize = max_blocksize(reduce_kernel_vrr); unsigned bsizex = 4; //this seems to be the optimal value if(sizex <= 2) bsizex = sizex; @@ -459,10 +452,10 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne assert(shared_mem_size <= 48*1024); reduce_kernel_vrr<<>>(sizex, sizey, sizez, kernel, begin(result)); - check_error(cudaGetLastError()); + check_error(last_error()); if(nblocky*nblockz == 1) { - cudaDeviceSynchronize(); + gpu::sync(); assert(result[0][0].size() == sizex); From 6b126007af56409b951a4e7abadb0f7df2645ef5 Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Fri, 11 Oct 2024 12:39:55 -0700 Subject: [PATCH 3/3] Explicitly build the dim3 objects for the kernel dimensions to keep HIP happy. --- external_libs/gpurun/include/gpu/reduce.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp index 7e51151af..3534f9a70 100644 --- a/external_libs/gpurun/include/gpu/reduce.hpp +++ b/external_libs/gpurun/include/gpu/reduce.hpp @@ -171,7 +171,10 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty gpu::array result({nblockx, nblocky}); - reduce_kernel_rr<<<{nblockx, nblocky}, {bsizex, bsizey}, bsizex*bsizey*sizeof(type)>>>(sizex, sizey, kernel, begin(result)); + struct dim3 dg{nblockx, nblocky}; + struct dim3 db{bsizex, bsizey}; + + reduce_kernel_rr<<>>(sizex, sizey, kernel, begin(result)); check_error(last_error()); if(nblockx*nblocky == 1) { @@ -256,7 +259,10 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t gpu::array result({nblockx, nblocky, nblockz}); - reduce_kernel_rrr<<<{nblockx, nblocky, nblockz}, {bsizex, bsizey, bsizez}, bsizex*bsizey*bsizez*sizeof(type)>>>(sizex, sizey, sizez, kernel, begin(result)); + struct dim3 dg{nblockx, nblocky, nblockz}; + struct dim3 db{bsizex, bsizey, bsizez}; + + reduce_kernel_rrr<<>>(sizex, sizey, sizez, kernel, begin(result)); check_error(last_error()); if(nblockx*nblocky*nblockz == 1) {