From 3d2ac47a164e38f9437d09fd930ab7f88a824a21 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Fri, 11 Oct 2024 11:13:13 -0700
Subject: [PATCH 1/3] Bugfix: gpu::reduce was not running on the GPU when using
 HIP

---
 external_libs/gpurun/include/gpu/reduce.hpp | 25 +++++++++------------
 1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp
index 27cffa7ac..4db1edf36 100644
--- a/external_libs/gpurun/include/gpu/reduce.hpp
+++ b/external_libs/gpurun/include/gpu/reduce.hpp
@@ -11,14 +11,11 @@
 
 #include <inq_config.h>
 
-#ifdef ENABLE_CUDA
-#include <cuda.h>
-#endif
-
 #include <cassert>
 
 #include <gpu/run.hpp>
 #include <gpu/array.hpp>
+#include <gpu/host.hpp>
 
 namespace gpu {
 
@@ -30,7 +27,7 @@ struct reduce {
 };
 
 
-#ifdef ENABLE_CUDA
+#ifdef ENABLE_GPU
 template <class kernel_type, class array_type>
 __global__ void reduce_kernel_r(long size, kernel_type kernel, array_type odata) {
 
@@ -84,7 +81,7 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
 	
   using type = decltype(kernel(0));
   
-#ifndef ENABLE_CUDA
+#ifndef ENABLE_GPU
 
   type accumulator(0.0);
   for(long ii = 0; ii < size; ii++){
@@ -112,7 +109,7 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
 #endif
 }
 
-#ifdef ENABLE_CUDA
+#ifdef ENABLE_GPU
 template <class kernel_type, class array_type>
 __global__ void reduce_kernel_rr(long sizex, long sizey, kernel_type kernel, array_type odata) {
 
@@ -154,7 +151,7 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty
 	
   using type = decltype(kernel(0, 0));
   
-#ifndef ENABLE_CUDA
+#ifndef ENABLE_GPU
 
   type accumulator(0.0);
 	for(long iy = 0; iy < sizey; iy++){
@@ -187,7 +184,7 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty
 #endif
 }
 
-#ifdef ENABLE_CUDA
+#ifdef ENABLE_GPU
 template <class kernel_type, class array_type>
 __global__ void reduce_kernel_rrr(long sizex, long sizey, long sizez, kernel_type kernel, array_type odata) {
 
@@ -233,7 +230,7 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t
 
 	if(sizex == 0 or sizey == 0 or sizez == 0) return initial_value;
 	
-#ifndef ENABLE_CUDA
+#ifndef ENABLE_GPU
 
   type accumulator = initial_value;
 	for(long iy = 0; iy < sizey; iy++){
@@ -273,7 +270,7 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t
 #endif
 }
 
-#ifdef ENABLE_CUDA
+#ifdef ENABLE_GPU
 template <class kernel_type, class array_type>
 __global__ void reduce_kernel_vr(long sizex, long sizey, kernel_type kernel, array_type odata) {
 
@@ -317,7 +314,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl
 	
   using type = decltype(kernel(0, 0));
 
-#ifndef ENABLE_CUDA
+#ifndef ENABLE_GPU
 
   gpu::array<type, 1> accumulator(sizex, 0.0);
 
@@ -373,7 +370,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl
 
 }
 
-#ifdef ENABLE_CUDA
+#ifdef ENABLE_GPU
 template <class kernel_type, class array_type>
 __global__ void reduce_kernel_vrr(long sizex, long sizey,long sizez, kernel_type kernel, array_type odata) {
 
@@ -417,7 +414,7 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne
 	
   using type = decltype(kernel(0, 0, 0));
 
-#ifndef ENABLE_CUDA
+#ifndef ENABLE_GPU
 
   gpu::array<type, 1> accumulator(sizex, 0.0);
 

From dd0fe52c54c478b869a20235a5dfbf1c6cf5da03 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Fri, 11 Oct 2024 11:59:15 -0700
Subject: [PATCH 2/3] Use generic versions of cuda functions.

---
 external_libs/gpurun/include/gpu/reduce.hpp | 35 +++++++++------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp
index 4db1edf36..7e51151af 100644
--- a/external_libs/gpurun/include/gpu/reduce.hpp
+++ b/external_libs/gpurun/include/gpu/reduce.hpp
@@ -97,10 +97,10 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
 	gpu::array<type, 1> result(nblock);
 
   reduce_kernel_r<<<nblock, blocksize, blocksize*sizeof(type)>>>(size, kernel, begin(result));	
-  check_error(cudaGetLastError());
+  check_error(last_error());
 	
   if(nblock == 1) {
-    cudaDeviceSynchronize();
+    gpu::sync();
     return result[0];
   } else {
     return run(gpu::reduce(nblock), array_access<decltype(begin(result))>{begin(result)});
@@ -172,10 +172,10 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty
 	gpu::array<type, 2> result({nblockx, nblocky});
 
   reduce_kernel_rr<<<{nblockx, nblocky}, {bsizex, bsizey}, bsizex*bsizey*sizeof(type)>>>(sizex, sizey, kernel, begin(result));	
-  check_error(cudaGetLastError());
+  check_error(last_error());
 	
   if(nblockx*nblocky == 1) {
-    cudaDeviceSynchronize();
+    gpu::sync();
     return result[0][0];
   } else {
     return run(gpu::reduce(nblockx*nblocky), array_access<decltype(begin(result.flatted()))>{begin(result.flatted())});
@@ -244,9 +244,8 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t
 	
 #else
 
-	int mingridsize, blocksize;
-	check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_rrr<kernel_type, decltype(begin(std::declval<gpu::array<type, 3>&>()))>));
-
+	auto blocksize = max_blocksize(reduce_kernel_rrr<kernel_type, decltype(begin(std::declval<gpu::array<type, 3>&>()))>);
+	
 	const unsigned bsizex = blocksize;
 	const unsigned bsizey = 1;
 	const unsigned bsizez = 1;
@@ -258,10 +257,10 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t
 	gpu::array<type, 3> result({nblockx, nblocky, nblockz});
 
 	reduce_kernel_rrr<<<{nblockx, nblocky, nblockz}, {bsizex, bsizey, bsizez}, bsizex*bsizey*bsizez*sizeof(type)>>>(sizex, sizey, sizez, kernel, begin(result));
-	check_error(cudaGetLastError());
+	check_error(last_error());
 
   if(nblockx*nblocky*nblockz == 1) {
-    cudaDeviceSynchronize();
+    gpu::sync();
     return initial_value + result[0][0][0];
   } else {
     return run(gpu::reduce(nblockx*nblocky*nblockz), array_access<decltype(begin(result.flatted().flatted()))>{begin(result.flatted().flatted())});
@@ -330,10 +329,7 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl
 
 	gpu::array<type, 2> result;
 	
-	int mingridsize = 0;
-	int blocksize = 0;
-
-	check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_vr<kernel_type, decltype(begin(result))>));
+	auto blocksize = max_blocksize(reduce_kernel_vr<kernel_type, decltype(begin(result))>);
 	
 	unsigned bsizex = 4; //this seems to be the optimal value
 	if(sizex <= 2) bsizex = sizex;
@@ -354,10 +350,10 @@ auto run(long sizex, reduce const & redy, kernel_type kernel) -> gpu::array<decl
   assert(shared_mem_size <= 48*1024);
   
   reduce_kernel_vr<<<dg, db, shared_mem_size>>>(sizex, sizey, kernel, begin(result));	
-  check_error(cudaGetLastError());
+  check_error(last_error());
 	
   if(nblocky == 1) {
-    cudaDeviceSynchronize();
+    gpu::sync();
 
 		assert(result[0].size() == sizex);
 		
@@ -432,10 +428,7 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne
 
 	gpu::array<type, 3> result;
 	
-	int mingridsize = 0;
-	int blocksize = 0;
-
-	check_error(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &blocksize, reduce_kernel_vrr<kernel_type, decltype(begin(result))>));
+	auto blocksize = max_blocksize(reduce_kernel_vrr<kernel_type, decltype(begin(result))>);
 	
 	unsigned bsizex = 4; //this seems to be the optimal value
 	if(sizex <= 2) bsizex = sizex;
@@ -459,10 +452,10 @@ auto run(long sizex, reduce const & redy, reduce const & redz, kernel_type kerne
   assert(shared_mem_size <= 48*1024);
   
   reduce_kernel_vrr<<<dg, db, shared_mem_size>>>(sizex, sizey, sizez, kernel, begin(result));	
-  check_error(cudaGetLastError());
+  check_error(last_error());
 	
   if(nblocky*nblockz == 1) {
-    cudaDeviceSynchronize();
+    gpu::sync();
 
 		assert(result[0][0].size() == sizex);
 		

From 6b126007af56409b951a4e7abadb0f7df2645ef5 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Fri, 11 Oct 2024 12:39:55 -0700
Subject: [PATCH 3/3] Explicitly build the dim3 objects for the kernel
 dimensions to keep HIP happy.

---
 external_libs/gpurun/include/gpu/reduce.hpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp
index 7e51151af..3534f9a70 100644
--- a/external_libs/gpurun/include/gpu/reduce.hpp
+++ b/external_libs/gpurun/include/gpu/reduce.hpp
@@ -171,7 +171,10 @@ auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> declty
 	
 	gpu::array<type, 2> result({nblockx, nblocky});
 
-  reduce_kernel_rr<<<{nblockx, nblocky}, {bsizex, bsizey}, bsizex*bsizey*sizeof(type)>>>(sizex, sizey, kernel, begin(result));	
+	struct dim3 dg{nblockx, nblocky};
+	struct dim3 db{bsizex, bsizey};
+
+	reduce_kernel_rr<<<dg, db, bsizex*bsizey*sizeof(type)>>>(sizex, sizey, kernel, begin(result));
   check_error(last_error());
 	
   if(nblockx*nblocky == 1) {
@@ -256,7 +259,10 @@ auto run(reduce const & redx, reduce const & redy, reduce const & redz, kernel_t
 
 	gpu::array<type, 3> result({nblockx, nblocky, nblockz});
 
-	reduce_kernel_rrr<<<{nblockx, nblocky, nblockz}, {bsizex, bsizey, bsizez}, bsizex*bsizey*bsizez*sizeof(type)>>>(sizex, sizey, sizez, kernel, begin(result));
+	struct dim3 dg{nblockx, nblocky, nblockz};
+	struct dim3 db{bsizex, bsizey, bsizez};
+
+	reduce_kernel_rrr<<<dg, db, bsizex*bsizey*bsizez*sizeof(type)>>>(sizex, sizey, sizez, kernel, begin(result));
 	check_error(last_error());
 
   if(nblockx*nblocky*nblockz == 1) {