Merge branch 'use_transform_reduce_for_gpu_reduce' into 'master'

Use transform reduce for gpu::reduce See merge request npneq/inq!1148
LLNL · Oct 14, 2024 · 6391ba5 · 6391ba5
2 parents ef72606 + 92b1f94
commit 6391ba5
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 62 deletions.
diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp
@@ -13,6 +13,13 @@
 
 #include <cassert>
 
+#ifndef ENABLE_GPU
+#include <numeric>
+#else
+#include <thrust/execution_policy.h>
+#include <thrust/transform_reduce.h>
+#endif
+
 #include <gpu/run.hpp>
 #include <gpu/array.hpp>
 #include <gpu/host.hpp>
@@ -26,40 +33,6 @@ struct reduce {
 	long size;
 };
 
-
-#ifdef ENABLE_GPU
-template <class kernel_type, class array_type>
-__global__ void reduce_kernel_r(long size, kernel_type kernel, array_type odata) {
-
-	extern __shared__ char shared_mem[];
-	auto reduction_buffer = (typename array_type::element *) shared_mem;
-
-	// each thread loads one element from global to shared mem
-	unsigned int tid = threadIdx.x;
-	unsigned int ii = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if(ii < size){
-		reduction_buffer[tid] = kernel(ii);
-	} else {
-		reduction_buffer[tid] = (typename array_type::element) 0.0;
-	}
-
-	__syncthreads();
-
-	// do reduction in shared mem
-	for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
-		if (tid < s) {
-			reduction_buffer[tid] += reduction_buffer[tid + s];
-		}
-		__syncthreads();
-	}
-
-	// write result for this block to global mem
-	if (tid == 0) odata[blockIdx.x] = reduction_buffer[0];
-
-}
-#endif
-
 template <typename array_type>
 struct array_access {
   array_type array;
@@ -80,32 +53,12 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
 	auto const size = red.size;
 
   using type = decltype(kernel(0));
-
-#ifndef ENABLE_GPU
-
-  type accumulator(0.0);
-  for(long ii = 0; ii < size; ii++){
-    accumulator += kernel(ii);
-  }
-  return accumulator;
+	auto range = boost::multi::extension_t{0l, size};
 
+#ifndef ENABLE_GPU
+	return std::transform_reduce(range.begin(), range.end(), type{}, std::plus<>{}, kernel);
 #else
-
-	const int blocksize = 1024;
-
-	unsigned nblock = (size + blocksize - 1)/blocksize;
-	gpu::array<type, 1> result(nblock);
-
-  reduce_kernel_r<<<nblock, blocksize, blocksize*sizeof(type)>>>(size, kernel, begin(result));	
-  check_error(last_error());
-
-  if(nblock == 1) {
-    gpu::sync();
-    return result[0];
-  } else {
-    return run(gpu::reduce(nblock), array_access<decltype(begin(result))>{begin(result)});
-  }
-
+	return thrust::transform_reduce(thrust::device, range.begin(), range.end(), kernel, type{}, std::plus<>{});
 #endif
 }
 
@@ -144,7 +97,7 @@ __global__ void reduce_kernel_rr(long sizex, long sizey, kernel_type kernel, arr
 #endif
 
 template <class kernel_type>
-auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> decltype(kernel(0, 0)) {
+auto run(gpu::reduce const & redx, gpu::reduce const & redy, kernel_type kernel) -> decltype(kernel(0, 0)) {
 
 	auto const sizex = redx.size;	
 	auto const sizey = redy.size;	

diff --git a/src/hamiltonian/xc_term.hpp b/src/hamiltonian/xc_term.hpp
@@ -496,7 +496,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
 		ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
 		auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_unpolarized());
 		ground_state::initial_guess(ions, electrons);
-		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
+		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
 		auto nvxc = result.energy.nvxc();
 		auto exc = result.energy.xc();
 		Approx target = Approx(nvxc).epsilon(1.e-10);
@@ -517,7 +517,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
 		ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
 		auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_polarized());
 		ground_state::initial_guess(ions, electrons);
-		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
+		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
 		auto nvxc = result.energy.nvxc();
 		auto exc = result.energy.xc();
 		Approx target = Approx(nvxc).epsilon(1.e-10);
@@ -538,7 +538,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
 		ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
 		auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_non_collinear());
 		ground_state::initial_guess(ions, electrons);
-		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
+		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
 		auto nvxc = result.energy.nvxc();
 		auto exc = result.energy.xc();
 		Approx target = Approx(nvxc).epsilon(1.e-10);

diff --git a/src/math/vector3.hpp b/src/math/vector3.hpp
@@ -345,6 +345,14 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG) {
 		vector3<int> vv; (void)vv;
 	}
 
+	SECTION("Zero by default constructor"){
+		vector3<int> vv{};
+
+		CHECK(vv[0] == 0.0);
+		CHECK(vv[1] == 0.0);
+		CHECK(vv[2] == 0.0);
+	}
+
 	SECTION("Scalar"){
 		vector3<double> vv(-45.677);