Skip to content

Commit

Permalink
Merge branch 'use_transform_reduce_for_gpu_reduce' into 'master'
Browse files Browse the repository at this point in the history
Use transform reduce for gpu::reduce

See merge request npneq/inq!1148
  • Loading branch information
xavierandrade committed Oct 14, 2024
2 parents ef72606 + 92b1f94 commit 6391ba5
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 62 deletions.
71 changes: 12 additions & 59 deletions external_libs/gpurun/include/gpu/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@

#include <cassert>

#ifndef ENABLE_GPU
#include <numeric>
#else
#include <thrust/execution_policy.h>
#include <thrust/transform_reduce.h>
#endif

#include <gpu/run.hpp>
#include <gpu/array.hpp>
#include <gpu/host.hpp>
Expand All @@ -26,40 +33,6 @@ struct reduce {
long size;
};


#ifdef ENABLE_GPU
template <class kernel_type, class array_type>
__global__ void reduce_kernel_r(long size, kernel_type kernel, array_type odata) {

extern __shared__ char shared_mem[];
auto reduction_buffer = (typename array_type::element *) shared_mem;

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int ii = blockIdx.x*blockDim.x + threadIdx.x;

if(ii < size){
reduction_buffer[tid] = kernel(ii);
} else {
reduction_buffer[tid] = (typename array_type::element) 0.0;
}

__syncthreads();

// do reduction in shared mem
for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if (tid < s) {
reduction_buffer[tid] += reduction_buffer[tid + s];
}
__syncthreads();
}

// write result for this block to global mem
if (tid == 0) odata[blockIdx.x] = reduction_buffer[0];

}
#endif

template <typename array_type>
struct array_access {
array_type array;
Expand All @@ -80,32 +53,12 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
auto const size = red.size;

using type = decltype(kernel(0));

#ifndef ENABLE_GPU

type accumulator(0.0);
for(long ii = 0; ii < size; ii++){
accumulator += kernel(ii);
}
return accumulator;
auto range = boost::multi::extension_t{0l, size};

#ifndef ENABLE_GPU
return std::transform_reduce(range.begin(), range.end(), type{}, std::plus<>{}, kernel);
#else

const int blocksize = 1024;

unsigned nblock = (size + blocksize - 1)/blocksize;
gpu::array<type, 1> result(nblock);

reduce_kernel_r<<<nblock, blocksize, blocksize*sizeof(type)>>>(size, kernel, begin(result));
check_error(last_error());

if(nblock == 1) {
gpu::sync();
return result[0];
} else {
return run(gpu::reduce(nblock), array_access<decltype(begin(result))>{begin(result)});
}

return thrust::transform_reduce(thrust::device, range.begin(), range.end(), kernel, type{}, std::plus<>{});
#endif
}

Expand Down Expand Up @@ -144,7 +97,7 @@ __global__ void reduce_kernel_rr(long sizex, long sizey, kernel_type kernel, arr
#endif

template <class kernel_type>
auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> decltype(kernel(0, 0)) {
auto run(gpu::reduce const & redx, gpu::reduce const & redy, kernel_type kernel) -> decltype(kernel(0, 0)) {

auto const sizex = redx.size;
auto const sizey = redy.size;
Expand Down
6 changes: 3 additions & 3 deletions src/hamiltonian/xc_term.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_unpolarized());
ground_state::initial_guess(ions, electrons);
auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
auto nvxc = result.energy.nvxc();
auto exc = result.energy.xc();
Approx target = Approx(nvxc).epsilon(1.e-10);
Expand All @@ -517,7 +517,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_polarized());
ground_state::initial_guess(ions, electrons);
auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
auto nvxc = result.energy.nvxc();
auto exc = result.energy.xc();
Approx target = Approx(nvxc).epsilon(1.e-10);
Expand All @@ -538,7 +538,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_non_collinear());
ground_state::initial_guess(ions, electrons);
auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
auto nvxc = result.energy.nvxc();
auto exc = result.energy.xc();
Approx target = Approx(nvxc).epsilon(1.e-10);
Expand Down
8 changes: 8 additions & 0 deletions src/math/vector3.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,14 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG) {
vector3<int> vv; (void)vv;
}

SECTION("Zero by default constructor"){
vector3<int> vv{};

CHECK(vv[0] == 0.0);
CHECK(vv[1] == 0.0);
CHECK(vv[2] == 0.0);
}

SECTION("Scalar"){
vector3<double> vv(-45.677);

Expand Down

0 comments on commit 6391ba5

Please sign in to comment.