Skip to content

Commit

Permalink
Merge branch 'simplify_projector_code' into 'master'
Browse files Browse the repository at this point in the history
Simplify projector code

See merge request npneq/inq!1180
  • Loading branch information
xavierandrade committed Dec 17, 2024
2 parents fbea34d + 17f247b commit d336905
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 115 deletions.
143 changes: 30 additions & 113 deletions src/hamiltonian/projector_all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ class projector_all {
////////////////////////////////////////////////////////////////////////////////////////////

template <typename KpointType>
gpu::array<complex, 3> project(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint) const {
gpu::array<complex, 3> calculate_projections(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint) const {

gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);

Expand Down Expand Up @@ -155,6 +155,21 @@ class projector_all {
}
#endif

if(phi.basis().comm().size() > 1) {
CALI_CXX_MARK_SCOPE("projector_all::project::reduce");
phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
}

return projections_all;
}

////////////////////////////////////////////////////////////////////////////////////////////

template <typename KpointType>
gpu::array<complex, 3> project(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint) const {

auto projections_all = calculate_projections(phi, kpoint);

{ CALI_CXX_MARK_SCOPE("projector_scal");

gpu::run(phi.local_set_size(), max_nlm_, nprojs_,
Expand All @@ -164,11 +179,8 @@ class projector_all {
});
}

if(phi.basis().comm().size() > 1) {
CALI_CXX_MARK_SCOPE("projector_all::project::reduce");
phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
}

gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});

#ifndef ENABLE_CUDA
for(auto iproj = 0; iproj < nprojs_; iproj++) {
CALI_CXX_MARK_SCOPE("projector_gemm_2");
Expand Down Expand Up @@ -235,68 +247,8 @@ class projector_all {

template <typename KpointType, typename Occupations>
double energy(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const {

gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);

{ CALI_CXX_MARK_SCOPE("projector::gather");

gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_,
[sgr = begin(sphere_phi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
if(poi[iproj][ipoint][0] >= 0){
auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint]));
sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
} else {
sgr[iproj][ipoint][ist] = complex(0.0, 0.0);
}
});
}

#ifndef ENABLE_CUDA
for(auto iproj = 0; iproj < nprojs_; iproj++){
CALI_CXX_MARK_SCOPE("projector_gemm_1");

if(locally_empty_[iproj]) continue;

namespace blas = boost::multi::blas;
blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj]));
}
#else
if(max_sphere_size_ > 0) {
CALI_CXX_MARK_SCOPE("projector_gemm_1");

const double zero = 0.0;
const double vol = phi.basis().volume_element();

auto status = cublasDgemmStridedBatched(/*cublasHandle_t handle = */ boost::multi::cuda::cublas::context::get_instance().get(),
/*cublasOperation_t transa = */ CUBLAS_OP_N,
/*cublasOperation_t transb = */ CUBLAS_OP_N,
/*int m = */ 2*phi.local_set_size(),
/*int n = */ max_nlm_,
/*int k = */ max_sphere_size_,
/*const double *alpha = */ &vol,
/*const double *A = */ reinterpret_cast<double const *>(raw_pointer_cast(sphere_phi_all.data_elements())),
/*int lda = */ 2*phi.local_set_size(),
/*long long int strideA = */ 2*max_sphere_size_*phi.local_set_size(),
/*const double *B = */ raw_pointer_cast(matrices_.data_elements()),
/*int ldb = */ max_sphere_size_,
/*long long int strideB =*/ max_nlm_*max_sphere_size_,
/*const double *beta = */ &zero,
/*double *C = */ reinterpret_cast<double *>(raw_pointer_cast(projections_all.data_elements())),
/*int ldc = */ 2*phi.local_set_size(),
/*long long int strideC = */ 2*max_nlm_*phi.local_set_size(),
/*int batchCount = */ nprojs_);
gpu::sync();

assert(status == CUBLAS_STATUS_SUCCESS);

}
#endif

if(phi.basis().comm().size() > 1) {
CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_basis");
phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
}
auto projections_all = calculate_projections(phi, kpoint);

auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_), 0.0,
[proj = begin(projections_all), coe = begin(coeff_), occ = begin(occupations), spinor_size = phi.local_spinor_set_size()]
Expand All @@ -312,62 +264,36 @@ class projector_all {
}

return en;

}

////////////////////////////////////////////////////////////////////////////////////////////


template <typename PhiType, typename GPhiType, typename MetricType, typename OccsType>
template <typename PhiType, typename GPhiType, typename MetricType, typename OccsType, typename KPoint>
void force(PhiType & phi, GPhiType const & gphi, MetricType const & metric,
OccsType const & occs, vector3<double, covariant> const & vector_potential, gpu::array<vector3<double>, 1> & forces_non_local) const {
OccsType const & occs, vector3<double, covariant> const & vector_potential, KPoint const & kpoint, gpu::array<vector3<double>, 1> & forces_non_local) const {

CALI_CXX_MARK_FUNCTION;

namespace blas = boost::multi::blas;

gpu::array<typename PhiType::element_type, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
gpu::array<typename GPhiType::element_type, 3> sphere_gphi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);

{ CALI_CXX_MARK_SCOPE("projector_all::force::gather");

gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_,
[sgr = begin(sphere_phi_all), gsgr = begin(sphere_gphi_all), gr = begin(phi.hypercubic()), ggr = begin(gphi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint = phi.kpoint() + vector_potential]
[gsgr = begin(sphere_gphi_all), gr = begin(phi.hypercubic()), ggr = begin(gphi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint]
GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
if(poi[iproj][ipoint][0] >= 0){
auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint]));
sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
gsgr[iproj][ipoint][ist] = phase*ggr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
} else {
sgr[iproj][ipoint][ist] = 0.0;
gsgr[iproj][ipoint][ist] = {complex(0.0), complex(0.0), complex(0.0)};
}
});
}

for(auto iproj = 0; iproj < nprojs_; iproj++){
if(locally_empty_[iproj]) continue;

blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj]));
}

{ CALI_CXX_MARK_SCOPE("projector_force_scal");

gpu::run(phi.local_set_size(), max_nlm_, nprojs_,
[proj = begin(projections_all), coeff = begin(coeff_)] GPU_LAMBDA (auto ist, auto ipj, auto iproj){
proj[iproj][ipj][ist] *= coeff[iproj][ipj];
});
}

if(phi.basis().comm().size() > 1) {
phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
}

for(auto iproj = 0; iproj < nprojs_; iproj++){
blas::real_doubled(sphere_phi_all[iproj]) = blas::gemm(1.0, blas::T(matrices_[iproj]), blas::real_doubled(projections_all[iproj]));
}

auto sphere_phi_all = project(phi, kpoint);
gpu::array<vector3<double, covariant>, 1> force(nprojs_, {0.0, 0.0, 0.0});

for(auto iproj = 0; iproj < nprojs_; iproj++) {
Expand All @@ -393,24 +319,19 @@ class projector_all {
////////////////////////////////////////////////////////////////////////////////////////////
template <typename KpointType>
void position_commutator(states::orbital_set<basis::real_space, complex> const & phi, states::orbital_set<basis::real_space, vector3<complex, covariant>> & cphi, KpointType const & kpoint) const {

gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
gpu::array<vector3<complex, contravariant>, 3> sphere_rphi_all({nprojs_, max_sphere_size_, phi.local_set_size()});

gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);
gpu::array<vector3<complex, contravariant>, 3> sphere_rphi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
gpu::array<vector3<complex, contravariant>, 3> rprojections_all({nprojs_, max_nlm_, phi.local_set_size()}, zero<vector3<complex, contravariant>>());

{ CALI_CXX_MARK_SCOPE("position_commutator::gather");

gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_,
[sphi = begin(sphere_phi_all), srphi = begin(sphere_rphi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
[srphi = begin(sphere_rphi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
if(poi[iproj][ipoint][0] >= 0){
auto rr = static_cast<vector3<double, contravariant>>(pos[iproj][ipoint]);
auto phase = polar(1.0, dot(kpoint, rr));
sphi[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
srphi[iproj][ipoint][ist] = rr*sphi[iproj][ipoint][ist];
srphi[iproj][ipoint][ist] = rr*phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
} else {
sphi[iproj][ipoint][ist] = complex(0.0, 0.0);
srphi[iproj][ipoint][ist][0] = complex(0.0, 0.0);
srphi[iproj][ipoint][ist][1] = complex(0.0, 0.0);
srphi[iproj][ipoint][ist][2] = complex(0.0, 0.0);
Expand All @@ -424,8 +345,6 @@ class projector_all {
if(locally_empty_[iproj]) continue;

namespace blas = boost::multi::blas;
blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj]));

auto rpa = rprojections_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();
auto sra = sphere_rphi_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();

Expand All @@ -435,15 +354,13 @@ class projector_all {
{ CALI_CXX_MARK_SCOPE("position_commutator_scal");

gpu::run(phi.local_set_size(), max_nlm_, nprojs_,
[proj = begin(projections_all), rproj = begin(rprojections_all), coe = begin(coeff_)]
[rproj = begin(rprojections_all), coe = begin(coeff_)]
GPU_LAMBDA (auto ist, auto ilm, auto iproj){
proj[iproj][ilm][ist] *= coe[iproj][ilm];
rproj[iproj][ilm][ist] *= coe[iproj][ilm];
});
}

if(phi.basis().comm().size() > 1) {
phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(rprojections_all.data_elements()), rprojections_all.num_elements(), std::plus<>{});
}

Expand All @@ -453,13 +370,13 @@ class projector_all {
if(locally_empty_[iproj]) continue;

namespace blas = boost::multi::blas;
blas::real_doubled(sphere_phi_all[iproj]) = blas::gemm(1., blas::T(matrices_[iproj]), blas::real_doubled(projections_all[iproj]));

auto rpa = rprojections_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();
auto sra = sphere_rphi_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();
blas::real_doubled(sra) = blas::gemm(1., blas::T(matrices_[iproj]), blas::real_doubled(rpa));
}

auto sphere_phi_all = project(phi, kpoint);

for(auto iproj = 0; iproj < nprojs_; iproj++){

if(locally_empty_[iproj]) continue;
Expand Down
4 changes: 2 additions & 2 deletions src/observables/forces_stress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ struct forces_stress {
auto iphi = 0;
for(auto & phi : electrons.kpin()){

auto gphi = operations::gradient(phi, /* factor = */ 1.0, /*shift = */ phi.kpoint());
auto gphi = operations::gradient(phi, /* factor = */ 1.0, /*shift = */ phi.kpoint() + ham.uniform_vector_potential());
observables::density::calculate_gradient_add(electrons.occupations()[iphi], phi, gphi, gdensity);

ham.projectors_all().force(phi, gphi, ions.cell().metric(), electrons.occupations()[iphi], ham.uniform_vector_potential(), forces_non_local);
ham.projectors_all().force(phi, gphi, ions.cell().metric(), electrons.occupations()[iphi], ham.uniform_vector_potential(), phi.kpoint() + ham.uniform_vector_potential(), forces_non_local);

stress += stress_kinetic(gphi, electrons.occupations()[iphi]);

Expand Down

0 comments on commit d336905

Please sign in to comment.