Merge branch 'simplify_projector_code' into 'master'

Simplify projector code See merge request npneq/inq!1180
LLNL · Dec 17, 2024 · d336905 · d336905
2 parents fbea34d + 17f247b
commit d336905
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 115 deletions.
diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp
@@ -96,8 +96,8 @@ class projector_all {
 	////////////////////////////////////////////////////////////////////////////////////////////		
 
 	template <typename KpointType>
-	gpu::array<complex, 3> project(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint) const {
-    
+	gpu::array<complex, 3> calculate_projections(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint) const {
+
 		gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
 		gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);
 
@@ -155,6 +155,21 @@ class projector_all {
 		}
 #endif
 
+		if(phi.basis().comm().size() > 1) {
+			CALI_CXX_MARK_SCOPE("projector_all::project::reduce");
+			phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
+		}
+
+		return 	projections_all;
+	}
+
+	////////////////////////////////////////////////////////////////////////////////////////////
+
+	template <typename KpointType>
+	gpu::array<complex, 3> project(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint) const {
+
+		auto projections_all = calculate_projections(phi, kpoint);
+
     { CALI_CXX_MARK_SCOPE("projector_scal");
 
       gpu::run(phi.local_set_size(), max_nlm_, nprojs_,
@@ -164,11 +179,8 @@ class projector_all {
                });
 		}
 
-		if(phi.basis().comm().size() > 1) {
-			CALI_CXX_MARK_SCOPE("projector_all::project::reduce");
-			phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
-		}
-
+		gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
+
 #ifndef ENABLE_CUDA
 		for(auto iproj = 0; iproj < nprojs_; iproj++) {
 			CALI_CXX_MARK_SCOPE("projector_gemm_2");
@@ -235,68 +247,8 @@ class projector_all {
 
 	template <typename KpointType, typename Occupations>
 	double energy(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const {
-
-		gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
-		gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);
-
-		{ CALI_CXX_MARK_SCOPE("projector::gather");
-
-			gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_,
-							 [sgr = begin(sphere_phi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
-								 if(poi[iproj][ipoint][0] >= 0){
-									 auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint]));
-									 sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
-								 } else {
-									 sgr[iproj][ipoint][ist] = complex(0.0, 0.0);
-								 }
-							 });
-		}
-
-#ifndef ENABLE_CUDA
-		for(auto iproj = 0; iproj < nprojs_; iproj++){
-			CALI_CXX_MARK_SCOPE("projector_gemm_1");
-
-			if(locally_empty_[iproj]) continue;
-
-			namespace blas = boost::multi::blas;
-			blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj]));
-		}
-#else
-		if(max_sphere_size_ > 0) {
-			CALI_CXX_MARK_SCOPE("projector_gemm_1");			
 
-			const double zero = 0.0;
-			const double vol = phi.basis().volume_element();
-
-			auto status = cublasDgemmStridedBatched(/*cublasHandle_t handle = */ boost::multi::cuda::cublas::context::get_instance().get(),
-																							/*cublasOperation_t transa = */ CUBLAS_OP_N,
-																							/*cublasOperation_t transb = */ CUBLAS_OP_N,
-																							/*int m = */ 2*phi.local_set_size(),
-																							/*int n = */ max_nlm_,
-																							/*int k = */ max_sphere_size_,
-																							/*const double *alpha = */ &vol,
-																							/*const double *A = */ reinterpret_cast<double const *>(raw_pointer_cast(sphere_phi_all.data_elements())),
-																							/*int lda = */ 2*phi.local_set_size(),
-																							/*long long int strideA = */ 2*max_sphere_size_*phi.local_set_size(),
-																							/*const double *B = */ raw_pointer_cast(matrices_.data_elements()),
-																							/*int ldb = */ max_sphere_size_,
-																							/*long long int strideB =*/ max_nlm_*max_sphere_size_,
-																							/*const double *beta = */ &zero,
-																							/*double *C = */ reinterpret_cast<double *>(raw_pointer_cast(projections_all.data_elements())),
-																							/*int ldc = */ 2*phi.local_set_size(),
-																							/*long long int strideC = */ 2*max_nlm_*phi.local_set_size(),
-																							/*int batchCount = */ nprojs_);
-			gpu::sync();
-
-			assert(status == CUBLAS_STATUS_SUCCESS);
-
-		}
-#endif
-
-		if(phi.basis().comm().size() > 1) {
-			CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_basis");
-			phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
-		}
+		auto projections_all = calculate_projections(phi, kpoint);
 
 		auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_), 0.0,
 											 [proj = begin(projections_all), coe = begin(coeff_), occ = begin(occupations), spinor_size = phi.local_spinor_set_size()]
@@ -312,62 +264,36 @@ class projector_all {
 		}
 
 		return en;
-
 	}
 
 	////////////////////////////////////////////////////////////////////////////////////////////
 
 
-	template <typename PhiType, typename GPhiType, typename MetricType, typename OccsType>
+	template <typename PhiType, typename GPhiType, typename MetricType, typename OccsType, typename KPoint>
 	void force(PhiType & phi, GPhiType const & gphi, MetricType const & metric,
-						 OccsType const & occs, vector3<double, covariant> const & vector_potential, gpu::array<vector3<double>, 1> & forces_non_local) const {
+						 OccsType const & occs, vector3<double, covariant> const & vector_potential, KPoint const & kpoint, gpu::array<vector3<double>, 1> & forces_non_local) const {
 
 		CALI_CXX_MARK_FUNCTION;
 
 		namespace blas = boost::multi::blas;
 
-		gpu::array<typename PhiType::element_type, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
 		gpu::array<typename GPhiType::element_type, 3> sphere_gphi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
-		gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);
 
 		{ CALI_CXX_MARK_SCOPE("projector_all::force::gather");
 
 			gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_,
-							 [sgr = begin(sphere_phi_all), gsgr = begin(sphere_gphi_all), gr = begin(phi.hypercubic()), ggr = begin(gphi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint = phi.kpoint() + vector_potential]
+							 [gsgr = begin(sphere_gphi_all), gr = begin(phi.hypercubic()), ggr = begin(gphi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint]
 							 GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
 								 if(poi[iproj][ipoint][0] >= 0){
 									 auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint]));
-									 sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
 									 gsgr[iproj][ipoint][ist] = phase*ggr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
 								 } else {
-									 sgr[iproj][ipoint][ist] = 0.0;
 									 gsgr[iproj][ipoint][ist] = {complex(0.0), complex(0.0), complex(0.0)};
 								 }
 							 });
 		}
 
-		for(auto iproj = 0; iproj < nprojs_; iproj++){
-			if(locally_empty_[iproj]) continue;
-
-			blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj]));
-		}
-
-		{ CALI_CXX_MARK_SCOPE("projector_force_scal"); 
-
-			gpu::run(phi.local_set_size(), max_nlm_, nprojs_,
-							 [proj = begin(projections_all), coeff = begin(coeff_)] GPU_LAMBDA (auto ist, auto ipj, auto iproj){
-								 proj[iproj][ipj][ist] *= coeff[iproj][ipj];
-							 });
-		}
-
-		if(phi.basis().comm().size() > 1) {
-			phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
-		}
-
-		for(auto iproj = 0; iproj < nprojs_; iproj++){		
-			blas::real_doubled(sphere_phi_all[iproj]) = blas::gemm(1.0, blas::T(matrices_[iproj]), blas::real_doubled(projections_all[iproj]));
-		}
-
+		auto sphere_phi_all = project(phi, kpoint);
 		gpu::array<vector3<double, covariant>, 1> force(nprojs_, {0.0, 0.0, 0.0});
 
 		for(auto iproj = 0; iproj < nprojs_; iproj++) {
@@ -393,24 +319,19 @@ class projector_all {
 	////////////////////////////////////////////////////////////////////////////////////////////	
 	template <typename KpointType>
 	void position_commutator(states::orbital_set<basis::real_space, complex> const & phi, states::orbital_set<basis::real_space, vector3<complex, covariant>> & cphi, KpointType const & kpoint) const {
-
-		gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
-		gpu::array<vector3<complex, contravariant>, 3> sphere_rphi_all({nprojs_, max_sphere_size_, phi.local_set_size()});		
 
-		gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);
+		gpu::array<vector3<complex, contravariant>, 3> sphere_rphi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
 		gpu::array<vector3<complex, contravariant>, 3> rprojections_all({nprojs_, max_nlm_, phi.local_set_size()}, zero<vector3<complex, contravariant>>());
 
 		{ CALI_CXX_MARK_SCOPE("position_commutator::gather");
 
 			gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_,
-							 [sphi = begin(sphere_phi_all), srphi = begin(sphere_rphi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
+							 [srphi = begin(sphere_rphi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
 								 if(poi[iproj][ipoint][0] >= 0){
 									 auto rr = static_cast<vector3<double, contravariant>>(pos[iproj][ipoint]);
 									 auto phase = polar(1.0, dot(kpoint, rr));
-									 sphi[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
-									 srphi[iproj][ipoint][ist] = rr*sphi[iproj][ipoint][ist];
+									 srphi[iproj][ipoint][ist] = rr*phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
 								 } else {
-									 sphi[iproj][ipoint][ist]     = complex(0.0, 0.0);
 									 srphi[iproj][ipoint][ist][0] = complex(0.0, 0.0);
 									 srphi[iproj][ipoint][ist][1] = complex(0.0, 0.0);
 									 srphi[iproj][ipoint][ist][2] = complex(0.0, 0.0);
@@ -424,8 +345,6 @@ class projector_all {
 			if(locally_empty_[iproj]) continue;
 
 			namespace blas = boost::multi::blas;
-			blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj]));
-
 			auto rpa = rprojections_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();
 			auto sra = sphere_rphi_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();
 
@@ -435,15 +354,13 @@ class projector_all {
     { CALI_CXX_MARK_SCOPE("position_commutator_scal");
 
       gpu::run(phi.local_set_size(), max_nlm_, nprojs_,
-               [proj = begin(projections_all), rproj = begin(rprojections_all), coe = begin(coeff_)]
+               [rproj = begin(rprojections_all), coe = begin(coeff_)]
                GPU_LAMBDA (auto ist, auto ilm, auto iproj){
-                 proj[iproj][ilm][ist] *= coe[iproj][ilm];
                  rproj[iproj][ilm][ist] *= coe[iproj][ilm];
                });
 		}
 
 		if(phi.basis().comm().size() > 1) {
-			phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
 			phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(rprojections_all.data_elements()), rprojections_all.num_elements(), std::plus<>{});
 		}
 
@@ -453,13 +370,13 @@ class projector_all {
 			if(locally_empty_[iproj]) continue;
 
 			namespace blas = boost::multi::blas;
-			blas::real_doubled(sphere_phi_all[iproj]) = blas::gemm(1., blas::T(matrices_[iproj]), blas::real_doubled(projections_all[iproj]));
-
 			auto rpa = rprojections_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();
 			auto sra = sphere_rphi_all[iproj].template reinterpret_array_cast<complex>(3).rotated().flatted().unrotated();
 			blas::real_doubled(sra) = blas::gemm(1., blas::T(matrices_[iproj]), blas::real_doubled(rpa));			
 		}
 
+		auto sphere_phi_all = project(phi, kpoint);
+
 		for(auto iproj = 0; iproj < nprojs_; iproj++){
 
 			if(locally_empty_[iproj]) continue;

diff --git a/src/observables/forces_stress.hpp b/src/observables/forces_stress.hpp
@@ -126,10 +126,10 @@ struct forces_stress {
 		auto iphi = 0;
 		for(auto & phi : electrons.kpin()){
 
-			auto gphi = operations::gradient(phi, /* factor = */ 1.0, /*shift = */ phi.kpoint());
+			auto gphi = operations::gradient(phi, /* factor = */ 1.0, /*shift = */ phi.kpoint() + ham.uniform_vector_potential());
 			observables::density::calculate_gradient_add(electrons.occupations()[iphi], phi, gphi, gdensity);
 
-			ham.projectors_all().force(phi, gphi, ions.cell().metric(), electrons.occupations()[iphi], ham.uniform_vector_potential(), forces_non_local);
+			ham.projectors_all().force(phi, gphi, ions.cell().metric(), electrons.occupations()[iphi], ham.uniform_vector_potential(), phi.kpoint() + ham.uniform_vector_potential(), forces_non_local);
 
 			stress += stress_kinetic(gphi, electrons.occupations()[iphi]);