Skip to content

feat(lapack): add CUDA support for SymMatEVD #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 132 additions & 1 deletion include/qlten/framework/hp_numeric/lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ inline void MatQR(
// roughly estimate for complex number
#endif
}
#else // define USE GPU
#else // define USE_GPU

#ifndef NDEBUG
//row-major matrix A, m: rows, n: cols; lda: leading-dimension, usually be n
Expand Down Expand Up @@ -659,8 +659,139 @@ inline cusolverStatus_t MatSVD(
}


inline cusolverStatus_t SymMatEVD(
const QLTEN_Double *mat,
const size_t n,
QLTEN_Double *d,
QLTEN_Double *u
) {
cusolverDnHandle_t handle = CusolverHandleManager::GetHandle();
cublasHandle_t cublasHandle = CublasHandleManager::GetHandle();

// Device memory allocations
int *devInfo = nullptr;
QLTEN_Double *eigenvalues = nullptr;
QLTEN_Double *d_mat = nullptr;
QLTEN_Double *d_work = nullptr;

HANDLE_CUDA_ERROR(cudaMalloc(&devInfo, sizeof(int)));
HANDLE_CUDA_ERROR(cudaMalloc(&eigenvalues, n * sizeof(QLTEN_Double)));
HANDLE_CUDA_ERROR(cudaMalloc(&d_mat, n * n * sizeof(QLTEN_Double)));

// Copy input matrix directly (no transpose needed for symmetric matrix)
HANDLE_CUDA_ERROR(cudaMemcpy(d_mat, mat, n * n * sizeof(QLTEN_Double), cudaMemcpyDeviceToDevice));

// Query and allocate workspace
int lwork = 0;
HANDLE_CUSOLVER_ERROR(cusolverDnDsyevd_bufferSize(
handle, CUSOLVER_EIG_MODE_VECTOR,
CUBLAS_FILL_MODE_LOWER, n,
d_mat, n, eigenvalues, &lwork
));
HANDLE_CUDA_ERROR(cudaMalloc(&d_work, lwork * sizeof(QLTEN_Double)));

// Compute eigenvalues/vectors using lower fill mode
auto status = cusolverDnDsyevd(
handle, CUSOLVER_EIG_MODE_VECTOR,
CUBLAS_FILL_MODE_LOWER, n, d_mat, n,
eigenvalues, d_work, lwork, devInfo
);
HANDLE_CUSOLVER_ERROR(status);

// Check convergence
int info;
HANDLE_CUDA_ERROR(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (info != 0) {
cudaFree(d_mat); cudaFree(d_work); cudaFree(devInfo); cudaFree(eigenvalues);
return CUSOLVER_STATUS_INTERNAL_ERROR;
}

// Transpose eigenvectors to row-major (columns to rows)
const QLTEN_Double alpha = 1.0;
const QLTEN_Double beta = 0.0;
HANDLE_CUBLAS_ERROR(cublasDgeam(
cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
n, n, &alpha, d_mat, n, &beta, nullptr, n,
u, n
));

// Set diagonal matrix (device-to-device copy)
HANDLE_CUBLAS_ERROR(cublasDcopy(cublasHandle, n, eigenvalues, 1, d, n + 1));

// Cleanup
cudaFree(d_mat);
cudaFree(d_work);
cudaFree(devInfo);
cudaFree(eigenvalues);

return status;
}

inline cusolverStatus_t SymMatEVD(
const QLTEN_Complex *mat,
const size_t n,
QLTEN_Double *d,
QLTEN_Complex *u
) {
cusolverDnHandle_t handle = CusolverHandleManager::GetHandle();
cublasHandle_t cublasHandle = CublasHandleManager::GetHandle();

// Device memory allocations
int *devInfo = nullptr;
QLTEN_Double *eigenvalues = nullptr;
cuDoubleComplex *d_mat = nullptr;
cuDoubleComplex *d_work = nullptr;

HANDLE_CUDA_ERROR(cudaMalloc(&devInfo, sizeof(int)));
HANDLE_CUDA_ERROR(cudaMalloc(&eigenvalues, n * sizeof(QLTEN_Double)));
HANDLE_CUDA_ERROR(cudaMalloc(&d_mat, n * n * sizeof(cuDoubleComplex)));

// Direct copy input matrix (no initial transpose)
HANDLE_CUDA_ERROR(cudaMemcpy(d_mat, mat, n * n * sizeof(cuDoubleComplex),
cudaMemcpyDeviceToDevice));

// Query and allocate workspace
int lwork = 0;
HANDLE_CUSOLVER_ERROR(cusolverDnZheevd_bufferSize(
handle, CUSOLVER_EIG_MODE_VECTOR,
CUBLAS_FILL_MODE_LOWER, n, // Use lower triangle for column-major interpretation
d_mat, n, eigenvalues, &lwork));
HANDLE_CUDA_ERROR(cudaMalloc(&d_work, lwork * sizeof(cuDoubleComplex)));

// Compute eigenvalues/vectors
auto status = cusolverDnZheevd(
handle, CUSOLVER_EIG_MODE_VECTOR,
CUBLAS_FILL_MODE_LOWER, n,
d_mat, n, eigenvalues, d_work, lwork, devInfo);
HANDLE_CUSOLVER_ERROR(status);

// Check convergence
int info;
HANDLE_CUDA_ERROR(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (info != 0) {
cudaFree(d_mat); cudaFree(d_work); cudaFree(devInfo); cudaFree(eigenvalues);
return CUSOLVER_STATUS_INTERNAL_ERROR;
}

// Conjugate transpose eigenvectors
const cuDoubleComplex alpha = {1.0, 0.0};
const cuDoubleComplex beta = {0.0, 0.0};
HANDLE_CUBLAS_ERROR(cublasZgeam(
cublasHandle, CUBLAS_OP_C, CUBLAS_OP_N,
n, n, &alpha, d_mat, n, &beta, nullptr, n,
reinterpret_cast<cuDoubleComplex*>(u), n));

// Set diagonal matrix
HANDLE_CUBLAS_ERROR(cublasDcopy(cublasHandle, n, eigenvalues, 1, d, n+1));

// Cleanup
cudaFree(d_mat);
cudaFree(d_work);
cudaFree(devInfo);
cudaFree(eigenvalues);

return status;
}

#endif

Expand Down
4 changes: 2 additions & 2 deletions include/qlten/qltensor/blk_spar_data_ten/global_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,7 @@ void BlockSparseDataTensor<ElemT, QNT>::CollectiveLinearCombine(
}
RawDataCopy_(source_pointers, dest_pointers, copy_size);
}
#ifndef USE_GPU
// #ifndef USE_GPU
template<typename ElemT, typename QNT>
void BlockSparseDataTensor<ElemT, QNT>::SymMatEVDRawDataDecomposition(
BlockSparseDataTensor<ElemT, QNT> &u,
Expand All @@ -1070,6 +1070,6 @@ void BlockSparseDataTensor<ElemT, QNT>::SymMatEVDRawDataDecomposition(
pu_start + task.data_offset);
}
}
#endif
// #endif
} /* qlten */
#endif /* ifndef QLTEN_QLTENSOR_BLK_SPAR_DATA_TEN_GLOBAL_OPERATIONS_H */
16 changes: 7 additions & 9 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: LGPL-3.0-only
#
#
# Author: Hao-Xin Wang <[email protected]>
# Creation Date: 2024-Jan-28
#
#
# Description: QuantumLiquids/tensor project. CMake file to control unittest.
#

Expand Down Expand Up @@ -219,13 +219,11 @@ add_unittest(test_ten_svd
"test_tensor_manipulation/test_ten_svd.cc"
"${BLAS_INCLUDE_DIR}" "" "${MATH_LIB_LINK_FLAGS}"
)
if (NOT QLTEN_USE_GPU) # temporary remove because it is not used in DMRG
# EVD
add_unittest(test_sym_mat_evd
"test_tensor_manipulation/test_sym_mat_evd.cc"
"${BLAS_INCLUDE_DIR}" "" "${MATH_LIB_LINK_FLAGS}"
)
endif ()
# EVD
add_unittest(test_sym_mat_evd
"test_tensor_manipulation/test_sym_mat_evd.cc"
"${BLAS_INCLUDE_DIR}" "" "${MATH_LIB_LINK_FLAGS}"
)
# Test tensor QR.
add_unittest(test_ten_qr
"test_tensor_manipulation/test_ten_qr.cc"
Expand Down