Skip to content

Commit

Permalink
[Feature] Add vector_mul_vector, vector_div_vector and vector_add_vec…
Browse files Browse the repository at this point in the history
…tor in blas_connector and added some GPU tests. (#5858)

* Added some other necessary kernels

* Fix compiling bug

* XX

* Finish CUDA kernel

* Fix marcos

* Fix typename

* GPU implementation

* Fix bugs

* add vector_add_vector kernel

* Add blas_connector CPU tests

* Fix blas usgae

* Add initializer and GPU tests
  • Loading branch information
Critsium-xy authored Jan 15, 2025
1 parent b809ce6 commit 0a0e19a
Show file tree
Hide file tree
Showing 5 changed files with 438 additions and 9 deletions.
121 changes: 116 additions & 5 deletions source/module_base/blas_connector.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "blas_connector.h"
#include "macros.h"

#ifdef __DSP
#include "module_base/kernels/dsp/dsp_connector.h"
Expand All @@ -8,12 +9,10 @@
#ifdef __CUDA
#include <base/macros/macros.h>
#include <cuda_runtime.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/inner_product.h>
#include "module_base/tool_quit.h"

#include "cublas_v2.h"
#include "module_hsolver/kernels/math_kernel_op.h"
#include "module_base/module_device/memory_op.h"


namespace BlasUtils{

Expand Down Expand Up @@ -652,4 +651,116 @@ void BlasConnector::copy(const long n, const std::complex<double> *a, const int
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
zcopy_(&n, a, &incx, b, &incy);
}
}


template <typename T>
void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
using Real = typename GetTypeReal<T>::type;
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector1[i] * vector2[i];
}
}
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
#ifdef __CUDA
hsolver::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
#endif
}
}


template <typename T>
void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
using Real = typename GetTypeReal<T>::type;
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector1[i] / vector2[i];
}
}
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
#ifdef __CUDA
hsolver::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
#endif
}
}

void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type)
{
if (device_type == base_device::CpuDevice){
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 8192 / sizeof(float))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
}
}
else if (device_type == base_device::GpuDevice){
#ifdef __CUDA
hsolver::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
#endif
}
}

void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type)
{
if (device_type == base_device::CpuDevice){
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 8192 / sizeof(double))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
}
}
else if (device_type == base_device::GpuDevice){
#ifdef __CUDA
hsolver::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
#endif
}
}

void vector_add_vector(const int& dim, std::complex<float> *result, const std::complex<float> *vector1, const float constant1, const std::complex<float> *vector2, const float constant2, base_device::AbacusDevice_t device_type)
{
if (device_type == base_device::CpuDevice){
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex<float>))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
}
}
else if (device_type == base_device::GpuDevice){
#ifdef __CUDA
hsolver::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
#endif
}
}

void vector_add_vector(const int& dim, std::complex<double> *result, const std::complex<double> *vector1, const double constant1, const std::complex<double> *vector2, const double constant2, base_device::AbacusDevice_t device_type)
{
if (device_type == base_device::CpuDevice){
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex<double>))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
}
}
else if (device_type == base_device::GpuDevice){
#ifdef __CUDA
hsolver::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
#endif
}
}
32 changes: 31 additions & 1 deletion source/module_base/blas_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <complex>
#include "module_base/module_device/types.h"
#include "macros.h"

// These still need to be linked in the header file
// Because quite a lot of code will directly use the original cblas kernels.
Expand Down Expand Up @@ -303,9 +304,38 @@ class BlasConnector
static
void copy(const long n, const std::complex<double> *a, const int incx, std::complex<double> *b, const int incy, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);

// A is symmetric
// There is some other operators needed, so implemented manually here
template <typename T>
static
void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);

template <typename T>
static
void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);

// y = alpha * x + beta * y
static
void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);

static
void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);

static
void vector_add_vector(const int& dim, std::complex<float> *result, const std::complex<float> *vector1, const float constant1, const std::complex<float> *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);

static
void vector_add_vector(const int& dim, std::complex<double> *result, const std::complex<double> *vector1, const double constant1, const std::complex<double> *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
};

#ifdef __CUDA

namespace BlasUtils{
void createGpuBlasHandle();
void destoryBLAShandle();
}

#endif

// If GATHER_INFO is defined, the original function is replaced with a "i" suffix,
// preventing changes on the original code.
// The real function call is at gather_math_lib_info.cpp
Expand Down
2 changes: 1 addition & 1 deletion source/module_base/kernels/cuda/math_op.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "cuda_runtime.h"
#include <cuda_runtime.h>
#include "module_base/kernels/math_op.h"

#include <base/macros/macros.h>
Expand Down
Loading

0 comments on commit 0a0e19a

Please sign in to comment.