Skip to content

Commit

Permalink
[CHG]1.增加CUDA端测试代码;2.kernel的lantency度量没有warmup
Browse files Browse the repository at this point in the history
  • Loading branch information
l30001493 committed Mar 23, 2021
1 parent 1206162 commit 0112267
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 30 deletions.
91 changes: 61 additions & 30 deletions gpu/cuda/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,13 @@
* author: dqliu
* date: 2020/03/18
*/

#include <cuda_runtime.h>

#ifdef USE_THRUST
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#endif

#ifdef USE_CUB
#include <cub/block/block_reduce.cuh>
#endif
#include "cuda_op.h"

// dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
// srcData[N], dstData[(N + BLOCK_SIZE - 1) / BLOCK_SIZE]
template <size_t BLOCK_SIZE, typename T>
__global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData) {

__global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData)
{
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
T __shared__ shm[BLOCK_SIZE];
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
Expand All @@ -35,10 +25,22 @@ __global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData)
}
}

template <>
void cudaCallReduceSUMSharedMem<unsigned int>(const size_t nElements, const unsigned int* srcData, unsigned int* dstData)
{
const size_t BLOCK_SIZE = 1024;
reduce_sum<BLOCK_SIZE, unsigned int><<<
(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE>>>(
nElements,
srcData,
dstData);
}

// srcData[N], dstData[1] (memset(0))
template <size_t BLOCK_SIZE, typename T>
__global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* dstData) {

__global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* dstData)
{
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
T __shared__ shm[BLOCK_SIZE];
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
Expand All @@ -56,8 +58,8 @@ __global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* d
}

template <size_t BLOCK_SIZE, typename T>
__global__ reduce_max(const size_t nElements, const T* srcData, T* dstData) {

__global__ void reduce_max(const size_t nElements, const T* srcData, T* dstData)
{
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
T __shared__ shm[BLOCK_SIZE];
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
Expand All @@ -76,8 +78,8 @@ __global__ reduce_max(const size_t nElements, const T* srcData, T* dstData) {

// dstData[1] = -INF
template <size_t BLOCK_SIZE, typename T>
__global__ reduce_max_atomic(const size_t nElements, const T* srcData, T* dstData) {

__global__ void reduce_max_atomic(const size_t nElements, const T* srcData, T* dstData)
{
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
T __shared__ shm[BLOCK_SIZE];
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
Expand All @@ -96,12 +98,13 @@ __global__ reduce_max_atomic(const size_t nElements, const T* srcData, T* dstDat

// dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
// srcData[N], dstData[(N + WARP_SIZE - 1) / WARP_SIZE]
#if __CUDA_ARCH__ >= 900
template <size_t WARP_SIZE, typename T>
__global__ reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstData) {
// #if __CUDA_ARCH__ >= 900
template<size_t WARP_SIZE, typename T>
__global__ void reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstData)
{
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t wid = gid % WARP_SIZE;
T sumVal = gidsrcData[gid] ? gid < nElements : 0;
T sumVal = gid < nElements ? srcData[gid] : 0;

for (size_t offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) {
sumVal += __shfl_xor_sync(0xffffffff, sumVal, offset, WARP_SIZE);
Expand All @@ -111,30 +114,58 @@ __global__ reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstD
dstData[gid / WARP_SIZE] = sumVal;
}
}
#endif

template<>
void cudaCallReduceSUMWarpCom<unsigned int>(const size_t nElements, const unsigned int* srcData, unsigned int* dstData) {
const size_t WARP_SIZE = 32;
const size_t BLOCK_SIZE = 1024;
reduce_sum_warp_com<
WARP_SIZE, unsigned int><<<
(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE>>>(
nElements,
srcData,
dstData);
}

// #endif

#ifdef USE_THRUST
template<typename T>
__global__ T reduce_sum_thrust(thrust::device_vector<T> src) {
return thrust::reduce(src.begin(), src.end());
T reduce_sum_thrust(thrust::device_vector<T> src)
{
return thrust::reduce(src.begin(), src.end());
}
#endif

#ifdef USE_CUB
template<size_t BLOCK_SIZE, typename T>
__global__ T void reduce_sum_cub(const size_t nElements, const T* srcData, T* dstData)
__global__ void reduce_sum_cub(const size_t nElements, const T* srcData, T* dstData)
{
const size_t gid = threadIdx.x + blockIdx.x * blocDim.x;
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
typedef cub::BlockReduce<T, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStroge TempStroge;
__shared__ typename BlockReduce::TempStorage TempStorage;

T sumVal = 0;
if (gid < nElements) {
sumVal = BlockReduce(TempStroge).Sum(srcData[gid]);
sumVal = BlockReduce(TempStorage).Sum(srcData[gid]);
}

if (threadIdx.x == 0) {
dstData[blockIdx.x] = sumVal;
}
}

template<>
void cubCallReduceSUM(const size_t nElements, const unsigned int* srcData, unsigned int* dstData)
{
const size_t BLOCK_SIZE = 1024;
reduce_sum_cub<
BLOCK_SIZE, unsigned int><<<
(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE>>>(
nElements,
srcData,
dstData);
}
#endif
27 changes: 27 additions & 0 deletions gpu/cuda_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef CUDA_COMMON_H
#define CUDA_COMMON_H

#include <iostream>
#include <cuda_runtime.h>

#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)

void SetGPUID(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}

#endif
36 changes: 36 additions & 0 deletions gpu/cuda_op.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef CUDA_OP_H
#define CUDA_OP_H

#include <cuda_runtime.h>

#ifdef USE_THRUST
#include <thrust/device_vector.h> // memory
#include <thrust/reduce.h> // op::reduce
#endif

#ifdef USE_CUB
#include <cub/block/block_reduce.cuh> // op::reduce
#endif

// Reduce
template <typename T>
void cudaCallReduceSUMSharedMem(const size_t nElements, const T* srcData, T* dstData);

template <typename T>
void cudaCallReduceSUMWarpCom(const size_t nElements, const T* srcData, T* dstData);


#ifdef USE_THRUST
template<typename T>
T thrustCallReduceSUM(thrust::device_vector<T> src);
#endif

#ifdef USE_CUB
template <typename T>
void cubCallReduceSUM(const size_t nElements, const T* srcData, T* dstData);
#endif


// Eltwise

#endif
60 changes: 60 additions & 0 deletions gpu/cuda_reduce_test.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include "cuda_common.h"
#include "cuda_op.h"

int main(int argc, char** argv)
{
if (argc < 2) {
printf("Usage: %s GPU_ID\n", argv[0]);
return -1;
}
const int gpu_id = atoi(argv[1]);
SetGPUID(gpu_id);

const size_t
n = 1 << 30,
BLOCK_SIZE = 1 << 10,
WARP_SIZE = 1 << 5,
REDUCE_SIZE = (n + WARP_SIZE - 1) / WARP_SIZE;
thrust::device_vector<unsigned> src(n, 1), tmp(REDUCE_SIZE);
const unsigned char opDesc[4][128] = {
"======thrust::reduce=======",
"======shared_sum_kernel=======",
"======warp_primitive_sum_kernel=======",
"======cub::BlockReduce reduce_sum_cub======="};
for (int op = 0; op < 4; ++op) {
unsigned sum;
cudaEvent_t beg, end;
cudaEventCreate(&beg);
cudaEventCreate(&end);
cudaEventRecord(beg, 0);
if (op == 0) {
sum = thrust::reduce(src.begin(), src.begin() + n);
}
if (op == 1) {
cudaCallReduceSUMSharedMem(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
}
if (op == 2) {
cudaCallReduceSUMWarpCom(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + WARP_SIZE - 1) / WARP_SIZE);
}
if (op == 3) {
cubCallReduceSUM(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
}
cudaEventRecord(end, 0);
cudaEventSynchronize(beg);
cudaEventSynchronize(end);
float elapsed_time;
cudaEventElapsedTime(
&elapsed_time,
beg,
end);
std::cout << opDesc[op] << std::endl;
std::cout << sum << ": " << elapsed_time << " ms elapsed." << std::endl;
std::cout << std::endl;
// printf("%u : %fms elapsed.\n", sum, elapsed_time);
}

return 0;
}
Binary file added gpu/tesla-v100-14TFLOPS-reduce-1G-lantency.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 0112267

Please sign in to comment.