Skip to content

Commit 0112267

Browse files
author
l30001493
committed
[CHG]1.增加CUDA端测试代码;2.kernel的lantency度量没有warmup
1 parent 1206162 commit 0112267

File tree

5 files changed

+184
-30
lines changed

5 files changed

+184
-30
lines changed

gpu/cuda/reduce.cu

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,13 @@
22
* author: dqliu
33
* date: 2020/03/18
44
*/
5-
6-
#include <cuda_runtime.h>
7-
8-
#ifdef USE_THRUST
9-
#include <thrust/device_vector.h>
10-
#include <thrust/reduce.h>
11-
#endif
12-
13-
#ifdef USE_CUB
14-
#include <cub/block/block_reduce.cuh>
15-
#endif
5+
#include "cuda_op.h"
166

177
// dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
188
// srcData[N], dstData[(N + BLOCK_SIZE - 1) / BLOCK_SIZE]
199
template <size_t BLOCK_SIZE, typename T>
20-
__global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData) {
21-
10+
__global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData)
11+
{
2212
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
2313
T __shared__ shm[BLOCK_SIZE];
2414
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -35,10 +25,22 @@ __global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData)
3525
}
3626
}
3727

28+
template <>
29+
void cudaCallReduceSUMSharedMem<unsigned int>(const size_t nElements, const unsigned int* srcData, unsigned int* dstData)
30+
{
31+
const size_t BLOCK_SIZE = 1024;
32+
reduce_sum<BLOCK_SIZE, unsigned int><<<
33+
(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
34+
BLOCK_SIZE>>>(
35+
nElements,
36+
srcData,
37+
dstData);
38+
}
39+
3840
// srcData[N], dstData[1] (memset(0))
3941
template <size_t BLOCK_SIZE, typename T>
40-
__global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* dstData) {
41-
42+
__global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* dstData)
43+
{
4244
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
4345
T __shared__ shm[BLOCK_SIZE];
4446
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -56,8 +58,8 @@ __global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* d
5658
}
5759

5860
template <size_t BLOCK_SIZE, typename T>
59-
__global__ reduce_max(const size_t nElements, const T* srcData, T* dstData) {
60-
61+
__global__ void reduce_max(const size_t nElements, const T* srcData, T* dstData)
62+
{
6163
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
6264
T __shared__ shm[BLOCK_SIZE];
6365
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -76,8 +78,8 @@ __global__ reduce_max(const size_t nElements, const T* srcData, T* dstData) {
7678

7779
// dstData[1] = -INF
7880
template <size_t BLOCK_SIZE, typename T>
79-
__global__ reduce_max_atomic(const size_t nElements, const T* srcData, T* dstData) {
80-
81+
__global__ void reduce_max_atomic(const size_t nElements, const T* srcData, T* dstData)
82+
{
8183
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
8284
T __shared__ shm[BLOCK_SIZE];
8385
shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -96,12 +98,13 @@ __global__ reduce_max_atomic(const size_t nElements, const T* srcData, T* dstDat
9698

9799
// dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
98100
// srcData[N], dstData[(N + WARP_SIZE - 1) / WARP_SIZE]
99-
#if __CUDA_ARCH__ >= 900
100-
template <size_t WARP_SIZE, typename T>
101-
__global__ reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstData) {
101+
// #if __CUDA_ARCH__ >= 900
102+
template<size_t WARP_SIZE, typename T>
103+
__global__ void reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstData)
104+
{
102105
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
103106
const size_t wid = gid % WARP_SIZE;
104-
T sumVal = gidsrcData[gid] ? gid < nElements : 0;
107+
T sumVal = gid < nElements ? srcData[gid] : 0;
105108

106109
for (size_t offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) {
107110
sumVal += __shfl_xor_sync(0xffffffff, sumVal, offset, WARP_SIZE);
@@ -111,30 +114,58 @@ __global__ reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstD
111114
dstData[gid / WARP_SIZE] = sumVal;
112115
}
113116
}
114-
#endif
117+
118+
template<>
119+
void cudaCallReduceSUMWarpCom<unsigned int>(const size_t nElements, const unsigned int* srcData, unsigned int* dstData) {
120+
const size_t WARP_SIZE = 32;
121+
const size_t BLOCK_SIZE = 1024;
122+
reduce_sum_warp_com<
123+
WARP_SIZE, unsigned int><<<
124+
(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
125+
BLOCK_SIZE>>>(
126+
nElements,
127+
srcData,
128+
dstData);
129+
}
130+
131+
// #endif
115132

116133
#ifdef USE_THRUST
117134
template<typename T>
118-
__global__ T reduce_sum_thrust(thrust::device_vector<T> src) {
119-
return thrust::reduce(src.begin(), src.end());
135+
T reduce_sum_thrust(thrust::device_vector<T> src)
136+
{
137+
return thrust::reduce(src.begin(), src.end());
120138
}
121139
#endif
122140

123141
#ifdef USE_CUB
124142
template<size_t BLOCK_SIZE, typename T>
125-
__global__ T void reduce_sum_cub(const size_t nElements, const T* srcData, T* dstData)
143+
__global__ void reduce_sum_cub(const size_t nElements, const T* srcData, T* dstData)
126144
{
127-
const size_t gid = threadIdx.x + blockIdx.x * blocDim.x;
145+
const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
128146
typedef cub::BlockReduce<T, BLOCK_SIZE> BlockReduce;
129-
__shared__ typename BlockReduce::TempStroge TempStroge;
147+
__shared__ typename BlockReduce::TempStorage TempStorage;
130148

131149
T sumVal = 0;
132150
if (gid < nElements) {
133-
sumVal = BlockReduce(TempStroge).Sum(srcData[gid]);
151+
sumVal = BlockReduce(TempStorage).Sum(srcData[gid]);
134152
}
135153

136154
if (threadIdx.x == 0) {
137155
dstData[blockIdx.x] = sumVal;
138156
}
139157
}
158+
159+
template<>
160+
void cubCallReduceSUM(const size_t nElements, const unsigned int* srcData, unsigned int* dstData)
161+
{
162+
const size_t BLOCK_SIZE = 1024;
163+
reduce_sum_cub<
164+
BLOCK_SIZE, unsigned int><<<
165+
(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
166+
BLOCK_SIZE>>>(
167+
nElements,
168+
srcData,
169+
dstData);
170+
}
140171
#endif

gpu/cuda_common.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#ifndef CUDA_COMMON_H
2+
#define CUDA_COMMON_H
3+
4+
#include <iostream>
5+
#include <cuda_runtime.h>
6+
7+
#define CUDA_CHECK(condition) \
8+
/* Code block avoids redefinition of cudaError_t error */ \
9+
do { \
10+
cudaError_t error = condition; \
11+
if (error != cudaSuccess) { \
12+
std::cout << cudaGetErrorString(error) << std::endl; \
13+
} \
14+
} while (0)
15+
16+
void SetGPUID(int device_id) {
17+
int current_device;
18+
CUDA_CHECK(cudaGetDevice(&current_device));
19+
if (current_device == device_id) {
20+
return;
21+
}
22+
// The call to cudaSetDevice must come before any calls to Get, which
23+
// may perform initialization using the GPU.
24+
CUDA_CHECK(cudaSetDevice(device_id));
25+
}
26+
27+
#endif

gpu/cuda_op.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#ifndef CUDA_OP_H
2+
#define CUDA_OP_H
3+
4+
#include <cuda_runtime.h>
5+
6+
#ifdef USE_THRUST
7+
#include <thrust/device_vector.h> // memory
8+
#include <thrust/reduce.h> // op::reduce
9+
#endif
10+
11+
#ifdef USE_CUB
12+
#include <cub/block/block_reduce.cuh> // op::reduce
13+
#endif
14+
15+
// Reduce
16+
template <typename T>
17+
void cudaCallReduceSUMSharedMem(const size_t nElements, const T* srcData, T* dstData);
18+
19+
template <typename T>
20+
void cudaCallReduceSUMWarpCom(const size_t nElements, const T* srcData, T* dstData);
21+
22+
23+
#ifdef USE_THRUST
24+
template<typename T>
25+
T thrustCallReduceSUM(thrust::device_vector<T> src);
26+
#endif
27+
28+
#ifdef USE_CUB
29+
template <typename T>
30+
void cubCallReduceSUM(const size_t nElements, const T* srcData, T* dstData);
31+
#endif
32+
33+
34+
// Eltwise
35+
36+
#endif

gpu/cuda_reduce_test.cu

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include "cuda_common.h"
2+
#include "cuda_op.h"
3+
4+
int main(int argc, char** argv)
5+
{
6+
if (argc < 2) {
7+
printf("Usage: %s GPU_ID\n", argv[0]);
8+
return -1;
9+
}
10+
const int gpu_id = atoi(argv[1]);
11+
SetGPUID(gpu_id);
12+
13+
const size_t
14+
n = 1 << 30,
15+
BLOCK_SIZE = 1 << 10,
16+
WARP_SIZE = 1 << 5,
17+
REDUCE_SIZE = (n + WARP_SIZE - 1) / WARP_SIZE;
18+
thrust::device_vector<unsigned> src(n, 1), tmp(REDUCE_SIZE);
19+
const unsigned char opDesc[4][128] = {
20+
"======thrust::reduce=======",
21+
"======shared_sum_kernel=======",
22+
"======warp_primitive_sum_kernel=======",
23+
"======cub::BlockReduce reduce_sum_cub======="};
24+
for (int op = 0; op < 4; ++op) {
25+
unsigned sum;
26+
cudaEvent_t beg, end;
27+
cudaEventCreate(&beg);
28+
cudaEventCreate(&end);
29+
cudaEventRecord(beg, 0);
30+
if (op == 0) {
31+
sum = thrust::reduce(src.begin(), src.begin() + n);
32+
}
33+
if (op == 1) {
34+
cudaCallReduceSUMSharedMem(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
35+
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
36+
}
37+
if (op == 2) {
38+
cudaCallReduceSUMWarpCom(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
39+
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + WARP_SIZE - 1) / WARP_SIZE);
40+
}
41+
if (op == 3) {
42+
cubCallReduceSUM(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
43+
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
44+
}
45+
cudaEventRecord(end, 0);
46+
cudaEventSynchronize(beg);
47+
cudaEventSynchronize(end);
48+
float elapsed_time;
49+
cudaEventElapsedTime(
50+
&elapsed_time,
51+
beg,
52+
end);
53+
std::cout << opDesc[op] << std::endl;
54+
std::cout << sum << ": " << elapsed_time << " ms elapsed." << std::endl;
55+
std::cout << std::endl;
56+
// printf("%u : %fms elapsed.\n", sum, elapsed_time);
57+
}
58+
59+
return 0;
60+
}
7.56 KB
Loading

0 commit comments

Comments
 (0)