2
2
* author: dqliu
3
3
* date: 2020/03/18
4
4
*/
5
-
6
- #include < cuda_runtime.h>
7
-
8
- #ifdef USE_THRUST
9
- #include < thrust/device_vector.h>
10
- #include < thrust/reduce.h>
11
- #endif
12
-
13
- #ifdef USE_CUB
14
- #include < cub/block/block_reduce.cuh>
15
- #endif
5
+ #include " cuda_op.h"
16
6
17
7
// dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
18
8
// srcData[N], dstData[(N + BLOCK_SIZE - 1) / BLOCK_SIZE]
19
9
template <size_t BLOCK_SIZE, typename T>
20
- __global__ void reduce_sum (const size_t nElements, const T* srcData, T* dstData) {
21
-
10
+ __global__ void reduce_sum (const size_t nElements, const T* srcData, T* dstData)
11
+ {
22
12
const size_t gid = threadIdx .x + blockIdx .x * blockDim .x ;
23
13
T __shared__ shm[BLOCK_SIZE];
24
14
shm[threadIdx .x ] = srcData[gid] ? gid < nElements : 0 ;
@@ -35,10 +25,22 @@ __global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData)
35
25
}
36
26
}
37
27
28
+ template <>
29
+ void cudaCallReduceSUMSharedMem<unsigned int >(const size_t nElements, const unsigned int * srcData, unsigned int * dstData)
30
+ {
31
+ const size_t BLOCK_SIZE = 1024 ;
32
+ reduce_sum<BLOCK_SIZE, unsigned int ><<<
33
+ (nElements + BLOCK_SIZE - 1 ) / BLOCK_SIZE,
34
+ BLOCK_SIZE>>> (
35
+ nElements,
36
+ srcData,
37
+ dstData);
38
+ }
39
+
38
40
// srcData[N], dstData[1] (memset(0))
39
41
template <size_t BLOCK_SIZE, typename T>
40
- __global__ void reduce_sum_atomic (const size_t nElements, const T* srcData, T* dstData) {
41
-
42
+ __global__ void reduce_sum_atomic (const size_t nElements, const T* srcData, T* dstData)
43
+ {
42
44
const size_t gid = threadIdx .x + blockIdx .x * blockDim .x ;
43
45
T __shared__ shm[BLOCK_SIZE];
44
46
shm[threadIdx .x ] = srcData[gid] ? gid < nElements : 0 ;
@@ -56,8 +58,8 @@ __global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* d
56
58
}
57
59
58
60
template <size_t BLOCK_SIZE, typename T>
59
- __global__ reduce_max (const size_t nElements, const T* srcData, T* dstData) {
60
-
61
+ __global__ void reduce_max (const size_t nElements, const T* srcData, T* dstData)
62
+ {
61
63
const size_t gid = threadIdx .x + blockIdx .x * blockDim .x ;
62
64
T __shared__ shm[BLOCK_SIZE];
63
65
shm[threadIdx .x ] = srcData[gid] ? gid < nElements : 0 ;
@@ -76,8 +78,8 @@ __global__ reduce_max(const size_t nElements, const T* srcData, T* dstData) {
76
78
77
79
// dstData[1] = -INF
78
80
template <size_t BLOCK_SIZE, typename T>
79
- __global__ reduce_max_atomic (const size_t nElements, const T* srcData, T* dstData) {
80
-
81
+ __global__ void reduce_max_atomic (const size_t nElements, const T* srcData, T* dstData)
82
+ {
81
83
const size_t gid = threadIdx .x + blockIdx .x * blockDim .x ;
82
84
T __shared__ shm[BLOCK_SIZE];
83
85
shm[threadIdx .x ] = srcData[gid] ? gid < nElements : 0 ;
@@ -96,12 +98,13 @@ __global__ reduce_max_atomic(const size_t nElements, const T* srcData, T* dstDat
96
98
97
99
// dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
98
100
// srcData[N], dstData[(N + WARP_SIZE - 1) / WARP_SIZE]
99
- #if __CUDA_ARCH__ >= 900
100
- template <size_t WARP_SIZE, typename T>
101
- __global__ reduce_sum_warp_com (const size_t nElements, const T* srcData, T* dstData) {
101
+ // #if __CUDA_ARCH__ >= 900
102
+ template <size_t WARP_SIZE, typename T>
103
+ __global__ void reduce_sum_warp_com (const size_t nElements, const T* srcData, T* dstData)
104
+ {
102
105
const size_t gid = threadIdx .x + blockIdx .x * blockDim .x ;
103
106
const size_t wid = gid % WARP_SIZE;
104
- T sumVal = gidsrcData[ gid] ? gid < nElements : 0 ;
107
+ T sumVal = gid < nElements ? srcData[gid] : 0 ;
105
108
106
109
for (size_t offset = WARP_SIZE >> 1 ; offset > 0 ; offset >>= 1 ) {
107
110
sumVal += __shfl_xor_sync (0xffffffff , sumVal, offset, WARP_SIZE);
@@ -111,30 +114,58 @@ __global__ reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstD
111
114
dstData[gid / WARP_SIZE] = sumVal;
112
115
}
113
116
}
114
- #endif
117
+
118
+ template <>
119
+ void cudaCallReduceSUMWarpCom<unsigned int >(const size_t nElements, const unsigned int * srcData, unsigned int * dstData) {
120
+ const size_t WARP_SIZE = 32 ;
121
+ const size_t BLOCK_SIZE = 1024 ;
122
+ reduce_sum_warp_com<
123
+ WARP_SIZE, unsigned int ><<<
124
+ (nElements + BLOCK_SIZE - 1 ) / BLOCK_SIZE,
125
+ BLOCK_SIZE>>> (
126
+ nElements,
127
+ srcData,
128
+ dstData);
129
+ }
130
+
131
+ // #endif
115
132
116
133
#ifdef USE_THRUST
117
134
template <typename T>
118
- __global__ T reduce_sum_thrust (thrust::device_vector<T> src) {
119
- return thrust::reduce (src.begin (), src.end ());
135
+ T reduce_sum_thrust (thrust::device_vector<T> src)
136
+ {
137
+ return thrust::reduce (src.begin (), src.end ());
120
138
}
121
139
#endif
122
140
123
141
#ifdef USE_CUB
124
142
template <size_t BLOCK_SIZE, typename T>
125
- __global__ T void reduce_sum_cub (const size_t nElements, const T* srcData, T* dstData)
143
+ __global__ void reduce_sum_cub (const size_t nElements, const T* srcData, T* dstData)
126
144
{
127
- const size_t gid = threadIdx .x + blockIdx .x * blocDim .x ;
145
+ const size_t gid = threadIdx .x + blockIdx .x * blockDim .x ;
128
146
typedef cub::BlockReduce<T, BLOCK_SIZE> BlockReduce;
129
- __shared__ typename BlockReduce::TempStroge TempStroge ;
147
+ __shared__ typename BlockReduce::TempStorage TempStorage ;
130
148
131
149
T sumVal = 0 ;
132
150
if (gid < nElements) {
133
- sumVal = BlockReduce (TempStroge ).Sum (srcData[gid]);
151
+ sumVal = BlockReduce (TempStorage ).Sum (srcData[gid]);
134
152
}
135
153
136
154
if (threadIdx .x == 0 ) {
137
155
dstData[blockIdx .x ] = sumVal;
138
156
}
139
157
}
158
+
159
+ template <>
160
+ void cubCallReduceSUM (const size_t nElements, const unsigned int * srcData, unsigned int * dstData)
161
+ {
162
+ const size_t BLOCK_SIZE = 1024 ;
163
+ reduce_sum_cub<
164
+ BLOCK_SIZE, unsigned int ><<<
165
+ (nElements + BLOCK_SIZE - 1 ) / BLOCK_SIZE,
166
+ BLOCK_SIZE>>> (
167
+ nElements,
168
+ srcData,
169
+ dstData);
170
+ }
140
171
#endif
0 commit comments