[CHG]1.增加CUDA端测试代码;2.kernel的lantency度量没有warmup

l30001493 · l30001493 · commit 011226709447 · 2021-03-23T17:01:12.000+08:00
diff --git a/gpu/cuda/reduce.cu b/gpu/cuda/reduce.cu
@@ -2,23 +2,13 @@
 * author: dqliu
 * date: 2020/03/18
 */
-
-#include <cuda_runtime.h>
-
-#ifdef USE_THRUST
-#include <thrust/device_vector.h>
-#include <thrust/reduce.h>
-#endif
-
-#ifdef USE_CUB
-#include <cub/block/block_reduce.cuh>
-#endif
+#include "cuda_op.h"
 
 // dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
 // srcData[N], dstData[(N + BLOCK_SIZE - 1) / BLOCK_SIZE]
 template <size_t BLOCK_SIZE, typename T>
-__global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData) {
-
+__global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData) 
+{
     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     T __shared__ shm[BLOCK_SIZE];
     shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -35,10 +25,22 @@ __global__ void reduce_sum(const size_t nElements, const T* srcData, T* dstData)
     }
 }
 
+template <>
+void cudaCallReduceSUMSharedMem<unsigned int>(const size_t nElements, const unsigned int* srcData, unsigned int* dstData)
+{
+	const size_t BLOCK_SIZE = 1024;
+	reduce_sum<BLOCK_SIZE, unsigned int><<<
+				(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
+				BLOCK_SIZE>>>(
+				nElements,
+				srcData,
+				dstData);
+}
+
 // srcData[N], dstData[1] (memset(0))
 template <size_t BLOCK_SIZE, typename T>
-__global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* dstData) {
-
+__global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* dstData) 
+{
     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     T __shared__ shm[BLOCK_SIZE];
     shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -56,8 +58,8 @@ __global__ void reduce_sum_atomic(const size_t nElements, const T* srcData, T* d
 }
 
 template <size_t BLOCK_SIZE, typename T>
-__global__ reduce_max(const size_t nElements, const T* srcData, T* dstData) {
-
+__global__ void reduce_max(const size_t nElements, const T* srcData, T* dstData) 
+{
     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     T __shared__ shm[BLOCK_SIZE];
     shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -76,8 +78,8 @@ __global__ reduce_max(const size_t nElements, const T* srcData, T* dstData) {
 
 // dstData[1] = -INF
 template <size_t BLOCK_SIZE, typename T>
-__global__ reduce_max_atomic(const size_t nElements, const T* srcData, T* dstData) {
-
+__global__ void reduce_max_atomic(const size_t nElements, const T* srcData, T* dstData) 
+{
     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     T __shared__ shm[BLOCK_SIZE];
     shm[threadIdx.x] = srcData[gid] ? gid < nElements : 0;
@@ -96,12 +98,13 @@ __global__ reduce_max_atomic(const size_t nElements, const T* srcData, T* dstDat
 
 // dim3 block(BLOCK_SIZE, 1, 1), grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1)
 // srcData[N], dstData[(N + WARP_SIZE - 1) / WARP_SIZE]
-#if __CUDA_ARCH__ >= 900
-template <size_t WARP_SIZE, typename T>
-__global__ reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstData) {
+// #if __CUDA_ARCH__ >= 900
+template<size_t WARP_SIZE, typename T>
+__global__ void reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstData)
+{
     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     const size_t wid = gid % WARP_SIZE;
-    T sumVal = gidsrcData[gid] ? gid < nElements : 0;
+    T sumVal = gid < nElements ? srcData[gid] : 0;
 
     for (size_t offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) {
         sumVal += __shfl_xor_sync(0xffffffff, sumVal, offset, WARP_SIZE);
@@ -111,30 +114,58 @@ __global__ reduce_sum_warp_com(const size_t nElements, const T* srcData, T* dstD
         dstData[gid / WARP_SIZE] = sumVal;
     }
 }
-#endif
+
+template<>
+void cudaCallReduceSUMWarpCom<unsigned int>(const size_t nElements, const unsigned int* srcData, unsigned int* dstData) {
+	const size_t WARP_SIZE = 32;
+	const size_t BLOCK_SIZE = 1024;
+	reduce_sum_warp_com<
+				WARP_SIZE, unsigned int><<<
+				(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
+				BLOCK_SIZE>>>(
+				nElements,
+				srcData,
+				dstData);
+}
+
+// #endif
 
 #ifdef USE_THRUST
 template<typename T>
-__global__ T reduce_sum_thrust(thrust::device_vector<T> src) {
-    return thrust::reduce(src.begin(), src.end());
+T reduce_sum_thrust(thrust::device_vector<T> src)
+{
+	return thrust::reduce(src.begin(), src.end());
 }
 #endif
 
 #ifdef USE_CUB
 template<size_t BLOCK_SIZE, typename T>
-__global__ T void reduce_sum_cub(const size_t nElements, const T* srcData, T* dstData) 
+__global__ void reduce_sum_cub(const size_t nElements, const T* srcData, T* dstData) 
 {
-    const size_t gid = threadIdx.x + blockIdx.x * blocDim.x;
+    const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     typedef cub::BlockReduce<T, BLOCK_SIZE> BlockReduce;
-    __shared__ typename BlockReduce::TempStroge TempStroge;
+    __shared__ typename BlockReduce::TempStorage TempStorage;
 
     T sumVal = 0;
     if (gid < nElements) {
-        sumVal = BlockReduce(TempStroge).Sum(srcData[gid]);
+        sumVal = BlockReduce(TempStorage).Sum(srcData[gid]);
     }
 
     if (threadIdx.x == 0) {
         dstData[blockIdx.x] = sumVal;
     }
 }
+
+template<>
+void cubCallReduceSUM(const size_t nElements, const unsigned int* srcData, unsigned int* dstData)
+{
+	const size_t BLOCK_SIZE = 1024;
+	reduce_sum_cub<
+				BLOCK_SIZE, unsigned int><<<
+				(nElements + BLOCK_SIZE - 1) / BLOCK_SIZE,
+				BLOCK_SIZE>>>(
+				nElements,
+				srcData,
+				dstData);
+}
 #endif
diff --git a/gpu/cuda_common.h b/gpu/cuda_common.h
@@ -0,0 +1,27 @@
+#ifndef CUDA_COMMON_H
+#define CUDA_COMMON_H
+
+#include <iostream>
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(condition) \
+	/* Code block avoids redefinition of cudaError_t error */ \
+	do { \
+		cudaError_t error = condition; \
+		if (error != cudaSuccess) { \
+		  std::cout << cudaGetErrorString(error) << std::endl; \
+		} \
+	} while (0)
+
+void SetGPUID(int device_id) {
+	int current_device;
+	CUDA_CHECK(cudaGetDevice(&current_device));
+	if (current_device == device_id) {
+		return;
+	}
+	// The call to cudaSetDevice must come before any calls to Get, which
+	// may perform initialization using the GPU.
+	CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+#endif
diff --git a/gpu/cuda_op.h b/gpu/cuda_op.h
@@ -0,0 +1,36 @@
+#ifndef CUDA_OP_H
+#define CUDA_OP_H
+
+#include <cuda_runtime.h>
+
+#ifdef USE_THRUST
+#include <thrust/device_vector.h>	// memory
+#include <thrust/reduce.h>			// op::reduce
+#endif
+
+#ifdef USE_CUB
+#include <cub/block/block_reduce.cuh>	// op::reduce
+#endif
+
+// Reduce
+template <typename T>
+void cudaCallReduceSUMSharedMem(const size_t nElements, const T* srcData, T* dstData);
+
+template <typename T>
+void cudaCallReduceSUMWarpCom(const size_t nElements, const T* srcData, T* dstData);
+
+
+#ifdef USE_THRUST
+template<typename T>
+T thrustCallReduceSUM(thrust::device_vector<T> src);
+#endif
+
+#ifdef USE_CUB
+template <typename T>
+void cubCallReduceSUM(const size_t nElements, const T* srcData, T* dstData);
+#endif
+
+
+// Eltwise
+
+#endif
diff --git a/gpu/cuda_reduce_test.cu b/gpu/cuda_reduce_test.cu
@@ -0,0 +1,60 @@
+#include "cuda_common.h"
+#include "cuda_op.h"
+
+int main(int argc, char** argv)
+{	
+	if (argc < 2) {
+		printf("Usage: %s GPU_ID\n", argv[0]);
+		return -1;
+	}
+	const int gpu_id = atoi(argv[1]);
+	SetGPUID(gpu_id);
+	
+	const size_t
+		n = 1 << 30,
+		BLOCK_SIZE = 1 << 10,
+		WARP_SIZE = 1 << 5,
+		REDUCE_SIZE = (n + WARP_SIZE - 1) / WARP_SIZE;
+	thrust::device_vector<unsigned> src(n, 1), tmp(REDUCE_SIZE);
+	const unsigned char opDesc[4][128] = {
+	"======thrust::reduce=======", 
+	"======shared_sum_kernel=======", 
+	"======warp_primitive_sum_kernel=======", 
+	"======cub::BlockReduce reduce_sum_cub======="};
+	for (int op = 0; op < 4; ++op) {
+		unsigned sum;
+		cudaEvent_t beg, end;
+		cudaEventCreate(&beg);
+		cudaEventCreate(&end);
+		cudaEventRecord(beg, 0);
+		if (op == 0) {
+			sum = thrust::reduce(src.begin(), src.begin() + n);
+		}
+		if (op == 1) {
+			cudaCallReduceSUMSharedMem(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
+			sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+		}
+		if (op == 2) {
+			cudaCallReduceSUMWarpCom(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
+			sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + WARP_SIZE - 1) / WARP_SIZE);
+		}
+		if (op == 3) {
+			cubCallReduceSUM(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data()));
+			sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+		}
+		cudaEventRecord(end, 0);
+		cudaEventSynchronize(beg);
+		cudaEventSynchronize(end);
+		float elapsed_time;
+		cudaEventElapsedTime(
+			&elapsed_time,
+			beg,
+			end);
+		std::cout << opDesc[op] << std::endl;
+		std::cout << sum << ": " << elapsed_time << " ms elapsed." << std::endl;
+		std::cout << std::endl;
+		// printf("%u : %fms elapsed.\n", sum, elapsed_time);
+	}
+	
+	return 0;
+}
diff --git a/gpu/tesla-v100-14TFLOPS-reduce-1G-lantency.PNG b/gpu/tesla-v100-14TFLOPS-reduce-1G-lantency.PNG