-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[CHG]1.增加CUDA端测试代码;2.kernel的lantency度量没有warmup
- Loading branch information
l30001493
committed
Mar 23, 2021
1 parent
1206162
commit 0112267
Showing
5 changed files
with
184 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#ifndef CUDA_COMMON_H | ||
#define CUDA_COMMON_H | ||
|
||
#include <iostream> | ||
#include <cuda_runtime.h> | ||
|
||
#define CUDA_CHECK(condition) \ | ||
/* Code block avoids redefinition of cudaError_t error */ \ | ||
do { \ | ||
cudaError_t error = condition; \ | ||
if (error != cudaSuccess) { \ | ||
std::cout << cudaGetErrorString(error) << std::endl; \ | ||
} \ | ||
} while (0) | ||
|
||
void SetGPUID(int device_id) { | ||
int current_device; | ||
CUDA_CHECK(cudaGetDevice(¤t_device)); | ||
if (current_device == device_id) { | ||
return; | ||
} | ||
// The call to cudaSetDevice must come before any calls to Get, which | ||
// may perform initialization using the GPU. | ||
CUDA_CHECK(cudaSetDevice(device_id)); | ||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#ifndef CUDA_OP_H | ||
#define CUDA_OP_H | ||
|
||
#include <cuda_runtime.h> | ||
|
||
#ifdef USE_THRUST | ||
#include <thrust/device_vector.h> // memory | ||
#include <thrust/reduce.h> // op::reduce | ||
#endif | ||
|
||
#ifdef USE_CUB | ||
#include <cub/block/block_reduce.cuh> // op::reduce | ||
#endif | ||
|
||
// Reduce | ||
template <typename T> | ||
void cudaCallReduceSUMSharedMem(const size_t nElements, const T* srcData, T* dstData); | ||
|
||
template <typename T> | ||
void cudaCallReduceSUMWarpCom(const size_t nElements, const T* srcData, T* dstData); | ||
|
||
|
||
#ifdef USE_THRUST | ||
template<typename T> | ||
T thrustCallReduceSUM(thrust::device_vector<T> src); | ||
#endif | ||
|
||
#ifdef USE_CUB | ||
template <typename T> | ||
void cubCallReduceSUM(const size_t nElements, const T* srcData, T* dstData); | ||
#endif | ||
|
||
|
||
// Eltwise | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#include "cuda_common.h" | ||
#include "cuda_op.h" | ||
|
||
int main(int argc, char** argv) | ||
{ | ||
if (argc < 2) { | ||
printf("Usage: %s GPU_ID\n", argv[0]); | ||
return -1; | ||
} | ||
const int gpu_id = atoi(argv[1]); | ||
SetGPUID(gpu_id); | ||
|
||
const size_t | ||
n = 1 << 30, | ||
BLOCK_SIZE = 1 << 10, | ||
WARP_SIZE = 1 << 5, | ||
REDUCE_SIZE = (n + WARP_SIZE - 1) / WARP_SIZE; | ||
thrust::device_vector<unsigned> src(n, 1), tmp(REDUCE_SIZE); | ||
const unsigned char opDesc[4][128] = { | ||
"======thrust::reduce=======", | ||
"======shared_sum_kernel=======", | ||
"======warp_primitive_sum_kernel=======", | ||
"======cub::BlockReduce reduce_sum_cub======="}; | ||
for (int op = 0; op < 4; ++op) { | ||
unsigned sum; | ||
cudaEvent_t beg, end; | ||
cudaEventCreate(&beg); | ||
cudaEventCreate(&end); | ||
cudaEventRecord(beg, 0); | ||
if (op == 0) { | ||
sum = thrust::reduce(src.begin(), src.begin() + n); | ||
} | ||
if (op == 1) { | ||
cudaCallReduceSUMSharedMem(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data())); | ||
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE); | ||
} | ||
if (op == 2) { | ||
cudaCallReduceSUMWarpCom(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data())); | ||
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + WARP_SIZE - 1) / WARP_SIZE); | ||
} | ||
if (op == 3) { | ||
cubCallReduceSUM(n, thrust::raw_pointer_cast(src.data()), thrust::raw_pointer_cast(tmp.data())); | ||
sum = thrust::reduce(tmp.begin(), tmp.begin() + (n + BLOCK_SIZE - 1) / BLOCK_SIZE); | ||
} | ||
cudaEventRecord(end, 0); | ||
cudaEventSynchronize(beg); | ||
cudaEventSynchronize(end); | ||
float elapsed_time; | ||
cudaEventElapsedTime( | ||
&elapsed_time, | ||
beg, | ||
end); | ||
std::cout << opDesc[op] << std::endl; | ||
std::cout << sum << ": " << elapsed_time << " ms elapsed." << std::endl; | ||
std::cout << std::endl; | ||
// printf("%u : %fms elapsed.\n", sum, elapsed_time); | ||
} | ||
|
||
return 0; | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.