From 6f50313cdbf98994243887c0d608224ecc93bba7 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 16 Aug 2024 16:20:33 -0600 Subject: [PATCH 01/10] reformated device_api and host_api benchmark to include engine and distribution column --- benchmark/benchmark_rocrand_device_api.cpp | 1600 +++++++++----------- benchmark/benchmark_rocrand_host_api.cpp | 733 ++++----- benchmark/custom_csv_formater.hpp | 182 +++ 3 files changed, 1211 insertions(+), 1304 deletions(-) create mode 100644 benchmark/custom_csv_formater.hpp diff --git a/benchmark/benchmark_rocrand_device_api.cpp b/benchmark/benchmark_rocrand_device_api.cpp index 44ddec8fb..59f087a95 100644 --- a/benchmark/benchmark_rocrand_device_api.cpp +++ b/benchmark/benchmark_rocrand_device_api.cpp @@ -34,1020 +34,812 @@ #include #include #include +#include +#include "custom_csv_formater.hpp" #ifndef DEFAULT_RAND_N - #define DEFAULT_RAND_N (1024 * 1024 * 128) +#define DEFAULT_RAND_N (1024 * 1024 * 128) #endif -template +template __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_kernel( - EngineState* states, const unsigned long long seed, const unsigned long long offset) -{ - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(seed, state_id, offset, &state); - states[state_id] = state; + EngineState *states, const unsigned long long seed, + const unsigned long long offset) { + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(seed, state_id, offset, &state); + states[state_id] = state; } -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( - EngineState* states, T* data, const size_t size, Generator generator) -{ - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int stride = gridDim.x * blockDim.x; - - EngineState state = states[state_id]; - unsigned int index = state_id; - while(index < size) - { - data[index] = generator(&state); - index += stride; - } - states[state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( + EngineState *states, T *data, const size_t size, Generator generator) { + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = gridDim.x * blockDim.x; + + EngineState state = states[state_id]; + unsigned int index = state_id; + while (index < size) { + data[index] = generator(&state); + index += stride; + } + states[state_id] = state; } -template -struct runner -{ - EngineState* states; - - runner(const size_t /* dimensions */, - const size_t blocks, - const size_t threads, - const unsigned long long seed, - const unsigned long long offset) - { - const size_t states_size = blocks * threads; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState))); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), - dim3(blocks), - dim3(threads), - 0, - 0, - states, - seed, - offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } +template struct runner { + EngineState *states; - ~runner() - { - HIP_CHECK(hipFree(states)); - } + runner(const size_t /* dimensions */, const size_t blocks, + const size_t threads, const unsigned long long seed, + const unsigned long long offset) { + const size_t states_size = blocks * threads; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState))); - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), - dim3(blocks), - dim3(threads), - 0, - stream, - states, - data, - size, - generator); - } -}; + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(blocks), + dim3(threads), 0, 0, states, seed, offset); -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( - rocrand_state_mtgp32* states, T* data, const size_t size, Generator generator) -{ - const unsigned int state_id = blockIdx.x; - unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int stride = gridDim.x * blockDim.x; - - __shared__ rocrand_state_mtgp32 state; - rocrand_mtgp32_block_copy(&states[state_id], &state); - - const size_t r = size % blockDim.x; - const size_t size_rounded_down = size - r; - const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x; - while(index < size_rounded_down) - { - data[index] = generator(&state); - index += stride; - } - while(index < size_rounded_up) - { - auto value = generator(&state); - if(index < size) - data[index] = value; - index += stride; - } + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } - rocrand_mtgp32_block_copy(&state, &states[state_id]); -} + ~runner() { HIP_CHECK(hipFree(states)); } -template<> -struct runner -{ - rocrand_state_mtgp32* states; - - runner(const size_t /* dimensions */, - const size_t blocks, - const size_t /* threads */, - const unsigned long long seed, - const unsigned long long /* offset */) - { - const size_t states_size = std::min((size_t)200, blocks); - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_mtgp32))); - - ROCRAND_CHECK( - rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, states_size, seed)); - } + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks), + dim3(threads), 0, stream, states, data, size, generator); + } +}; - ~runner() - { - HIP_CHECK(hipFree(states)); - } +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( + rocrand_state_mtgp32 *states, T *data, const size_t size, + Generator generator) { + const unsigned int state_id = blockIdx.x; + unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int stride = gridDim.x * blockDim.x; + + __shared__ rocrand_state_mtgp32 state; + rocrand_mtgp32_block_copy(&states[state_id], &state); + + const size_t r = size % blockDim.x; + const size_t size_rounded_down = size - r; + const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x; + while (index < size_rounded_down) { + data[index] = generator(&state); + index += stride; + } + while (index < size_rounded_up) { + auto value = generator(&state); + if (index < size) + data[index] = value; + index += stride; + } + + rocrand_mtgp32_block_copy(&state, &states[state_id]); +} - template - void generate(const size_t blocks, - const size_t /* threads */, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), - dim3(std::min((size_t)200, blocks)), - dim3(256), - 0, - stream, - states, - data, - size, - generator); - } +template <> struct runner { + rocrand_state_mtgp32 *states; + + runner(const size_t /* dimensions */, const size_t blocks, + const size_t /* threads */, const unsigned long long seed, + const unsigned long long /* offset */) { + const size_t states_size = std::min((size_t)200, blocks); + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_mtgp32))); + + ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, + states_size, seed)); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t /* threads */, + hipStream_t stream, T *data, const size_t size, + const Generator &generator) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), + dim3(std::min((size_t)200, blocks)), dim3(256), 0, + stream, states, data, size, generator); + } }; __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_kernel( - rocrand_state_lfsr113* states, const uint4 seed) -{ - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - rocrand_state_lfsr113 state; - rocrand_init(seed, state_id, &state); - states[state_id] = state; + rocrand_state_lfsr113 *states, const uint4 seed) { + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + rocrand_state_lfsr113 state; + rocrand_init(seed, state_id, &state); + states[state_id] = state; } -template<> -struct runner -{ - rocrand_state_lfsr113* states; - - runner(const size_t /* dimensions */, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long /* offset */) - { - const size_t states_size = blocks * threads; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113))); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), - dim3(blocks), - dim3(threads), - 0, - 0, - states, - uint4{ROCRAND_LFSR113_DEFAULT_SEED_X, - ROCRAND_LFSR113_DEFAULT_SEED_Y, - ROCRAND_LFSR113_DEFAULT_SEED_Z, - ROCRAND_LFSR113_DEFAULT_SEED_W}); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } +template <> struct runner { + rocrand_state_lfsr113 *states; - ~runner() - { - HIP_CHECK(hipFree(states)); - } + runner(const size_t /* dimensions */, const size_t blocks, + const size_t threads, const unsigned long long /* seed */, + const unsigned long long /* offset */) { + const size_t states_size = blocks * threads; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113))); - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), - dim3(blocks), - dim3(threads), - 0, - stream, - states, - data, - size, - generator); - } + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_kernel), dim3(blocks), dim3(threads), 0, 0, states, + uint4{ROCRAND_LFSR113_DEFAULT_SEED_X, ROCRAND_LFSR113_DEFAULT_SEED_Y, + ROCRAND_LFSR113_DEFAULT_SEED_Z, ROCRAND_LFSR113_DEFAULT_SEED_W}); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks), + dim3(threads), 0, stream, states, data, size, generator); + } }; -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_kernel( - EngineState* states, SobolType* directions, SobolType offset) -{ - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(&directions[dimension * sizeof(SobolType) * 8], offset + state_id, &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_kernel( + EngineState *states, SobolType *directions, SobolType offset) { + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(&directions[dimension * sizeof(SobolType) * 8], + offset + state_id, &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled_sobol_kernel( - EngineState* states, SobolType* directions, SobolType* scramble_constants, SobolType offset) -{ - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(&directions[dimension * sizeof(SobolType) * 8], - scramble_constants[dimension], - offset + state_id, - &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled_sobol_kernel( + EngineState *states, SobolType *directions, SobolType *scramble_constants, + SobolType offset) { + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(&directions[dimension * sizeof(SobolType) * 8], + scramble_constants[dimension], offset + state_id, &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } // generate_kernel for the normal and scrambled sobol generators -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_sobol_kernel( - EngineState* states, T* data, const size_t size, Generator generator) -{ - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int stride = gridDim.x * blockDim.x; - - EngineState state = states[gridDim.x * blockDim.x * dimension + state_id]; - const size_t offset = dimension * size; - unsigned int index = state_id; - while(index < size) - { - data[offset + index] = generator(&state); - skipahead(stride - 1, &state); - index += stride; - } - state = states[gridDim.x * blockDim.x * dimension + state_id]; - skipahead(static_cast(size), &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_sobol_kernel( + EngineState *states, T *data, const size_t size, Generator generator) { + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = gridDim.x * blockDim.x; + + EngineState state = states[gridDim.x * blockDim.x * dimension + state_id]; + const size_t offset = dimension * size; + unsigned int index = state_id; + while (index < size) { + data[offset + index] = generator(&state); + skipahead(stride - 1, &state); + index += stride; + } + state = states[gridDim.x * blockDim.x * dimension + state_id]; + skipahead(static_cast(size), &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } -template<> -struct runner -{ - rocrand_state_sobol32* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned int* h_directions; - ROCRAND_CHECK( - rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol32))); - - unsigned int* directions; - const size_t size = dimensions * 32 * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&directions, size)); - HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - static_cast(offset)); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - } +template <> struct runner { + rocrand_state_sobol32 *states; + size_t dimensions; - ~runner() - { - HIP_CHECK(hipFree(states)); - } + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } -}; + const unsigned int *h_directions; + ROCRAND_CHECK(rocrand_get_direction_vectors32( + &h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); -template<> -struct runner -{ - rocrand_state_scrambled_sobol32* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned int* h_directions; - const unsigned int* h_constants; - - ROCRAND_CHECK( - rocrand_get_direction_vectors32(&h_directions, - ROCRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)); - ROCRAND_CHECK(rocrand_get_scramble_constants32(&h_constants)); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol32))); - - unsigned int* directions; - const size_t directions_size = dimensions * 32 * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&directions, directions_size)); - HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, hipMemcpyHostToDevice)); - - unsigned int* scramble_constants; - const size_t constants_size = dimensions * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); - HIP_CHECK( - hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - scramble_constants, - static_cast(offset)); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - HIP_CHECK(hipFree(scramble_constants)); - } + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol32))); - ~runner() - { - HIP_CHECK(hipFree(states)); - } + unsigned int *directions; + const size_t size = dimensions * 32 * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&directions, size)); + HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } -}; + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, static_cast(offset)); -template<> -struct runner -{ - rocrand_state_sobol64* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned long long* h_directions; - rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64))); - - unsigned long long int* directions; - const size_t size = dimensions * 64 * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&directions, size)); - HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - } + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); - ~runner() - { - HIP_CHECK(hipFree(states)); - } + HIP_CHECK(hipFree(directions)); + } - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } }; -template<> -struct runner -{ - rocrand_state_scrambled_sobol64* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned long long* h_directions; - const unsigned long long* h_constants; - - rocrand_get_direction_vectors64(&h_directions, - ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6); - rocrand_get_scramble_constants64(&h_constants); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol64))); - - unsigned long long int* directions; - const size_t directions_size = dimensions * 64 * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&directions, directions_size)); - HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, hipMemcpyHostToDevice)); - - unsigned long long int* scramble_constants; - const size_t constants_size = dimensions * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); - HIP_CHECK( - hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - scramble_constants, - offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - HIP_CHECK(hipFree(scramble_constants)); - } +template <> struct runner { + rocrand_state_scrambled_sobol32 *states; + size_t dimensions; + + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; + + const unsigned int *h_directions; + const unsigned int *h_constants; + + ROCRAND_CHECK(rocrand_get_direction_vectors32( + &h_directions, ROCRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)); + ROCRAND_CHECK(rocrand_get_scramble_constants32(&h_constants)); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, + states_size * sizeof(rocrand_state_scrambled_sobol32))); + + unsigned int *directions; + const size_t directions_size = dimensions * 32 * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&directions, directions_size)); + HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, + hipMemcpyHostToDevice)); + + unsigned int *scramble_constants; + const size_t constants_size = dimensions * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); + HIP_CHECK(hipMemcpy(scramble_constants, h_constants, constants_size, + hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, scramble_constants, + static_cast(offset)); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + HIP_CHECK(hipFree(scramble_constants)); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } +}; - ~runner() - { - HIP_CHECK(hipFree(states)); - } +template <> struct runner { + rocrand_state_sobol64 *states; + size_t dimensions; - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } -}; + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; -// Provide optional create and destroy functions for the generators. -struct generator_type -{ - static void create() {} + const unsigned long long *h_directions; + rocrand_get_direction_vectors64(&h_directions, + ROCRAND_DIRECTION_VECTORS_64_JOEKUO6); - static void destroy() {} -}; + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64))); -template -struct generator_uint : public generator_type -{ - typedef unsigned int data_type; + unsigned long long int *directions; + const size_t size = dimensions * 64 * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&directions, size)); + HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - std::string name() - { - return "uniform-uint"; - } + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, offset); - __device__ data_type operator()(Engine* state) const - { - return rocrand(state); - } -}; + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); -template -struct generator_ullong : public generator_type -{ - typedef unsigned long long int data_type; + HIP_CHECK(hipFree(directions)); + } - std::string name() - { - return "uniform-ullong"; - } + ~runner() { HIP_CHECK(hipFree(states)); } - __device__ data_type operator()(Engine* state) const - { - return rocrand(state); - } + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } }; -template -struct generator_uniform : public generator_type -{ - typedef float data_type; +template <> struct runner { + rocrand_state_scrambled_sobol64 *states; + size_t dimensions; + + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; + + const unsigned long long *h_directions; + const unsigned long long *h_constants; + + rocrand_get_direction_vectors64( + &h_directions, ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6); + rocrand_get_scramble_constants64(&h_constants); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, + states_size * sizeof(rocrand_state_scrambled_sobol64))); + + unsigned long long int *directions; + const size_t directions_size = + dimensions * 64 * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&directions, directions_size)); + HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, + hipMemcpyHostToDevice)); + + unsigned long long int *scramble_constants; + const size_t constants_size = dimensions * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); + HIP_CHECK(hipMemcpy(scramble_constants, h_constants, constants_size, + hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, scramble_constants, offset); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + HIP_CHECK(hipFree(scramble_constants)); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } +}; - std::string name() - { - return "uniform-float"; - } +// Provide optional create and destroy functions for the generators. +struct generator_type { + static void create() {} - __device__ data_type operator()(Engine* state) const - { - return rocrand_uniform(state); - } + static void destroy() {} }; -template -struct generator_uniform_double : public generator_type -{ - typedef double data_type; +template struct generator_uint : public generator_type { + typedef unsigned int data_type; - std::string name() - { - return "uniform-double"; - } + std::string name() { return "uniform-uint"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_uniform_double(state); - } + __device__ data_type operator()(Engine *state) const { + return rocrand(state); + } }; -template -struct generator_normal : public generator_type -{ - typedef float data_type; +template struct generator_ullong : public generator_type { + typedef unsigned long long int data_type; - std::string name() - { - return "normal-float"; - } + std::string name() { return "uniform-ullong"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_normal(state); - } + __device__ data_type operator()(Engine *state) const { + return rocrand(state); + } }; -template -struct generator_normal_double : public generator_type -{ - typedef double data_type; +template struct generator_uniform : public generator_type { + typedef float data_type; - std::string name() - { - return "normal-double"; - } + std::string name() { return "uniform-float"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_normal_double(state); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_uniform(state); + } }; -template -struct generator_log_normal : public generator_type -{ - typedef float data_type; +template +struct generator_uniform_double : public generator_type { + typedef double data_type; - std::string name() - { - return "log-normal-float"; - } + std::string name() { return "uniform-double"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_log_normal(state, 0.f, 1.f); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_uniform_double(state); + } }; -template -struct generator_log_normal_double : public generator_type -{ - typedef double data_type; +template struct generator_normal : public generator_type { + typedef float data_type; - std::string name() - { - return "log-normal-double"; - } + std::string name() { return "normal-float"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_log_normal_double(state, 0., 1.); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_normal(state); + } }; -template -struct generator_poisson : public generator_type -{ - typedef unsigned int data_type; +template +struct generator_normal_double : public generator_type { + typedef double data_type; - std::string name() - { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << lambda; - return "poisson(lambda=" + stream.str() + ")"; - } + std::string name() { return "normal-double"; } - __device__ data_type operator()(Engine* state) - { - return rocrand_poisson(state, lambda); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_normal_double(state); + } +}; - double lambda; +template struct generator_log_normal : public generator_type { + typedef float data_type; + + std::string name() { return "log-normal-float"; } + + __device__ data_type operator()(Engine *state) const { + return rocrand_log_normal(state, 0.f, 1.f); + } }; -template -struct generator_discrete_poisson : public generator_type -{ - typedef unsigned int data_type; +template +struct generator_log_normal_double : public generator_type { + typedef double data_type; - std::string name() - { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << lambda; - return "discrete-poisson(lambda=" + stream.str() + ")"; - } + std::string name() { return "log-normal-double"; } - void create() - { - ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution)); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_log_normal_double(state, 0., 1.); + } +}; - void destroy() - { - ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); - } +template struct generator_poisson : public generator_type { + typedef unsigned int data_type; - __device__ data_type operator()(Engine* state) - { - return rocrand_discrete(state, discrete_distribution); - } + std::string name() { + std::stringstream stream; + stream << std::fixed << std::setprecision(1) << lambda; + return "poisson(lambda=" + stream.str() + ")"; + } + + __device__ data_type operator()(Engine *state) { + return rocrand_poisson(state, lambda); + } - rocrand_discrete_distribution discrete_distribution; - double lambda; + double lambda; }; -template -struct generator_discrete_custom : public generator_type -{ - typedef unsigned int data_type; +template +struct generator_discrete_poisson : public generator_type { + typedef unsigned int data_type; - std::string name() - { - return "discrete-custom"; - } + std::string name() { + std::stringstream stream; + stream << std::fixed << std::setprecision(1) << lambda; + return "discrete-poisson(lambda=" + stream.str() + ")"; + } - void create() - { - const unsigned int offset = 1234; - std::vector probabilities = {10, 10, 1, 120, 8, 6, 140, 2, 150, 150, 10, 80}; - - double sum = std::accumulate(probabilities.begin(), probabilities.end(), 0.); - std::transform(probabilities.begin(), - probabilities.end(), - probabilities.begin(), - [=](double p) { return p / sum; }); - ROCRAND_CHECK(rocrand_create_discrete_distribution(probabilities.data(), - probabilities.size(), - offset, - &discrete_distribution)); - } + void create() { + ROCRAND_CHECK( + rocrand_create_poisson_distribution(lambda, &discrete_distribution)); + } - void destroy() - { - ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); - } + void destroy() { + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } - __device__ data_type operator()(Engine* state) - { - return rocrand_discrete(state, discrete_distribution); - } + __device__ data_type operator()(Engine *state) { + return rocrand_discrete(state, discrete_distribution); + } - rocrand_discrete_distribution discrete_distribution; + rocrand_discrete_distribution discrete_distribution; + double lambda; }; -struct benchmark_context -{ - size_t size; - size_t dimensions; - size_t trials; - size_t blocks; - size_t threads; - std::vector lambdas; -}; +template +struct generator_discrete_custom : public generator_type { + typedef unsigned int data_type; -template -void run_benchmark(benchmark::State& state, - const hipStream_t stream, - const benchmark_context& context, - Generator generator) -{ - typedef typename Generator::data_type data_type; - - const size_t size = context.size; - const size_t dimensions = context.dimensions; - const size_t trials = context.trials; - const size_t blocks = context.blocks; - const size_t threads = context.threads; - - // Optional initialization of the generator - generator.create(); - - data_type* data; - HIP_CHECK(hipMalloc(&data, size * sizeof(data_type))); - - constexpr unsigned long long int seed = 12345ULL; - constexpr unsigned long long int offset = 6789ULL; - - runner r(dimensions, blocks, threads, seed, offset); - - // Warm-up - for(size_t i = 0; i < 5; i++) - { - r.generate(blocks, threads, stream, data, size, generator); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } + std::string name() { return "discrete-custom"; } - // Measurement - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - for(auto _ : state) - { - HIP_CHECK(hipEventRecord(start, stream)); - for(size_t i = 0; i < trials; i++) - { - r.generate(blocks, threads, stream, data, size, generator); - } - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed; - HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); - - state.SetIterationTime(elapsed / 1000.f); - } - state.SetBytesProcessed(trials * state.iterations() * size * sizeof(data_type)); - state.SetItemsProcessed(trials * state.iterations() * size); + void create() { + const unsigned int offset = 1234; + std::vector probabilities = {10, 10, 1, 120, 8, 6, + 140, 2, 150, 150, 10, 80}; - // Optional de-initialization of the generator - generator.destroy(); + double sum = + std::accumulate(probabilities.begin(), probabilities.end(), 0.); + std::transform(probabilities.begin(), probabilities.end(), + probabilities.begin(), [=](double p) { return p / sum; }); + ROCRAND_CHECK(rocrand_create_discrete_distribution( + probabilities.data(), probabilities.size(), offset, + &discrete_distribution)); + } - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); - HIP_CHECK(hipFree(data)); -} + void destroy() { + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } -template -void add_benchmark(const benchmark_context& context, - const hipStream_t stream, - std::vector& benchmarks, - const std::string& name, - Generator generator) -{ - static_assert(std::is_trivially_copyable::value - && std::is_trivially_destructible::value, - "Generator gets copied to device at kernel launch."); - const std::string benchmark_name = "device_kernel<" + name + "," + generator.name() + ">"; - benchmarks.emplace_back(benchmark::RegisterBenchmark(benchmark_name.c_str(), - &run_benchmark, - stream, - context, - generator)); -} + __device__ data_type operator()(Engine *state) { + return rocrand_discrete(state, discrete_distribution); + } -template -void add_benchmarks(const benchmark_context& ctx, - const hipStream_t stream, - std::vector& benchmarks, - const rocrand_rng_type engine_type) -{ - constexpr bool is_64_bits = std::is_same::value - || std::is_same::value - || std::is_same::value - || std::is_same::value; - - const std::string name = engine_name(engine_type); - - if(is_64_bits) - { - add_benchmark(ctx, stream, benchmarks, name, generator_ullong()); - } - else - { - add_benchmark(ctx, stream, benchmarks, name, generator_uint()); - } + rocrand_discrete_distribution discrete_distribution; +}; - add_benchmark(ctx, stream, benchmarks, name, generator_uniform()); - add_benchmark(ctx, stream, benchmarks, name, generator_uniform_double()); - add_benchmark(ctx, stream, benchmarks, name, generator_normal()); - add_benchmark(ctx, stream, benchmarks, name, generator_normal_double()); - add_benchmark(ctx, stream, benchmarks, name, generator_log_normal()); - add_benchmark(ctx, stream, benchmarks, name, generator_log_normal_double()); - - for(size_t i = 0; i < ctx.lambdas.size(); i++) - { - generator_poisson gen_poisson; - gen_poisson.lambda = ctx.lambdas[i]; - add_benchmark(ctx, stream, benchmarks, name, gen_poisson); - } +struct benchmark_context { + size_t size; + size_t dimensions; + size_t trials; + size_t blocks; + size_t threads; + std::vector lambdas; +}; - for(size_t i = 0; i < ctx.lambdas.size(); i++) - { - generator_discrete_poisson gen_discrete_poisson; - gen_discrete_poisson.lambda = ctx.lambdas[i]; - add_benchmark(ctx, stream, benchmarks, name, gen_discrete_poisson); - } +template +void run_benchmark(benchmark::State &state, const hipStream_t stream, + const benchmark_context &context, Generator generator) { + typedef typename Generator::data_type data_type; + + const size_t size = context.size; + const size_t dimensions = context.dimensions; + const size_t trials = context.trials; + const size_t blocks = context.blocks; + const size_t threads = context.threads; + + // Optional initialization of the generator + generator.create(); + + data_type *data; + HIP_CHECK(hipMalloc(&data, size * sizeof(data_type))); + + constexpr unsigned long long int seed = 12345ULL; + constexpr unsigned long long int offset = 6789ULL; + + runner r(dimensions, blocks, threads, seed, offset); + + // Warm-up + for (size_t i = 0; i < 5; i++) { + r.generate(blocks, threads, stream, data, size, generator); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } + + // Measurement + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + for (auto _ : state) { + HIP_CHECK(hipEventRecord(start, stream)); + for (size_t i = 0; i < trials; i++) { + r.generate(blocks, threads, stream, data, size, generator); + } + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed; + HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); + + state.SetIterationTime(elapsed / 1000.f); + } + state.SetBytesProcessed(trials * state.iterations() * size * + sizeof(data_type)); + state.SetItemsProcessed(trials * state.iterations() * size); + + // Optional de-initialization of the generator + generator.destroy(); + + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipFree(data)); +} - add_benchmark(ctx, stream, benchmarks, name, generator_discrete_custom()); +template +void add_benchmark(const benchmark_context &context, const hipStream_t stream, + std::vector &benchmarks, + const std::string &name, Generator generator) { + static_assert(std::is_trivially_copyable::value && + std::is_trivially_destructible::value, + "Generator gets copied to device at kernel launch."); + const std::string benchmark_name = + "device_kernel<" + name + "," + generator.name() + ">"; + benchmarks.emplace_back(benchmark::RegisterBenchmark( + benchmark_name.c_str(), &run_benchmark, stream, + context, generator)); } -int main(int argc, char* argv[]) -{ - benchmark::Initialize(&argc, argv); - - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); - parser.set_optional("dimensions", - "dimensions", - 1, - "number of dimensions of quasi-random values"); - parser.set_optional("trials", "trials", 20, "number of trials"); - parser.set_optional("blocks", "blocks", 256, "number of blocks"); - parser.set_optional("threads", "threads", 256, "number of threads in each block"); - parser.set_optional>( - "lambda", - "lambda", - {10.0}, - "space-separated list of lambdas of Poisson distribution"); - parser.run_and_exit_if_error(); - - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - - add_common_benchmark_rocrand_info(); - - benchmark_context ctx{}; - - ctx.size = parser.get("size"); - ctx.dimensions = parser.get("dimensions"); - ctx.trials = parser.get("trials"); - ctx.blocks = parser.get("blocks"); - ctx.threads = parser.get("threads"); - ctx.lambdas = parser.get>("lambda"); - - benchmark::AddCustomContext("size", std::to_string(ctx.size)); - benchmark::AddCustomContext("dimensions", std::to_string(ctx.dimensions)); - benchmark::AddCustomContext("trials", std::to_string(ctx.trials)); - benchmark::AddCustomContext("blocks", std::to_string(ctx.blocks)); - benchmark::AddCustomContext("threads", std::to_string(ctx.threads)); - - std::vector benchmarks = {}; - - // MT19937 has no kernel implementation - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_LFSR113); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MRG31K3P); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MRG32K3A); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MTGP32); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_PHILOX4_32_10); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SOBOL32); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SOBOL64); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY2_32_20); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY4_32_20); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY2_64_20); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY4_64_20); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_XORWOW); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } +template +void add_benchmarks(const benchmark_context &ctx, const hipStream_t stream, + std::vector &benchmarks, + const rocrand_rng_type engine_type) { + constexpr bool is_64_bits = + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; + + const std::string name = engine_name(engine_type); + + if (is_64_bits) { + add_benchmark(ctx, stream, benchmarks, name, + generator_ullong()); + } else { + add_benchmark(ctx, stream, benchmarks, name, + generator_uint()); + } + + add_benchmark(ctx, stream, benchmarks, name, + generator_uniform()); + add_benchmark(ctx, stream, benchmarks, name, + generator_uniform_double()); + add_benchmark(ctx, stream, benchmarks, name, + generator_normal()); + add_benchmark(ctx, stream, benchmarks, name, + generator_normal_double()); + add_benchmark(ctx, stream, benchmarks, name, + generator_log_normal()); + add_benchmark(ctx, stream, benchmarks, name, + generator_log_normal_double()); + + for (size_t i = 0; i < ctx.lambdas.size(); i++) { + generator_poisson gen_poisson; + gen_poisson.lambda = ctx.lambdas[i]; + add_benchmark(ctx, stream, benchmarks, name, gen_poisson); + } + + for (size_t i = 0; i < ctx.lambdas.size(); i++) { + generator_discrete_poisson gen_discrete_poisson; + gen_discrete_poisson.lambda = ctx.lambdas[i]; + add_benchmark(ctx, stream, benchmarks, name, gen_discrete_poisson); + } + + add_benchmark(ctx, stream, benchmarks, name, + generator_discrete_custom()); +} +int main(int argc, char *argv[]) { + + // get the out format and out file name thats being passed into + // benchmark::Initialize() + std::string outFormat = ""; + std::string outFile = ""; + std::string filter = ""; + for (int i = 1; i < argc; i++) { + std::string input(argv[i]); + + int equalPos = input.find("="); + std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); + std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); + + if (arg == "benchmark_out_format") + outFormat = argVal; + else if (arg == "benchmark_out") + outFile = argVal; + else if (arg == "benchmark_filter") + filter = argVal; + } + + benchmark::Initialize(&argc, argv); + + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_RAND_N, + "number of values"); + parser.set_optional("dimensions", "dimensions", 1, + "number of dimensions of quasi-random values"); + parser.set_optional("trials", "trials", 20, "number of trials"); + parser.set_optional("blocks", "blocks", 256, "number of blocks"); + parser.set_optional("threads", "threads", 256, + "number of threads in each block"); + parser.set_optional>( + "lambda", "lambda", {10.0}, + "space-separated list of lambdas of Poisson distribution"); + parser.run_and_exit_if_error(); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + add_common_benchmark_rocrand_info(); + + benchmark_context ctx{}; + + ctx.size = parser.get("size"); + ctx.dimensions = parser.get("dimensions"); + ctx.trials = parser.get("trials"); + ctx.blocks = parser.get("blocks"); + ctx.threads = parser.get("threads"); + ctx.lambdas = parser.get>("lambda"); + + benchmark::AddCustomContext("size", std::to_string(ctx.size)); + benchmark::AddCustomContext("dimensions", std::to_string(ctx.dimensions)); + benchmark::AddCustomContext("trials", std::to_string(ctx.trials)); + benchmark::AddCustomContext("blocks", std::to_string(ctx.blocks)); + benchmark::AddCustomContext("threads", std::to_string(ctx.threads)); + + std::vector benchmarks = {}; + + // MT19937 has no kernel implementation + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_LFSR113); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_MRG31K3P); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_MRG32K3A); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_MTGP32); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_PHILOX4_32_10); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_QUASI_SOBOL32); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_QUASI_SOBOL64); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY2_32_20); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY4_32_20); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY4_64_20); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_XORWOW); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + if (outFormat == "csv") { + std::string spec = (filter == "" || filter == "all") ? "." : filter; + std::ofstream output_file; + + benchmark::ConsoleReporter console_reporter; + benchmark::customCSVReporter csv_reporter; + + auto &Err = console_reporter.GetErrorStream(); + + csv_reporter.SetOutputStream(&output_file); + csv_reporter.SetErrorStream(&Err); + + benchmark::BenchmarkReporter *console_ptr = &console_reporter; + benchmark::BenchmarkReporter *csv_ptr = &csv_reporter; + + benchmark::RunSpecifiedBenchmarks(console_ptr, csv_ptr, spec); + + } else { // Run benchmarks benchmark::RunSpecifiedBenchmarks(); - HIP_CHECK(hipStreamDestroy(stream)); + } + HIP_CHECK(hipStreamDestroy(stream)); - return 0; + return 0; } diff --git a/benchmark/benchmark_rocrand_host_api.cpp b/benchmark/benchmark_rocrand_host_api.cpp index c29b21b91..8cac13d30 100644 --- a/benchmark/benchmark_rocrand_host_api.cpp +++ b/benchmark/benchmark_rocrand_host_api.cpp @@ -23,10 +23,11 @@ #include +#include "custom_csv_formater.hpp" +#include #include -#include - #include +#include #include #include @@ -34,175 +35,175 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; #endif +typedef std::unique_ptr PtrType; typedef rocrand_rng_type rng_type_t; -template -using generate_func_type = std::function; - -template -void run_benchmark(benchmark::State& state, - generate_func_type generate_func, - const size_t size, - const bool byte_size, - const size_t trials, - const size_t dimensions, - const size_t offset, - const rng_type_t rng_type, - const rocrand_ordering ordering, - const bool benchmark_host, - hipStream_t stream) -{ - const size_t binary_div = byte_size ? sizeof(T) : 1; - const size_t rounded_size = (size / binary_div / dimensions) * dimensions; - - T* data; - rocrand_generator generator; - - if(benchmark_host) - { - data = new T[rounded_size]; - ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); - } - else - { - HIP_CHECK(hipMalloc(&data, rounded_size * sizeof(T))); - ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); - } - - ROCRAND_CHECK(rocrand_set_ordering(generator, ordering)); - - rocrand_status status = rocrand_set_quasi_random_generator_dimensions(generator, dimensions); - if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random - { - ROCRAND_CHECK(status); - } - - ROCRAND_CHECK(rocrand_set_stream(generator, stream)); - - status = rocrand_set_offset(generator, offset); - if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random - { - ROCRAND_CHECK(status); - } - - // Warm-up - for(size_t i = 0; i < 15; i++) - { - ROCRAND_CHECK(generate_func(generator, data, rounded_size)); - } - HIP_CHECK(hipDeviceSynchronize()); - - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - for(auto _ : state) - { - HIP_CHECK(hipEventRecord(start, stream)); - for(size_t i = 0; i < trials; i++) - { - ROCRAND_CHECK(generate_func(generator, data, rounded_size)); - } - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed = 0.0f; - HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); - - state.SetIterationTime(elapsed / 1000.f); - } - state.SetBytesProcessed(trials * state.iterations() * rounded_size * sizeof(T)); - state.SetItemsProcessed(trials * state.iterations() * rounded_size); - - HIP_CHECK(hipEventDestroy(stop)); - HIP_CHECK(hipEventDestroy(start)); - ROCRAND_CHECK(rocrand_destroy_generator(generator)); - if(benchmark_host) - { - delete[] data; - } - else - { - HIP_CHECK(hipFree(data)); +template +using generate_func_type = + std::function; + +template +void run_benchmark(benchmark::State &state, generate_func_type generate_func, + const size_t size, const bool byte_size, const size_t trials, + const size_t dimensions, const size_t offset, + const rng_type_t rng_type, const rocrand_ordering ordering, + const bool benchmark_host, hipStream_t stream) { + const size_t binary_div = byte_size ? sizeof(T) : 1; + const size_t rounded_size = (size / binary_div / dimensions) * dimensions; + + T *data; + rocrand_generator generator; + + if (benchmark_host) { + data = new T[rounded_size]; + ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); + } else { + HIP_CHECK(hipMalloc(&data, rounded_size * sizeof(T))); + ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); + } + + ROCRAND_CHECK(rocrand_set_ordering(generator, ordering)); + + rocrand_status status = + rocrand_set_quasi_random_generator_dimensions(generator, dimensions); + if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random + { + ROCRAND_CHECK(status); + } + + ROCRAND_CHECK(rocrand_set_stream(generator, stream)); + + status = rocrand_set_offset(generator, offset); + if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random + { + ROCRAND_CHECK(status); + } + + // Warm-up + for (size_t i = 0; i < 15; i++) { + ROCRAND_CHECK(generate_func(generator, data, rounded_size)); + } + HIP_CHECK(hipDeviceSynchronize()); + + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + for (auto _ : state) { + HIP_CHECK(hipEventRecord(start, stream)); + for (size_t i = 0; i < trials; i++) { + ROCRAND_CHECK(generate_func(generator, data, rounded_size)); } + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed = 0.0f; + HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); + + state.SetIterationTime(elapsed / 1000.f); + } + state.SetBytesProcessed(trials * state.iterations() * rounded_size * + sizeof(T)); + state.SetItemsProcessed(trials * state.iterations() * rounded_size); + + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipEventDestroy(start)); + ROCRAND_CHECK(rocrand_destroy_generator(generator)); + + if (benchmark_host) { + delete[] data; + } else { + HIP_CHECK(hipFree(data)); + } } -int main(int argc, char* argv[]) -{ - - // Parse argv - benchmark::Initialize(&argc, argv); - - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); - parser.set_optional("byte-size", - "byte-size", - false, - "--size is interpreted as the number of generated bytes"); - parser.set_optional("dimensions", - "dimensions", - 1, - "number of dimensions of quasi-random values"); - parser.set_optional("offset", "offset", 0, "offset of generated pseudo-random values"); - parser.set_optional("trials", "trials", 20, "number of trials"); - parser.set_optional>( - "lambda", - "lambda", - {10.0}, - "space-separated list of lambdas of Poisson distribution"); - parser.set_optional("host", - "host", - false, - "run benchmarks on the host instead of on the device"); - parser.run_and_exit_if_error(); - - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - - // Benchmark info - add_common_benchmark_rocrand_info(); - - const size_t size = parser.get("size"); - const bool byte_size = parser.get("byte-size"); - const size_t trials = parser.get("trials"); - const size_t dimensions = parser.get("dimensions"); - const size_t offset = parser.get("offset"); - const std::vector poisson_lambdas = parser.get>("lambda"); - const bool benchmark_host = parser.get("host"); - - benchmark::AddCustomContext("size", std::to_string(size)); - benchmark::AddCustomContext("byte-size", std::to_string(byte_size)); - benchmark::AddCustomContext("trials", std::to_string(trials)); - benchmark::AddCustomContext("dimensions", std::to_string(dimensions)); - benchmark::AddCustomContext("offset", std::to_string(offset)); - benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host)); - - std::vector benchmarked_engine_types{ROCRAND_RNG_PSEUDO_LFSR113, - ROCRAND_RNG_PSEUDO_MRG31K3P, - ROCRAND_RNG_PSEUDO_MRG32K3A, - ROCRAND_RNG_PSEUDO_MTGP32, - ROCRAND_RNG_PSEUDO_MT19937, - ROCRAND_RNG_PSEUDO_PHILOX4_32_10, - ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, - ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, - ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, - ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, - ROCRAND_RNG_PSEUDO_XORWOW, - ROCRAND_RNG_QUASI_SOBOL32, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, - ROCRAND_RNG_QUASI_SOBOL64, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64}; - - const std::map ordering_name_map{ - {ROCRAND_ORDERING_PSEUDO_DEFAULT, "default"}, - { ROCRAND_ORDERING_PSEUDO_LEGACY, "legacy"}, - { ROCRAND_ORDERING_PSEUDO_BEST, "best"}, - {ROCRAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"}, - { ROCRAND_ORDERING_PSEUDO_SEEDED, "seeded"}, - { ROCRAND_ORDERING_QUASI_DEFAULT, "default"}, - }; - - const std::map> benchmarked_orderings{ - // clang-format off +int main(int argc, char *argv[]) { + + // get the out format and out file name thats being passed into + // benchmark::Initialize() + std::string outFormat = ""; + std::string outFile = ""; + std::string filter = ""; + for (int i = 1; i < argc; i++) { + std::string input(argv[i]); + + int equalPos = input.find("="); + std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); + std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); + + if (arg == "benchmark_out_format") + outFormat = argVal; + else if (arg == "benchmark_out") + outFile = argVal; + else if (arg == "benchmark_filter") + filter = argVal; + } + + // Parse argv + benchmark::Initialize(&argc, argv); + + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_RAND_N, + "number of values"); + parser.set_optional( + "byte-size", "byte-size", false, + "--size is interpreted as the number of generated bytes"); + parser.set_optional("dimensions", "dimensions", 1, + "number of dimensions of quasi-random values"); + parser.set_optional("offset", "offset", 0, + "offset of generated pseudo-random values"); + parser.set_optional("trials", "trials", 20, "number of trials"); + parser.set_optional>( + "lambda", "lambda", {10.0}, + "space-separated list of lambdas of Poisson distribution"); + parser.set_optional( + "host", "host", false, + "run benchmarks on the host instead of on the device"); + parser.run_and_exit_if_error(); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + // Benchmark info + add_common_benchmark_rocrand_info(); + + const size_t size = parser.get("size"); + const bool byte_size = parser.get("byte-size"); + const size_t trials = parser.get("trials"); + const size_t dimensions = parser.get("dimensions"); + const size_t offset = parser.get("offset"); + const std::vector poisson_lambdas = + parser.get>("lambda"); + const bool benchmark_host = parser.get("host"); + + benchmark::AddCustomContext("size", std::to_string(size)); + benchmark::AddCustomContext("byte-size", std::to_string(byte_size)); + benchmark::AddCustomContext("trials", std::to_string(trials)); + benchmark::AddCustomContext("dimensions", std::to_string(dimensions)); + benchmark::AddCustomContext("offset", std::to_string(offset)); + benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host)); + + std::vector benchmarked_engine_types{ + ROCRAND_RNG_PSEUDO_LFSR113, ROCRAND_RNG_PSEUDO_MRG31K3P, + ROCRAND_RNG_PSEUDO_MRG32K3A, ROCRAND_RNG_PSEUDO_MTGP32, + ROCRAND_RNG_PSEUDO_MT19937, ROCRAND_RNG_PSEUDO_PHILOX4_32_10, + ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, + ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, + ROCRAND_RNG_PSEUDO_XORWOW, ROCRAND_RNG_QUASI_SOBOL32, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, ROCRAND_RNG_QUASI_SOBOL64, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64}; + + const std::map ordering_name_map{ + {ROCRAND_ORDERING_PSEUDO_DEFAULT, "default"}, + {ROCRAND_ORDERING_PSEUDO_LEGACY, "legacy"}, + {ROCRAND_ORDERING_PSEUDO_BEST, "best"}, + {ROCRAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"}, + {ROCRAND_ORDERING_PSEUDO_SEEDED, "seeded"}, + {ROCRAND_ORDERING_QUASI_DEFAULT, "default"}, + }; + + const std::map> + benchmarked_orderings{ + // clang-format off { ROCRAND_RNG_PSEUDO_MTGP32, {ROCRAND_ORDERING_PSEUDO_DEFAULT, ROCRAND_ORDERING_PSEUDO_DYNAMIC}}, { ROCRAND_RNG_PSEUDO_MT19937, {ROCRAND_ORDERING_PSEUDO_DEFAULT}}, @@ -228,242 +229,174 @@ int main(int argc, char* argv[]) {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, {ROCRAND_ORDERING_QUASI_DEFAULT}}, { ROCRAND_RNG_QUASI_SOBOL64, {ROCRAND_ORDERING_QUASI_DEFAULT}}, {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64, {ROCRAND_ORDERING_QUASI_DEFAULT}}, - // clang-format on - }; - - const std::string benchmark_name_prefix = "device_generate"; - // Add benchmarks - std::vector benchmarks = {}; - for(const rocrand_rng_type engine_type : benchmarked_engine_types) - { - const std::string name = engine_name(engine_type); - for(const rocrand_ordering ordering : benchmarked_orderings.at(engine_type)) - { - const std::string name_engine_prefix - = benchmark_name_prefix + "<" + name + "," + ordering_name_map.at(ordering) + ","; - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-uint>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned int* data, size_t size_gen) - { return rocrand_generate(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-uchar>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned char* data, size_t size_gen) - { return rocrand_generate_char(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-ushort>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned short* data, size_t size_gen) - { return rocrand_generate_short(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half* data, size_t size_gen) - { return rocrand_generate_uniform_half(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float* data, size_t size_gen) - { return rocrand_generate_uniform(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double* data, size_t size_gen) - { return rocrand_generate_uniform_double(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half* data, size_t size_gen) - { - return rocrand_generate_normal_half(gen, - data, - size_gen, - __float2half(0.0f), - __float2half(1.0f)); - }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float* data, size_t size_gen) - { return rocrand_generate_normal(gen, data, size_gen, 0.0f, 1.0f); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double* data, size_t size_gen) - { return rocrand_generate_normal_double(gen, data, size_gen, 0.0, 1.0); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half* data, size_t size_gen) - { - return rocrand_generate_log_normal_half(gen, - data, - size_gen, - __float2half(0.0f), - __float2half(1.0f)); - }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float* data, size_t size_gen) - { return rocrand_generate_log_normal(gen, data, size_gen, 0.0f, 1.0f); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double* data, size_t size_gen) - { return rocrand_generate_log_normal_double(gen, data, size_gen, 0.0, 1.0); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - for(auto lambda : poisson_lambdas) - { - const std::string poisson_dis_name - = std::string("poisson(lambda=") + std::to_string(lambda) + ")>"; - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + poisson_dis_name).c_str(), - &run_benchmark, - [lambda](rocrand_generator gen, unsigned int* data, size_t size_gen) - { return rocrand_generate_poisson(gen, data, size_gen, lambda); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - } - } - } - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); + // clang-format on + }; + + const std::string benchmark_name_prefix = "device_generate"; + // Add benchmarks + std::vector benchmarks = {}; + for (const rocrand_rng_type engine_type : benchmarked_engine_types) { + const std::string name = engine_name(engine_type); + for (const rocrand_ordering ordering : + benchmarked_orderings.at(engine_type)) { + const std::string name_engine_prefix = + benchmark_name_prefix + "<" + name + "," + + ordering_name_map.at(ordering) + ","; + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-uint>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned int *data, size_t size_gen) { + return rocrand_generate(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-uchar>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned char *data, size_t size_gen) { + return rocrand_generate_char(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-ushort>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned short *data, size_t size_gen) { + return rocrand_generate_short(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-half>").c_str(), + &run_benchmark<__half>, + [](rocrand_generator gen, __half *data, size_t size_gen) { + return rocrand_generate_uniform_half(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-float>").c_str(), + &run_benchmark, + [](rocrand_generator gen, float *data, size_t size_gen) { + return rocrand_generate_uniform(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double *data, size_t size_gen) { + return rocrand_generate_uniform_double(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-half>").c_str(), &run_benchmark<__half>, + [](rocrand_generator gen, __half *data, size_t size_gen) { + return rocrand_generate_normal_half( + gen, data, size_gen, __float2half(0.0f), __float2half(1.0f)); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-float>").c_str(), &run_benchmark, + [](rocrand_generator gen, float *data, size_t size_gen) { + return rocrand_generate_normal(gen, data, size_gen, 0.0f, 1.0f); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double *data, size_t size_gen) { + return rocrand_generate_normal_double(gen, data, size_gen, 0.0, + 1.0); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-half>").c_str(), + &run_benchmark<__half>, + [](rocrand_generator gen, __half *data, size_t size_gen) { + return rocrand_generate_log_normal_half( + gen, data, size_gen, __float2half(0.0f), __float2half(1.0f)); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-float>").c_str(), + &run_benchmark, + [](rocrand_generator gen, float *data, size_t size_gen) { + return rocrand_generate_log_normal(gen, data, size_gen, 0.0f, 1.0f); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double *data, size_t size_gen) { + return rocrand_generate_log_normal_double(gen, data, size_gen, 0.0, + 1.0); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + for (auto lambda : poisson_lambdas) { + const std::string poisson_dis_name = + std::string("poisson(lambda=") + std::to_string(lambda) + ")>"; + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + poisson_dis_name).c_str(), + &run_benchmark, + [lambda](rocrand_generator gen, unsigned int *data, + size_t size_gen) { + return rocrand_generate_poisson(gen, data, size_gen, lambda); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + } } + } + + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + if (outFormat == "csv") { + std::string spec = (filter == "" || filter == "all") ? "." : filter; + std::ofstream output_file; + + benchmark::ConsoleReporter console_reporter; + benchmark::customCSVReporter csv_reporter; + + auto &Err = console_reporter.GetErrorStream(); + + csv_reporter.SetOutputStream(&output_file); + csv_reporter.SetErrorStream(&Err); + + benchmark::BenchmarkReporter *console_ptr = &console_reporter; + benchmark::BenchmarkReporter *csv_ptr = &csv_reporter; + + benchmark::RunSpecifiedBenchmarks(console_ptr, csv_ptr, spec); + + } else { // Run benchmarks benchmark::RunSpecifiedBenchmarks(); - HIP_CHECK(hipStreamDestroy(stream)); + } + + HIP_CHECK(hipStreamDestroy(stream)); - return 0; + return 0; } diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp new file mode 100644 index 000000000..c5ecb04e9 --- /dev/null +++ b/benchmark/custom_csv_formater.hpp @@ -0,0 +1,182 @@ +#pragma once + +#include +#include +namespace benchmark { + +class customCSVReporter : public BenchmarkReporter { +public: + customCSVReporter() : printed_header_(false) {} + bool ReportContext(const Context &context) override; + void ReportRuns(const std::vector &reports) override; + +private: + std::string CsvEscape(const std::string &s) { + std::string tmp; + tmp.reserve(s.size() + 2); + for (char c : s) { + switch (c) { + case '"': + tmp += "\"\""; + break; + default: + tmp += c; + break; + } + } + return '"' + tmp + '"'; + } + + // Function to return an string for the calculated complexity + std::string GetBigOString(BigO complexity) { + switch (complexity) { + case oN: + return "N"; + case oNSquared: + return "N^2"; + case oNCubed: + return "N^3"; + case oLogN: + return "lgN"; + case oNLogN: + return "NlgN"; + case o1: + return "(1)"; + default: + return "f(N)"; + } + } + + void PrintRunData(const Run &report); + bool printed_header_; + std::set user_counter_names_; + + std::ostream *nullLog = nullptr; + + std::vector elements = { + "engine", "distribution", "name", "iterations", + "real_time", "cpu_time", "time_unit", "bytes_per_second", + "items_per_second", "label", "error_occurred", "error_message"}; +}; + +bool customCSVReporter::ReportContext(const Context &context) { + PrintBasicContext(&GetErrorStream(), context); + return true; +} + +void customCSVReporter::ReportRuns(const std::vector &reports) { + std::ostream &Out = GetOutputStream(); + + if (!printed_header_) { + // save the names of all the user counters + for (const auto &run : reports) { + for (const auto &cnt : run.counters) { + if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second") + continue; + user_counter_names_.insert(cnt.first); + } + } + + // print the header + for (auto B = elements.begin(); B != elements.end();) { + Out << *B++; + if (B != elements.end()) + Out << ","; + } + for (auto B = user_counter_names_.begin(); + B != user_counter_names_.end();) { + Out << ",\"" << *B++ << "\""; + } + Out << "\n"; + + printed_header_ = true; + } else { + // check that all the current counters are saved in the name set + for (const auto &run : reports) { + for (const auto &cnt : run.counters) { + if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second") + continue; + + // benchmark::internal::GetNullLogInstance() + *nullLog << "All counters must be present in each run. " + << "Counter named \"" << cnt.first + << "\" was not in a run after being added to the header"; + } + } + } + + // print results for each run + for (const auto &run : reports) { + PrintRunData(run); + } +} + +void customCSVReporter::PrintRunData(const Run &run) { + std::ostream &Out = GetOutputStream(); + std::ostream &Err = GetErrorStream(); + + //get the name of the engine and distribution: + + std::string temp = run.benchmark_name(); + temp.erase(0, temp.find("<") + 1); + + std::string engineName = std::string(temp.begin(), temp.begin() + temp.find(",")); + + temp.erase(0, engineName.size() + 1); + temp.erase(0, temp.find(",") + 1); + std::string disName = std::string(temp.begin(), temp.begin() + temp.find(">")); + + + Out << engineName << ","; + Out << disName << ","; + Out << CsvEscape(run.benchmark_name()) << ","; + if (run.error_occurred) { + Err << std::string(elements.size() - 3, ','); + Err << "true,"; + Err << CsvEscape(run.error_message) << "\n"; + return; + } + + // Do not print iteration on bigO and RMS report + if (!run.report_big_o && !run.report_rms) { + Out << run.iterations; + } + Out << ","; + + Out << run.GetAdjustedRealTime() << ","; + Out << run.GetAdjustedCPUTime() << ","; + + // Do not print timeLabel on bigO and RMS report + if (run.report_big_o) { + Out << GetBigOString(run.complexity); + } else if (!run.report_rms) { + Out << GetTimeUnitString(run.time_unit); + } + Out << ","; + + if (run.counters.find("bytes_per_second") != run.counters.end()) { + Out << run.counters.at("bytes_per_second"); + } + Out << ","; + if (run.counters.find("items_per_second") != run.counters.end()) { + Out << run.counters.at("items_per_second"); + } + Out << ","; + if (!run.report_label.empty()) { + Out << CsvEscape(run.report_label); + } + Out << ",,"; // for error_occurred and error_message + + // Print user counters + for (const auto &ucn : user_counter_names_) { + auto it = run.counters.find(ucn); + if (it == run.counters.end()) { + Out << ","; + } else { + Out << "," << it->second; + } + } + Out << '\n'; +} + +} // namespace benchmark From ac1a1678d7447d28adc8cbfaaff65c62a8ee1590 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 19 Aug 2024 11:08:07 -0600 Subject: [PATCH 02/10] implemented suggested changes --- benchmark/benchmark_rocrand_host_api.cpp | 6 ++++-- benchmark/custom_csv_formater.hpp | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark_rocrand_host_api.cpp b/benchmark/benchmark_rocrand_host_api.cpp index 8cac13d30..fde451ae5 100644 --- a/benchmark/benchmark_rocrand_host_api.cpp +++ b/benchmark/benchmark_rocrand_host_api.cpp @@ -35,7 +35,6 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; #endif -typedef std::unique_ptr PtrType; typedef rocrand_rng_type rng_type_t; template @@ -125,8 +124,11 @@ int main(int argc, char *argv[]) { std::string filter = ""; for (int i = 1; i < argc; i++) { std::string input(argv[i]); - int equalPos = input.find("="); + + if(equalPos < 0) + continue; + std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp index c5ecb04e9..25cac2477 100644 --- a/benchmark/custom_csv_formater.hpp +++ b/benchmark/custom_csv_formater.hpp @@ -28,7 +28,7 @@ class customCSVReporter : public BenchmarkReporter { } // Function to return an string for the calculated complexity - std::string GetBigOString(BigO complexity) { + std::string GetBigOString(const BigO complexity) { switch (complexity) { case oN: return "N"; @@ -53,7 +53,7 @@ class customCSVReporter : public BenchmarkReporter { std::ostream *nullLog = nullptr; - std::vector elements = { + std::array elements = { "engine", "distribution", "name", "iterations", "real_time", "cpu_time", "time_unit", "bytes_per_second", "items_per_second", "label", "error_occurred", "error_message"}; @@ -126,7 +126,6 @@ void customCSVReporter::PrintRunData(const Run &run) { temp.erase(0, temp.find(",") + 1); std::string disName = std::string(temp.begin(), temp.begin() + temp.find(">")); - Out << engineName << ","; Out << disName << ","; Out << CsvEscape(run.benchmark_name()) << ","; From b56cb562ce46f3c4244fc80754f2ce6b544feff2 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 19 Aug 2024 14:28:18 -0600 Subject: [PATCH 03/10] added the new columns for --benchmark_format=csv as well --- benchmark/benchmark_rocrand_device_api.cpp | 53 +--- benchmark/benchmark_rocrand_host_api.cpp | 53 +--- benchmark/benchmark_utils.hpp | 328 +++++++++++++-------- 3 files changed, 231 insertions(+), 203 deletions(-) diff --git a/benchmark/benchmark_rocrand_device_api.cpp b/benchmark/benchmark_rocrand_device_api.cpp index 59f087a95..fbe44bcb2 100644 --- a/benchmark/benchmark_rocrand_device_api.cpp +++ b/benchmark/benchmark_rocrand_device_api.cpp @@ -28,14 +28,14 @@ #include #include +#include "custom_csv_formater.hpp" #include +#include #include #include #include #include #include -#include -#include "custom_csv_formater.hpp" #ifndef DEFAULT_RAND_N #define DEFAULT_RAND_N (1024 * 1024 * 128) @@ -723,26 +723,13 @@ void add_benchmarks(const benchmark_context &ctx, const hipStream_t stream, } int main(int argc, char *argv[]) { - - // get the out format and out file name thats being passed into + // get paramaters before they are passed into // benchmark::Initialize() std::string outFormat = ""; - std::string outFile = ""; std::string filter = ""; - for (int i = 1; i < argc; i++) { - std::string input(argv[i]); - - int equalPos = input.find("="); - std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); - std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); + std::string consoleFormat = ""; - if (arg == "benchmark_out_format") - outFormat = argVal; - else if (arg == "benchmark_out") - outFile = argVal; - else if (arg == "benchmark_filter") - filter = argVal; - } + getFormats(argc, argv, outFormat, filter, consoleFormat); benchmark::Initialize(&argc, argv); @@ -818,27 +805,19 @@ int main(int argc, char *argv[]) { b->Unit(benchmark::kMillisecond); } - if (outFormat == "csv") { - std::string spec = (filter == "" || filter == "all") ? "." : filter; - std::ofstream output_file; + benchmark::BenchmarkReporter *console_reporter = + getConsoleReporter(consoleFormat); + benchmark::BenchmarkReporter *out_file_reporter = + getOutFileReporter(outFormat); - benchmark::ConsoleReporter console_reporter; - benchmark::customCSVReporter csv_reporter; + std::string spec = (filter == "" || filter == "all") ? "." : filter; - auto &Err = console_reporter.GetErrorStream(); - - csv_reporter.SetOutputStream(&output_file); - csv_reporter.SetErrorStream(&Err); - - benchmark::BenchmarkReporter *console_ptr = &console_reporter; - benchmark::BenchmarkReporter *csv_ptr = &csv_reporter; - - benchmark::RunSpecifiedBenchmarks(console_ptr, csv_ptr, spec); - - } else { - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - } + // Run benchmarks + if (outFormat == "") // default case + benchmark::RunSpecifiedBenchmarks(console_reporter, spec); + else + benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, + spec); HIP_CHECK(hipStreamDestroy(stream)); return 0; diff --git a/benchmark/benchmark_rocrand_host_api.cpp b/benchmark/benchmark_rocrand_host_api.cpp index fde451ae5..58cc41845 100644 --- a/benchmark/benchmark_rocrand_host_api.cpp +++ b/benchmark/benchmark_rocrand_host_api.cpp @@ -117,28 +117,13 @@ void run_benchmark(benchmark::State &state, generate_func_type generate_func, int main(int argc, char *argv[]) { - // get the out format and out file name thats being passed into + // get paramaters before they are passed into // benchmark::Initialize() std::string outFormat = ""; - std::string outFile = ""; std::string filter = ""; - for (int i = 1; i < argc; i++) { - std::string input(argv[i]); - int equalPos = input.find("="); - - if(equalPos < 0) - continue; - - std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); - std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); - - if (arg == "benchmark_out_format") - outFormat = argVal; - else if (arg == "benchmark_out") - outFile = argVal; - else if (arg == "benchmark_filter") - filter = argVal; - } + std::string consoleFormat = ""; + + getFormats(argc, argv, outFormat, filter, consoleFormat); // Parse argv benchmark::Initialize(&argc, argv); @@ -376,27 +361,19 @@ int main(int argc, char *argv[]) { b->Unit(benchmark::kMillisecond); } - if (outFormat == "csv") { - std::string spec = (filter == "" || filter == "all") ? "." : filter; - std::ofstream output_file; - - benchmark::ConsoleReporter console_reporter; - benchmark::customCSVReporter csv_reporter; - - auto &Err = console_reporter.GetErrorStream(); + benchmark::BenchmarkReporter *console_reporter = + getConsoleReporter(consoleFormat); + benchmark::BenchmarkReporter *out_file_reporter = + getOutFileReporter(outFormat); - csv_reporter.SetOutputStream(&output_file); - csv_reporter.SetErrorStream(&Err); + std::string spec = (filter == "" || filter == "all") ? "." : filter; - benchmark::BenchmarkReporter *console_ptr = &console_reporter; - benchmark::BenchmarkReporter *csv_ptr = &csv_reporter; - - benchmark::RunSpecifiedBenchmarks(console_ptr, csv_ptr, spec); - - } else { - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - } + // Run benchmarks + if (outFormat == "") // default case + benchmark::RunSpecifiedBenchmarks(console_reporter, spec); + else + benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, + spec); HIP_CHECK(hipStreamDestroy(stream)); diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index aa4007610..50d36ad10 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -27,136 +27,208 @@ #include #include -#define HIP_CHECK(condition) \ - do \ - { \ - hipError_t error_ = condition; \ - if(error_ != hipSuccess) \ - { \ - std::cout << "HIP error: " << error_ << " line: " << __LINE__ << std::endl; \ - exit(error_); \ - } \ - } \ - while(0) - -inline void add_common_benchmark_info() -{ - auto str = [](const std::string& name, const std::string& val) - { benchmark::AddCustomContext(name, val); }; - - auto num = [](const std::string& name, const int& value) - { benchmark::AddCustomContext(name, std::to_string(value)); }; - - auto dim2 = [num](const std::string& name, const int* values) - { - num(name + "_x", values[0]); - num(name + "_y", values[1]); - }; - - auto dim3 = [num, dim2](const std::string& name, const int* values) - { - dim2(name, values); - num(name + "_z", values[2]); - }; - - auto num_size_t = [](const std::string& name, const size_t& value) - { benchmark::AddCustomContext(name, std::to_string(value)); }; - - int runtime_version; - HIP_CHECK(hipRuntimeGetVersion(&runtime_version)); - num("hip_runtime_version", runtime_version); - - // On the NVIDIA platform not all members of this struct will be written to - // Zero-initialize to avoid referencing dangling memory - hipDeviceProp_t devProp{}; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - str("hdp_name", devProp.name); - num_size_t("hdp_total_global_mem", devProp.totalGlobalMem); - num_size_t("hdp_shared_mem_per_block", devProp.sharedMemPerBlock); - num("hdp_regs_per_block", devProp.regsPerBlock); - num("hdp_warp_size", devProp.warpSize); - num("hdp_max_threads_per_block", devProp.maxThreadsPerBlock); - dim3("hdp_max_threads_dim", devProp.maxThreadsDim); - dim3("hdp_max_grid_size", devProp.maxGridSize); - num("hdp_clock_rate", devProp.clockRate); - num("hdp_memory_clock_rate", devProp.memoryClockRate); - num("hdp_memory_bus_width", devProp.memoryBusWidth); - num_size_t("hdp_total_const_mem", devProp.totalConstMem); - num("hdp_major", devProp.major); - num("hdp_minor", devProp.minor); - num("hdp_multi_processor_count", devProp.multiProcessorCount); - num("hdp_l2_cache_size", devProp.l2CacheSize); - num_size_t("hdp_max_threads_per_multiprocessor", devProp.maxThreadsPerMultiProcessor); - num("hdp_compute_mode", devProp.computeMode); - num("hdp_clock_instruction_rate", devProp.clockInstructionRate); - num("hdp_concurrent_kernels", devProp.concurrentKernels); - num("hdp_pci_domain_id", devProp.pciDomainID); - num("hdp_pci_bus_id", devProp.pciBusID); - num("hdp_pci_device_id", devProp.pciDeviceID); - num("hdp_max_shared_memory_per_multi_processor", devProp.maxSharedMemoryPerMultiProcessor); - num("hdp_is_multi_gpu_board", devProp.isMultiGpuBoard); - num("hdp_can_map_host_memory", devProp.canMapHostMemory); - str("hdp_gcn_arch_name", devProp.gcnArchName); - num("hdp_integrated", devProp.integrated); - num("hdp_cooperative_launch", devProp.cooperativeLaunch); - num("hdp_cooperative_multi_device_launch", devProp.cooperativeMultiDeviceLaunch); - num_size_t("hdp_max_texture_1d_linear", devProp.maxTexture1DLinear); - num("hdp_max_texture_1d", devProp.maxTexture1D); - dim2("hdp_max_texture_2d", devProp.maxTexture2D); - dim3("hdp_max_texture_3d", devProp.maxTexture3D); - num_size_t("hdp_mem_pitch", devProp.memPitch); - num("hdp_texture_alignment", devProp.textureAlignment); - num("hdp_texture_pitch_alignment", devProp.texturePitchAlignment); - num("hdp_kernel_exec_timeout_enabled", devProp.kernelExecTimeoutEnabled); - num("hdp_ecc_enabled", devProp.ECCEnabled); - num("hdp_tcc_driver", devProp.tccDriver); - num("hdp_cooperative_multi_device_unmatched_func", devProp.cooperativeMultiDeviceUnmatchedFunc); - num("hdp_cooperative_multi_device_unmatched_grid_dim", - devProp.cooperativeMultiDeviceUnmatchedGridDim); - num("hdp_cooperative_multi_device_unmatched_block_dim", - devProp.cooperativeMultiDeviceUnmatchedBlockDim); - num("hdp_cooperative_multi_device_unmatched_shared_mem", - devProp.cooperativeMultiDeviceUnmatchedSharedMem); - num("hdp_is_large_bar", devProp.isLargeBar); - num("hdp_asic_revision", devProp.asicRevision); - num("hdp_managed_memory", devProp.managedMemory); - num("hdp_direct_managed_mem_access_from_host", devProp.directManagedMemAccessFromHost); - num("hdp_concurrent_managed_access", devProp.concurrentManagedAccess); - num("hdp_pageable_memory_access", devProp.pageableMemoryAccess); - num("hdp_pageable_memory_access_uses_host_page_tables", - devProp.pageableMemoryAccessUsesHostPageTables); - - const auto arch = devProp.arch; - num("hdp_arch_has_global_int32_atomics", arch.hasGlobalInt32Atomics); - num("hdp_arch_has_global_float_atomic_exch", arch.hasGlobalFloatAtomicExch); - num("hdp_arch_has_shared_int32_atomics", arch.hasSharedInt32Atomics); - num("hdp_arch_has_shared_float_atomic_exch", arch.hasSharedFloatAtomicExch); - num("hdp_arch_has_float_atomic_add", arch.hasFloatAtomicAdd); - num("hdp_arch_has_global_int64_atomics", arch.hasGlobalInt64Atomics); - num("hdp_arch_has_shared_int64_atomics", arch.hasSharedInt64Atomics); - num("hdp_arch_has_doubles", arch.hasDoubles); - num("hdp_arch_has_warp_vote", arch.hasWarpVote); - num("hdp_arch_has_warp_ballot", arch.hasWarpBallot); - num("hdp_arch_has_warp_shuffle", arch.hasWarpShuffle); - num("hdp_arch_has_funnel_shift", arch.hasFunnelShift); - num("hdp_arch_has_thread_fence_system", arch.hasThreadFenceSystem); - num("hdp_arch_has_sync_threads_ext", arch.hasSyncThreadsExt); - num("hdp_arch_has_surface_funcs", arch.hasSurfaceFuncs); - num("hdp_arch_has_3d_grid", arch.has3dGrid); - num("hdp_arch_has_dynamic_parallelism", arch.hasDynamicParallelism); +#include "custom_csv_formater.hpp" +#include + +#define HIP_CHECK(condition) \ + do { \ + hipError_t error_ = condition; \ + if (error_ != hipSuccess) { \ + std::cout << "HIP error: " << error_ << " line: " << __LINE__ \ + << std::endl; \ + exit(error_); \ + } \ + } while (0) + +inline void add_common_benchmark_info() { + auto str = [](const std::string &name, const std::string &val) { + benchmark::AddCustomContext(name, val); + }; + + auto num = [](const std::string &name, const int &value) { + benchmark::AddCustomContext(name, std::to_string(value)); + }; + + auto dim2 = [num](const std::string &name, const int *values) { + num(name + "_x", values[0]); + num(name + "_y", values[1]); + }; + + auto dim3 = [num, dim2](const std::string &name, const int *values) { + dim2(name, values); + num(name + "_z", values[2]); + }; + + auto num_size_t = [](const std::string &name, const size_t &value) { + benchmark::AddCustomContext(name, std::to_string(value)); + }; + + int runtime_version; + HIP_CHECK(hipRuntimeGetVersion(&runtime_version)); + num("hip_runtime_version", runtime_version); + + // On the NVIDIA platform not all members of this struct will be written to + // Zero-initialize to avoid referencing dangling memory + hipDeviceProp_t devProp{}; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + str("hdp_name", devProp.name); + num_size_t("hdp_total_global_mem", devProp.totalGlobalMem); + num_size_t("hdp_shared_mem_per_block", devProp.sharedMemPerBlock); + num("hdp_regs_per_block", devProp.regsPerBlock); + num("hdp_warp_size", devProp.warpSize); + num("hdp_max_threads_per_block", devProp.maxThreadsPerBlock); + dim3("hdp_max_threads_dim", devProp.maxThreadsDim); + dim3("hdp_max_grid_size", devProp.maxGridSize); + num("hdp_clock_rate", devProp.clockRate); + num("hdp_memory_clock_rate", devProp.memoryClockRate); + num("hdp_memory_bus_width", devProp.memoryBusWidth); + num_size_t("hdp_total_const_mem", devProp.totalConstMem); + num("hdp_major", devProp.major); + num("hdp_minor", devProp.minor); + num("hdp_multi_processor_count", devProp.multiProcessorCount); + num("hdp_l2_cache_size", devProp.l2CacheSize); + num_size_t("hdp_max_threads_per_multiprocessor", + devProp.maxThreadsPerMultiProcessor); + num("hdp_compute_mode", devProp.computeMode); + num("hdp_clock_instruction_rate", devProp.clockInstructionRate); + num("hdp_concurrent_kernels", devProp.concurrentKernels); + num("hdp_pci_domain_id", devProp.pciDomainID); + num("hdp_pci_bus_id", devProp.pciBusID); + num("hdp_pci_device_id", devProp.pciDeviceID); + num("hdp_max_shared_memory_per_multi_processor", + devProp.maxSharedMemoryPerMultiProcessor); + num("hdp_is_multi_gpu_board", devProp.isMultiGpuBoard); + num("hdp_can_map_host_memory", devProp.canMapHostMemory); + str("hdp_gcn_arch_name", devProp.gcnArchName); + num("hdp_integrated", devProp.integrated); + num("hdp_cooperative_launch", devProp.cooperativeLaunch); + num("hdp_cooperative_multi_device_launch", + devProp.cooperativeMultiDeviceLaunch); + num_size_t("hdp_max_texture_1d_linear", devProp.maxTexture1DLinear); + num("hdp_max_texture_1d", devProp.maxTexture1D); + dim2("hdp_max_texture_2d", devProp.maxTexture2D); + dim3("hdp_max_texture_3d", devProp.maxTexture3D); + num_size_t("hdp_mem_pitch", devProp.memPitch); + num("hdp_texture_alignment", devProp.textureAlignment); + num("hdp_texture_pitch_alignment", devProp.texturePitchAlignment); + num("hdp_kernel_exec_timeout_enabled", devProp.kernelExecTimeoutEnabled); + num("hdp_ecc_enabled", devProp.ECCEnabled); + num("hdp_tcc_driver", devProp.tccDriver); + num("hdp_cooperative_multi_device_unmatched_func", + devProp.cooperativeMultiDeviceUnmatchedFunc); + num("hdp_cooperative_multi_device_unmatched_grid_dim", + devProp.cooperativeMultiDeviceUnmatchedGridDim); + num("hdp_cooperative_multi_device_unmatched_block_dim", + devProp.cooperativeMultiDeviceUnmatchedBlockDim); + num("hdp_cooperative_multi_device_unmatched_shared_mem", + devProp.cooperativeMultiDeviceUnmatchedSharedMem); + num("hdp_is_large_bar", devProp.isLargeBar); + num("hdp_asic_revision", devProp.asicRevision); + num("hdp_managed_memory", devProp.managedMemory); + num("hdp_direct_managed_mem_access_from_host", + devProp.directManagedMemAccessFromHost); + num("hdp_concurrent_managed_access", devProp.concurrentManagedAccess); + num("hdp_pageable_memory_access", devProp.pageableMemoryAccess); + num("hdp_pageable_memory_access_uses_host_page_tables", + devProp.pageableMemoryAccessUsesHostPageTables); + + const auto arch = devProp.arch; + num("hdp_arch_has_global_int32_atomics", arch.hasGlobalInt32Atomics); + num("hdp_arch_has_global_float_atomic_exch", arch.hasGlobalFloatAtomicExch); + num("hdp_arch_has_shared_int32_atomics", arch.hasSharedInt32Atomics); + num("hdp_arch_has_shared_float_atomic_exch", arch.hasSharedFloatAtomicExch); + num("hdp_arch_has_float_atomic_add", arch.hasFloatAtomicAdd); + num("hdp_arch_has_global_int64_atomics", arch.hasGlobalInt64Atomics); + num("hdp_arch_has_shared_int64_atomics", arch.hasSharedInt64Atomics); + num("hdp_arch_has_doubles", arch.hasDoubles); + num("hdp_arch_has_warp_vote", arch.hasWarpVote); + num("hdp_arch_has_warp_ballot", arch.hasWarpBallot); + num("hdp_arch_has_warp_shuffle", arch.hasWarpShuffle); + num("hdp_arch_has_funnel_shift", arch.hasFunnelShift); + num("hdp_arch_has_thread_fence_system", arch.hasThreadFenceSystem); + num("hdp_arch_has_sync_threads_ext", arch.hasSyncThreadsExt); + num("hdp_arch_has_surface_funcs", arch.hasSurfaceFuncs); + num("hdp_arch_has_3d_grid", arch.has3dGrid); + num("hdp_arch_has_dynamic_parallelism", arch.hasDynamicParallelism); } -inline size_t next_power2(size_t x) -{ - size_t power = 1; - while(power < x) - { - power *= 2; - } - return power; +inline size_t next_power2(size_t x) { + size_t power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +inline benchmark::BenchmarkReporter * +getConsoleReporter(const std::string format) { + benchmark::BenchmarkReporter *reporter; + if (format == "csv") { + static benchmark::customCSVReporter csv_reporter; + csv_reporter.SetErrorStream(&std::cout); + csv_reporter.SetOutputStream(&std::cout); + reporter = &csv_reporter; + } else if (format == "json") { + static benchmark::customCSVReporter json_reporter; + json_reporter.SetErrorStream(&std::cout); + json_reporter.SetOutputStream(&std::cout); + reporter = &json_reporter; + } else { + static benchmark::ConsoleReporter terminal_reporter; + terminal_reporter.SetErrorStream(&std::cout); + terminal_reporter.SetOutputStream(&std::cout); + reporter = &terminal_reporter; + } + + return reporter; +} + +inline benchmark::BenchmarkReporter * +getOutFileReporter(const std::string format) { + benchmark::BenchmarkReporter *reporter = nullptr; + std::ofstream output_file; + if (format == "csv") { + static benchmark::customCSVReporter csv_reporter; + csv_reporter.SetOutputStream(&output_file); + csv_reporter.SetErrorStream(&output_file); + reporter = &csv_reporter; + } else if (format == "json") { + static benchmark::JSONReporter json_reporter; + json_reporter.SetOutputStream(&output_file); + json_reporter.SetErrorStream(&output_file); + reporter = &json_reporter; + } else if (format == "console") { + static benchmark::ConsoleReporter console_reporter; + console_reporter.SetOutputStream(&output_file); + console_reporter.SetErrorStream(&output_file); + reporter = &console_reporter; + } + + return reporter; +} + +inline void getFormats(const int argc, char *argv[], std::string &outFormat, + std::string &filter, std::string &consoleFormat) { + for (int i = 1; i < argc; i++) { + std::string input(argv[i]); + int equalPos = input.find("="); + + if (equalPos < 0) + continue; + + std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); + std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); + + if (arg == "benchmark_out_format") + outFormat = argVal; + else if (arg == "benchmark_filter") + filter = argVal; + else if (arg == "benchmark_format") + consoleFormat = argVal; + } } #endif // ROCRAND_BENCHMARK_UTILS_HPP_ From 1ffdc8142df5392a77aae1ffce97c49580c35dd0 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 19 Aug 2024 14:30:01 -0600 Subject: [PATCH 04/10] updated change log --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cdb1cdf4..5b538c35c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ Documentation for rocRAND is available at * Added host generator for MT19937 * Support for `rocrand_generate_poisson` in hipGraphs +* Added engine and distribution columns for csv format in benchmark_rocrand_host_api and + benchmark_rocrand_device_api. To see these new columns set --benchmark_format=csv or + --benchmark_out_format=csv --benchmark_out="outName.csv" ### Changes From c5d59b498de5b6be21c2f40ecd4c81b21429f14a Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 20 Aug 2024 09:15:42 -0600 Subject: [PATCH 05/10] updated copyright --- benchmark/benchmark_utils.hpp | 2 +- benchmark/custom_csv_formater.hpp | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 50d36ad10..1f3a6f205 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp index 25cac2477..c48a068e2 100644 --- a/benchmark/custom_csv_formater.hpp +++ b/benchmark/custom_csv_formater.hpp @@ -1,3 +1,23 @@ +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + #pragma once #include From 82e975c85aa8a961244b5d3369f304ffd86b295d Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 20 Aug 2024 10:13:02 -0600 Subject: [PATCH 06/10] reformated with .clang-format file --- benchmark/benchmark_rocrand_device_api.cpp | 1627 ++++++++++++-------- benchmark/benchmark_rocrand_generate.cpp | 492 +++--- benchmark/benchmark_rocrand_host_api.cpp | 720 +++++---- benchmark/benchmark_utils.hpp | 396 ++--- benchmark/cmdparser.hpp | 892 ++++++----- benchmark/custom_csv_formater.hpp | 338 ++-- 6 files changed, 2527 insertions(+), 1938 deletions(-) diff --git a/benchmark/benchmark_rocrand_device_api.cpp b/benchmark/benchmark_rocrand_device_api.cpp index fbe44bcb2..b38430efc 100644 --- a/benchmark/benchmark_rocrand_device_api.cpp +++ b/benchmark/benchmark_rocrand_device_api.cpp @@ -38,787 +38,1066 @@ #include #ifndef DEFAULT_RAND_N -#define DEFAULT_RAND_N (1024 * 1024 * 128) + #define DEFAULT_RAND_N (1024 * 1024 * 128) #endif -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_kernel( - EngineState *states, const unsigned long long seed, - const unsigned long long offset) { - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(seed, state_id, offset, &state); - states[state_id] = state; +template +__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) +void init_kernel(EngineState* states, + const unsigned long long seed, + const unsigned long long offset) +{ + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(seed, state_id, offset, &state); + states[state_id] = state; } -template +template __global__ -__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( - EngineState *states, T *data, const size_t size, Generator generator) { - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int stride = gridDim.x * blockDim.x; - - EngineState state = states[state_id]; - unsigned int index = state_id; - while (index < size) { - data[index] = generator(&state); - index += stride; - } - states[state_id] = state; +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) +void generate_kernel(EngineState* states, T* data, const size_t size, Generator generator) +{ + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = gridDim.x * blockDim.x; + + EngineState state = states[state_id]; + unsigned int index = state_id; + while(index < size) + { + data[index] = generator(&state); + index += stride; + } + states[state_id] = state; } -template struct runner { - EngineState *states; - - runner(const size_t /* dimensions */, const size_t blocks, - const size_t threads, const unsigned long long seed, - const unsigned long long offset) { - const size_t states_size = blocks * threads; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState))); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(blocks), - dim3(threads), 0, 0, states, seed, offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } +template +struct runner +{ + EngineState* states; + + runner(const size_t /* dimensions */, + const size_t blocks, + const size_t threads, + const unsigned long long seed, + const unsigned long long offset) + { + const size_t states_size = blocks * threads; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState))); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + seed, + offset); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } - ~runner() { HIP_CHECK(hipFree(states)); } + ~runner() + { + HIP_CHECK(hipFree(states)); + } - template - void generate(const size_t blocks, const size_t threads, hipStream_t stream, - T *data, const size_t size, const Generator &generator) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks), - dim3(threads), 0, stream, states, data, size, generator); - } + template + void generate(const size_t blocks, + const size_t threads, + hipStream_t stream, + T* data, + const size_t size, + const Generator& generator) + { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), + dim3(blocks), + dim3(threads), + 0, + stream, + states, + data, + size, + generator); + } }; -template +template __global__ -__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( - rocrand_state_mtgp32 *states, T *data, const size_t size, - Generator generator) { - const unsigned int state_id = blockIdx.x; - unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int stride = gridDim.x * blockDim.x; - - __shared__ rocrand_state_mtgp32 state; - rocrand_mtgp32_block_copy(&states[state_id], &state); - - const size_t r = size % blockDim.x; - const size_t size_rounded_down = size - r; - const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x; - while (index < size_rounded_down) { - data[index] = generator(&state); - index += stride; - } - while (index < size_rounded_up) { - auto value = generator(&state); - if (index < size) - data[index] = value; - index += stride; - } - - rocrand_mtgp32_block_copy(&state, &states[state_id]); -} - -template <> struct runner { - rocrand_state_mtgp32 *states; - - runner(const size_t /* dimensions */, const size_t blocks, - const size_t /* threads */, const unsigned long long seed, - const unsigned long long /* offset */) { - const size_t states_size = std::min((size_t)200, blocks); - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_mtgp32))); - - ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, - states_size, seed)); - } - - ~runner() { HIP_CHECK(hipFree(states)); } - - template - void generate(const size_t blocks, const size_t /* threads */, - hipStream_t stream, T *data, const size_t size, - const Generator &generator) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), - dim3(std::min((size_t)200, blocks)), dim3(256), 0, - stream, states, data, size, generator); - } -}; +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) +void generate_kernel(rocrand_state_mtgp32* states, T* data, const size_t size, Generator generator) +{ + const unsigned int state_id = blockIdx.x; + unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int stride = gridDim.x * blockDim.x; + + __shared__ rocrand_state_mtgp32 state; + rocrand_mtgp32_block_copy(&states[state_id], &state); + + const size_t r = size % blockDim.x; + const size_t size_rounded_down = size - r; + const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x; + while(index < size_rounded_down) + { + data[index] = generator(&state); + index += stride; + } + while(index < size_rounded_up) + { + auto value = generator(&state); + if(index < size) + data[index] = value; + index += stride; + } -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_kernel( - rocrand_state_lfsr113 *states, const uint4 seed) { - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - rocrand_state_lfsr113 state; - rocrand_init(seed, state_id, &state); - states[state_id] = state; + rocrand_mtgp32_block_copy(&state, &states[state_id]); } -template <> struct runner { - rocrand_state_lfsr113 *states; +template<> +struct runner +{ + rocrand_state_mtgp32* states; + + runner(const size_t /* dimensions */, + const size_t blocks, + const size_t /* threads */, + const unsigned long long seed, + const unsigned long long /* offset */) + { + const size_t states_size = std::min((size_t)200, blocks); + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_mtgp32))); + + ROCRAND_CHECK( + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, states_size, seed)); + } - runner(const size_t /* dimensions */, const size_t blocks, - const size_t threads, const unsigned long long /* seed */, - const unsigned long long /* offset */) { - const size_t states_size = blocks * threads; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113))); + ~runner() + { + HIP_CHECK(hipFree(states)); + } - hipLaunchKernelGGL( - HIP_KERNEL_NAME(init_kernel), dim3(blocks), dim3(threads), 0, 0, states, - uint4{ROCRAND_LFSR113_DEFAULT_SEED_X, ROCRAND_LFSR113_DEFAULT_SEED_Y, - ROCRAND_LFSR113_DEFAULT_SEED_Z, ROCRAND_LFSR113_DEFAULT_SEED_W}); + template + void generate(const size_t blocks, + const size_t /* threads */, + hipStream_t stream, + T* data, + const size_t size, + const Generator& generator) + { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), + dim3(std::min((size_t)200, blocks)), + dim3(256), + 0, + stream, + states, + data, + size, + generator); + } +}; - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } +__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) +void init_kernel(rocrand_state_lfsr113* states, const uint4 seed) +{ + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + rocrand_state_lfsr113 state; + rocrand_init(seed, state_id, &state); + states[state_id] = state; +} - ~runner() { HIP_CHECK(hipFree(states)); } +template<> +struct runner +{ + rocrand_state_lfsr113* states; + + runner(const size_t /* dimensions */, + const size_t blocks, + const size_t threads, + const unsigned long long /* seed */, + const unsigned long long /* offset */) + { + const size_t states_size = blocks * threads; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113))); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + uint4{ROCRAND_LFSR113_DEFAULT_SEED_X, + ROCRAND_LFSR113_DEFAULT_SEED_Y, + ROCRAND_LFSR113_DEFAULT_SEED_Z, + ROCRAND_LFSR113_DEFAULT_SEED_W}); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } - template - void generate(const size_t blocks, const size_t threads, hipStream_t stream, - T *data, const size_t size, const Generator &generator) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks), - dim3(threads), 0, stream, states, data, size, generator); - } + ~runner() + { + HIP_CHECK(hipFree(states)); + } + + template + void generate(const size_t blocks, + const size_t threads, + hipStream_t stream, + T* data, + const size_t size, + const Generator& generator) + { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), + dim3(blocks), + dim3(threads), + 0, + stream, + states, + data, + size, + generator); + } }; -template +template __global__ -__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_kernel( - EngineState *states, SobolType *directions, SobolType offset) { - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(&directions[dimension * sizeof(SobolType) * 8], - offset + state_id, &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) +void init_sobol_kernel(EngineState* states, SobolType* directions, SobolType offset) +{ + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(&directions[dimension * sizeof(SobolType) * 8], offset + state_id, &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } -template +template __global__ -__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled_sobol_kernel( - EngineState *states, SobolType *directions, SobolType *scramble_constants, - SobolType offset) { - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(&directions[dimension * sizeof(SobolType) * 8], - scramble_constants[dimension], offset + state_id, &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) +void init_scrambled_sobol_kernel(EngineState* states, + SobolType* directions, + SobolType* scramble_constants, + SobolType offset) +{ + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(&directions[dimension * sizeof(SobolType) * 8], + scramble_constants[dimension], + offset + state_id, + &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } // generate_kernel for the normal and scrambled sobol generators -template +template __global__ -__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_sobol_kernel( - EngineState *states, T *data, const size_t size, Generator generator) { - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int stride = gridDim.x * blockDim.x; - - EngineState state = states[gridDim.x * blockDim.x * dimension + state_id]; - const size_t offset = dimension * size; - unsigned int index = state_id; - while (index < size) { - data[offset + index] = generator(&state); - skipahead(stride - 1, &state); - index += stride; - } - state = states[gridDim.x * blockDim.x * dimension + state_id]; - skipahead(static_cast(size), &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) +void generate_sobol_kernel(EngineState* states, T* data, const size_t size, Generator generator) +{ + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = gridDim.x * blockDim.x; + + EngineState state = states[gridDim.x * blockDim.x * dimension + state_id]; + const size_t offset = dimension * size; + unsigned int index = state_id; + while(index < size) + { + data[offset + index] = generator(&state); + skipahead(stride - 1, &state); + index += stride; + } + state = states[gridDim.x * blockDim.x * dimension + state_id]; + skipahead(static_cast(size), &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } -template <> struct runner { - rocrand_state_sobol32 *states; - size_t dimensions; - - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long /* seed */, const unsigned long long offset) { - this->dimensions = dimensions; - - const unsigned int *h_directions; - ROCRAND_CHECK(rocrand_get_direction_vectors32( - &h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol32))); - - unsigned int *directions; - const size_t size = dimensions * 32 * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&directions, size)); - HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, - directions, static_cast(offset)); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - } - - ~runner() { HIP_CHECK(hipFree(states)); } +template<> +struct runner +{ + rocrand_state_sobol32* states; + size_t dimensions; + + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long /* seed */, + const unsigned long long offset) + { + this->dimensions = dimensions; + + const unsigned int* h_directions; + ROCRAND_CHECK( + rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol32))); + + unsigned int* directions; + const size_t size = dimensions * 32 * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&directions, size)); + HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + 0, + states, + directions, + static_cast(offset)); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + } - template - void generate(const size_t blocks, const size_t threads, hipStream_t stream, - T *data, const size_t size, const Generator &generator) { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, stream, - states, data, size / dimensions, generator); - } -}; + ~runner() + { + HIP_CHECK(hipFree(states)); + } -template <> struct runner { - rocrand_state_scrambled_sobol32 *states; - size_t dimensions; - - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long /* seed */, const unsigned long long offset) { - this->dimensions = dimensions; - - const unsigned int *h_directions; - const unsigned int *h_constants; - - ROCRAND_CHECK(rocrand_get_direction_vectors32( - &h_directions, ROCRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)); - ROCRAND_CHECK(rocrand_get_scramble_constants32(&h_constants)); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, - states_size * sizeof(rocrand_state_scrambled_sobol32))); - - unsigned int *directions; - const size_t directions_size = dimensions * 32 * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&directions, directions_size)); - HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, - hipMemcpyHostToDevice)); - - unsigned int *scramble_constants; - const size_t constants_size = dimensions * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); - HIP_CHECK(hipMemcpy(scramble_constants, h_constants, constants_size, - hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, - directions, scramble_constants, - static_cast(offset)); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - HIP_CHECK(hipFree(scramble_constants)); - } - - ~runner() { HIP_CHECK(hipFree(states)); } - - template - void generate(const size_t blocks, const size_t threads, hipStream_t stream, - T *data, const size_t size, const Generator &generator) { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, stream, - states, data, size / dimensions, generator); - } + template + void generate(const size_t blocks, + const size_t threads, + hipStream_t stream, + T* data, + const size_t size, + const Generator& generator) + { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + stream, + states, + data, + size / dimensions, + generator); + } }; -template <> struct runner { - rocrand_state_sobol64 *states; - size_t dimensions; - - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long /* seed */, const unsigned long long offset) { - this->dimensions = dimensions; - - const unsigned long long *h_directions; - rocrand_get_direction_vectors64(&h_directions, - ROCRAND_DIRECTION_VECTORS_64_JOEKUO6); +template<> +struct runner +{ + rocrand_state_scrambled_sobol32* states; + size_t dimensions; + + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long /* seed */, + const unsigned long long offset) + { + this->dimensions = dimensions; + + const unsigned int* h_directions; + const unsigned int* h_constants; + + ROCRAND_CHECK( + rocrand_get_direction_vectors32(&h_directions, + ROCRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)); + ROCRAND_CHECK(rocrand_get_scramble_constants32(&h_constants)); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol32))); + + unsigned int* directions; + const size_t directions_size = dimensions * 32 * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&directions, directions_size)); + HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, hipMemcpyHostToDevice)); + + unsigned int* scramble_constants; + const size_t constants_size = dimensions * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); + HIP_CHECK( + hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + 0, + states, + directions, + scramble_constants, + static_cast(offset)); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + HIP_CHECK(hipFree(scramble_constants)); + } - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64))); + ~runner() + { + HIP_CHECK(hipFree(states)); + } - unsigned long long int *directions; - const size_t size = dimensions * 64 * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&directions, size)); - HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); + template + void generate(const size_t blocks, + const size_t threads, + hipStream_t stream, + T* data, + const size_t size, + const Generator& generator) + { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + stream, + states, + data, + size / dimensions, + generator); + } +}; - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, - directions, offset); +template<> +struct runner +{ + rocrand_state_sobol64* states; + size_t dimensions; + + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long /* seed */, + const unsigned long long offset) + { + this->dimensions = dimensions; + + const unsigned long long* h_directions; + rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64))); + + unsigned long long int* directions; + const size_t size = dimensions * 64 * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&directions, size)); + HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + 0, + states, + directions, + offset); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + } - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); + ~runner() + { + HIP_CHECK(hipFree(states)); + } - HIP_CHECK(hipFree(directions)); - } + template + void generate(const size_t blocks, + const size_t threads, + hipStream_t stream, + T* data, + const size_t size, + const Generator& generator) + { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + stream, + states, + data, + size / dimensions, + generator); + } +}; - ~runner() { HIP_CHECK(hipFree(states)); } +template<> +struct runner +{ + rocrand_state_scrambled_sobol64* states; + size_t dimensions; + + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long /* seed */, + const unsigned long long offset) + { + this->dimensions = dimensions; + + const unsigned long long* h_directions; + const unsigned long long* h_constants; + + rocrand_get_direction_vectors64(&h_directions, + ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6); + rocrand_get_scramble_constants64(&h_constants); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol64))); + + unsigned long long int* directions; + const size_t directions_size = dimensions * 64 * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&directions, directions_size)); + HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, hipMemcpyHostToDevice)); + + unsigned long long int* scramble_constants; + const size_t constants_size = dimensions * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); + HIP_CHECK( + hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + 0, + states, + directions, + scramble_constants, + offset); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + HIP_CHECK(hipFree(scramble_constants)); + } - template - void generate(const size_t blocks, const size_t threads, hipStream_t stream, - T *data, const size_t size, const Generator &generator) { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, stream, - states, data, size / dimensions, generator); - } -}; + ~runner() + { + HIP_CHECK(hipFree(states)); + } -template <> struct runner { - rocrand_state_scrambled_sobol64 *states; - size_t dimensions; - - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long /* seed */, const unsigned long long offset) { - this->dimensions = dimensions; - - const unsigned long long *h_directions; - const unsigned long long *h_constants; - - rocrand_get_direction_vectors64( - &h_directions, ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6); - rocrand_get_scramble_constants64(&h_constants); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, - states_size * sizeof(rocrand_state_scrambled_sobol64))); - - unsigned long long int *directions; - const size_t directions_size = - dimensions * 64 * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&directions, directions_size)); - HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, - hipMemcpyHostToDevice)); - - unsigned long long int *scramble_constants; - const size_t constants_size = dimensions * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); - HIP_CHECK(hipMemcpy(scramble_constants, h_constants, constants_size, - hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, - directions, scramble_constants, offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - HIP_CHECK(hipFree(scramble_constants)); - } - - ~runner() { HIP_CHECK(hipFree(states)); } - - template - void generate(const size_t blocks, const size_t threads, hipStream_t stream, - T *data, const size_t size, const Generator &generator) { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), dim3(threads), 0, stream, - states, data, size / dimensions, generator); - } + template + void generate(const size_t blocks, + const size_t threads, + hipStream_t stream, + T* data, + const size_t size, + const Generator& generator) + { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), + dim3(threads), + 0, + stream, + states, + data, + size / dimensions, + generator); + } }; // Provide optional create and destroy functions for the generators. -struct generator_type { - static void create() {} +struct generator_type +{ + static void create() {} - static void destroy() {} + static void destroy() {} }; -template struct generator_uint : public generator_type { - typedef unsigned int data_type; +template +struct generator_uint : public generator_type +{ + typedef unsigned int data_type; - std::string name() { return "uniform-uint"; } + std::string name() + { + return "uniform-uint"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand(state); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand(state); + } }; -template struct generator_ullong : public generator_type { - typedef unsigned long long int data_type; +template +struct generator_ullong : public generator_type +{ + typedef unsigned long long int data_type; - std::string name() { return "uniform-ullong"; } + std::string name() + { + return "uniform-ullong"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand(state); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand(state); + } }; -template struct generator_uniform : public generator_type { - typedef float data_type; +template +struct generator_uniform : public generator_type +{ + typedef float data_type; - std::string name() { return "uniform-float"; } + std::string name() + { + return "uniform-float"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand_uniform(state); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand_uniform(state); + } }; -template -struct generator_uniform_double : public generator_type { - typedef double data_type; +template +struct generator_uniform_double : public generator_type +{ + typedef double data_type; - std::string name() { return "uniform-double"; } + std::string name() + { + return "uniform-double"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand_uniform_double(state); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand_uniform_double(state); + } }; -template struct generator_normal : public generator_type { - typedef float data_type; +template +struct generator_normal : public generator_type +{ + typedef float data_type; - std::string name() { return "normal-float"; } + std::string name() + { + return "normal-float"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand_normal(state); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand_normal(state); + } }; -template -struct generator_normal_double : public generator_type { - typedef double data_type; +template +struct generator_normal_double : public generator_type +{ + typedef double data_type; - std::string name() { return "normal-double"; } + std::string name() + { + return "normal-double"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand_normal_double(state); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand_normal_double(state); + } }; -template struct generator_log_normal : public generator_type { - typedef float data_type; +template +struct generator_log_normal : public generator_type +{ + typedef float data_type; - std::string name() { return "log-normal-float"; } + std::string name() + { + return "log-normal-float"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand_log_normal(state, 0.f, 1.f); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand_log_normal(state, 0.f, 1.f); + } }; -template -struct generator_log_normal_double : public generator_type { - typedef double data_type; +template +struct generator_log_normal_double : public generator_type +{ + typedef double data_type; - std::string name() { return "log-normal-double"; } + std::string name() + { + return "log-normal-double"; + } - __device__ data_type operator()(Engine *state) const { - return rocrand_log_normal_double(state, 0., 1.); - } + __device__ + data_type + operator()(Engine* state) const + { + return rocrand_log_normal_double(state, 0., 1.); + } }; -template struct generator_poisson : public generator_type { - typedef unsigned int data_type; +template +struct generator_poisson : public generator_type +{ + typedef unsigned int data_type; - std::string name() { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << lambda; - return "poisson(lambda=" + stream.str() + ")"; - } + std::string name() + { + std::stringstream stream; + stream << std::fixed << std::setprecision(1) << lambda; + return "poisson(lambda=" + stream.str() + ")"; + } - __device__ data_type operator()(Engine *state) { - return rocrand_poisson(state, lambda); - } + __device__ + data_type + operator()(Engine* state) + { + return rocrand_poisson(state, lambda); + } - double lambda; + double lambda; }; -template -struct generator_discrete_poisson : public generator_type { - typedef unsigned int data_type; +template +struct generator_discrete_poisson : public generator_type +{ + typedef unsigned int data_type; - std::string name() { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << lambda; - return "discrete-poisson(lambda=" + stream.str() + ")"; - } + std::string name() + { + std::stringstream stream; + stream << std::fixed << std::setprecision(1) << lambda; + return "discrete-poisson(lambda=" + stream.str() + ")"; + } - void create() { - ROCRAND_CHECK( - rocrand_create_poisson_distribution(lambda, &discrete_distribution)); - } + void create() + { + ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution)); + } - void destroy() { - ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); - } + void destroy() + { + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } - __device__ data_type operator()(Engine *state) { - return rocrand_discrete(state, discrete_distribution); - } + __device__ + data_type + operator()(Engine* state) + { + return rocrand_discrete(state, discrete_distribution); + } - rocrand_discrete_distribution discrete_distribution; - double lambda; + rocrand_discrete_distribution discrete_distribution; + double lambda; }; -template -struct generator_discrete_custom : public generator_type { - typedef unsigned int data_type; - - std::string name() { return "discrete-custom"; } +template +struct generator_discrete_custom : public generator_type +{ + typedef unsigned int data_type; - void create() { - const unsigned int offset = 1234; - std::vector probabilities = {10, 10, 1, 120, 8, 6, - 140, 2, 150, 150, 10, 80}; + std::string name() + { + return "discrete-custom"; + } - double sum = - std::accumulate(probabilities.begin(), probabilities.end(), 0.); - std::transform(probabilities.begin(), probabilities.end(), - probabilities.begin(), [=](double p) { return p / sum; }); - ROCRAND_CHECK(rocrand_create_discrete_distribution( - probabilities.data(), probabilities.size(), offset, - &discrete_distribution)); - } + void create() + { + const unsigned int offset = 1234; + std::vector probabilities = {10, 10, 1, 120, 8, 6, 140, 2, 150, 150, 10, 80}; + + double sum = std::accumulate(probabilities.begin(), probabilities.end(), 0.); + std::transform(probabilities.begin(), + probabilities.end(), + probabilities.begin(), + [=](double p) { return p / sum; }); + ROCRAND_CHECK(rocrand_create_discrete_distribution(probabilities.data(), + probabilities.size(), + offset, + &discrete_distribution)); + } - void destroy() { - ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); - } + void destroy() + { + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } - __device__ data_type operator()(Engine *state) { - return rocrand_discrete(state, discrete_distribution); - } + __device__ + data_type + operator()(Engine* state) + { + return rocrand_discrete(state, discrete_distribution); + } - rocrand_discrete_distribution discrete_distribution; + rocrand_discrete_distribution discrete_distribution; }; -struct benchmark_context { - size_t size; - size_t dimensions; - size_t trials; - size_t blocks; - size_t threads; - std::vector lambdas; +struct benchmark_context +{ + size_t size; + size_t dimensions; + size_t trials; + size_t blocks; + size_t threads; + std::vector lambdas; }; -template -void run_benchmark(benchmark::State &state, const hipStream_t stream, - const benchmark_context &context, Generator generator) { - typedef typename Generator::data_type data_type; - - const size_t size = context.size; - const size_t dimensions = context.dimensions; - const size_t trials = context.trials; - const size_t blocks = context.blocks; - const size_t threads = context.threads; - - // Optional initialization of the generator - generator.create(); - - data_type *data; - HIP_CHECK(hipMalloc(&data, size * sizeof(data_type))); - - constexpr unsigned long long int seed = 12345ULL; - constexpr unsigned long long int offset = 6789ULL; - - runner r(dimensions, blocks, threads, seed, offset); - - // Warm-up - for (size_t i = 0; i < 5; i++) { - r.generate(blocks, threads, stream, data, size, generator); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } - - // Measurement - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - for (auto _ : state) { - HIP_CHECK(hipEventRecord(start, stream)); - for (size_t i = 0; i < trials; i++) { - r.generate(blocks, threads, stream, data, size, generator); - } - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed; - HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); - - state.SetIterationTime(elapsed / 1000.f); - } - state.SetBytesProcessed(trials * state.iterations() * size * - sizeof(data_type)); - state.SetItemsProcessed(trials * state.iterations() * size); - - // Optional de-initialization of the generator - generator.destroy(); - - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); - HIP_CHECK(hipFree(data)); +template +void run_benchmark(benchmark::State& state, + const hipStream_t stream, + const benchmark_context& context, + Generator generator) +{ + typedef typename Generator::data_type data_type; + + const size_t size = context.size; + const size_t dimensions = context.dimensions; + const size_t trials = context.trials; + const size_t blocks = context.blocks; + const size_t threads = context.threads; + + // Optional initialization of the generator + generator.create(); + + data_type* data; + HIP_CHECK(hipMalloc(&data, size * sizeof(data_type))); + + constexpr unsigned long long int seed = 12345ULL; + constexpr unsigned long long int offset = 6789ULL; + + runner r(dimensions, blocks, threads, seed, offset); + + // Warm-up + for(size_t i = 0; i < 5; i++) + { + r.generate(blocks, threads, stream, data, size, generator); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } + + // Measurement + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + for(auto _ : state) + { + HIP_CHECK(hipEventRecord(start, stream)); + for(size_t i = 0; i < trials; i++) + { + r.generate(blocks, threads, stream, data, size, generator); + } + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed; + HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); + + state.SetIterationTime(elapsed / 1000.f); + } + state.SetBytesProcessed(trials * state.iterations() * size * sizeof(data_type)); + state.SetItemsProcessed(trials * state.iterations() * size); + + // Optional de-initialization of the generator + generator.destroy(); + + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipFree(data)); } -template -void add_benchmark(const benchmark_context &context, const hipStream_t stream, - std::vector &benchmarks, - const std::string &name, Generator generator) { - static_assert(std::is_trivially_copyable::value && - std::is_trivially_destructible::value, - "Generator gets copied to device at kernel launch."); - const std::string benchmark_name = - "device_kernel<" + name + "," + generator.name() + ">"; - benchmarks.emplace_back(benchmark::RegisterBenchmark( - benchmark_name.c_str(), &run_benchmark, stream, - context, generator)); +template +void add_benchmark(const benchmark_context& context, + const hipStream_t stream, + std::vector& benchmarks, + const std::string& name, + Generator generator) +{ + static_assert(std::is_trivially_copyable::value + && std::is_trivially_destructible::value, + "Generator gets copied to device at kernel launch."); + const std::string benchmark_name = "device_kernel<" + name + "," + generator.name() + ">"; + benchmarks.emplace_back(benchmark::RegisterBenchmark(benchmark_name.c_str(), + &run_benchmark, + stream, + context, + generator)); } -template -void add_benchmarks(const benchmark_context &ctx, const hipStream_t stream, - std::vector &benchmarks, - const rocrand_rng_type engine_type) { - constexpr bool is_64_bits = - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value; - - const std::string name = engine_name(engine_type); - - if (is_64_bits) { - add_benchmark(ctx, stream, benchmarks, name, - generator_ullong()); - } else { - add_benchmark(ctx, stream, benchmarks, name, - generator_uint()); - } - - add_benchmark(ctx, stream, benchmarks, name, - generator_uniform()); - add_benchmark(ctx, stream, benchmarks, name, - generator_uniform_double()); - add_benchmark(ctx, stream, benchmarks, name, - generator_normal()); - add_benchmark(ctx, stream, benchmarks, name, - generator_normal_double()); - add_benchmark(ctx, stream, benchmarks, name, - generator_log_normal()); - add_benchmark(ctx, stream, benchmarks, name, - generator_log_normal_double()); - - for (size_t i = 0; i < ctx.lambdas.size(); i++) { - generator_poisson gen_poisson; - gen_poisson.lambda = ctx.lambdas[i]; - add_benchmark(ctx, stream, benchmarks, name, gen_poisson); - } - - for (size_t i = 0; i < ctx.lambdas.size(); i++) { - generator_discrete_poisson gen_discrete_poisson; - gen_discrete_poisson.lambda = ctx.lambdas[i]; - add_benchmark(ctx, stream, benchmarks, name, gen_discrete_poisson); - } - - add_benchmark(ctx, stream, benchmarks, name, - generator_discrete_custom()); +template +void add_benchmarks(const benchmark_context& ctx, + const hipStream_t stream, + std::vector& benchmarks, + const rocrand_rng_type engine_type) +{ + constexpr bool is_64_bits = std::is_same::value + || std::is_same::value + || std::is_same::value + || std::is_same::value; + + const std::string name = engine_name(engine_type); + + if(is_64_bits) + { + add_benchmark(ctx, stream, benchmarks, name, generator_ullong()); + } + else + { + add_benchmark(ctx, stream, benchmarks, name, generator_uint()); + } + + add_benchmark(ctx, stream, benchmarks, name, generator_uniform()); + add_benchmark(ctx, stream, benchmarks, name, generator_uniform_double()); + add_benchmark(ctx, stream, benchmarks, name, generator_normal()); + add_benchmark(ctx, stream, benchmarks, name, generator_normal_double()); + add_benchmark(ctx, stream, benchmarks, name, generator_log_normal()); + add_benchmark(ctx, stream, benchmarks, name, generator_log_normal_double()); + + for(size_t i = 0; i < ctx.lambdas.size(); i++) + { + generator_poisson gen_poisson; + gen_poisson.lambda = ctx.lambdas[i]; + add_benchmark(ctx, stream, benchmarks, name, gen_poisson); + } + + for(size_t i = 0; i < ctx.lambdas.size(); i++) + { + generator_discrete_poisson gen_discrete_poisson; + gen_discrete_poisson.lambda = ctx.lambdas[i]; + add_benchmark(ctx, stream, benchmarks, name, gen_discrete_poisson); + } + + add_benchmark(ctx, stream, benchmarks, name, generator_discrete_custom()); } -int main(int argc, char *argv[]) { - // get paramaters before they are passed into - // benchmark::Initialize() - std::string outFormat = ""; - std::string filter = ""; - std::string consoleFormat = ""; - - getFormats(argc, argv, outFormat, filter, consoleFormat); - - benchmark::Initialize(&argc, argv); - - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_RAND_N, - "number of values"); - parser.set_optional("dimensions", "dimensions", 1, - "number of dimensions of quasi-random values"); - parser.set_optional("trials", "trials", 20, "number of trials"); - parser.set_optional("blocks", "blocks", 256, "number of blocks"); - parser.set_optional("threads", "threads", 256, - "number of threads in each block"); - parser.set_optional>( - "lambda", "lambda", {10.0}, - "space-separated list of lambdas of Poisson distribution"); - parser.run_and_exit_if_error(); - - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - - add_common_benchmark_rocrand_info(); - - benchmark_context ctx{}; - - ctx.size = parser.get("size"); - ctx.dimensions = parser.get("dimensions"); - ctx.trials = parser.get("trials"); - ctx.blocks = parser.get("blocks"); - ctx.threads = parser.get("threads"); - ctx.lambdas = parser.get>("lambda"); - - benchmark::AddCustomContext("size", std::to_string(ctx.size)); - benchmark::AddCustomContext("dimensions", std::to_string(ctx.dimensions)); - benchmark::AddCustomContext("trials", std::to_string(ctx.trials)); - benchmark::AddCustomContext("blocks", std::to_string(ctx.blocks)); - benchmark::AddCustomContext("threads", std::to_string(ctx.threads)); - - std::vector benchmarks = {}; - - // MT19937 has no kernel implementation - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_PSEUDO_LFSR113); - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_PSEUDO_MRG31K3P); - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_PSEUDO_MRG32K3A); - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_PSEUDO_MTGP32); - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_PSEUDO_PHILOX4_32_10); - add_benchmarks( - ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32); - add_benchmarks( - ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64); - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_QUASI_SOBOL32); - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_QUASI_SOBOL64); - add_benchmarks( - ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY2_32_20); - add_benchmarks( - ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY4_32_20); - add_benchmarks( - ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20); - add_benchmarks( - ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY4_64_20); - add_benchmarks(ctx, stream, benchmarks, - ROCRAND_RNG_PSEUDO_XORWOW); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - benchmark::BenchmarkReporter *console_reporter = - getConsoleReporter(consoleFormat); - benchmark::BenchmarkReporter *out_file_reporter = - getOutFileReporter(outFormat); - - std::string spec = (filter == "" || filter == "all") ? "." : filter; - - // Run benchmarks - if (outFormat == "") // default case - benchmark::RunSpecifiedBenchmarks(console_reporter, spec); - else - benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, - spec); - HIP_CHECK(hipStreamDestroy(stream)); - - return 0; +int main(int argc, char* argv[]) +{ + // get paramaters before they are passed into + // benchmark::Initialize() + std::string outFormat = ""; + std::string filter = ""; + std::string consoleFormat = ""; + + getFormats(argc, argv, outFormat, filter, consoleFormat); + + benchmark::Initialize(&argc, argv); + + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); + parser.set_optional("dimensions", + "dimensions", + 1, + "number of dimensions of quasi-random values"); + parser.set_optional("trials", "trials", 20, "number of trials"); + parser.set_optional("blocks", "blocks", 256, "number of blocks"); + parser.set_optional("threads", "threads", 256, "number of threads in each block"); + parser.set_optional>( + "lambda", + "lambda", + {10.0}, + "space-separated list of lambdas of Poisson distribution"); + parser.run_and_exit_if_error(); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + add_common_benchmark_rocrand_info(); + + benchmark_context ctx{}; + + ctx.size = parser.get("size"); + ctx.dimensions = parser.get("dimensions"); + ctx.trials = parser.get("trials"); + ctx.blocks = parser.get("blocks"); + ctx.threads = parser.get("threads"); + ctx.lambdas = parser.get>("lambda"); + + benchmark::AddCustomContext("size", std::to_string(ctx.size)); + benchmark::AddCustomContext("dimensions", std::to_string(ctx.dimensions)); + benchmark::AddCustomContext("trials", std::to_string(ctx.trials)); + benchmark::AddCustomContext("blocks", std::to_string(ctx.blocks)); + benchmark::AddCustomContext("threads", std::to_string(ctx.threads)); + + std::vector benchmarks = {}; + + // MT19937 has no kernel implementation + add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_LFSR113); + add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MRG31K3P); + add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MRG32K3A); + add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MTGP32); + add_benchmarks(ctx, + stream, + benchmarks, + ROCRAND_RNG_PSEUDO_PHILOX4_32_10); + add_benchmarks(ctx, + stream, + benchmarks, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32); + add_benchmarks(ctx, + stream, + benchmarks, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64); + add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SOBOL32); + add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SOBOL64); + add_benchmarks(ctx, + stream, + benchmarks, + ROCRAND_RNG_PSEUDO_THREEFRY2_32_20); + add_benchmarks(ctx, + stream, + benchmarks, + ROCRAND_RNG_PSEUDO_THREEFRY4_32_20); + add_benchmarks(ctx, + stream, + benchmarks, + ROCRAND_RNG_PSEUDO_THREEFRY2_64_20); + add_benchmarks(ctx, + stream, + benchmarks, + ROCRAND_RNG_PSEUDO_THREEFRY4_64_20); + add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_XORWOW); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + benchmark::BenchmarkReporter* console_reporter = getConsoleReporter(consoleFormat); + benchmark::BenchmarkReporter* out_file_reporter = getOutFileReporter(outFormat); + + std::string spec = (filter == "" || filter == "all") ? "." : filter; + + // Run benchmarks + if(outFormat == "") // default case + benchmark::RunSpecifiedBenchmarks(console_reporter, spec); + else + benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, spec); + HIP_CHECK(hipStreamDestroy(stream)); + + return 0; } diff --git a/benchmark/benchmark_rocrand_generate.cpp b/benchmark/benchmark_rocrand_generate.cpp index a6bbabe12..c2fa286ab 100644 --- a/benchmark/benchmark_rocrand_generate.cpp +++ b/benchmark/benchmark_rocrand_generate.cpp @@ -18,36 +18,38 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include +#include #include -#include -#include +#include #include +#include #include -#include +#include #include "cmdparser.hpp" #include #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } - -#define ROCRAND_CHECK(condition) \ - { \ - rocrand_status _status = condition; \ - if(_status != ROCRAND_STATUS_SUCCESS) { \ - std::cout << "ROCRAND error: " << _status << " line: " << __LINE__ << std::endl; \ - exit(_status); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } + +#define ROCRAND_CHECK(condition) \ + { \ + rocrand_status _status = condition; \ + if(_status != ROCRAND_STATUS_SUCCESS) \ + { \ + std::cout << "ROCRAND error: " << _status << " line: " << __LINE__ << std::endl; \ + exit(_status); \ + } \ + } #ifndef DEFAULT_RAND_N const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; @@ -56,32 +58,32 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; typedef rocrand_rng_type rng_type_t; template -using generate_func_type = std::function; +using generate_func_type = std::function; template -void run_benchmark(const cli::Parser& parser, - const rng_type_t rng_type, - hipStream_t stream, +void run_benchmark(const cli::Parser& parser, + const rng_type_t rng_type, + hipStream_t stream, generate_func_type generate_func, - const std::string& distribution, - const std::string& engine, - const double lambda = 0.f) + const std::string& distribution, + const std::string& engine, + const double lambda = 0.f) { - const size_t size0 = parser.get("size"); - const size_t trials = parser.get("trials"); - const size_t dimensions = parser.get("dimensions"); - const size_t offset = parser.get("offset"); - const size_t size = (size0 / dimensions) * dimensions; - const std::string format = parser.get("format"); - - T * data; + const size_t size0 = parser.get("size"); + const size_t trials = parser.get("trials"); + const size_t dimensions = parser.get("dimensions"); + const size_t offset = parser.get("offset"); + const size_t size = (size0 / dimensions) * dimensions; + const std::string format = parser.get("format"); + + T* data; HIP_CHECK(hipMalloc(&data, size * sizeof(T))); rocrand_generator generator; ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); rocrand_status status = rocrand_set_quasi_random_generator_dimensions(generator, dimensions); - if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random + if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random { ROCRAND_CHECK(status); } @@ -89,13 +91,13 @@ void run_benchmark(const cli::Parser& parser, ROCRAND_CHECK(rocrand_set_stream(generator, stream)); status = rocrand_set_offset(generator, offset); - if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random + if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random { ROCRAND_CHECK(status); } // Warm-up - for (size_t i = 0; i < 15; i++) + for(size_t i = 0; i < 15; i++) { ROCRAND_CHECK(generate_func(generator, data, size)); } @@ -106,7 +108,7 @@ void run_benchmark(const cli::Parser& parser, HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); HIP_CHECK(hipEventRecord(start, stream)); - for (size_t i = 0; i < trials; i++) + for(size_t i = 0; i < trials; i++) { ROCRAND_CHECK(generate_func(generator, data, size)); } @@ -117,43 +119,33 @@ void run_benchmark(const cli::Parser& parser, HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); - if (format.compare("csv") == 0) + if(format.compare("csv") == 0) { - std::cout << std::fixed << std::setprecision(3) - << engine << "," - << distribution << "," - << (trials * size * sizeof(T)) / - (elapsed / 1e3 * (1 << 30)) << "," - << (trials * size) / - (elapsed / 1e3 * (1 << 30)) << "," - << elapsed / trials << "," - << elapsed << "," - << size << ","; - if (distribution.compare("poisson") == 0 || distribution.compare("discrete-poisson") == 0) + std::cout << std::fixed << std::setprecision(3) << engine << "," << distribution << "," + << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30)) << "," + << (trials * size) / (elapsed / 1e3 * (1 << 30)) << "," << elapsed / trials << "," + << elapsed << "," << size << ","; + if(distribution.compare("poisson") == 0 || distribution.compare("discrete-poisson") == 0) { - std::cout << lambda; + std::cout << lambda; } std::cout << std::endl; } else { - if (format.compare("console") != 0) + if(format.compare("console") != 0) { - std::cout << "Unknown format specified (must be either console or csv). Defaulting to console output." << std::endl; + std::cout << "Unknown format specified (must be either console or csv). Defaulting to " + "console output." + << std::endl; } - std::cout << std::fixed << std::setprecision(3) - << " " - << "Throughput = " - << std::setw(8) << (trials * size * sizeof(T)) / - (elapsed / 1e3 * (1 << 30)) - << " GB/s, Samples = " - << std::setw(8) << (trials * size) / - (elapsed / 1e3 * (1 << 30)) - << " GSample/s, AvgTime (1 trial) = " - << std::setw(8) << elapsed / trials - << " ms, Time (all) = " - << std::setw(8) << elapsed - << " ms, Size = " << size + std::cout << std::fixed << std::setprecision(3) << " " + << "Throughput = " << std::setw(8) + << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30)) + << " GB/s, Samples = " << std::setw(8) + << (trials * size) / (elapsed / 1e3 * (1 << 30)) + << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed / trials + << " ms, Time (all) = " << std::setw(8) << elapsed << " ms, Size = " << size << std::endl; } @@ -162,142 +154,164 @@ void run_benchmark(const cli::Parser& parser, } void run_benchmarks(const cli::Parser& parser, - const rng_type_t rng_type, + const rng_type_t rng_type, const std::string& distribution, const std::string& engine, - hipStream_t stream) + hipStream_t stream) { const std::string format = parser.get("format"); - if (distribution == "uniform-uint") - { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, unsigned int * data, size_t size) { - return rocrand_generate(gen, data, size); - }, - distribution, engine - ); - } - if (distribution == "uniform-uchar") + if(distribution == "uniform-uint") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, unsigned char * data, size_t size) { - return rocrand_generate_char(gen, data, size); - }, - distribution, engine - ); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, unsigned int* data, size_t size) + { return rocrand_generate(gen, data, size); }, + distribution, + engine); } - if (distribution == "uniform-ushort") + if(distribution == "uniform-uchar") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, unsigned short * data, size_t size) { - return rocrand_generate_short(gen, data, size); - }, - distribution, engine - ); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, unsigned char* data, size_t size) + { return rocrand_generate_char(gen, data, size); }, + distribution, + engine); } - if (distribution == "uniform-half") + if(distribution == "uniform-ushort") { - run_benchmark<__half>(parser, rng_type, stream, - [](rocrand_generator gen, __half * data, size_t size) { - return rocrand_generate_uniform_half(gen, data, size); - }, - distribution, engine - ); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, unsigned short* data, size_t size) + { return rocrand_generate_short(gen, data, size); }, + distribution, + engine); } - if (distribution == "uniform-float") + if(distribution == "uniform-half") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, float * data, size_t size) { - return rocrand_generate_uniform(gen, data, size); - }, - distribution, engine - ); + run_benchmark<__half>( + parser, + rng_type, + stream, + [](rocrand_generator gen, __half* data, size_t size) + { return rocrand_generate_uniform_half(gen, data, size); }, + distribution, + engine); } - if (distribution == "uniform-double") + if(distribution == "uniform-float") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, double * data, size_t size) { - return rocrand_generate_uniform_double(gen, data, size); - }, - distribution, engine - ); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, float* data, size_t size) + { return rocrand_generate_uniform(gen, data, size); }, + distribution, + engine); } - if (distribution == "normal-half") + if(distribution == "uniform-double") { - run_benchmark<__half>(parser, - rng_type, - stream, - [](rocrand_generator gen, __half* data, size_t size) { - return rocrand_generate_normal_half(gen, - data, - size, - __float2half(0.0f), - __float2half(1.0f)); - }, - distribution, - engine); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, double* data, size_t size) + { return rocrand_generate_uniform_double(gen, data, size); }, + distribution, + engine); } - if (distribution == "normal-float") + if(distribution == "normal-half") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, float * data, size_t size) { - return rocrand_generate_normal(gen, data, size, 0.0f, 1.0f); + run_benchmark<__half>( + parser, + rng_type, + stream, + [](rocrand_generator gen, __half* data, size_t size) { + return rocrand_generate_normal_half(gen, + data, + size, + __float2half(0.0f), + __float2half(1.0f)); }, - distribution, engine - ); + distribution, + engine); } - if (distribution == "normal-double") + if(distribution == "normal-float") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, double * data, size_t size) { - return rocrand_generate_normal_double(gen, data, size, 0.0, 1.0); - }, - distribution, engine - ); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, float* data, size_t size) + { return rocrand_generate_normal(gen, data, size, 0.0f, 1.0f); }, + distribution, + engine); } - if (distribution == "log-normal-half") + if(distribution == "normal-double") { - run_benchmark<__half>(parser, - rng_type, - stream, - [](rocrand_generator gen, __half* data, size_t size) - { - return rocrand_generate_log_normal_half(gen, - data, - size, - __float2half(0.0f), - __float2half(1.0f)); - }, - distribution, - engine); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, double* data, size_t size) + { return rocrand_generate_normal_double(gen, data, size, 0.0, 1.0); }, + distribution, + engine); } - if (distribution == "log-normal-float") + if(distribution == "log-normal-half") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, float * data, size_t size) { - return rocrand_generate_log_normal(gen, data, size, 0.0f, 1.0f); + run_benchmark<__half>( + parser, + rng_type, + stream, + [](rocrand_generator gen, __half* data, size_t size) + { + return rocrand_generate_log_normal_half(gen, + data, + size, + __float2half(0.0f), + __float2half(1.0f)); }, - distribution, engine - ); + distribution, + engine); } - if (distribution == "log-normal-double") + if(distribution == "log-normal-float") { - run_benchmark(parser, rng_type, stream, - [](rocrand_generator gen, double * data, size_t size) { - return rocrand_generate_log_normal_double(gen, data, size, 0.0, 1.0); - }, - distribution, engine - ); + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, float* data, size_t size) + { return rocrand_generate_log_normal(gen, data, size, 0.0f, 1.0f); }, + distribution, + engine); } - if (distribution == "poisson") + if(distribution == "log-normal-double") + { + run_benchmark( + parser, + rng_type, + stream, + [](rocrand_generator gen, double* data, size_t size) + { return rocrand_generate_log_normal_double(gen, data, size, 0.0, 1.0); }, + distribution, + engine); + } + if(distribution == "poisson") { const auto lambdas = parser.get>("lambda"); - for (double lambda : lambdas) + for(double lambda : lambdas) { - if (format.compare("console") == 0) + if(format.compare("console") == 0) { - std::cout << " " << "lambda " - << std::fixed << std::setprecision(1) << lambda << std::endl; + std::cout << " " + << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl; } run_benchmark( parser, @@ -330,66 +344,76 @@ const std::vector all_engines = { "scrambled_sobol64", }; -const std::vector all_distributions = { - "uniform-uint", - "uniform-uchar", - "uniform-ushort", - "uniform-half", - // "uniform-long-long", - "uniform-float", - "uniform-double", - "normal-half", - "normal-float", - "normal-double", - "log-normal-half", - "log-normal-float", - "log-normal-double", - "poisson" -}; - -int main(int argc, char *argv[]) +const std::vector all_distributions = {"uniform-uint", + "uniform-uchar", + "uniform-ushort", + "uniform-half", + // "uniform-long-long", + "uniform-float", + "uniform-double", + "normal-half", + "normal-float", + "normal-double", + "log-normal-half", + "log-normal-float", + "log-normal-double", + "poisson"}; + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); - const std::string distribution_desc = - "space-separated list of distributions:" + - std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(), - [](const std::string& a, const std::string& b) { - return a + "\n " + b; - } - ) + - "\n or all"; - const std::string engine_desc = - "space-separated list of random number engines:" + - std::accumulate(all_engines.begin(), all_engines.end(), std::string(), - [](const std::string& a, const std::string& b) { - return a + "\n " + b; - } - ) + - "\n or all"; + const std::string distribution_desc + = "space-separated list of distributions:" + + std::accumulate(all_distributions.begin(), + all_distributions.end(), + std::string(), + [](const std::string& a, const std::string& b) + { return a + "\n " + b; }) + + "\n or all"; + const std::string engine_desc = "space-separated list of random number engines:" + + std::accumulate(all_engines.begin(), + all_engines.end(), + std::string(), + [](const std::string& a, const std::string& b) + { return a + "\n " + b; }) + + "\n or all"; parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); - parser.set_optional("dimensions", "dimensions", 1, "number of dimensions of quasi-random values"); + parser.set_optional("dimensions", + "dimensions", + 1, + "number of dimensions of quasi-random values"); parser.set_optional("offset", "offset", 0, "offset of generated pseudo-random values"); parser.set_optional("trials", "trials", 20, "number of trials"); - parser.set_optional>("dis", "dis", {"uniform-uint"}, distribution_desc); + parser.set_optional>("dis", + "dis", + {"uniform-uint"}, + distribution_desc); parser.set_optional>("engine", "engine", {"philox"}, engine_desc); - parser.set_optional>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution"); - parser.set_optional("format", "format", {"console"}, "output format: console or csv"); + parser.set_optional>( + "lambda", + "lambda", + {10.0}, + "space-separated list of lambdas of Poisson distribution"); + parser.set_optional("format", + "format", + {"console"}, + "output format: console or csv"); parser.run_and_exit_if_error(); std::vector engines; { auto es = parser.get>("engine"); - if (std::find(es.begin(), es.end(), "all") != es.end()) + if(std::find(es.begin(), es.end(), "all") != es.end()) { engines = all_engines; } else { - for (auto e : all_engines) + for(auto e : all_engines) { - if (std::find(es.begin(), es.end(), e) != es.end()) + if(std::find(es.begin(), es.end(), e) != es.end()) engines.push_back(e); } } @@ -398,15 +422,15 @@ int main(int argc, char *argv[]) std::vector distributions; { auto ds = parser.get>("dis"); - if (std::find(ds.begin(), ds.end(), "all") != ds.end()) + if(std::find(ds.begin(), ds.end(), "all") != ds.end()) { distributions = all_distributions; } else { - for (auto d : all_distributions) + for(auto d : all_distributions) { - if (std::find(ds.begin(), ds.end(), d) != ds.end()) + if(std::find(ds.begin(), ds.end(), d) != ds.end()) distributions.push_back(d); } } @@ -430,27 +454,27 @@ int main(int argc, char *argv[]) hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); - std::string format = parser.get("format"); - bool console_output = format.compare("console") == 0 ? true : false; + std::string format = parser.get("format"); + bool console_output = format.compare("console") == 0 ? true : false; - if (!console_output) + if(!console_output) { - std::cout << "Engine,Distribution,Throughput,Samples,AvgTime (1 Trial),Time(all),Size,Lambda" - << std::endl; - std::cout << ",,GB/s,GSample/s,ms),ms),values," - << std::endl; + std::cout + << "Engine,Distribution,Throughput,Samples,AvgTime (1 Trial),Time(all),Size,Lambda" + << std::endl; + std::cout << ",,GB/s,GSample/s,ms),ms),values," << std::endl; } - for (auto engine : engines) + for(auto engine : engines) { rng_type_t rng_type = ROCRAND_RNG_PSEUDO_XORWOW; - if (engine == "xorwow") + if(engine == "xorwow") rng_type = ROCRAND_RNG_PSEUDO_XORWOW; else if(engine == "mrg31k3p") rng_type = ROCRAND_RNG_PSEUDO_MRG31K3P; - else if (engine == "mrg32k3a") + else if(engine == "mrg32k3a") rng_type = ROCRAND_RNG_PSEUDO_MRG32K3A; - else if (engine == "philox") + else if(engine == "philox") rng_type = ROCRAND_RNG_PSEUDO_PHILOX4_32_10; else if(engine == "threefry2x32") rng_type = ROCRAND_RNG_PSEUDO_THREEFRY2_32_20; @@ -460,15 +484,15 @@ int main(int argc, char *argv[]) rng_type = ROCRAND_RNG_PSEUDO_THREEFRY4_32_20; else if(engine == "threefry4x64") rng_type = ROCRAND_RNG_PSEUDO_THREEFRY4_64_20; - else if (engine == "sobol32") + else if(engine == "sobol32") rng_type = ROCRAND_RNG_QUASI_SOBOL32; else if(engine == "scrambled_sobol32") rng_type = ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32; - else if (engine == "sobol64") + else if(engine == "sobol64") rng_type = ROCRAND_RNG_QUASI_SOBOL64; else if(engine == "scrambled_sobol64") rng_type = ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64; - else if (engine == "mtgp32") + else if(engine == "mtgp32") rng_type = ROCRAND_RNG_PSEUDO_MTGP32; else if(engine == "lfsr113") rng_type = ROCRAND_RNG_PSEUDO_LFSR113; @@ -480,11 +504,13 @@ int main(int argc, char *argv[]) exit(1); } - if (console_output) std::cout << engine << ":" << std::endl; + if(console_output) + std::cout << engine << ":" << std::endl; - for (auto distribution : distributions) + for(auto distribution : distributions) { - if (console_output) std::cout << " " << distribution << ":" << std::endl; + if(console_output) + std::cout << " " << distribution << ":" << std::endl; run_benchmarks(parser, rng_type, distribution, engine, stream); } std::cout << std::endl; diff --git a/benchmark/benchmark_rocrand_host_api.cpp b/benchmark/benchmark_rocrand_host_api.cpp index 58cc41845..b354ad6a3 100644 --- a/benchmark/benchmark_rocrand_host_api.cpp +++ b/benchmark/benchmark_rocrand_host_api.cpp @@ -37,160 +37,182 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; typedef rocrand_rng_type rng_type_t; -template -using generate_func_type = - std::function; - -template -void run_benchmark(benchmark::State &state, generate_func_type generate_func, - const size_t size, const bool byte_size, const size_t trials, - const size_t dimensions, const size_t offset, - const rng_type_t rng_type, const rocrand_ordering ordering, - const bool benchmark_host, hipStream_t stream) { - const size_t binary_div = byte_size ? sizeof(T) : 1; - const size_t rounded_size = (size / binary_div / dimensions) * dimensions; - - T *data; - rocrand_generator generator; - - if (benchmark_host) { - data = new T[rounded_size]; - ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); - } else { - HIP_CHECK(hipMalloc(&data, rounded_size * sizeof(T))); - ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); - } - - ROCRAND_CHECK(rocrand_set_ordering(generator, ordering)); - - rocrand_status status = - rocrand_set_quasi_random_generator_dimensions(generator, dimensions); - if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random - { - ROCRAND_CHECK(status); - } - - ROCRAND_CHECK(rocrand_set_stream(generator, stream)); - - status = rocrand_set_offset(generator, offset); - if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random - { - ROCRAND_CHECK(status); - } - - // Warm-up - for (size_t i = 0; i < 15; i++) { - ROCRAND_CHECK(generate_func(generator, data, rounded_size)); - } - HIP_CHECK(hipDeviceSynchronize()); - - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - for (auto _ : state) { - HIP_CHECK(hipEventRecord(start, stream)); - for (size_t i = 0; i < trials; i++) { - ROCRAND_CHECK(generate_func(generator, data, rounded_size)); +template +using generate_func_type = std::function; + +template +void run_benchmark(benchmark::State& state, + generate_func_type generate_func, + const size_t size, + const bool byte_size, + const size_t trials, + const size_t dimensions, + const size_t offset, + const rng_type_t rng_type, + const rocrand_ordering ordering, + const bool benchmark_host, + hipStream_t stream) +{ + const size_t binary_div = byte_size ? sizeof(T) : 1; + const size_t rounded_size = (size / binary_div / dimensions) * dimensions; + + T* data; + rocrand_generator generator; + + if(benchmark_host) + { + data = new T[rounded_size]; + ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); + } + else + { + HIP_CHECK(hipMalloc(&data, rounded_size * sizeof(T))); + ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); + } + + ROCRAND_CHECK(rocrand_set_ordering(generator, ordering)); + + rocrand_status status = rocrand_set_quasi_random_generator_dimensions(generator, dimensions); + if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random + { + ROCRAND_CHECK(status); + } + + ROCRAND_CHECK(rocrand_set_stream(generator, stream)); + + status = rocrand_set_offset(generator, offset); + if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random + { + ROCRAND_CHECK(status); + } + + // Warm-up + for(size_t i = 0; i < 15; i++) + { + ROCRAND_CHECK(generate_func(generator, data, rounded_size)); + } + HIP_CHECK(hipDeviceSynchronize()); + + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + for(auto _ : state) + { + HIP_CHECK(hipEventRecord(start, stream)); + for(size_t i = 0; i < trials; i++) + { + ROCRAND_CHECK(generate_func(generator, data, rounded_size)); + } + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed = 0.0f; + HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); + + state.SetIterationTime(elapsed / 1000.f); + } + state.SetBytesProcessed(trials * state.iterations() * rounded_size * sizeof(T)); + state.SetItemsProcessed(trials * state.iterations() * rounded_size); + + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipEventDestroy(start)); + ROCRAND_CHECK(rocrand_destroy_generator(generator)); + + if(benchmark_host) + { + delete[] data; + } + else + { + HIP_CHECK(hipFree(data)); } - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed = 0.0f; - HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); - - state.SetIterationTime(elapsed / 1000.f); - } - state.SetBytesProcessed(trials * state.iterations() * rounded_size * - sizeof(T)); - state.SetItemsProcessed(trials * state.iterations() * rounded_size); - - HIP_CHECK(hipEventDestroy(stop)); - HIP_CHECK(hipEventDestroy(start)); - ROCRAND_CHECK(rocrand_destroy_generator(generator)); - - if (benchmark_host) { - delete[] data; - } else { - HIP_CHECK(hipFree(data)); - } } -int main(int argc, char *argv[]) { - - // get paramaters before they are passed into - // benchmark::Initialize() - std::string outFormat = ""; - std::string filter = ""; - std::string consoleFormat = ""; - - getFormats(argc, argv, outFormat, filter, consoleFormat); - - // Parse argv - benchmark::Initialize(&argc, argv); - - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_RAND_N, - "number of values"); - parser.set_optional( - "byte-size", "byte-size", false, - "--size is interpreted as the number of generated bytes"); - parser.set_optional("dimensions", "dimensions", 1, - "number of dimensions of quasi-random values"); - parser.set_optional("offset", "offset", 0, - "offset of generated pseudo-random values"); - parser.set_optional("trials", "trials", 20, "number of trials"); - parser.set_optional>( - "lambda", "lambda", {10.0}, - "space-separated list of lambdas of Poisson distribution"); - parser.set_optional( - "host", "host", false, - "run benchmarks on the host instead of on the device"); - parser.run_and_exit_if_error(); - - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - - // Benchmark info - add_common_benchmark_rocrand_info(); - - const size_t size = parser.get("size"); - const bool byte_size = parser.get("byte-size"); - const size_t trials = parser.get("trials"); - const size_t dimensions = parser.get("dimensions"); - const size_t offset = parser.get("offset"); - const std::vector poisson_lambdas = - parser.get>("lambda"); - const bool benchmark_host = parser.get("host"); - - benchmark::AddCustomContext("size", std::to_string(size)); - benchmark::AddCustomContext("byte-size", std::to_string(byte_size)); - benchmark::AddCustomContext("trials", std::to_string(trials)); - benchmark::AddCustomContext("dimensions", std::to_string(dimensions)); - benchmark::AddCustomContext("offset", std::to_string(offset)); - benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host)); - - std::vector benchmarked_engine_types{ - ROCRAND_RNG_PSEUDO_LFSR113, ROCRAND_RNG_PSEUDO_MRG31K3P, - ROCRAND_RNG_PSEUDO_MRG32K3A, ROCRAND_RNG_PSEUDO_MTGP32, - ROCRAND_RNG_PSEUDO_MT19937, ROCRAND_RNG_PSEUDO_PHILOX4_32_10, - ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, - ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, - ROCRAND_RNG_PSEUDO_XORWOW, ROCRAND_RNG_QUASI_SOBOL32, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, ROCRAND_RNG_QUASI_SOBOL64, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64}; - - const std::map ordering_name_map{ - {ROCRAND_ORDERING_PSEUDO_DEFAULT, "default"}, - {ROCRAND_ORDERING_PSEUDO_LEGACY, "legacy"}, - {ROCRAND_ORDERING_PSEUDO_BEST, "best"}, - {ROCRAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"}, - {ROCRAND_ORDERING_PSEUDO_SEEDED, "seeded"}, - {ROCRAND_ORDERING_QUASI_DEFAULT, "default"}, - }; - - const std::map> - benchmarked_orderings{ - // clang-format off +int main(int argc, char* argv[]) +{ + + // get paramaters before they are passed into + // benchmark::Initialize() + std::string outFormat = ""; + std::string filter = ""; + std::string consoleFormat = ""; + + getFormats(argc, argv, outFormat, filter, consoleFormat); + + // Parse argv + benchmark::Initialize(&argc, argv); + + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); + parser.set_optional("byte-size", + "byte-size", + false, + "--size is interpreted as the number of generated bytes"); + parser.set_optional("dimensions", + "dimensions", + 1, + "number of dimensions of quasi-random values"); + parser.set_optional("offset", "offset", 0, "offset of generated pseudo-random values"); + parser.set_optional("trials", "trials", 20, "number of trials"); + parser.set_optional>( + "lambda", + "lambda", + {10.0}, + "space-separated list of lambdas of Poisson distribution"); + parser.set_optional("host", + "host", + false, + "run benchmarks on the host instead of on the device"); + parser.run_and_exit_if_error(); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + // Benchmark info + add_common_benchmark_rocrand_info(); + + const size_t size = parser.get("size"); + const bool byte_size = parser.get("byte-size"); + const size_t trials = parser.get("trials"); + const size_t dimensions = parser.get("dimensions"); + const size_t offset = parser.get("offset"); + const std::vector poisson_lambdas = parser.get>("lambda"); + const bool benchmark_host = parser.get("host"); + + benchmark::AddCustomContext("size", std::to_string(size)); + benchmark::AddCustomContext("byte-size", std::to_string(byte_size)); + benchmark::AddCustomContext("trials", std::to_string(trials)); + benchmark::AddCustomContext("dimensions", std::to_string(dimensions)); + benchmark::AddCustomContext("offset", std::to_string(offset)); + benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host)); + + std::vector benchmarked_engine_types{ROCRAND_RNG_PSEUDO_LFSR113, + ROCRAND_RNG_PSEUDO_MRG31K3P, + ROCRAND_RNG_PSEUDO_MRG32K3A, + ROCRAND_RNG_PSEUDO_MTGP32, + ROCRAND_RNG_PSEUDO_MT19937, + ROCRAND_RNG_PSEUDO_PHILOX4_32_10, + ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, + ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, + ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, + ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, + ROCRAND_RNG_PSEUDO_XORWOW, + ROCRAND_RNG_QUASI_SOBOL32, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, + ROCRAND_RNG_QUASI_SOBOL64, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64}; + + const std::map ordering_name_map{ + {ROCRAND_ORDERING_PSEUDO_DEFAULT, "default"}, + { ROCRAND_ORDERING_PSEUDO_LEGACY, "legacy"}, + { ROCRAND_ORDERING_PSEUDO_BEST, "best"}, + {ROCRAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"}, + { ROCRAND_ORDERING_PSEUDO_SEEDED, "seeded"}, + { ROCRAND_ORDERING_QUASI_DEFAULT, "default"}, + }; + + const std::map> benchmarked_orderings{ + // clang-format off { ROCRAND_RNG_PSEUDO_MTGP32, {ROCRAND_ORDERING_PSEUDO_DEFAULT, ROCRAND_ORDERING_PSEUDO_DYNAMIC}}, { ROCRAND_RNG_PSEUDO_MT19937, {ROCRAND_ORDERING_PSEUDO_DEFAULT}}, @@ -216,166 +238,252 @@ int main(int argc, char *argv[]) { {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, {ROCRAND_ORDERING_QUASI_DEFAULT}}, { ROCRAND_RNG_QUASI_SOBOL64, {ROCRAND_ORDERING_QUASI_DEFAULT}}, {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64, {ROCRAND_ORDERING_QUASI_DEFAULT}}, - // clang-format on - }; - - const std::string benchmark_name_prefix = "device_generate"; - // Add benchmarks - std::vector benchmarks = {}; - for (const rocrand_rng_type engine_type : benchmarked_engine_types) { - const std::string name = engine_name(engine_type); - for (const rocrand_ordering ordering : - benchmarked_orderings.at(engine_type)) { - const std::string name_engine_prefix = - benchmark_name_prefix + "<" + name + "," + - ordering_name_map.at(ordering) + ","; - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-uint>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned int *data, size_t size_gen) { - return rocrand_generate(gen, data, size_gen); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-uchar>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned char *data, size_t size_gen) { - return rocrand_generate_char(gen, data, size_gen); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-ushort>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned short *data, size_t size_gen) { - return rocrand_generate_short(gen, data, size_gen); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half *data, size_t size_gen) { - return rocrand_generate_uniform_half(gen, data, size_gen); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float *data, size_t size_gen) { - return rocrand_generate_uniform(gen, data, size_gen); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double *data, size_t size_gen) { - return rocrand_generate_uniform_double(gen, data, size_gen); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-half>").c_str(), &run_benchmark<__half>, - [](rocrand_generator gen, __half *data, size_t size_gen) { - return rocrand_generate_normal_half( - gen, data, size_gen, __float2half(0.0f), __float2half(1.0f)); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-float>").c_str(), &run_benchmark, - [](rocrand_generator gen, float *data, size_t size_gen) { - return rocrand_generate_normal(gen, data, size_gen, 0.0f, 1.0f); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double *data, size_t size_gen) { - return rocrand_generate_normal_double(gen, data, size_gen, 0.0, - 1.0); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half *data, size_t size_gen) { - return rocrand_generate_log_normal_half( - gen, data, size_gen, __float2half(0.0f), __float2half(1.0f)); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float *data, size_t size_gen) { - return rocrand_generate_log_normal(gen, data, size_gen, 0.0f, 1.0f); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double *data, size_t size_gen) { - return rocrand_generate_log_normal_double(gen, data, size_gen, 0.0, - 1.0); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - - for (auto lambda : poisson_lambdas) { - const std::string poisson_dis_name = - std::string("poisson(lambda=") + std::to_string(lambda) + ")>"; - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + poisson_dis_name).c_str(), - &run_benchmark, - [lambda](rocrand_generator gen, unsigned int *data, - size_t size_gen) { - return rocrand_generate_poisson(gen, data, size_gen, lambda); - }, - size, byte_size, trials, dimensions, offset, engine_type, ordering, - benchmark_host, stream)); - } + // clang-format on + }; + + const std::string benchmark_name_prefix = "device_generate"; + // Add benchmarks + std::vector benchmarks = {}; + for(const rocrand_rng_type engine_type : benchmarked_engine_types) + { + const std::string name = engine_name(engine_type); + for(const rocrand_ordering ordering : benchmarked_orderings.at(engine_type)) + { + const std::string name_engine_prefix + = benchmark_name_prefix + "<" + name + "," + ordering_name_map.at(ordering) + ","; + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-uint>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned int* data, size_t size_gen) + { return rocrand_generate(gen, data, size_gen); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-uchar>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned char* data, size_t size_gen) + { return rocrand_generate_char(gen, data, size_gen); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-ushort>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned short* data, size_t size_gen) + { return rocrand_generate_short(gen, data, size_gen); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-half>").c_str(), + &run_benchmark<__half>, + [](rocrand_generator gen, __half* data, size_t size_gen) + { return rocrand_generate_uniform_half(gen, data, size_gen); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-float>").c_str(), + &run_benchmark, + [](rocrand_generator gen, float* data, size_t size_gen) + { return rocrand_generate_uniform(gen, data, size_gen); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double* data, size_t size_gen) + { return rocrand_generate_uniform_double(gen, data, size_gen); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-half>").c_str(), + &run_benchmark<__half>, + [](rocrand_generator gen, __half* data, size_t size_gen) + { + return rocrand_generate_normal_half(gen, + data, + size_gen, + __float2half(0.0f), + __float2half(1.0f)); + }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-float>").c_str(), + &run_benchmark, + [](rocrand_generator gen, float* data, size_t size_gen) + { return rocrand_generate_normal(gen, data, size_gen, 0.0f, 1.0f); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double* data, size_t size_gen) + { return rocrand_generate_normal_double(gen, data, size_gen, 0.0, 1.0); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-half>").c_str(), + &run_benchmark<__half>, + [](rocrand_generator gen, __half* data, size_t size_gen) + { + return rocrand_generate_log_normal_half(gen, + data, + size_gen, + __float2half(0.0f), + __float2half(1.0f)); + }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-float>").c_str(), + &run_benchmark, + [](rocrand_generator gen, float* data, size_t size_gen) + { return rocrand_generate_log_normal(gen, data, size_gen, 0.0f, 1.0f); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double* data, size_t size_gen) + { return rocrand_generate_log_normal_double(gen, data, size_gen, 0.0, 1.0); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + + for(auto lambda : poisson_lambdas) + { + const std::string poisson_dis_name + = std::string("poisson(lambda=") + std::to_string(lambda) + ")>"; + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + poisson_dis_name).c_str(), + &run_benchmark, + [lambda](rocrand_generator gen, unsigned int* data, size_t size_gen) + { return rocrand_generate_poisson(gen, data, size_gen, lambda); }, + size, + byte_size, + trials, + dimensions, + offset, + engine_type, + ordering, + benchmark_host, + stream)); + } + } } - } - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } - benchmark::BenchmarkReporter *console_reporter = - getConsoleReporter(consoleFormat); - benchmark::BenchmarkReporter *out_file_reporter = - getOutFileReporter(outFormat); + benchmark::BenchmarkReporter* console_reporter = getConsoleReporter(consoleFormat); + benchmark::BenchmarkReporter* out_file_reporter = getOutFileReporter(outFormat); - std::string spec = (filter == "" || filter == "all") ? "." : filter; + std::string spec = (filter == "" || filter == "all") ? "." : filter; - // Run benchmarks - if (outFormat == "") // default case - benchmark::RunSpecifiedBenchmarks(console_reporter, spec); - else - benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, - spec); + // Run benchmarks + if(outFormat == "") // default case + benchmark::RunSpecifiedBenchmarks(console_reporter, spec); + else + benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, spec); - HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK(hipStreamDestroy(stream)); - return 0; + return 0; } diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 1f3a6f205..1ee5a4c2a 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -30,205 +30,219 @@ #include "custom_csv_formater.hpp" #include -#define HIP_CHECK(condition) \ - do { \ - hipError_t error_ = condition; \ - if (error_ != hipSuccess) { \ - std::cout << "HIP error: " << error_ << " line: " << __LINE__ \ - << std::endl; \ - exit(error_); \ - } \ - } while (0) - -inline void add_common_benchmark_info() { - auto str = [](const std::string &name, const std::string &val) { - benchmark::AddCustomContext(name, val); - }; - - auto num = [](const std::string &name, const int &value) { - benchmark::AddCustomContext(name, std::to_string(value)); - }; - - auto dim2 = [num](const std::string &name, const int *values) { - num(name + "_x", values[0]); - num(name + "_y", values[1]); - }; - - auto dim3 = [num, dim2](const std::string &name, const int *values) { - dim2(name, values); - num(name + "_z", values[2]); - }; - - auto num_size_t = [](const std::string &name, const size_t &value) { - benchmark::AddCustomContext(name, std::to_string(value)); - }; - - int runtime_version; - HIP_CHECK(hipRuntimeGetVersion(&runtime_version)); - num("hip_runtime_version", runtime_version); - - // On the NVIDIA platform not all members of this struct will be written to - // Zero-initialize to avoid referencing dangling memory - hipDeviceProp_t devProp{}; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - str("hdp_name", devProp.name); - num_size_t("hdp_total_global_mem", devProp.totalGlobalMem); - num_size_t("hdp_shared_mem_per_block", devProp.sharedMemPerBlock); - num("hdp_regs_per_block", devProp.regsPerBlock); - num("hdp_warp_size", devProp.warpSize); - num("hdp_max_threads_per_block", devProp.maxThreadsPerBlock); - dim3("hdp_max_threads_dim", devProp.maxThreadsDim); - dim3("hdp_max_grid_size", devProp.maxGridSize); - num("hdp_clock_rate", devProp.clockRate); - num("hdp_memory_clock_rate", devProp.memoryClockRate); - num("hdp_memory_bus_width", devProp.memoryBusWidth); - num_size_t("hdp_total_const_mem", devProp.totalConstMem); - num("hdp_major", devProp.major); - num("hdp_minor", devProp.minor); - num("hdp_multi_processor_count", devProp.multiProcessorCount); - num("hdp_l2_cache_size", devProp.l2CacheSize); - num_size_t("hdp_max_threads_per_multiprocessor", - devProp.maxThreadsPerMultiProcessor); - num("hdp_compute_mode", devProp.computeMode); - num("hdp_clock_instruction_rate", devProp.clockInstructionRate); - num("hdp_concurrent_kernels", devProp.concurrentKernels); - num("hdp_pci_domain_id", devProp.pciDomainID); - num("hdp_pci_bus_id", devProp.pciBusID); - num("hdp_pci_device_id", devProp.pciDeviceID); - num("hdp_max_shared_memory_per_multi_processor", - devProp.maxSharedMemoryPerMultiProcessor); - num("hdp_is_multi_gpu_board", devProp.isMultiGpuBoard); - num("hdp_can_map_host_memory", devProp.canMapHostMemory); - str("hdp_gcn_arch_name", devProp.gcnArchName); - num("hdp_integrated", devProp.integrated); - num("hdp_cooperative_launch", devProp.cooperativeLaunch); - num("hdp_cooperative_multi_device_launch", - devProp.cooperativeMultiDeviceLaunch); - num_size_t("hdp_max_texture_1d_linear", devProp.maxTexture1DLinear); - num("hdp_max_texture_1d", devProp.maxTexture1D); - dim2("hdp_max_texture_2d", devProp.maxTexture2D); - dim3("hdp_max_texture_3d", devProp.maxTexture3D); - num_size_t("hdp_mem_pitch", devProp.memPitch); - num("hdp_texture_alignment", devProp.textureAlignment); - num("hdp_texture_pitch_alignment", devProp.texturePitchAlignment); - num("hdp_kernel_exec_timeout_enabled", devProp.kernelExecTimeoutEnabled); - num("hdp_ecc_enabled", devProp.ECCEnabled); - num("hdp_tcc_driver", devProp.tccDriver); - num("hdp_cooperative_multi_device_unmatched_func", - devProp.cooperativeMultiDeviceUnmatchedFunc); - num("hdp_cooperative_multi_device_unmatched_grid_dim", - devProp.cooperativeMultiDeviceUnmatchedGridDim); - num("hdp_cooperative_multi_device_unmatched_block_dim", - devProp.cooperativeMultiDeviceUnmatchedBlockDim); - num("hdp_cooperative_multi_device_unmatched_shared_mem", - devProp.cooperativeMultiDeviceUnmatchedSharedMem); - num("hdp_is_large_bar", devProp.isLargeBar); - num("hdp_asic_revision", devProp.asicRevision); - num("hdp_managed_memory", devProp.managedMemory); - num("hdp_direct_managed_mem_access_from_host", - devProp.directManagedMemAccessFromHost); - num("hdp_concurrent_managed_access", devProp.concurrentManagedAccess); - num("hdp_pageable_memory_access", devProp.pageableMemoryAccess); - num("hdp_pageable_memory_access_uses_host_page_tables", - devProp.pageableMemoryAccessUsesHostPageTables); - - const auto arch = devProp.arch; - num("hdp_arch_has_global_int32_atomics", arch.hasGlobalInt32Atomics); - num("hdp_arch_has_global_float_atomic_exch", arch.hasGlobalFloatAtomicExch); - num("hdp_arch_has_shared_int32_atomics", arch.hasSharedInt32Atomics); - num("hdp_arch_has_shared_float_atomic_exch", arch.hasSharedFloatAtomicExch); - num("hdp_arch_has_float_atomic_add", arch.hasFloatAtomicAdd); - num("hdp_arch_has_global_int64_atomics", arch.hasGlobalInt64Atomics); - num("hdp_arch_has_shared_int64_atomics", arch.hasSharedInt64Atomics); - num("hdp_arch_has_doubles", arch.hasDoubles); - num("hdp_arch_has_warp_vote", arch.hasWarpVote); - num("hdp_arch_has_warp_ballot", arch.hasWarpBallot); - num("hdp_arch_has_warp_shuffle", arch.hasWarpShuffle); - num("hdp_arch_has_funnel_shift", arch.hasFunnelShift); - num("hdp_arch_has_thread_fence_system", arch.hasThreadFenceSystem); - num("hdp_arch_has_sync_threads_ext", arch.hasSyncThreadsExt); - num("hdp_arch_has_surface_funcs", arch.hasSurfaceFuncs); - num("hdp_arch_has_3d_grid", arch.has3dGrid); - num("hdp_arch_has_dynamic_parallelism", arch.hasDynamicParallelism); +#define HIP_CHECK(condition) \ + do \ + { \ + hipError_t error_ = condition; \ + if(error_ != hipSuccess) \ + { \ + std::cout << "HIP error: " << error_ << " line: " << __LINE__ << std::endl; \ + exit(error_); \ + } \ + } \ + while(0) + +inline void add_common_benchmark_info() +{ + auto str = [](const std::string& name, const std::string& val) + { benchmark::AddCustomContext(name, val); }; + + auto num = [](const std::string& name, const int& value) + { benchmark::AddCustomContext(name, std::to_string(value)); }; + + auto dim2 = [num](const std::string& name, const int* values) + { + num(name + "_x", values[0]); + num(name + "_y", values[1]); + }; + + auto dim3 = [num, dim2](const std::string& name, const int* values) + { + dim2(name, values); + num(name + "_z", values[2]); + }; + + auto num_size_t = [](const std::string& name, const size_t& value) + { benchmark::AddCustomContext(name, std::to_string(value)); }; + + int runtime_version; + HIP_CHECK(hipRuntimeGetVersion(&runtime_version)); + num("hip_runtime_version", runtime_version); + + // On the NVIDIA platform not all members of this struct will be written to + // Zero-initialize to avoid referencing dangling memory + hipDeviceProp_t devProp{}; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + str("hdp_name", devProp.name); + num_size_t("hdp_total_global_mem", devProp.totalGlobalMem); + num_size_t("hdp_shared_mem_per_block", devProp.sharedMemPerBlock); + num("hdp_regs_per_block", devProp.regsPerBlock); + num("hdp_warp_size", devProp.warpSize); + num("hdp_max_threads_per_block", devProp.maxThreadsPerBlock); + dim3("hdp_max_threads_dim", devProp.maxThreadsDim); + dim3("hdp_max_grid_size", devProp.maxGridSize); + num("hdp_clock_rate", devProp.clockRate); + num("hdp_memory_clock_rate", devProp.memoryClockRate); + num("hdp_memory_bus_width", devProp.memoryBusWidth); + num_size_t("hdp_total_const_mem", devProp.totalConstMem); + num("hdp_major", devProp.major); + num("hdp_minor", devProp.minor); + num("hdp_multi_processor_count", devProp.multiProcessorCount); + num("hdp_l2_cache_size", devProp.l2CacheSize); + num_size_t("hdp_max_threads_per_multiprocessor", devProp.maxThreadsPerMultiProcessor); + num("hdp_compute_mode", devProp.computeMode); + num("hdp_clock_instruction_rate", devProp.clockInstructionRate); + num("hdp_concurrent_kernels", devProp.concurrentKernels); + num("hdp_pci_domain_id", devProp.pciDomainID); + num("hdp_pci_bus_id", devProp.pciBusID); + num("hdp_pci_device_id", devProp.pciDeviceID); + num("hdp_max_shared_memory_per_multi_processor", devProp.maxSharedMemoryPerMultiProcessor); + num("hdp_is_multi_gpu_board", devProp.isMultiGpuBoard); + num("hdp_can_map_host_memory", devProp.canMapHostMemory); + str("hdp_gcn_arch_name", devProp.gcnArchName); + num("hdp_integrated", devProp.integrated); + num("hdp_cooperative_launch", devProp.cooperativeLaunch); + num("hdp_cooperative_multi_device_launch", devProp.cooperativeMultiDeviceLaunch); + num_size_t("hdp_max_texture_1d_linear", devProp.maxTexture1DLinear); + num("hdp_max_texture_1d", devProp.maxTexture1D); + dim2("hdp_max_texture_2d", devProp.maxTexture2D); + dim3("hdp_max_texture_3d", devProp.maxTexture3D); + num_size_t("hdp_mem_pitch", devProp.memPitch); + num("hdp_texture_alignment", devProp.textureAlignment); + num("hdp_texture_pitch_alignment", devProp.texturePitchAlignment); + num("hdp_kernel_exec_timeout_enabled", devProp.kernelExecTimeoutEnabled); + num("hdp_ecc_enabled", devProp.ECCEnabled); + num("hdp_tcc_driver", devProp.tccDriver); + num("hdp_cooperative_multi_device_unmatched_func", devProp.cooperativeMultiDeviceUnmatchedFunc); + num("hdp_cooperative_multi_device_unmatched_grid_dim", + devProp.cooperativeMultiDeviceUnmatchedGridDim); + num("hdp_cooperative_multi_device_unmatched_block_dim", + devProp.cooperativeMultiDeviceUnmatchedBlockDim); + num("hdp_cooperative_multi_device_unmatched_shared_mem", + devProp.cooperativeMultiDeviceUnmatchedSharedMem); + num("hdp_is_large_bar", devProp.isLargeBar); + num("hdp_asic_revision", devProp.asicRevision); + num("hdp_managed_memory", devProp.managedMemory); + num("hdp_direct_managed_mem_access_from_host", devProp.directManagedMemAccessFromHost); + num("hdp_concurrent_managed_access", devProp.concurrentManagedAccess); + num("hdp_pageable_memory_access", devProp.pageableMemoryAccess); + num("hdp_pageable_memory_access_uses_host_page_tables", + devProp.pageableMemoryAccessUsesHostPageTables); + + const auto arch = devProp.arch; + num("hdp_arch_has_global_int32_atomics", arch.hasGlobalInt32Atomics); + num("hdp_arch_has_global_float_atomic_exch", arch.hasGlobalFloatAtomicExch); + num("hdp_arch_has_shared_int32_atomics", arch.hasSharedInt32Atomics); + num("hdp_arch_has_shared_float_atomic_exch", arch.hasSharedFloatAtomicExch); + num("hdp_arch_has_float_atomic_add", arch.hasFloatAtomicAdd); + num("hdp_arch_has_global_int64_atomics", arch.hasGlobalInt64Atomics); + num("hdp_arch_has_shared_int64_atomics", arch.hasSharedInt64Atomics); + num("hdp_arch_has_doubles", arch.hasDoubles); + num("hdp_arch_has_warp_vote", arch.hasWarpVote); + num("hdp_arch_has_warp_ballot", arch.hasWarpBallot); + num("hdp_arch_has_warp_shuffle", arch.hasWarpShuffle); + num("hdp_arch_has_funnel_shift", arch.hasFunnelShift); + num("hdp_arch_has_thread_fence_system", arch.hasThreadFenceSystem); + num("hdp_arch_has_sync_threads_ext", arch.hasSyncThreadsExt); + num("hdp_arch_has_surface_funcs", arch.hasSurfaceFuncs); + num("hdp_arch_has_3d_grid", arch.has3dGrid); + num("hdp_arch_has_dynamic_parallelism", arch.hasDynamicParallelism); } -inline size_t next_power2(size_t x) { - size_t power = 1; - while (power < x) { - power *= 2; - } - return power; +inline size_t next_power2(size_t x) +{ + size_t power = 1; + while(power < x) + { + power *= 2; + } + return power; } -inline benchmark::BenchmarkReporter * -getConsoleReporter(const std::string format) { - benchmark::BenchmarkReporter *reporter; - if (format == "csv") { - static benchmark::customCSVReporter csv_reporter; - csv_reporter.SetErrorStream(&std::cout); - csv_reporter.SetOutputStream(&std::cout); - reporter = &csv_reporter; - } else if (format == "json") { - static benchmark::customCSVReporter json_reporter; - json_reporter.SetErrorStream(&std::cout); - json_reporter.SetOutputStream(&std::cout); - reporter = &json_reporter; - } else { - static benchmark::ConsoleReporter terminal_reporter; - terminal_reporter.SetErrorStream(&std::cout); - terminal_reporter.SetOutputStream(&std::cout); - reporter = &terminal_reporter; - } - - return reporter; +inline benchmark::BenchmarkReporter* getConsoleReporter(const std::string format) +{ + benchmark::BenchmarkReporter* reporter; + if(format == "csv") + { + static benchmark::customCSVReporter csv_reporter; + csv_reporter.SetErrorStream(&std::cout); + csv_reporter.SetOutputStream(&std::cout); + reporter = &csv_reporter; + } + else if(format == "json") + { + static benchmark::customCSVReporter json_reporter; + json_reporter.SetErrorStream(&std::cout); + json_reporter.SetOutputStream(&std::cout); + reporter = &json_reporter; + } + else + { + static benchmark::ConsoleReporter terminal_reporter; + terminal_reporter.SetErrorStream(&std::cout); + terminal_reporter.SetOutputStream(&std::cout); + reporter = &terminal_reporter; + } + + return reporter; } -inline benchmark::BenchmarkReporter * -getOutFileReporter(const std::string format) { - benchmark::BenchmarkReporter *reporter = nullptr; - std::ofstream output_file; - if (format == "csv") { - static benchmark::customCSVReporter csv_reporter; - csv_reporter.SetOutputStream(&output_file); - csv_reporter.SetErrorStream(&output_file); - reporter = &csv_reporter; - } else if (format == "json") { - static benchmark::JSONReporter json_reporter; - json_reporter.SetOutputStream(&output_file); - json_reporter.SetErrorStream(&output_file); - reporter = &json_reporter; - } else if (format == "console") { - static benchmark::ConsoleReporter console_reporter; - console_reporter.SetOutputStream(&output_file); - console_reporter.SetErrorStream(&output_file); - reporter = &console_reporter; - } - - return reporter; +inline benchmark::BenchmarkReporter* getOutFileReporter(const std::string format) +{ + benchmark::BenchmarkReporter* reporter = nullptr; + std::ofstream output_file; + if(format == "csv") + { + static benchmark::customCSVReporter csv_reporter; + csv_reporter.SetOutputStream(&output_file); + csv_reporter.SetErrorStream(&output_file); + reporter = &csv_reporter; + } + else if(format == "json") + { + static benchmark::JSONReporter json_reporter; + json_reporter.SetOutputStream(&output_file); + json_reporter.SetErrorStream(&output_file); + reporter = &json_reporter; + } + else if(format == "console") + { + static benchmark::ConsoleReporter console_reporter; + console_reporter.SetOutputStream(&output_file); + console_reporter.SetErrorStream(&output_file); + reporter = &console_reporter; + } + + return reporter; } -inline void getFormats(const int argc, char *argv[], std::string &outFormat, - std::string &filter, std::string &consoleFormat) { - for (int i = 1; i < argc; i++) { - std::string input(argv[i]); - int equalPos = input.find("="); - - if (equalPos < 0) - continue; - - std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); - std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); - - if (arg == "benchmark_out_format") - outFormat = argVal; - else if (arg == "benchmark_filter") - filter = argVal; - else if (arg == "benchmark_format") - consoleFormat = argVal; - } +inline void getFormats(const int argc, + char* argv[], + std::string& outFormat, + std::string& filter, + std::string& consoleFormat) +{ + for(int i = 1; i < argc; i++) + { + std::string input(argv[i]); + int equalPos = input.find("="); + + if(equalPos < 0) + continue; + + std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); + std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); + + if(arg == "benchmark_out_format") + outFormat = argVal; + else if(arg == "benchmark_filter") + filter = argVal; + else if(arg == "benchmark_format") + consoleFormat = argVal; + } } #endif // ROCRAND_BENCHMARK_UTILS_HPP_ diff --git a/benchmark/cmdparser.hpp b/benchmark/cmdparser.hpp index 1ae024715..27264f837 100644 --- a/benchmark/cmdparser.hpp +++ b/benchmark/cmdparser.hpp @@ -26,488 +26,622 @@ */ #pragma once +#include #include +#include #include #include #include -#include -#include -namespace cli { - struct CallbackArgs { - const std::vector& arguments; - std::ostream& output; - std::ostream& error; +namespace cli +{ +struct CallbackArgs +{ + const std::vector& arguments; + std::ostream& output; + std::ostream& error; +}; +class Parser +{ +private: + class CmdBase + { + public: + explicit CmdBase(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant, + bool variadic) + : name(name) + , command(name.size() > 0 ? "-" + name : "") + , alternative(alternative.size() > 0 ? "--" + alternative : "") + , description(description) + , required(required) + , handled(false) + , arguments({}) + , dominant(dominant) + , variadic(variadic) + {} + + virtual ~CmdBase() {} + + std::string name; + std::string command; + std::string alternative; + std::string description; + bool required; + bool handled; + std::vector arguments; + bool const dominant; + bool const variadic; + + virtual std::string print_value() const = 0; + virtual bool parse(std::ostream& output, std::ostream& error) = 0; + + bool is(const std::string& given) const + { + return given == command || given == alternative; + } }; - class Parser { - private: - class CmdBase { - public: - explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) : - name(name), - command(name.size() > 0 ? "-" + name : ""), - alternative(alternative.size() > 0 ? "--" + alternative : ""), - description(description), - required(required), - handled(false), - arguments({}), - dominant(dominant), - variadic(variadic) { - } - virtual ~CmdBase() { - } - - std::string name; - std::string command; - std::string alternative; - std::string description; - bool required; - bool handled; - std::vector arguments; - bool const dominant; - bool const variadic; - - virtual std::string print_value() const = 0; - virtual bool parse(std::ostream& output, std::ostream& error) = 0; - - bool is(const std::string& given) const { - return given == command || given == alternative; - } - }; + template + struct ArgumentCountChecker + { + static constexpr bool Variadic = false; + }; - template - struct ArgumentCountChecker - { - static constexpr bool Variadic = false; - }; + template + struct ArgumentCountChecker> + { + static constexpr bool Variadic = true; + }; - template - struct ArgumentCountChecker> + template + class CmdFunction final : public CmdBase + { + public: + explicit CmdFunction(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant) + : CmdBase(name, + alternative, + description, + required, + dominant, + ArgumentCountChecker::Variadic) + {} + + bool parse(std::ostream& output, std::ostream& error) override { - static constexpr bool Variadic = true; - }; - - template - class CmdFunction final : public CmdBase { - public: - explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : - CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { + try + { + CallbackArgs args{arguments, output, error}; + value = callback(args); + return true; } - - bool parse(std::ostream& output, std::ostream& error) override { - try { - CallbackArgs args { arguments, output, error }; - value = callback(args); - return true; - } catch (...) { - return false; - } - } - - std::string print_value() const override { - return ""; + catch(...) + { + return false; } + } - std::function callback; - T value; - }; + std::string print_value() const override + { + return ""; + } - template - class CmdArgument final : public CmdBase { - public: - explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : - CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { - } + std::function callback; + T value; + }; - bool parse(std::ostream&, std::ostream&) override { - try { - value = Parser::parse(arguments, value); - return true; - } catch (...) { - return false; - } + template + class CmdArgument final : public CmdBase + { + public: + explicit CmdArgument(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant) + : CmdBase(name, + alternative, + description, + required, + dominant, + ArgumentCountChecker::Variadic) + {} + + bool parse(std::ostream&, std::ostream&) override + { + try + { + value = Parser::parse(arguments, value); + return true; } - - std::string print_value() const override { - return stringify(value); + catch(...) + { + return false; } + } - T value {}; - }; + std::string print_value() const override + { + return stringify(value); + } - static int parse(const std::vector& elements, const int&) { - if (elements.size() != 1) - throw std::bad_cast(); + T value{}; + }; - return std::stoi(elements[0]); - } + static int parse(const std::vector& elements, const int&) + { + if(elements.size() != 1) + throw std::bad_cast(); - static bool parse(const std::vector& elements, const bool& defval) { - if (elements.size() != 0) - throw std::runtime_error("A boolean command line parameter cannot have any arguments."); + return std::stoi(elements[0]); + } - return !defval; - } + static bool parse(const std::vector& elements, const bool& defval) + { + if(elements.size() != 0) + throw std::runtime_error("A boolean command line parameter cannot have any arguments."); - static double parse(const std::vector& elements, const double&) { - if (elements.size() != 1) - throw std::bad_cast(); + return !defval; + } - return std::stod(elements[0]); - } + static double parse(const std::vector& elements, const double&) + { + if(elements.size() != 1) + throw std::bad_cast(); - static float parse(const std::vector& elements, const float&) { - if (elements.size() != 1) - throw std::bad_cast(); + return std::stod(elements[0]); + } - return std::stof(elements[0]); - } + static float parse(const std::vector& elements, const float&) + { + if(elements.size() != 1) + throw std::bad_cast(); - static long double parse(const std::vector& elements, const long double&) { - if (elements.size() != 1) - throw std::bad_cast(); + return std::stof(elements[0]); + } - return std::stold(elements[0]); - } + static long double parse(const std::vector& elements, const long double&) + { + if(elements.size() != 1) + throw std::bad_cast(); - static unsigned int parse(const std::vector& elements, const unsigned int&) { - if (elements.size() != 1) - throw std::bad_cast(); + return std::stold(elements[0]); + } - return static_cast(std::stoul(elements[0])); - } + static unsigned int parse(const std::vector& elements, const unsigned int&) + { + if(elements.size() != 1) + throw std::bad_cast(); - static unsigned long parse(const std::vector& elements, const unsigned long&) { - if (elements.size() != 1) - throw std::bad_cast(); + return static_cast(std::stoul(elements[0])); + } - return std::stoul(elements[0]); - } - - static unsigned long long parse(const std::vector& elements, const unsigned long long&) { - if (elements.size() != 1) - throw std::bad_cast(); + static unsigned long parse(const std::vector& elements, const unsigned long&) + { + if(elements.size() != 1) + throw std::bad_cast(); - return std::stoull(elements[0]); - } + return std::stoul(elements[0]); + } - static long parse(const std::vector& elements, const long&) { - if (elements.size() != 1) - throw std::bad_cast(); + static unsigned long long parse(const std::vector& elements, + const unsigned long long&) + { + if(elements.size() != 1) + throw std::bad_cast(); - return std::stol(elements[0]); - } + return std::stoull(elements[0]); + } - static std::string parse(const std::vector& elements, const std::string&) { - if (elements.size() != 1) - throw std::bad_cast(); + static long parse(const std::vector& elements, const long&) + { + if(elements.size() != 1) + throw std::bad_cast(); - return elements[0]; - } + return std::stol(elements[0]); + } - template - static std::vector parse(const std::vector& elements, const std::vector&) { - const T defval = T(); - std::vector values { }; - std::vector buffer(1); + static std::string parse(const std::vector& elements, const std::string&) + { + if(elements.size() != 1) + throw std::bad_cast(); - for (const auto& element : elements) { - buffer[0] = element; - values.push_back(parse(buffer, defval)); - } + return elements[0]; + } - return values; - } + template + static std::vector parse(const std::vector& elements, const std::vector&) + { + const T defval = T(); + std::vector values{}; + std::vector buffer(1); - template - static std::string stringify(const T& value) { - return std::to_string(value); + for(const auto& element : elements) + { + buffer[0] = element; + values.push_back(parse(buffer, defval)); } - template - static std::string stringify(const std::vector& values) { - std::stringstream ss { }; - ss << "[ "; + return values; + } - for (const auto& value : values) { - ss << stringify(value) << " "; - } + template + static std::string stringify(const T& value) + { + return std::to_string(value); + } - ss << "]"; - return ss.str(); - } + template + static std::string stringify(const std::vector& values) + { + std::stringstream ss{}; + ss << "[ "; - static std::string stringify(const std::string& str) { - return str; + for(const auto& value : values) + { + ss << stringify(value) << " "; } - public: - explicit Parser(int argc, const char** argv) : - _appname(argv[0]) { - for (int i = 1; i < argc; ++i) { - _arguments.push_back(argv[i]); - } - enable_help(); - } + ss << "]"; + return ss.str(); + } - explicit Parser(int argc, char** argv) : - _appname(argv[0]) { - for (int i = 1; i < argc; ++i) { - _arguments.push_back(argv[i]); - } - enable_help(); - } + static std::string stringify(const std::string& str) + { + return str; + } - ~Parser() { - for (int i = 0, n = _commands.size(); i < n; ++i) { - delete _commands[i]; - } +public: + explicit Parser(int argc, const char** argv) : _appname(argv[0]) + { + for(int i = 1; i < argc; ++i) + { + _arguments.push_back(argv[i]); } + enable_help(); + } - bool has_help() const { - for (const auto command : _commands) { - if (command->name == "h" && command->alternative == "--help") { - return true; - } - } - - return false; + explicit Parser(int argc, char** argv) : _appname(argv[0]) + { + for(int i = 1; i < argc; ++i) + { + _arguments.push_back(argv[i]); } + enable_help(); + } - void enable_help() { - set_callback("h", "help", std::function([this](CallbackArgs& args){ - args.output << this->usage(); - exit(0); - return false; - }), "", true); + ~Parser() + { + for(int i = 0, n = _commands.size(); i < n; ++i) + { + delete _commands[i]; } + } - void disable_help() { - for (auto command = _commands.begin(); command != _commands.end(); ++command) { - if ((*command)->name == "h" && (*command)->alternative == "--help") { - _commands.erase(command); - break; - } + bool has_help() const + { + for(const auto command : _commands) + { + if(command->name == "h" && command->alternative == "--help") + { + return true; } } - template - void set_default(bool is_required, const std::string& description = "") { - auto command = new CmdArgument { "", "", description, is_required, false }; - _commands.push_back(command); - } - - template - void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) { - auto command = new CmdArgument { name, alternative, description, true, dominant }; - _commands.push_back(command); + return false; + } + + void enable_help() + { + set_callback("h", + "help", + std::function( + [this](CallbackArgs& args) + { + args.output << this->usage(); + exit(0); + return false; + }), + "", + true); + } + + void disable_help() + { + for(auto command = _commands.begin(); command != _commands.end(); ++command) + { + if((*command)->name == "h" && (*command)->alternative == "--help") + { + _commands.erase(command); + break; + } } - - template - void set_optional(const std::string& name, const std::string& alternative, const T& defaultValue, const std::string& description = "", bool dominant = false) { - auto command = new CmdArgument { name, alternative, description, false, dominant }; - command->value = defaultValue; - _commands.push_back(command); + } + + template + void set_default(bool is_required, const std::string& description = "") + { + auto command = new CmdArgument{"", "", description, is_required, false}; + _commands.push_back(command); + } + + template + void set_required(const std::string& name, + const std::string& alternative, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdArgument{name, alternative, description, true, dominant}; + _commands.push_back(command); + } + + template + void set_optional(const std::string& name, + const std::string& alternative, + const T& defaultValue, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdArgument{name, alternative, description, false, dominant}; + command->value = defaultValue; + _commands.push_back(command); + } + + template + void set_callback(const std::string& name, + const std::string& alternative, + std::function callback, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdFunction{name, alternative, description, false, dominant}; + command->callback = callback; + _commands.push_back(command); + } + + inline void run_and_exit_if_error() + { + if(run() == false) + { + exit(1); } + } - template - void set_callback(const std::string& name, const std::string& alternative, std::function callback, const std::string& description = "", bool dominant = false) { - auto command = new CmdFunction { name, alternative, description, false, dominant }; - command->callback = callback; - _commands.push_back(command); - } + inline bool run() + { + return run(std::cout, std::cerr); + } - inline void run_and_exit_if_error() { - if (run() == false) { - exit(1); - } - } + inline bool run(std::ostream& output) + { + return run(output, std::cerr); + } - inline bool run() { - return run(std::cout, std::cerr); - } + bool run(std::ostream& output, std::ostream& error) + { + if(_arguments.size() > 0) + { + auto current = find_default(); - inline bool run(std::ostream& output) { - return run(output, std::cerr); - } + for(int i = 0, n = _arguments.size(); i < n; ++i) + { + auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; + auto associated = isarg ? find(_arguments[i]) : nullptr; - bool run(std::ostream& output, std::ostream& error) { - if (_arguments.size() > 0) { - auto current = find_default(); - - for (int i = 0, n = _arguments.size(); i < n; ++i) { - auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; - auto associated = isarg ? find(_arguments[i]) : nullptr; - - if (associated != nullptr) { - current = associated; - associated->handled = true; - } else if (current == nullptr) { - error << no_default(); - return false; - } else { - current->arguments.push_back(_arguments[i]); - current->handled = true; - if (!current->variadic) - { - // If the current command is not variadic, then no more arguments - // should be added to it. In this case, switch back to the default - // command. - current = find_default(); - } - } + if(associated != nullptr) + { + current = associated; + associated->handled = true; } - } - - // First, parse dominant arguments since they succeed even if required - // arguments are missing. - for (auto command : _commands) { - if (command->handled && command->dominant && !command->parse(output, error)) { - error << howto_use(command); + else if(current == nullptr) + { + error << no_default(); return false; } - } - - // Next, check for any missing arguments. - for (auto command : _commands) { - if (command->required && !command->handled) { - error << howto_required(command); - return false; + else + { + current->arguments.push_back(_arguments[i]); + current->handled = true; + if(!current->variadic) + { + // If the current command is not variadic, then no more arguments + // should be added to it. In this case, switch back to the default + // command. + current = find_default(); + } } } + } - // Finally, parse all remaining arguments. - for (auto command : _commands) { - if (command->handled && !command->dominant && !command->parse(output, error)) { - error << howto_use(command); - return false; - } + // First, parse dominant arguments since they succeed even if required + // arguments are missing. + for(auto command : _commands) + { + if(command->handled && command->dominant && !command->parse(output, error)) + { + error << howto_use(command); + return false; } - - return true; } - template - T get(const std::string& name) const { - for (const auto& command : _commands) { - if (command->name == name) { - auto cmd = dynamic_cast*>(command); - - if (cmd == nullptr) { - throw std::runtime_error("Invalid usage of the parameter " + name + " detected."); - } - - return cmd->value; - } + // Next, check for any missing arguments. + for(auto command : _commands) + { + if(command->required && !command->handled) + { + error << howto_required(command); + return false; } - - throw std::runtime_error("The parameter " + name + " could not be found."); } - template - T get_if(const std::string& name, std::function callback) const { - auto value = get(name); - return callback(value); + // Finally, parse all remaining arguments. + for(auto command : _commands) + { + if(command->handled && !command->dominant && !command->parse(output, error)) + { + error << howto_use(command); + return false; + } } - int requirements() const { - int count = 0; + return true; + } - for (const auto& command : _commands) { - if (command->required) { - ++count; + template + T get(const std::string& name) const + { + for(const auto& command : _commands) + { + if(command->name == name) + { + auto cmd = dynamic_cast*>(command); + + if(cmd == nullptr) + { + throw std::runtime_error("Invalid usage of the parameter " + name + + " detected."); } - } - return count; + return cmd->value; + } } - int commands() const { - return static_cast(_commands.size()); - } + throw std::runtime_error("The parameter " + name + " could not be found."); + } - inline const std::string& app_name() const { - return _appname; - } + template + T get_if(const std::string& name, std::function callback) const + { + auto value = get(name); + return callback(value); + } - protected: - CmdBase* find(const std::string& name) { - for (auto command : _commands) { - if (command->is(name)) { - return command; - } - } + int requirements() const + { + int count = 0; - return nullptr; + for(const auto& command : _commands) + { + if(command->required) + { + ++count; + } } - CmdBase* find_default() { - for (auto command : _commands) { - if (command->name == "") { - return command; - } - } + return count; + } + + int commands() const + { + return static_cast(_commands.size()); + } - return nullptr; + inline const std::string& app_name() const + { + return _appname; + } + +protected: + CmdBase* find(const std::string& name) + { + for(auto command : _commands) + { + if(command->is(name)) + { + return command; + } } - std::string usage() const { - std::stringstream ss { }; - ss << "Available parameters:\n\n"; + return nullptr; + } - for (const auto& command : _commands) { - ss << " " << command->command << "\t" << command->alternative; + CmdBase* find_default() + { + for(auto command : _commands) + { + if(command->name == "") + { + return command; + } + } - if (command->required == true) { - ss << "\t(required)"; - } + return nullptr; + } - ss << "\n " << command->description; + std::string usage() const + { + std::stringstream ss{}; + ss << "Available parameters:\n\n"; - if (command->required == false) { - ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'."; - } + for(const auto& command : _commands) + { + ss << " " << command->command << "\t" << command->alternative; - ss << "\n\n"; + if(command->required == true) + { + ss << "\t(required)"; } - return ss.str(); - } + ss << "\n " << command->description; - void print_help(std::stringstream& ss) const { - if (has_help()) { - ss << "For more help use --help or -h.\n"; + if(command->required == false) + { + ss << "\n " + << "This parameter is optional. The default value is '" + command->print_value() + << "'."; } - } - - std::string howto_required(CmdBase* command) const { - std::stringstream ss { }; - ss << "The parameter " << command->name << " is required.\n"; - ss << command->description << '\n'; - print_help(ss); - return ss.str(); - } - std::string howto_use(CmdBase* command) const { - std::stringstream ss { }; - ss << "The parameter " << command->name << " has invalid arguments.\n"; - ss << command->description << '\n'; - print_help(ss); - return ss.str(); + ss << "\n\n"; } - std::string no_default() const { - std::stringstream ss { }; - ss << "No default parameter has been specified.\n"; - ss << "The given argument must be used with a parameter.\n"; - print_help(ss); - return ss.str(); - } + return ss.str(); + } - private: - const std::string _appname; - std::vector _arguments; - std::vector _commands; - }; -} + void print_help(std::stringstream& ss) const + { + if(has_help()) + { + ss << "For more help use --help or -h.\n"; + } + } + + std::string howto_required(CmdBase* command) const + { + std::stringstream ss{}; + ss << "The parameter " << command->name << " is required.\n"; + ss << command->description << '\n'; + print_help(ss); + return ss.str(); + } + + std::string howto_use(CmdBase* command) const + { + std::stringstream ss{}; + ss << "The parameter " << command->name << " has invalid arguments.\n"; + ss << command->description << '\n'; + print_help(ss); + return ss.str(); + } + + std::string no_default() const + { + std::stringstream ss{}; + ss << "No default parameter has been specified.\n"; + ss << "The given argument must be used with a parameter.\n"; + print_help(ss); + return ss.str(); + } + +private: + const std::string _appname; + std::vector _arguments; + std::vector _commands; +}; +} // namespace cli diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp index c48a068e2..aea2aed19 100644 --- a/benchmark/custom_csv_formater.hpp +++ b/benchmark/custom_csv_formater.hpp @@ -22,180 +22,208 @@ #include #include -namespace benchmark { +namespace benchmark +{ -class customCSVReporter : public BenchmarkReporter { +class customCSVReporter : public BenchmarkReporter +{ public: - customCSVReporter() : printed_header_(false) {} - bool ReportContext(const Context &context) override; - void ReportRuns(const std::vector &reports) override; + customCSVReporter() : printed_header_(false) {} + bool ReportContext(const Context& context) override; + void ReportRuns(const std::vector& reports) override; private: - std::string CsvEscape(const std::string &s) { - std::string tmp; - tmp.reserve(s.size() + 2); - for (char c : s) { - switch (c) { - case '"': - tmp += "\"\""; - break; - default: - tmp += c; - break; - } + std::string CsvEscape(const std::string& s) + { + std::string tmp; + tmp.reserve(s.size() + 2); + for(char c : s) + { + switch(c) + { + case '"': tmp += "\"\""; break; + default: tmp += c; break; + } + } + return '"' + tmp + '"'; } - return '"' + tmp + '"'; - } - - // Function to return an string for the calculated complexity - std::string GetBigOString(const BigO complexity) { - switch (complexity) { - case oN: - return "N"; - case oNSquared: - return "N^2"; - case oNCubed: - return "N^3"; - case oLogN: - return "lgN"; - case oNLogN: - return "NlgN"; - case o1: - return "(1)"; - default: - return "f(N)"; - } - } - - void PrintRunData(const Run &report); - bool printed_header_; - std::set user_counter_names_; - std::ostream *nullLog = nullptr; + // Function to return an string for the calculated complexity + std::string GetBigOString(const BigO complexity) + { + switch(complexity) + { + case oN: return "N"; + case oNSquared: return "N^2"; + case oNCubed: return "N^3"; + case oLogN: return "lgN"; + case oNLogN: return "NlgN"; + case o1: return "(1)"; + default: return "f(N)"; + } + } - std::array elements = { - "engine", "distribution", "name", "iterations", - "real_time", "cpu_time", "time_unit", "bytes_per_second", - "items_per_second", "label", "error_occurred", "error_message"}; + void PrintRunData(const Run& report); + bool printed_header_; + std::set user_counter_names_; + + std::ostream* nullLog = nullptr; + + std::array elements = {"engine", + "distribution", + "name", + "iterations", + "real_time", + "cpu_time", + "time_unit", + "bytes_per_second", + "items_per_second", + "label", + "error_occurred", + "error_message"}; }; -bool customCSVReporter::ReportContext(const Context &context) { - PrintBasicContext(&GetErrorStream(), context); - return true; +bool customCSVReporter::ReportContext(const Context& context) +{ + PrintBasicContext(&GetErrorStream(), context); + return true; } -void customCSVReporter::ReportRuns(const std::vector &reports) { - std::ostream &Out = GetOutputStream(); - - if (!printed_header_) { - // save the names of all the user counters - for (const auto &run : reports) { - for (const auto &cnt : run.counters) { - if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second") - continue; - user_counter_names_.insert(cnt.first); - } +void customCSVReporter::ReportRuns(const std::vector& reports) +{ + std::ostream& Out = GetOutputStream(); + + if(!printed_header_) + { + // save the names of all the user counters + for(const auto& run : reports) + { + for(const auto& cnt : run.counters) + { + if(cnt.first == "bytes_per_second" || cnt.first == "items_per_second") + continue; + user_counter_names_.insert(cnt.first); + } + } + + // print the header + for(auto B = elements.begin(); B != elements.end();) + { + Out << *B++; + if(B != elements.end()) + Out << ","; + } + for(auto B = user_counter_names_.begin(); B != user_counter_names_.end();) + { + Out << ",\"" << *B++ << "\""; + } + Out << "\n"; + + printed_header_ = true; + } + else + { + // check that all the current counters are saved in the name set + for(const auto& run : reports) + { + for(const auto& cnt : run.counters) + { + if(cnt.first == "bytes_per_second" || cnt.first == "items_per_second") + continue; + + // benchmark::internal::GetNullLogInstance() + *nullLog << "All counters must be present in each run. " + << "Counter named \"" << cnt.first + << "\" was not in a run after being added to the header"; + } + } } - // print the header - for (auto B = elements.begin(); B != elements.end();) { - Out << *B++; - if (B != elements.end()) - Out << ","; + // print results for each run + for(const auto& run : reports) + { + PrintRunData(run); } - for (auto B = user_counter_names_.begin(); - B != user_counter_names_.end();) { - Out << ",\"" << *B++ << "\""; +} + +void customCSVReporter::PrintRunData(const Run& run) +{ + std::ostream& Out = GetOutputStream(); + std::ostream& Err = GetErrorStream(); + + //get the name of the engine and distribution: + + std::string temp = run.benchmark_name(); + temp.erase(0, temp.find("<") + 1); + + std::string engineName = std::string(temp.begin(), temp.begin() + temp.find(",")); + + temp.erase(0, engineName.size() + 1); + temp.erase(0, temp.find(",") + 1); + std::string disName = std::string(temp.begin(), temp.begin() + temp.find(">")); + + Out << engineName << ","; + Out << disName << ","; + Out << CsvEscape(run.benchmark_name()) << ","; + if(run.error_occurred) + { + Err << std::string(elements.size() - 3, ','); + Err << "true,"; + Err << CsvEscape(run.error_message) << "\n"; + return; } - Out << "\n"; - - printed_header_ = true; - } else { - // check that all the current counters are saved in the name set - for (const auto &run : reports) { - for (const auto &cnt : run.counters) { - if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second") - continue; - - // benchmark::internal::GetNullLogInstance() - *nullLog << "All counters must be present in each run. " - << "Counter named \"" << cnt.first - << "\" was not in a run after being added to the header"; - } + + // Do not print iteration on bigO and RMS report + if(!run.report_big_o && !run.report_rms) + { + Out << run.iterations; } - } + Out << ","; - // print results for each run - for (const auto &run : reports) { - PrintRunData(run); - } -} + Out << run.GetAdjustedRealTime() << ","; + Out << run.GetAdjustedCPUTime() << ","; -void customCSVReporter::PrintRunData(const Run &run) { - std::ostream &Out = GetOutputStream(); - std::ostream &Err = GetErrorStream(); - - //get the name of the engine and distribution: - - std::string temp = run.benchmark_name(); - temp.erase(0, temp.find("<") + 1); - - std::string engineName = std::string(temp.begin(), temp.begin() + temp.find(",")); - - temp.erase(0, engineName.size() + 1); - temp.erase(0, temp.find(",") + 1); - std::string disName = std::string(temp.begin(), temp.begin() + temp.find(">")); - - Out << engineName << ","; - Out << disName << ","; - Out << CsvEscape(run.benchmark_name()) << ","; - if (run.error_occurred) { - Err << std::string(elements.size() - 3, ','); - Err << "true,"; - Err << CsvEscape(run.error_message) << "\n"; - return; - } - - // Do not print iteration on bigO and RMS report - if (!run.report_big_o && !run.report_rms) { - Out << run.iterations; - } - Out << ","; - - Out << run.GetAdjustedRealTime() << ","; - Out << run.GetAdjustedCPUTime() << ","; - - // Do not print timeLabel on bigO and RMS report - if (run.report_big_o) { - Out << GetBigOString(run.complexity); - } else if (!run.report_rms) { - Out << GetTimeUnitString(run.time_unit); - } - Out << ","; - - if (run.counters.find("bytes_per_second") != run.counters.end()) { - Out << run.counters.at("bytes_per_second"); - } - Out << ","; - if (run.counters.find("items_per_second") != run.counters.end()) { - Out << run.counters.at("items_per_second"); - } - Out << ","; - if (!run.report_label.empty()) { - Out << CsvEscape(run.report_label); - } - Out << ",,"; // for error_occurred and error_message - - // Print user counters - for (const auto &ucn : user_counter_names_) { - auto it = run.counters.find(ucn); - if (it == run.counters.end()) { - Out << ","; - } else { - Out << "," << it->second; + // Do not print timeLabel on bigO and RMS report + if(run.report_big_o) + { + Out << GetBigOString(run.complexity); + } + else if(!run.report_rms) + { + Out << GetTimeUnitString(run.time_unit); + } + Out << ","; + + if(run.counters.find("bytes_per_second") != run.counters.end()) + { + Out << run.counters.at("bytes_per_second"); + } + Out << ","; + if(run.counters.find("items_per_second") != run.counters.end()) + { + Out << run.counters.at("items_per_second"); + } + Out << ","; + if(!run.report_label.empty()) + { + Out << CsvEscape(run.report_label); + } + Out << ",,"; // for error_occurred and error_message + + // Print user counters + for(const auto& ucn : user_counter_names_) + { + auto it = run.counters.find(ucn); + if(it == run.counters.end()) + { + Out << ","; + } + else + { + Out << "," << it->second; + } } - } - Out << '\n'; + Out << '\n'; } } // namespace benchmark From 89aacd90be6ad39fa5aa757442f23824ac667a15 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 20 Aug 2024 15:41:48 -0600 Subject: [PATCH 07/10] added lambda,mode,gbps columns --- benchmark/custom_csv_formater.hpp | 34 +++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp index aea2aed19..0e43abe64 100644 --- a/benchmark/custom_csv_formater.hpp +++ b/benchmark/custom_csv_formater.hpp @@ -69,14 +69,17 @@ class customCSVReporter : public BenchmarkReporter std::ostream* nullLog = nullptr; - std::array elements = {"engine", + std::array elements = {"engine", "distribution", + "mode", "name", "iterations", "real_time", "cpu_time", "time_unit", "bytes_per_second", + "gigabytes_per_second", + "lambda", "items_per_second", "label", "error_occurred", @@ -154,16 +157,34 @@ void customCSVReporter::PrintRunData(const Run& run) //get the name of the engine and distribution: std::string temp = run.benchmark_name(); + + std::string deviceName = std::string(temp.begin(), temp.begin() + temp.find("<")); + temp.erase(0, temp.find("<") + 1); std::string engineName = std::string(temp.begin(), temp.begin() + temp.find(",")); temp.erase(0, engineName.size() + 1); - temp.erase(0, temp.find(",") + 1); + + std::string mode = "default"; + + if(deviceName != "device_kernel") + { + mode = std::string(temp.begin(), temp.begin() + temp.find(',')); + temp.erase(0, temp.find(",") + 1); + } std::string disName = std::string(temp.begin(), temp.begin() + temp.find(">")); - Out << engineName << ","; - Out << disName << ","; + std::string lambda = ""; + + size_t ePos = disName.find("="); + if(ePos <= disName.size()) + { + lambda = std::string(disName.begin() + (ePos + 1), disName.end() - 1); + disName.erase(disName.begin() + disName.find("("), disName.end()); + } + + Out << engineName << "," << disName << "," << mode << ","; Out << CsvEscape(run.benchmark_name()) << ","; if(run.error_occurred) { @@ -199,6 +220,11 @@ void customCSVReporter::PrintRunData(const Run& run) Out << run.counters.at("bytes_per_second"); } Out << ","; + + double gbps = run.counters.at("bytes_per_second") / std::pow(1024, 3); + + Out << gbps << "," << lambda << ","; + if(run.counters.find("items_per_second") != run.counters.end()) { Out << run.counters.at("items_per_second"); From 0d9aa5b97bc07b66e42aac72e362862d69e1a639 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 20 Aug 2024 15:58:45 -0600 Subject: [PATCH 08/10] replaced gigabytes_per_second with throughput_gigabytes_per_second --- benchmark/custom_csv_formater.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp index 0e43abe64..5f89053c9 100644 --- a/benchmark/custom_csv_formater.hpp +++ b/benchmark/custom_csv_formater.hpp @@ -78,7 +78,7 @@ class customCSVReporter : public BenchmarkReporter "cpu_time", "time_unit", "bytes_per_second", - "gigabytes_per_second", + "throughput_gigabytes_per_second", "lambda", "items_per_second", "label", From 41cd2fcd9769bb38b671bc8783a8c5c4809016d2 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 20 Aug 2024 15:59:55 -0600 Subject: [PATCH 09/10] updated change log --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b538c35c..e028c3e36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,9 @@ Documentation for rocRAND is available at * Added host generator for MT19937 * Support for `rocrand_generate_poisson` in hipGraphs -* Added engine and distribution columns for csv format in benchmark_rocrand_host_api and - benchmark_rocrand_device_api. To see these new columns set --benchmark_format=csv or - --benchmark_out_format=csv --benchmark_out="outName.csv" +* Added engine, distribution, mpde, throughput_gigabytes_per_second, and lambda columns for csv format in + benchmark_rocrand_host_api and benchmark_rocrand_device_api. To see these new columns set --benchmark_format=csv + or --benchmark_out_format=csv --benchmark_out="outName.csv" ### Changes From 1a2396dd76b6a6e5c523b841183ed7c3b7090467 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 20 Aug 2024 16:01:18 -0600 Subject: [PATCH 10/10] fixed typo in changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e028c3e36..01475ffee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ Documentation for rocRAND is available at * Added host generator for MT19937 * Support for `rocrand_generate_poisson` in hipGraphs -* Added engine, distribution, mpde, throughput_gigabytes_per_second, and lambda columns for csv format in +* Added engine, distribution, mode, throughput_gigabytes_per_second, and lambda columns for csv format in benchmark_rocrand_host_api and benchmark_rocrand_device_api. To see these new columns set --benchmark_format=csv or --benchmark_out_format=csv --benchmark_out="outName.csv"