diff --git a/.clang-format b/.clang-format index ebf71708..bf4a9927 100644 --- a/.clang-format +++ b/.clang-format @@ -58,11 +58,7 @@ BraceWrapping: AfterNamespace: true AfterStruct: true AfterUnion: true - BeforeCatch: true - BeforeElse: true AfterExternBlock: false - BeforeCatch: true - BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 68be9cde..b370708c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -45,7 +45,7 @@ clang-format: stage: lint needs: [] tags: - - rocm-build + - build variables: CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format" GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format" @@ -62,7 +62,7 @@ copyright-date: stage: lint needs: [] tags: - - rocm-build + - build rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: @@ -107,7 +107,7 @@ build:rocm: - .rules:build stage: build tags: - - rocm-build + - build needs: [] script: - cmake @@ -145,7 +145,7 @@ build:rocm-benchmark: - .rules:build stage: build tags: - - rocm-build + - build needs: [] script: - cmake @@ -304,7 +304,7 @@ build:nvcc: - .nvcc - .rules:build tags: - - nvcc-build + - build needs: [] script: - cmake @@ -339,7 +339,7 @@ build:nvcc-benchmark: - .nvcc - .rules:build tags: - - nvcc-build + - build needs: [] script: - cmake diff --git a/CHANGELOG.md b/CHANGELOG.md index 177ae0d4..950525f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ See README.md on how to build the hipCUB documentation using Doxygen. +## (Unreleased) hipCUB-2.13.1 for ROCm 6.1.0 +### Changed +- CUB backend references CUB and Thrust version 2.1.0. +- Updated `HIPCUB_HOST_WARP_THREADS` macro definition to match `host_warp_size` changes from rocPRIM 3.0. +- Implemented `__int128_t` and `__uint128_t` support for radix_sort. +### Fixed +- Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage. + +### Added +- Added interface `DeviceMemcpy::Batched` for batched memcpy from rocPRIM and CUB. + ## (Unreleased) hipCUB-2.13.1 for ROCm 5.7.0 ### Changed - CUB backend references CUB and Thrust version 2.0.1. diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 5fff6007..2d5e2188 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -80,6 +80,7 @@ add_hipcub_benchmark(benchmark_block_run_length_decode.cpp) add_hipcub_benchmark(benchmark_block_scan.cpp) add_hipcub_benchmark(benchmark_block_shuffle.cpp) add_hipcub_benchmark(benchmark_device_adjacent_difference.cpp) +add_hipcub_benchmark(benchmark_device_batch_memcpy.cpp) add_hipcub_benchmark(benchmark_device_histogram.cpp) add_hipcub_benchmark(benchmark_device_memory.cpp) add_hipcub_benchmark(benchmark_device_merge_sort.cpp) diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp new file mode 100644 index 00000000..3d72e349 --- /dev/null +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -0,0 +1,410 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark/benchmark.h" +#include "cmdparser.hpp" +#include "common_benchmark_header.hpp" + +#include "hipcub/block/block_load.hpp" +#include "hipcub/block/block_store.hpp" +#include "hipcub/device/device_memcpy.hpp" +#include "hipcub/hipcub.hpp" + +#ifdef __HIP_PLATFORM_AMD__ + // Only include this on AMD as it contains specialized config information + #include +#endif + +#include + +#include +#include +#include +#include +#include + +#include + +constexpr uint32_t warmup_size = 5; +constexpr int32_t max_size = 1024 * 1024; + +constexpr int32_t wlev_min_size = 128; +constexpr int32_t blev_min_size = 1024; + +// Used for generating offsets. We generate a permutation map and then derive +// offsets via a sum scan over the sizes in the order of the permutation. This +// allows us to keep the order of buffers we pass to batch_memcpy, but still +// have source and destinations mappings not be the identity function: +// +// batch_memcpy( +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) +// [3 , 2 , 1 , 2 ]) // size +// +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d +// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! +// ───┬─── ─────┬───── ───┬─── ─── +// └─────────┼─────────┼───┐ +// ┌───┘ ┌───┘ │ what batch_memcpy does +// ▼ ▼ ▼ +// ─── ─────────── ─────── ─────── +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' +// └───┴───┴───┴───┴───┴───┴───┴───┘ +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const auto n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + for(T i = 0, sum = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } + + return result; +} + +using offset_type = size_t; + +template +struct BatchMemcpyData +{ + size_t total_num_elements = 0; + ValueType* d_input = nullptr; + ValueType* d_output = nullptr; + ValueType** d_buffer_srcs = nullptr; + ValueType** d_buffer_dsts = nullptr; + BufferSizeType* d_buffer_sizes = nullptr; + + BatchMemcpyData() = default; + BatchMemcpyData(const BatchMemcpyData&) = delete; + + BatchMemcpyData(BatchMemcpyData&& other) + : total_num_elements{std::exchange(other.total_num_elements, 0)} + , d_input{std::exchange(other.d_input, nullptr)} + , d_output{std::exchange(other.d_output, nullptr)} + , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} + , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} + , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} + {} + + BatchMemcpyData& operator=(BatchMemcpyData&& other) + { + total_num_elements = std::exchange(other.total_num_elements, 0); + d_input = std::exchange(other.d_input, nullptr); + d_output = std::exchange(other.d_output, nullptr); + d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); + d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); + d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); + return *this; + }; + + BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; + + size_t total_num_bytes() const + { + return total_num_elements * sizeof(ValueType); + } + + ~BatchMemcpyData() + { + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + } +}; + +template +BatchMemcpyData prepare_data(const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const bool shuffle_buffers = false; + + BatchMemcpyData result; + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + constexpr int32_t wlev_min_elems + = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); + constexpr int32_t blev_min_elems + = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); + constexpr int32_t max_elems = max_size / sizeof(ValueType); + + // Generate data + std::mt19937_64 rng(std::random_device{}()); + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = benchmark_utils::generate_random_data_n(iter, + num_tlev_buffers, + 1, + wlev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_wlev_buffers, + wlev_min_elems, + blev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_blev_buffers, + blev_min_elems, + max_elems, + rng); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); + } + + result.total_num_elements + = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints + = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + std::for_each(reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); + + HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); + HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); + + HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffle_buffers) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = result.d_input + src_offsets[i]; + h_buffer_dsts[i] = result.d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK( + hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_sizes, + h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(BufferSizeType), + hipMemcpyHostToDevice)); + + return result; +} + +template +void run_benchmark(benchmark::State& state, + hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + size_t temp_storage_bytes = 0; + BatchMemcpyData data; + HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers)); + + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + data = prepare_data(num_tlev_buffers, + num_wlev_buffers, + num_blev_buffers); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); + + HIP_CHECK(hipFree(d_temp_storage)); +} + +#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ + ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ + ",num_blev:" #num_blev ",cfg:default_config}", \ + [=](benchmark::State& state) \ + { \ + run_benchmark, \ + size_type>(state, stream, num_tlev, num_wlev, num_blev); \ + }) + +#define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) + +int32_t main(int32_t argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", 1024, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", + "name_format", + "human", + "either: json,human,txt"); + + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int32_t trials = parser.get("trials"); + + // HIP + hipStream_t stream = hipStreamDefault; // default + + // Benchmark info + benchmark::AddCustomContext("size", std::to_string(size)); + + // Add benchmarks + std::vector benchmarks; + + benchmarks = {BENCHMARK_TYPE(1, 1), + BENCHMARK_TYPE(1, 2), + BENCHMARK_TYPE(1, 4), + BENCHMARK_TYPE(1, 8), + BENCHMARK_TYPE(2, 2), + BENCHMARK_TYPE(4, 4), + BENCHMARK_TYPE(8, 8)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index f96d2d42..149d5ada 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -119,7 +119,7 @@ template struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { - run_benchmark(state, size, stream, hipcub::DeviceReduce::Sum); + run_benchmark(state, size, stream, hipcub::DeviceReduce::Sum); } }; @@ -127,7 +127,7 @@ template struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { - run_benchmark(state, size, stream, hipcub::DeviceReduce::Min); + run_benchmark(state, size, stream, hipcub::DeviceReduce::Min); } }; @@ -139,7 +139,10 @@ struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { - run_benchmark(state, size, stream, hipcub::DeviceReduce::ArgMin); + run_benchmark(state, + size, + stream, + hipcub::DeviceReduce::ArgMin); } }; diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index b8b08957..deadd3dd 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -357,10 +357,68 @@ template struct DeviceSelectWarpSize { static constexpr unsigned value = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize - ? LogicalWarpSize - : HIPCUB_DEVICE_WARP_THREADS; + ? LogicalWarpSize + : HIPCUB_DEVICE_WARP_THREADS; }; +template +using it_value_t = typename std::iterator_traits::value_type; + +using engine_type = std::default_random_engine; + +// generate_random_data_n() generates only part of sequence and replicates it, +// because benchmarks usually do not need "true" random sequence. +template +inline auto generate_random_data_n( + OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) + -> typename std::enable_if_t>::value, OutputIter> +{ + using T = it_value_t; + + using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; + std::uniform_int_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; +} + +template +inline auto generate_random_data_n(OutputIterator it, + size_t size, + U min, + V max, + Generator& gen, + size_t max_random_size = 1024 * 1024) + -> std::enable_if_t>::value, OutputIterator> +{ + using T = typename std::iterator_traits::value_type; + + std::uniform_real_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; +} + +template +struct alignas(Alignment) custom_aligned_type +{ + unsigned char data[Size]; +}; + +template::value && std::is_unsigned::value, int> = 0> +inline constexpr auto ceiling_div(const T a, const U b) +{ + return a / b + (a % b > 0 ? 1 : 0); +} + } // end benchmark_util namespace // Need for hipcub::DeviceReduce::Min/Max etc. diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 3653f492..c1738491 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -147,54 +147,54 @@ if(HIP_COMPILER STREQUAL "nvcc") if(NOT DEFINED CUB_INCLUDE_DIR) file( - DOWNLOAD https://github.com/NVIDIA/cub/archive/2.0.1.zip - ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip + DOWNLOAD https://github.com/NVIDIA/cub/archive/2.1.0.zip + ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip STATUS cub_download_status LOG cub_download_log ) list(GET cub_download_status 0 cub_download_error_code) if(cub_download_error_code) message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/cub/archive/2.0.1.zip failed " + "https://github.com/NVIDIA/cub/archive/2.1.0.zip failed " "error_code: ${cub_download_error_code} " "log: ${cub_download_log} " ) endif() execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip + COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE cub_unpack_error_code ) if(cub_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip failed") + message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip failed") endif() - set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1/ CACHE PATH "") + set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0/ CACHE PATH "") endif() if(NOT DEFINED THRUST_INCLUDE_DIR) file( - DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.0.1.zip - ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip + DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.1.0.zip + ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip STATUS thrust_download_status LOG thrust_download_log ) list(GET thrust_download_status 0 thrust_download_error_code) if(thrust_download_error_code) message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/thrust/archive/2.0.1.zip failed " + "https://github.com/NVIDIA/thrust/archive/2.1.0.zip failed " "error_code: ${thrust_download_error_code} " "log: ${thrust_download_log} " ) endif() execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip + COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE thrust_unpack_error_code ) if(thrust_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip failed") + message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip failed") endif() - set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1/ CACHE PATH "") + set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0/ CACHE PATH "") endif() else() # rocPRIM (only for ROCm platform) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 202e3454..2a9654bd 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -31,7 +31,7 @@ click==8.1.3 # sphinx-external-toc click-log==0.4.0 # via doxysphinx -cryptography==41.0.4 +cryptography==41.0.6 # via pyjwt deprecated==1.2.13 # via pygithub diff --git a/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp b/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp new file mode 100644 index 00000000..835e5dd0 --- /dev/null +++ b/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp @@ -0,0 +1,64 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_ +#define HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_ + +#include "../../../config.hpp" + +#include + +#include + +BEGIN_HIPCUB_NAMESPACE + +struct DeviceMemcpy +{ + template + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + uint32_t num_buffers, + hipStream_t stream = 0, + bool /* debug_synchronous */ = false) + { + return hipCUDAErrorTohipError(::cub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + input_buffer_it, + output_buffer_it, + buffer_sizes, + num_buffers, + stream)); + } +}; + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_ diff --git a/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp b/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp index 3a55ce1e..09977015 100644 --- a/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp +++ b/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp @@ -166,6 +166,29 @@ struct DeviceMergeSort compare_op, stream)); } + + template + HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeysCopy(void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0, + bool debug_synchronous = false) + { + (void)debug_synchronous; + return hipCUDAErrorTohipError(::cub::DeviceMergeSort::StableSortKeysCopy(d_temp_storage, + temp_storage_bytes, + d_input_keys, + d_output_keys, + num_items, + compare_op, + stream)); + } }; END_HIPCUB_NAMESPACE diff --git a/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp b/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp index 0d949bd8..0847ff68 100644 --- a/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp +++ b/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp @@ -39,22 +39,20 @@ BEGIN_HIPCUB_NAMESPACE class DeviceReduce { public: - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ReduceOpT, - typename T - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Reduce(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - ReduceOpT reduction_op, - T init, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Reduce(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + ReduceOpT reduction_op, + T init, + hipStream_t stream = 0, + bool debug_synchronous = false) { (void)debug_synchronous; return hipCUDAErrorTohipError(::cub::DeviceReduce::Reduce(d_temp_storage, @@ -67,18 +65,14 @@ class DeviceReduce stream)); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Sum(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Sum(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { (void)debug_synchronous; return hipCUDAErrorTohipError(::cub::DeviceReduce::Sum(d_temp_storage, @@ -89,18 +83,14 @@ class DeviceReduce stream)); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Min(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Min(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { (void)debug_synchronous; return hipCUDAErrorTohipError(::cub::DeviceReduce::Min(d_temp_storage, @@ -111,18 +101,14 @@ class DeviceReduce stream)); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t ArgMin(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { (void)debug_synchronous; return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMin(d_temp_storage, @@ -133,18 +119,14 @@ class DeviceReduce stream)); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Max(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Max(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { (void)debug_synchronous; return hipCUDAErrorTohipError(::cub::DeviceReduce::Max(d_temp_storage, @@ -155,18 +137,14 @@ class DeviceReduce stream)); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t ArgMax(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { (void)debug_synchronous; return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMax(d_temp_storage, @@ -177,26 +155,25 @@ class DeviceReduce stream)); } - template< - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename ReductionOpT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t ReduceByKey(void * d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - ReductionOpT reduction_op, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t + ReduceByKey(void* d_temp_storage, + size_t& temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + ReductionOpT reduction_op, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { (void)debug_synchronous; return hipCUDAErrorTohipError(::cub::DeviceReduce::ReduceByKey(d_temp_storage, diff --git a/hipcub/include/hipcub/backend/cub/hipcub.hpp b/hipcub/include/hipcub/backend/cub/hipcub.hpp index 1424e201..8b70cc1a 100644 --- a/hipcub/include/hipcub/backend/cub/hipcub.hpp +++ b/hipcub/include/hipcub/backend/cub/hipcub.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2022, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -53,6 +53,7 @@ // hipError_t instead of cudaError_t #include "device/device_adjacent_difference.hpp" #include "device/device_histogram.hpp" +#include "device/device_memcpy.hpp" #include "device/device_merge_sort.hpp" #include "device/device_partition.hpp" #include "device/device_radix_sort.hpp" diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp new file mode 100644 index 00000000..93e6fd11 --- /dev/null +++ b/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp @@ -0,0 +1,65 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ +#define HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ + +#include "../../../config.hpp" + +#include + +#include + +BEGIN_HIPCUB_NAMESPACE + +struct DeviceMemcpy +{ + template + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + uint32_t num_buffers, + hipStream_t stream = 0, + bool debug_synchronous = false) + { + return rocprim::batch_memcpy(d_temp_storage, + temp_storage_bytes, + input_buffer_it, + output_buffer_it, + buffer_sizes, + num_buffers, + stream, + debug_synchronous); + } +}; + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp index 6782b907..df48f95a 100644 --- a/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp +++ b/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -162,64 +162,65 @@ inline hip_bfloat16 get_max_special_value() class DeviceReduce { public: - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ReduceOpT, - typename T - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Reduce(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - ReduceOpT reduction_op, - T init, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Reduce(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + ReduceOpT reduction_op, + T init, + hipStream_t stream = 0, + bool debug_synchronous = false) { return ::rocprim::reduce( - d_temp_storage, temp_storage_bytes, - d_in, d_out, init, num_items, - ::hipcub::detail::convert_result_type(reduction_op), - stream, debug_synchronous - ); + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + init, + num_items, + ::hipcub::detail::convert_binary_result_type( + reduction_op), + stream, + debug_synchronous); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Sum(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Sum(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { - using T = typename std::iterator_traits::value_type; - return Reduce( - d_temp_storage, temp_storage_bytes, - d_in, d_out, num_items, ::hipcub::Sum(), T(0), - stream, debug_synchronous - ); + using InputT = typename std::iterator_traits::value_type; + using OutputT = typename std::iterator_traits::value_type; + using InitT = hipcub::detail::non_void_value_t; + return Reduce(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + ::hipcub::Sum(), + InitT(0), + stream, + debug_synchronous); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Min(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Min(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { using T = typename std::iterator_traits::value_type; return Reduce( @@ -229,28 +230,19 @@ class DeviceReduce ); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t ArgMin(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { - using OffsetT = int; + using OffsetT = NumItemsT; using T = typename std::iterator_traits::value_type; using O = typename std::iterator_traits::value_type; - using OutputTupleT = - typename std::conditional< - std::is_same::value, - KeyValuePair, - O - >::type; + using OutputTupleT = hipcub::detail::non_void_value_t>; using OutputValueT = typename OutputTupleT::Value; using IteratorT = ArgIndexInputIterator; @@ -269,18 +261,14 @@ class DeviceReduce ); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t Max(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t Max(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { using T = typename std::iterator_traits::value_type; return Reduce( @@ -290,28 +278,19 @@ class DeviceReduce ); } - template < - typename InputIteratorT, - typename OutputIteratorT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t ArgMax(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { - using OffsetT = int; + using OffsetT = NumItemsT; using T = typename std::iterator_traits::value_type; using O = typename std::iterator_traits::value_type; - using OutputTupleT = - typename std::conditional< - std::is_same::value, - KeyValuePair, - O - >::type; + using OutputTupleT = hipcub::detail::non_void_value_t>; using OutputValueT = typename OutputTupleT::Value; using IteratorT = ArgIndexInputIterator; @@ -330,37 +309,40 @@ class DeviceReduce ); } - template< - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename ReductionOpT - > - HIPCUB_RUNTIME_FUNCTION static - hipError_t ReduceByKey(void * d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - ReductionOpT reduction_op, - int num_items, - hipStream_t stream = 0, - bool debug_synchronous = false) + template + HIPCUB_RUNTIME_FUNCTION static hipError_t + ReduceByKey(void* d_temp_storage, + size_t& temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + ReductionOpT reduction_op, + NumItemsT num_items, + hipStream_t stream = 0, + bool debug_synchronous = false) { using key_compare_op = ::rocprim::equal_to::value_type>; - return ::rocprim::reduce_by_key( - d_temp_storage, temp_storage_bytes, - d_keys_in, d_values_in, num_items, - d_unique_out, d_aggregates_out, d_num_runs_out, - ::hipcub::detail::convert_result_type(reduction_op), - key_compare_op(), - stream, debug_synchronous - ); + return ::rocprim::reduce_by_key(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + num_items, + d_unique_out, + d_aggregates_out, + d_num_runs_out, + reduction_op, + key_compare_op(), + stream, + debug_synchronous); } }; diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp index ce0d3246..de404b67 100644 --- a/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp +++ b/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -127,6 +127,7 @@ template spmv_params.num_nonzeros = num_nonzeros; spmv_params.alpha = 1.0; spmv_params.beta = 0.0; + (void)debug_synchronous; hipError_t status; if(d_temp_storage == nullptr) diff --git a/hipcub/include/hipcub/backend/rocprim/hipcub.hpp b/hipcub/include/hipcub/backend/rocprim/hipcub.hpp index 236bc9c1..6de82c45 100644 --- a/hipcub/include/hipcub/backend/rocprim/hipcub.hpp +++ b/hipcub/include/hipcub/backend/rocprim/hipcub.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2022, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -52,6 +52,7 @@ // Device #include "device/device_adjacent_difference.hpp" #include "device/device_histogram.hpp" +#include "device/device_memcpy.hpp" #include "device/device_merge_sort.hpp" #include "device/device_partition.hpp" #include "device/device_radix_sort.hpp" diff --git a/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 1fda5711..5297770a 100644 --- a/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,25 +34,25 @@ #include "../util_type.hpp" +#include + BEGIN_HIPCUB_NAMESPACE struct Equality { - template - HIPCUB_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + template + HIPCUB_HOST_DEVICE inline constexpr bool operator()(T&& t, U&& u) const { - return a == b; + return std::forward(t) == std::forward(u); } }; struct Inequality { - template - HIPCUB_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + template + HIPCUB_HOST_DEVICE inline constexpr bool operator()(T&& t, U&& u) const { - return a != b; + return std::forward(t) != std::forward(u); } }; @@ -64,61 +64,57 @@ struct InequalityWrapper HIPCUB_HOST_DEVICE inline InequalityWrapper(EqualityOp op) : op(op) {} - template - HIPCUB_HOST_DEVICE inline - bool operator()(const T &a, const T &b) + template + HIPCUB_HOST_DEVICE inline bool operator()(T&& t, U&& u) { - return !op(a, b); + return !op(std::forward(t), std::forward(u)); } }; struct Sum { - template - HIPCUB_HOST_DEVICE inline - constexpr T operator()(const T &a, const T &b) const + template + HIPCUB_HOST_DEVICE inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) { - return a + b; + return std::forward(t) + std::forward(u); } }; struct Difference { - template - HIPCUB_HOST_DEVICE inline - constexpr T operator()(const T &a, const T &b) const + template + HIPCUB_HOST_DEVICE inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) { - return a - b; + return std::forward(t) - std::forward(u); } }; struct Division { - template - HIPCUB_HOST_DEVICE inline - constexpr T operator()(const T &a, const T &b) const + template + HIPCUB_HOST_DEVICE inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) { - return a / b; + return std::forward(t) / std::forward(u); } }; struct Max { - template - HIPCUB_HOST_DEVICE inline - constexpr T operator()(const T &a, const T &b) const + template + HIPCUB_HOST_DEVICE inline constexpr typename std::common_type::type + operator()(T&& t, U&& u) const { - return a < b ? b : a; + return t < u ? u : t; } }; struct Min { - template - HIPCUB_HOST_DEVICE inline - constexpr T operator()(const T &a, const T &b) const + template + HIPCUB_HOST_DEVICE inline constexpr typename std::common_type::type + operator()(T&& t, U&& u) const { - return a < b ? a : b; + return t < u ? t : u; } }; @@ -155,9 +151,8 @@ struct ArgMin template struct CastOp { - template - HIPCUB_HOST_DEVICE inline - B operator()(const A &a) const + template + HIPCUB_HOST_DEVICE inline B operator()(A&& a) const { return (B)a; } @@ -257,10 +252,8 @@ struct BinaryFlip { } - template - HIPCUB_DEVICE auto - operator()(T &&t, U &&u) -> decltype(binary_op(std::forward(u), - std::forward(t))) + template + HIPCUB_DEVICE auto operator()(T&& t, U&& u) -> decltype(auto) { return binary_op(std::forward(u), std::forward(t)); } @@ -276,7 +269,20 @@ BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) namespace detail { -// CUB uses value_type of OutputIteratorT (if not void) as a type of intermediate results in reduce, +// Non-void value type. +template +using non_void_value_t = + typename std::conditional::value, FallbackT, IteratorT>::type; + +// Invoke result type. +template +using invoke_result_t = typename ::rocprim::detail::invoke_result::type; + +/// Intermediate accumulator type. +template +using accumulator_t = std::decay_t>; + +// CUB uses value_type of OutputIteratorT (if not void) as a type of intermediate results in segmented reduce, // for example: // // /// The output value type @@ -303,12 +309,9 @@ template< > struct convert_result_type_wrapper { - using input_type = typename std::iterator_traits::value_type; + using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional< - std::is_void::value, input_type, output_type - >::type; + using result_type = non_void_value_t; convert_result_type_wrapper(BinaryFunction op) : op(op) {} @@ -334,6 +337,46 @@ convert_result_type(BinaryFunction op) return convert_result_type_wrapper(op); } +// CUB now uses as intermediate result type the return type of BinaryFunction in reduce, scan +// and reduce_by_key. +// +// // The accumulator type +// using AccumT = typename std::decay>::type; +// +// rocPRIM was being passed the value_type of OutputIteratorT (if not void) as intermediate +// result type, following the previous behaviour of CUB. +// +// This wrapper allows to have compatibility with CUB in hipCUB. + +template +struct convert_binary_result_type_wrapper +{ + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using init_type = InitT; + using accum_type = accumulator_t; + + convert_binary_result_type_wrapper(BinaryFunction op) : op(op) {} + + template + HIPCUB_HOST_DEVICE inline constexpr accum_type operator()(const T& a, const T& b) const + { + return static_cast(op(a, b)); + } + + BinaryFunction op; +}; + +template +inline convert_binary_result_type_wrapper + convert_binary_result_type(BinaryFunction op) +{ + return convert_binary_result_type_wrapper(op); +} + } // end detail namespace END_HIPCUB_NAMESPACE diff --git a/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp b/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp index 85510c23..7f66cf80 100644 --- a/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp +++ b/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp @@ -34,6 +34,7 @@ #include #include "../../config.hpp" +#include "util_type.hpp" #include @@ -242,6 +243,21 @@ unsigned int BFE(UnsignedBits source, return detail::unsigned_bit_extract(source, bit_start, num_bits); } +#if HIPCUB_IS_INT128_ENABLED +/** + * Bitfield-extract for 128-bit types. + */ +template +__device__ __forceinline__ unsigned int BFE(UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type<16> /*byte_len*/) +{ + const __uint128_t MASK = (__uint128_t{1} << num_bits) - 1; + return (source >> bit_start) & MASK; +} +#endif + // Bitfield insert. // Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. HIPCUB_DEVICE inline diff --git a/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 48937272..494994aa 100644 --- a/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -49,6 +49,22 @@ using NullType = ::rocprim::empty_type; #endif +#ifndef HIPCUB_IS_INT128_ENABLED + #if defined(__HIPCC_RTC__) + #if defined(__HIPCC_RTC_INT128__) + #define HIPCUB_IS_INT128_ENABLED 1 + #endif // !defined(__HIPCC_RTC_INT128__) + #else // !defined(__HIPCC_RTC__) + #if HIP_VERSION >= 50400000 + #if(HIPCUB_HOST_COMPILER == HIPCUB_HOST_COMPILER_GCC) \ + || (HIPCUB_HOST_COMPILER == HIPCUB_HOST_COMPILER_CLANG) || defined(__ICC) \ + || defined(__GNUC__) || defined(__clang__) + #define HIPCUB_IS_INT128_ENABLED 1 + #endif // GCC || CLANG || ICC + #endif // VER >= 5.4 + #endif // !defined(__HIPCC_RTC__) +#endif // !defined(HIPCUB_IS_INT128_ENABLED) + template struct [[deprecated("[Since 1.16] If is deprecated use std::conditional instead.")]] If { @@ -625,6 +641,79 @@ template <> struct NumericTraits : BaseTraits struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; + #if HIPCUB_IS_INT128_ENABLED +template<> +struct NumericTraits<__uint128_t> +{ + using T = __uint128_t; + using UnsignedBits = __uint128_t; + + static constexpr Category CATEGORY = UNSIGNED_INTEGER; + static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0); + static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1); + + static constexpr bool PRIMITIVE = false; + static constexpr bool NULL_TYPE = false; + + static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key; + } + + static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key; + } + + static __host__ __device__ __forceinline__ T Max() + { + return MAX_KEY; + } + + static __host__ __device__ __forceinline__ T Lowest() + { + return LOWEST_KEY; + } +}; + +template<> +struct NumericTraits<__int128_t> +{ + using T = __int128_t; + using UnsignedBits = __uint128_t; + + static constexpr Category CATEGORY = SIGNED_INTEGER; + static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT; + static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + static constexpr bool PRIMITIVE = false; + static constexpr bool NULL_TYPE = false; + + static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } +}; + #endif + template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits<__half> : BaseTraits {}; diff --git a/hipcub/include/hipcub/config.hpp b/hipcub/include/hipcub/config.hpp index b804d6be..5f3a07e6 100644 --- a/hipcub/include/hipcub/config.hpp +++ b/hipcub/include/hipcub/config.hpp @@ -44,10 +44,31 @@ #define HIPCUB_ROCPRIM_API 1 #define HIPCUB_RUNTIME_FUNCTION __host__ - #include + #include + +namespace detail +{ +inline unsigned int host_warp_size_wrapper() +{ + int device_id = 0; + unsigned int host_warp_size = 0; + hipError_t error = hipGetDevice(&device_id); + if(error != hipSuccess) + { + fprintf(stderr, "HIP error: %d line: %d: %s\n", error, __LINE__, hipGetErrorString(error)); + fflush(stderr); + } + if(::rocprim::host_warp_size(device_id, host_warp_size) != hipSuccess) + { + return 0u; + } + return host_warp_size; +} +} // namespace detail + #define HIPCUB_WARP_THREADS ::rocprim::warp_size() #define HIPCUB_DEVICE_WARP_THREADS ::rocprim::device_warp_size() - #define HIPCUB_HOST_WARP_THREADS ::rocprim::host_warp_size() + #define HIPCUB_HOST_WARP_THREADS detail::host_warp_size_wrapper() #define HIPCUB_ARCH 1 // ignored with rocPRIM backend #elif defined(__HIP_PLATFORM_NVIDIA__) #define HIPCUB_CUB_API 1 diff --git a/hipcub/include/hipcub/device/device_memcpy.hpp b/hipcub/include/hipcub/device/device_memcpy.hpp new file mode 100644 index 00000000..2a7864c7 --- /dev/null +++ b/hipcub/include/hipcub/device/device_memcpy.hpp @@ -0,0 +1,38 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_ +#define HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_ + +#ifdef __HIP_PLATFORM_AMD__ + #include "../backend/rocprim/device/device_memcpy.hpp" +#elif defined(__HIP_PLATFORM_NVIDIA__) + #include "../backend/cub/device/device_memcpy.hpp" +#endif + +#endif // HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_ diff --git a/rmake.py b/rmake.py index 9a1dce38..53bcf269 100644 --- a/rmake.py +++ b/rmake.py @@ -98,7 +98,13 @@ def config_cmd(): toolchain = os.path.join( src_path, "toolchain-windows.cmake" ) #set CPACK_PACKAGING_INSTALL_PREFIX= defined as blank as it is appended to end of path for archive creation cmake_platform_opts.append( f"-DWIN32=ON -DCPACK_PACKAGING_INSTALL_PREFIX=") #" -DCPACK_PACKAGING_INSTALL_PREFIX={rocm_path}" - cmake_platform_opts.append( f"-DCMAKE_INSTALL_PREFIX=\"C:/hipSDK\"" ) + cmake_platform_opts.append( f"-DCMAKE_INSTALL_PREFIX=\"C:/hipSDK\"" ) + + # MSVC requires acknowledgement of using extended aligned storage. + # Before VS 2017 15.8, has non-conforming alignment. VS 2017 15.8 fixes this, but inherently changes layouts of + # aligned storage with extended alignment, and thus binary compatibility with such types. + cmake_platform_opts.append( "-DCMAKE_CXX_FLAGS=\"-D_ENABLE_EXTENDED_ALIGNED_STORAGE\"") + generator = f"-G Ninja" # "-G \"Visual Studio 16 2019\" -A x64" # -G NMake ") # cmake_options.append( generator ) diff --git a/test/extra/CMakeLists.txt b/test/extra/CMakeLists.txt index 076ce74a..93329abd 100644 --- a/test/extra/CMakeLists.txt +++ b/test/extra/CMakeLists.txt @@ -50,54 +50,54 @@ if(HIP_COMPILER STREQUAL "nvcc") if(NOT DEFINED CUB_INCLUDE_DIR) file( - DOWNLOAD https://github.com/NVIDIA/cub/archive/2.0.1.zip - ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip + DOWNLOAD https://github.com/NVIDIA/cub/archive/2.1.0.zip + ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip STATUS cub_download_status LOG cub_download_log ) list(GET cub_download_status 0 cub_download_error_code) if(cub_download_error_code) message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/cub/archive/2.0.1.zip failed " + "https://github.com/NVIDIA/cub/archive/2.1.0.zip failed " "error_code: ${cub_download_error_code} " "log: ${cub_download_log} " ) endif() execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip + COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE cub_unpack_error_code ) if(cub_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip failed") + message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip failed") endif() - set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1/ CACHE PATH "") + set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0/ CACHE PATH "") endif() if(NOT DEFINED THRUST_INCLUDE_DIR) file( - DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.0.1.zip - ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip + DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.1.0.zip + ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip STATUS thrust_download_status LOG thrust_download_log ) list(GET thrust_download_status 0 thrust_download_error_code) if(thrust_download_error_code) message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/thrust/archive/2.0.1.zip failed " + "https://github.com/NVIDIA/thrust/archive/2.1.0.zip failed " "error_code: ${thrust_download_error_code} " "log: ${thrust_download_log} " ) endif() execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip + COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE thrust_unpack_error_code ) if(thrust_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip failed") + message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip failed") endif() - set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1/ CACHE PATH "") + set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0/ CACHE PATH "") endif() else() # rocPRIM (only for ROCm platform) diff --git a/test/hipcub/CMakeLists.txt b/test/hipcub/CMakeLists.txt index 52382cbb..e1d67d85 100644 --- a/test/hipcub/CMakeLists.txt +++ b/test/hipcub/CMakeLists.txt @@ -1,5 +1,5 @@ # MIT License # -# Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -206,6 +206,7 @@ add_hipcub_test("hipcub.BlockScan" test_hipcub_block_scan.cpp) add_hipcub_test("hipcub.BlockShuffle" test_hipcub_block_shuffle.cpp) add_hipcub_test("hipcub.DeviceAdjacentDifference" test_hipcub_device_adjacent_difference.cpp) add_hipcub_test("hipcub.DeviceHistogram" test_hipcub_device_histogram.cpp) +add_hipcub_test("hipcub.DeviceMemcpy" test_hipcub_device_memcpy.cpp) add_hipcub_test("hipcub.DeviceMergeSort" test_hipcub_device_merge_sort.cpp) add_hipcub_test_parallel("hipcub.DeviceRadixSort" test_hipcub_device_radix_sort.cpp.in) add_hipcub_test("hipcub.DeviceReduce" test_hipcub_device_reduce.cpp) @@ -225,6 +226,6 @@ add_hipcub_test("hipcub.WarpMergeSort" test_hipcub_warp_merge_sort.cpp) add_hipcub_test("hipcub.WarpReduce" test_hipcub_warp_reduce.cpp) add_hipcub_test("hipcub.WarpScan" test_hipcub_warp_scan.cpp) add_hipcub_test("hipcub.WarpStore" test_hipcub_warp_store.cpp) -add_hipcub_test("hipcub.Iterator" test_hipcub_iterators.cpp) -add_hipcub_test("hipcub.ThreadOperations" test_hipcub_thread.cpp) +add_hipcub_test("hipcub.Iterators" test_hipcub_iterators.cpp) +add_hipcub_test("hipcub.ThreadOperators" test_hipcub_thread_operators.cpp) add_hipcub_test("hipcub.ThreadSort" test_hipcub_thread_sort.cpp) diff --git a/test/hipcub/half.hpp b/test/hipcub/half.hpp index e986c38a..f0fe6dc4 100644 --- a/test/hipcub/half.hpp +++ b/test/hipcub/half.hpp @@ -40,7 +40,8 @@ #include #endif - #include +#include +#include #ifdef __GNUC__ // There's a ton of type-punning going on in this file. @@ -181,7 +182,10 @@ struct half_t f = (0xff << 23) | (sign << 31); // inf } } - return *reinterpret_cast(&f); + static_assert(sizeof(float) == sizeof(std::uint32_t), "4-byte size check"); + float ret{}; + std::memcpy(&ret, &f, sizeof(float)); + return ret; } diff --git a/test/hipcub/test_hipcub_block_radix_sort.cpp b/test/hipcub/test_hipcub_block_radix_sort.cpp index e3d93a22..5c6f0460 100644 --- a/test/hipcub/test_hipcub_block_radix_sort.cpp +++ b/test/hipcub/test_hipcub_block_radix_sort.cpp @@ -58,7 +58,11 @@ class HipcubBlockRadixSort : public ::testing::Test { }; typedef ::testing::Types< - // Power of 2 BlockSize +// Power of 2 BlockSize +#if HIPCUB_IS_INT128_ENABLED + params<__int128_t, __int128_t, 64U, 1>, + params<__uint128_t, __uint128_t, 64U, 1>, +#endif params, params, params, diff --git a/test/hipcub/test_hipcub_device_memcpy.cpp b/test/hipcub/test_hipcub_device_memcpy.cpp new file mode 100644 index 00000000..46e54cad --- /dev/null +++ b/test/hipcub/test_hipcub_device_memcpy.cpp @@ -0,0 +1,327 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "common_test_header.hpp" +#include "test_utils_assertions.hpp" +#include "test_utils_custom_test_types.hpp" +#include "test_utils_data_generation.hpp" +#include "test_utils_types.hpp" + +#include "hipcub/device/device_memcpy.hpp" +#include "hipcub/thread/thread_operators.hpp" + +#include +#include + +#include +#include +#include +#include + +#include + +template +struct DeviceBatchMemcpyParams +{ + using value_type = ValueType; + using size_type = SizeType; + static constexpr bool shuffled = Shuffled; + static constexpr uint32_t num_buffers = NumBuffers; + static constexpr uint32_t max_size = MaxSize; +}; + +template +struct DeviceBatchMemcpyTests : public ::testing::Test +{ + using value_type = typename Params::value_type; + using size_type = typename Params::size_type; + static constexpr bool shuffled = Params::shuffled; + static constexpr uint32_t num_buffers = Params::num_buffers; + static constexpr uint32_t max_size = Params::max_size; +}; + +typedef ::testing::Types< + // Ignore copy/move + + // Unshuffled inputs and outputs + // Variable value_type + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + // size_type: uint16_t + DeviceBatchMemcpyParams, + // size_type: int64_t + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + + // weird amount of buffers + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + + // Shuffled inputs and outputs + // Variable value_type + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + // size_type: uint16_t + DeviceBatchMemcpyParams, + // size_type: int64_t + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams> + DeviceBatchMemcpyTestsParams; + +TYPED_TEST_SUITE(DeviceBatchMemcpyTests, DeviceBatchMemcpyTestsParams); + +// Used for generating offsets. We generate a permutation map and then derive +// offsets via a sum scan over the sizes in the order of the permutation. This +// allows us to keep the order of buffers we pass to batch_memcpy, but still +// have source and destinations mappings not be the identity function: +// +// batch_memcpy( +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) +// [3 , 2 , 1 , 2 ]) // size +// +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d +// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! +// ───┬─── ─────┬───── ───┬─── ─── +// └─────────┼─────────┼───┐ +// ┌───┘ ┌───┘ │ what batch_memcpy does +// ▼ ▼ ▼ +// ─── ─────────── ─────── ─────── +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' +// └───┴───┴───┴───┴───┴───┴───┴───┘ +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const size_t n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + T sum = 0; + for(size_t i = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } + + return result; +} + +TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation) +{ + // While on rocPRIM these can be variable via the config. CUB does not allow this. + // Therefore we assume fixed size. Otherwise we would use: + // - rocprim::batch_memcpy_config<>::wlev_size_threshold + // - rocprim::batch_memcpy_config<>::blev_size_threshold; + constexpr int32_t wlev_min_size = 128; + constexpr int32_t blev_min_size = 1024; + + constexpr int32_t num_buffers = TestFixture::num_buffers; + constexpr int32_t max_size = TestFixture::max_size; + constexpr bool shuffled = TestFixture::shuffled; + + constexpr int32_t num_tlev_buffers = num_buffers / 3; + constexpr int32_t num_wlev_buffers = num_buffers / 3; + + using value_type = typename TestFixture::value_type; + using buffer_size_type = typename TestFixture::size_type; + using buffer_offset_type = uint32_t; + using byte_offset_type = size_t; + + using value_alias = + typename std::conditional::value, + typename test_utils::inner_type::type, + value_type>::type; + + // Get random buffer sizes + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + // Total number of bytes. + byte_offset_type total_num_bytes = 0; + byte_offset_type total_num_elements = 0; + + uint32_t seed = 0; + SCOPED_TRACE(testing::Message() << "with seed= " << seed); + std::default_random_engine rng{seed}; + + for(buffer_offset_type i = 0; i < num_buffers; ++i) + { + buffer_size_type size; + if(i < num_tlev_buffers) + { + size = test_utils::get_random_value(1, wlev_min_size - 1, rng()); + } else if(i < num_tlev_buffers + num_wlev_buffers) + { + size = test_utils::get_random_value(wlev_min_size, + blev_min_size - 1, + rng()); + } else + { + size = test_utils::get_random_value(blev_min_size, max_size, rng()); + } + + // convert from number of bytes to number of elements + size = test_utils::max(1, size / sizeof(value_type)); + size = test_utils::min(size, max_size); + + h_buffer_num_elements[i] = size; + total_num_elements += size; + } + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(value_type); + } + + // And the total byte size + total_num_bytes = total_num_elements * sizeof(value_type); + + // Device pointers + value_type* d_input{}; + value_type* d_output{}; + value_type** d_buffer_srcs{}; + value_type** d_buffer_dsts{}; + buffer_size_type* d_buffer_sizes{}; + + // Calculate temporary storage + + size_t temp_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, + temp_storage_bytes, + d_buffer_srcs, + d_buffer_dsts, + d_buffer_sizes, + num_buffers)); + + void* d_temp_storage{}; + + // Allocate memory. + HIP_CHECK(hipMalloc(&d_input, total_num_bytes)); + HIP_CHECK(hipMalloc(&d_output, total_num_bytes)); + + HIP_CHECK(hipMalloc(&d_buffer_srcs, num_buffers * sizeof(*d_buffer_srcs))); + HIP_CHECK(hipMalloc(&d_buffer_dsts, num_buffers * sizeof(*d_buffer_dsts))); + HIP_CHECK(hipMalloc(&d_buffer_sizes, num_buffers * sizeof(*d_buffer_sizes))); + + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Generate data. + std::vector h_input + = test_utils::get_random_data(total_num_elements, + test_utils::numeric_limits::min(), + test_utils::numeric_limits::max(), + rng()); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffled) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + test_utils::host_exclusive_scan(h_buffer_num_elements.begin(), + h_buffer_num_elements.end(), + 0, + src_offsets.begin(), + hipcub::Sum{}); + test_utils::host_exclusive_scan(h_buffer_num_elements.begin(), + h_buffer_num_elements.end(), + 0, + dst_offsets.begin(), + hipcub::Sum{}); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(int32_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = d_input + src_offsets[i]; + h_buffer_dsts[i] = d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK(hipMemcpy(d_input, h_input.data(), total_num_bytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(*d_buffer_srcs), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(*d_buffer_dsts), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_sizes, + h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(*d_buffer_sizes), + hipMemcpyHostToDevice)); + + // Run batched memcpy. + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + d_buffer_srcs, + d_buffer_dsts, + d_buffer_sizes, + num_buffers, + hipStreamDefault)); + // Verify results. + std::vector h_output(total_num_elements); + HIP_CHECK(hipMemcpy(h_output.data(), d_output, total_num_bytes, hipMemcpyDeviceToHost)); + + for(int32_t i = 0; i < num_buffers; ++i) + { + for(buffer_size_type j = 0; j < h_buffer_num_elements[i]; ++j) + { + auto input_index = src_offsets[i] + j; + auto output_index = dst_offsets[i] + j; + + ASSERT_TRUE(test_utils::bit_equal(h_input[input_index], h_output[output_index])); + } + } +} diff --git a/test/hipcub/test_hipcub_device_merge_sort.cpp b/test/hipcub/test_hipcub_device_merge_sort.cpp index 04141405..21ed0462 100644 --- a/test/hipcub/test_hipcub_device_merge_sort.cpp +++ b/test/hipcub/test_hipcub_device_merge_sort.cpp @@ -317,8 +317,6 @@ TYPED_TEST(HipcubDeviceMergeSort, StableSortKeys) } } -// hipCUB currently provides the CUB 1.x interface, StableSortKeysCopy is part of CUB 2.x -#ifdef __HIP_PLATFORM_AMD__ TYPED_TEST(HipcubDeviceMergeSort, StableSortKeysCopy) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -402,7 +400,6 @@ TYPED_TEST(HipcubDeviceMergeSort, StableSortKeysCopy) } } } -#endif TYPED_TEST(HipcubDeviceMergeSort, SortPairs) { diff --git a/test/hipcub/test_hipcub_device_radix_sort.cpp.in b/test/hipcub/test_hipcub_device_radix_sort.cpp.in index b28b502b..8485ca3f 100644 --- a/test/hipcub/test_hipcub_device_radix_sort.cpp.in +++ b/test/hipcub/test_hipcub_device_radix_sort.cpp.in @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -49,17 +49,21 @@ #endif #if HIPCUB_TEST_TYPE_SLICE == 0 - INSTANTIATE(params) +#if HIPCUB_IS_INT128_ENABLED + INSTANTIATE(params<__uint128_t, __uint128_t, true >) + INSTANTIATE(params<__int128_t, __int128_t, true >) +#endif + INSTANTIATE(params) INSTANTIATE(params) - INSTANTIATE(params) + INSTANTIATE(params) INSTANTIATE(params) INSTANTIATE(params) - INSTANTIATE(params) + INSTANTIATE(params) INSTANTIATE(params) INSTANTIATE(params) - INSTANTIATE(params) + INSTANTIATE(params) INSTANTIATE(params) - INSTANTIATE(params) + INSTANTIATE(params) INSTANTIATE(params >) #elif HIPCUB_TEST_TYPE_SLICE == 1 // start_bit and end_bit diff --git a/test/hipcub/test_hipcub_device_reduce.cpp b/test/hipcub/test_hipcub_device_reduce.cpp index 121d5167..17f71f8a 100644 --- a/test/hipcub/test_hipcub_device_reduce.cpp +++ b/test/hipcub/test_hipcub_device_reduce.cpp @@ -21,10 +21,14 @@ // SOFTWARE. #include "common_test_header.hpp" -#include "test_utils_argminmax.hpp" + +// Thread operators fixes for extended float types +#include "test_utils_thread_operators.hpp" // hipcub API #include "hipcub/device/device_reduce.hpp" +#include "hipcub/iterator/constant_input_iterator.hpp" + #include // Params for tests @@ -57,18 +61,15 @@ typedef ::testing::Types< DeviceReduceParams, DeviceReduceParams, DeviceReduceParams, - DeviceReduceParams + DeviceReduceParams, + DeviceReduceParams, + DeviceReduceParams #ifdef __HIP_PLATFORM_AMD__ , - DeviceReduceParams, // Doesn't compile in CUB 2.0.1 - DeviceReduceParams, // Doesn't compile in CUB 2.0.1 - DeviceReduceParams< - test_utils::bfloat16, - test_utils:: - bfloat16> // Kernel crash on NVIDIA / CUB, failing Reduce::Sum test on AMD due to rounding. -#endif -#ifdef HIPCUB_ROCPRIM_API - , + DeviceReduceParams, // Doesn't work on NVIDIA / CUB + DeviceReduceParams, // Doesn't work on NVIDIA / CUB DeviceReduceParams, test_utils::custom_test_type>, DeviceReduceParams, test_utils::custom_test_type> #endif @@ -100,9 +101,6 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) using U = typename TestFixture::output_type; const bool debug_synchronous = TestFixture::debug_synchronous; - if(std::is_same::value) - GTEST_SKIP(); - const std::vector sizes = get_sizes(); for(auto size : sizes) { @@ -136,24 +134,30 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) ); HIP_CHECK(hipDeviceSynchronize()); - // Calculate expected results on host - U expected = U(0.0f); + // Calculate expected results on host using the same accumulator type than on device + using Sum = + typename AlgebraicSelector::type; // For custom_type_test tests + using AccumT = hipcub::detail::accumulator_t; + Sum sum_op; + AccumT tmp_result = U(0.0f); // hipcub::Sum uses as initial type the output type for(unsigned int i = 0; i < input.size(); i++) { - expected = expected + (U) input[i]; + tmp_result = sum_op(tmp_result, input[i]); } + const U expected = static_cast(tmp_result); // temp storage size_t temp_storage_size_bytes; void * d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceReduce::Sum( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, input.size(), - stream, debug_synchronous - ) - ); + DeviceReduceSelector reduce_selector; + reduce_selector.reduce_sum(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + input.size(), + stream, + debug_synchronous); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0U); @@ -163,13 +167,13 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - hipcub::DeviceReduce::Sum( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, input.size(), - stream, debug_synchronous - ) - ); + reduce_selector.reduce_sum(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + input.size(), + stream, + debug_synchronous); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -231,13 +235,17 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceMinimum) ); HIP_CHECK(hipDeviceSynchronize()); - hipcub::Min min_op; - // Calculate expected results on host - U expected = U(test_utils::numeric_limits::max()); + // Calculate expected results on host using the same accumulator type than on device + using Min = typename MinSelector::type; // For custom_type_test tests + using AccumT = hipcub::detail::accumulator_t; + Min min_op; + AccumT tmp_result = test_utils::numeric_limits< + T>::max(); // hipcub::Min uses as initial type the input type for(unsigned int i = 0; i < input.size(); i++) { - expected = min_op(expected, U(input[i])); + tmp_result = min_op(tmp_result, input[i]); } + const U expected = static_cast(tmp_result); // temp storage size_t temp_storage_size_bytes; @@ -548,3 +556,109 @@ TYPED_TEST(HipcubDeviceReduceArgMinMaxSpecialTests, ReduceArgMaxInf) test_utils::numeric_limits::lowest()); } #endif // __HIP_PLATFORM_AMD__ + +// --------------------------------------------------------- +// Test for large indices +// --------------------------------------------------------- + +template +class HipcubDeviceReduceLargeIndicesTests : public ::testing::Test +{ +public: + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; + static constexpr bool debug_synchronous = false; +}; + +typedef ::testing::Types, + DeviceReduceParams, + DeviceReduceParams, + DeviceReduceParams, + DeviceReduceParams, + DeviceReduceParams> + HipcubDeviceReduceLargeIndicesTestsParams; + +TYPED_TEST_SUITE(HipcubDeviceReduceLargeIndicesTests, HipcubDeviceReduceLargeIndicesTestsParams); + +TYPED_TEST(HipcubDeviceReduceLargeIndicesTests, LargeIndices) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + const bool debug_synchronous = TestFixture::debug_synchronous; + using IteratorType = hipcub::ConstantInputIterator; + + const std::vector exponents = {30, 31, 32, 33, 34}; + for(auto exponent : exponents) + { + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + const size_t size = 1ll << exponent; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + hipStream_t stream = 0; // default + + // Generate data + IteratorType d_input(T{1}); + std::vector output(1, (U)0.0f); + + U* d_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, output.size() * sizeof(U))); + HIP_CHECK(hipDeviceSynchronize()); + + // Calculate expected results on host + const U expected = static_cast(size); + + // Temp storage + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + + // Get size of d_temp_storage + hipcub::DeviceReduce::Sum(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + size, + stream, + debug_synchronous); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0U); + + // Allocate temporary storage + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Run + hipcub::DeviceReduce::Sum(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + size, + stream, + debug_synchronous); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Copy output to host + HIP_CHECK(hipMemcpy(output.data(), + d_output, + output.size() * sizeof(U), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + // Check if output values are as expected + const std::size_t result = output[0]; + ASSERT_EQ(result, size); + + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); + } + } +} diff --git a/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/test/hipcub/test_hipcub_device_segmented_reduce.cpp index dd53f2b9..4129af50 100644 --- a/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -21,7 +21,9 @@ // SOFTWARE. #include "common_test_header.hpp" -#include "test_utils_argminmax.hpp" + +// Thread operators fixes for extended float types +#include "test_utils_thread_operators.hpp" // hipcub API #include "hipcub/device/device_segmented_reduce.hpp" diff --git a/test/hipcub/test_hipcub_device_segmented_sort.hpp b/test/hipcub/test_hipcub_device_segmented_sort.hpp index 07ee815a..120cfbf6 100644 --- a/test/hipcub/test_hipcub_device_segmented_sort.hpp +++ b/test/hipcub/test_hipcub_device_segmented_sort.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -141,15 +141,13 @@ inline void generate_input_data(std::vector &keys_input, seed_value + seed_value_addition ); } - + offsets.clear(); - unsigned segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_distribution(gen); offsets.push_back(offset); - ++segments_count; offset += segment_length; } offsets.push_back(size); diff --git a/test/hipcub/test_hipcub_thread_operators.cpp b/test/hipcub/test_hipcub_thread_operators.cpp new file mode 100644 index 00000000..a4093101 --- /dev/null +++ b/test/hipcub/test_hipcub_thread_operators.cpp @@ -0,0 +1,796 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "common_test_header.hpp" + +#include "test_utils_assertions.hpp" +#include "test_utils_data_generation.hpp" +#include "test_utils_thread_operators.hpp" +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +template +struct ThreadOperatorsParams +{ + using input_type = InputType; + using output_type = OutputType; +}; + +template +class HipcubThreadOperatorsTests : public ::testing::Test +{ +public: + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; +}; + +typedef ::testing::Types< + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, test_utils::custom_test_type>, + ThreadOperatorsParams, test_utils::custom_test_type> +#ifdef __HIP_PLATFORM_AMD__ + , + ThreadOperatorsParams, // Doesn't work on NVIDIA / CUB + ThreadOperatorsParams // Doesn't work on NVIDIA / CUB +#endif + > + ThreadOperatorsParameters; + +TYPED_TEST_SUITE(HipcubThreadOperatorsTests, ThreadOperatorsParameters); + +// Commutative operators tests. + +/// \brief Shared code for equality/inequality operators. +template +void equality_op_test(ScanOpT op, bool equality) +{ + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + // Generate random input value. + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + const InputT input_val + = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; + + OutputT output_val{}; + + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(input_val, input_val), equality)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(output_val, output_val), equality)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(output_val, input_val), !equality)); + + output_val = OutputT(input_val); + + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(output_val, input_val), equality)); + } +} + +TYPED_TEST(HipcubThreadOperatorsTests, Equality) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + + using Equality = typename EqualitySelector::type; + Equality op{}; + + equality_op_test(op, true); +} + +TYPED_TEST(HipcubThreadOperatorsTests, Inequality) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + + using Inequality = typename EqualitySelector::type; + Inequality op{}; + + equality_op_test(op, false); +} + +TYPED_TEST(HipcubThreadOperatorsTests, InequalityWrapper) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + + using Equality = typename EqualitySelector::type; + Equality wrapped_op{}; + hipcub::InequalityWrapper op{wrapped_op}; + + equality_op_test(op, false); +} + +/// \brief Shared code for algebraic operators. +template +void algebraic_op_test(const InputT input_val, OutputT init_val) +{ + using accum_type = hipcub::detail::accumulator_t; + + ScanOpT op{}; + + accum_type output_val = init_val; + + // Check result. + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(init_val, input_val), output_val)); + + // Check return type. + ASSERT_NO_FATAL_FAILURE(test_utils::assert_type(op(init_val, input_val), output_val)); +} + +TYPED_TEST(HipcubThreadOperatorsTests, Sum) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + using Sum = typename AlgebraicSelector::type; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + // Generate random initial value. + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + output_type init_val + = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; + + algebraic_op_test(input_type{}, init_val); + } +} + +TYPED_TEST(HipcubThreadOperatorsTests, Difference) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + using Difference = + typename AlgebraicSelector::type; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + // Generate random initial value. + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + output_type init_val + = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; + + algebraic_op_test(input_type{}, init_val); + } +} + +// Division operator is not defined for custom_test_type. +template +class HipcubDivisionOperatorTests : public ::testing::Test +{ +public: + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; +}; + +typedef ::testing::Types< + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams +#ifdef __HIP_PLATFORM_AMD__ + , + ThreadOperatorsParams, // Doesn't work on NVIDIA / CUB + ThreadOperatorsParams // Doesn't work on NVIDIA / CUB +#endif + > + DivisionOperatorParameters; +TYPED_TEST_SUITE(HipcubDivisionOperatorTests, DivisionOperatorParameters); + +TYPED_TEST(HipcubDivisionOperatorTests, Division) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + using Division = typename AlgebraicSelector::type; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + // Generate random input value. + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + input_type input_val + = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; + + algebraic_op_test(input_val, output_type{}); + } +} + +/// \brief Shared code for min/max operators. +template +void minmax_op_test(bool is_max) +{ + ScanOpT op{}; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + // Generate random initial and input values. + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + OutputT init_val = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; + InputT input_val = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; + + AccumT output_val + = is_max ? test_utils::max(init_val, input_val) : test_utils::min(init_val, input_val); + + // Check result. + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(init_val, input_val), output_val)); + + // Check return type. + ASSERT_NO_FATAL_FAILURE(test_utils::assert_type(op(init_val, input_val), output_val)); + } +} + +TYPED_TEST(HipcubThreadOperatorsTests, Max) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + using accum_type = typename std::common_type::type; + using Max = typename MaxSelector::type; + + minmax_op_test(true); +} + +TYPED_TEST(HipcubThreadOperatorsTests, Min) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + using accum_type = typename std::common_type::type; + using Min = typename MinSelector::type; + + minmax_op_test(false); +} + +/// \brief Shared code for ArgMin/ArgMax operators. +template +void arg_op_test(bool is_max) +{ + using input_pair_type = hipcub::KeyValuePair; + + ArgOpT op{}; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + // Generate random initial and input values. + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + std::vector generated_values + = test_utils::get_random_data(2, 1.0f, 100.0f, seed_value); + + InputT input_val = generated_values[0]; + InputT init_val = generated_values[1]; + InputT output_val + = is_max ? test_utils::max(init_val, input_val) : test_utils::min(init_val, input_val); + + input_pair_type init_pair(0, init_val); + input_pair_type input_pair(0, input_val); + input_pair_type output_pair = op(init_pair, input_pair); + + // Check result. + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_pair.value, output_val)); + } +} + +TYPED_TEST(HipcubThreadOperatorsTests, ArgMax) +{ + using input_type = typename TestFixture::input_type; + using ArgMax = typename ArgMaxSelector::type; + + arg_op_test(true); +} + +TYPED_TEST(HipcubThreadOperatorsTests, ArgMin) +{ + using input_type = typename TestFixture::input_type; + using ArgMin = typename ArgMinSelector::type; + + arg_op_test(false); +} + +// Non-commutative operators. + +template +class HipcubNCThreadOperatorsTests : public ::testing::Test +{ +public: + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; +}; + +typedef ::testing::Types, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams, + ThreadOperatorsParams> + NCThreadOperatorsParameters; + +std::vector get_sizes() +{ + // We generate size 208 as a maximum so the sum $\sum_{i = n/2 + 1}^n i$ does not overflow for sort type. + // This overflow does not happen for an unsigned int size n iff (3 * n^2 + 2 * n)/4 <= 32767 iff n <= 208. + std::vector sizes = {1, 8, 10, 53, 208}; + const std::vector random_sizes = test_utils::get_random_data(2, 1, 208, rand()); + sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); + std::sort(sizes.begin(), sizes.end()); + return sizes; +} + +TYPED_TEST_SUITE(HipcubNCThreadOperatorsTests, NCThreadOperatorsParameters); + +/// \brief Shared code for scan operators. +template +void scan_op_test(std::vector h_input, + std::vector h_expected, + ScanOpT op, + size_t input_size) +{ + // Set HIP device. + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + hipStream_t stream = 0; + const bool debug_synchronous = false; + + // Allocate input and output on device and copy input from host. + InputT* d_input{}; + OutputT* d_output{}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input_size * sizeof(InputT))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input_size * sizeof(OutputT))); + HIP_CHECK( + hipMemcpy(d_input, h_input.data(), input_size * sizeof(InputT), hipMemcpyHostToDevice)); + + // Get size of temporary storage on device. + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + HIP_CHECK(hipcub::DeviceScan::InclusiveScan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + op, + input_size, + stream, + debug_synchronous)); + + // Size of temporary storage must be > 0. + ASSERT_GT(temp_storage_size_bytes, 0U); + + // Allocate temporary storage. + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + // Run kernel. + HIP_CHECK(hipcub::DeviceScan::InclusiveScan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + op, + input_size, + stream, + debug_synchronous)); + HIP_CHECK(hipGetLastError()); + + // Copy output to host. + std::vector h_output(input_size); + HIP_CHECK( + hipMemcpy(h_output.data(), d_output, input_size * sizeof(OutputT), hipMemcpyDeviceToHost)); + + // Check output. + for(size_t i = 0; i < input_size; ++i) + { + auto tolerance = std::max(std::abs(0.1f * h_expected[i]), OutputT(0.01f)); + if(std::is_integral::value) + tolerance = 0; + ASSERT_NEAR(h_output[i], h_expected[i], tolerance) << "where index = " << i; + } + + // Check output type. + for(size_t i = 0; i < input_size; ++i) + { + ASSERT_NO_FATAL_FAILURE(test_utils::assert_type(h_output[i], h_expected[i])) + << "where index = " << i; + } + + // Free resources. + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); +} + +TYPED_TEST(HipcubNCThreadOperatorsTests, SwizzleScanOp) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + + // Generate input data. + const std::vector sizes = get_sizes(); + for(auto input_size : sizes) + { + SCOPED_TRACE(testing::Message() << "with size = " << input_size); + + std::vector h_input(input_size); + std::iota(h_input.begin(), h_input.end(), static_cast(1)); + + // Scan function: SwizzleScanOp. + hipcub::Sum sum_op{}; + hipcub::SwizzleScanOp scan_op(sum_op); + + // Calculate expected results on host. + std::vector h_expected(input_size); + test_utils::host_inclusive_scan(h_input.begin(), + h_input.end(), + h_expected.begin(), + scan_op); + + scan_op_test(h_input, h_expected, scan_op, input_size); + } +} + +TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceBySegmentOp) +{ + using key_type = int; + using input_type = typename TestFixture::input_type; + using output_type = input_type; + using pair_type = hipcub::KeyValuePair; + + const std::vector sizes = get_sizes(); + for(auto segment_size : sizes) + { + constexpr size_t segment_count = 2; + const size_t input_size = segment_count * segment_size; + + SCOPED_TRACE(testing::Message() << "with size = " << input_size); + + // Generate data. We generate the input {1, 2, 3, ... , n} and we want to compute the + // output {1 + 2 + ... + n/2, (n/2 + 1) + (n/2 + 2) + ... + n}. + std::vector input_values(input_size); + std::iota(input_values.begin(), input_values.end(), static_cast(1)); + + std::vector input_keys(input_size); + std::iota(input_keys.begin(), input_keys.begin() + segment_size, static_cast(0)); + std::iota(input_keys.begin() + segment_size, input_keys.end(), static_cast(0)); + + std::vector input{}; + for(size_t i = 0; i < input_size; ++i) + { + input.push_back(pair_type(input_keys[i], input_values[i])); + } + + // Reduce and scan operators. + hipcub::Sum sum_op{}; + hipcub::ReduceBySegmentOp op(sum_op); + + // Calculate expected results on host. + std::vector expected{}; + pair_type init(0, 0); + for(size_t offset = 0; offset < input_size; offset += segment_size) + { + const size_t end = std::min(input_size, offset + segment_size); + pair_type aggregate = init; + for(size_t i = offset; i < end; ++i) + { + pair_type input_pair = input[i]; + key_type expected_key = sum_op(aggregate.key, input_pair.key); + output_type expected_value + = input_pair.key ? input_pair.value : sum_op(aggregate.value, input_pair.value); + aggregate = pair_type(expected_key, expected_value); + } + expected.push_back(aggregate); + } + + // Get output on host. + std::vector output{}; + for(size_t offset = 0; offset < input_size; offset += segment_size) + { + const size_t end = std::min(input_size, offset + segment_size); + pair_type aggregate = init; + for(size_t i = offset; i < end; ++i) + { + aggregate = op(aggregate, input[i]); + } + output.push_back(aggregate); + } + + // Check if output pairs are as expected. + for(size_t i = 0; i < segment_count; i++) + { + // Check keys. + ASSERT_EQ(expected[i].key, output[i].key) << "where index = " << i; + + // Check values. + auto tolerance + = std::max(std::abs(0.1f * expected[i].value), output_type(0.01f)); + if(std::is_integral::value) + tolerance = 0; + ASSERT_NEAR(expected[i].value, output[i].value, tolerance) << "where index = " << i; + } + } +} + +TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceByKeyOp) +{ + using key_type = int; + using input_type = typename TestFixture::input_type; + using output_type = input_type; + using pair_type = hipcub::KeyValuePair; + + // Set HIP device. + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + hipStream_t stream = 0; + const bool debug_synchronous = false; + + const std::vector sizes = get_sizes(); + for(auto input_size : sizes) + { + const size_t h_unique_keys = input_size / 2 + (input_size % 2); + + // Generate data. We generate the input {1, 2, 3, ... , n} and we want to compute the + // output {1 + 2, 3 + 4, ... , (n - 1) + n}. + std::vector h_input(input_size); + std::iota(h_input.begin(), h_input.end(), static_cast(1)); + + std::vector h_keys(input_size); + for(size_t i = 0; i < input_size; ++i) + { + h_keys[i] = (i % 2) ? h_keys[i - 1] : i / 2; + } + + // Reduce operators. + hipcub::Sum sum_op; + hipcub::ReduceByKeyOp op{}; + + // Calculate output on host. + std::vector h_output(h_unique_keys); + std::vector h_keys_output(h_unique_keys); + h_keys_output[0] = h_keys[0]; + h_output[0] = h_input[0]; + pair_type first(h_keys[0], h_input[0]); + for(size_t i = 1; i < input_size; ++i) + { + pair_type second(h_keys[i], h_input[i]); + pair_type result = (first.key == second.key) ? op(first, second) : second; + h_keys_output[h_keys[i]] = result.key; + h_output[h_keys[i]] = result.value; + first = pair_type(second); + } + + // Allocate input, keys and expected results on device and copy input and keys from host. + input_type* d_input{}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input_size * sizeof(input_type))); + HIP_CHECK(hipMemcpy(d_input, + h_input.data(), + input_size * sizeof(input_type), + hipMemcpyHostToDevice)); + + key_type* d_keys{}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_keys, input_size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys, h_keys.data(), input_size * sizeof(key_type), hipMemcpyHostToDevice)); + + key_type* d_keys_expected{}; + output_type* d_expected{}; + size_t* d_unique_keys_expected{}; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_keys_expected, h_unique_keys * sizeof(key_type))); + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_expected, h_unique_keys * sizeof(output_type))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_unique_keys_expected, sizeof(size_t))); + + // Get size of temporary storage on device. + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_keys_expected, + d_input, + d_expected, + d_unique_keys_expected, + sum_op, + input_size, + stream, + debug_synchronous)); + + // Size of temporary storage must be > 0. + ASSERT_GT(temp_storage_size_bytes, 0U); + + // Allocate temporary storage. + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + // Run kernel. + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_keys_expected, + d_input, + d_expected, + d_unique_keys_expected, + sum_op, + input_size, + stream, + debug_synchronous)); + HIP_CHECK(hipGetLastError()); + + // Copy expected results to host. + std::vector h_keys_expected(h_unique_keys); + HIP_CHECK(hipMemcpy(h_keys_expected.data(), + d_keys_expected, + h_unique_keys * sizeof(key_type), + hipMemcpyDeviceToHost)); + + std::vector h_expected(h_unique_keys); + HIP_CHECK(hipMemcpy(h_expected.data(), + d_expected, + h_unique_keys * sizeof(output_type), + hipMemcpyDeviceToHost)); + + std::vector h_unique_keys_expected(1); + HIP_CHECK(hipMemcpy(h_unique_keys_expected.data(), + d_unique_keys_expected, + sizeof(size_t), + hipMemcpyDeviceToHost)); + + // Check if output values are as expected. + // Check number of unique keys. + ASSERT_EQ(h_unique_keys_expected[0], h_unique_keys); + for(size_t i = 0; i < h_unique_keys; ++i) + { + // Check keys. + ASSERT_EQ(h_keys_expected[i], h_keys_output[i]) << "where index = " << i; + + // Check values. + auto tolerance + = std::max(std::abs(0.1f * h_expected[i]), output_type(0.01f)); + if(std::is_integral::value) + tolerance = 0; + ASSERT_NEAR(h_expected[i], h_output[i], tolerance) << "where index = " << i; + } + + // Free resources. + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_keys)); + HIP_CHECK(hipFree(d_keys_expected)); + HIP_CHECK(hipFree(d_expected)); + HIP_CHECK(hipFree(d_unique_keys_expected)); + } +} + +TYPED_TEST(HipcubNCThreadOperatorsTests, BinaryFlip) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + + const std::vector sizes = get_sizes(); + for(auto input_size : sizes) + { + // Generate data. + std::vector h_input(input_size); + std::iota(h_input.begin(), h_input.end(), static_cast(1)); + + // Scan function: BinaryFlip. + hipcub::Sum sum_op{}; + hipcub::BinaryFlip scan_op(sum_op); + + // Calculate expected results on host. + std::vector h_expected{}; + + // BinaryFlip's () operator is a device function, so cannot be called from the host function + // test_utils::host_inclusive_scan. We do the scan "manually". + output_type accum = h_input[0]; + h_expected.push_back(accum); + for(size_t i = 1; i < input_size; ++i) + { + // The host_inclusive_cast would do: + // + // accum = scan_op(accum, static_cast(h_input[i])); + // + // But for the BinaryFlip this is equivalent to: + accum = sum_op(static_cast(h_input[i]), accum); + h_expected.push_back(accum); + } + + scan_op_test(h_input, h_expected, scan_op, input_size); + } +} + +// Unary operators tests. + +TYPED_TEST(HipcubNCThreadOperatorsTests, CastOp) +{ + using input_type = typename TestFixture::input_type; + using output_type = typename TestFixture::output_type; + using IteratorType + = hipcub::TransformInputIterator, input_type*>; + + const std::vector sizes = get_sizes(); + for(auto input_size : sizes) + { + // Generate data. + std::vector input(input_size); + std::iota(input.begin(), input.end(), static_cast(0)); + + std::vector expected(input_size); + std::iota(expected.begin(), expected.end(), static_cast(0)); + + // Scan operator: CastOp. + hipcub::CastOp op{}; + + // Transform input applying the casting operator. + auto output = IteratorType(input.data(), op); + + // Check output. + for(size_t i = 0; i < input_size; ++i) + { + auto tolerance + = std::max(std::abs(0.1f * expected[i]), output_type(0.01f)); + if(std::is_integral::value) + tolerance = 0; + ASSERT_NEAR(output[i], expected[i], tolerance); + } + + // Check output type. + for(size_t i = 0; i < input_size; ++i) + { + ASSERT_NO_FATAL_FAILURE(test_utils::assert_type(output[i], expected[i])) + << "where index = " << i; + } + } +} diff --git a/test/hipcub/test_hipcub_warp_reduce.cpp b/test/hipcub/test_hipcub_warp_reduce.cpp index 60792522..82202a81 100644 --- a/test/hipcub/test_hipcub_warp_reduce.cpp +++ b/test/hipcub/test_hipcub_warp_reduce.cpp @@ -128,16 +128,18 @@ TYPED_TEST(HipcubWarpReduceTests, Reduce) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -273,16 +275,18 @@ TYPED_TEST(HipcubWarpReduceTests, ReduceValid) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -419,16 +423,18 @@ TYPED_TEST(HipcubWarpReduceTests, HeadSegmentedReduceSum) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -612,16 +618,18 @@ TYPED_TEST(HipcubWarpReduceTests, TailSegmentedReduceSum) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; diff --git a/test/hipcub/test_hipcub_warp_scan.cpp b/test/hipcub/test_hipcub_warp_scan.cpp index e8b76ccb..95a39dda 100644 --- a/test/hipcub/test_hipcub_warp_scan.cpp +++ b/test/hipcub/test_hipcub_warp_scan.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -127,16 +127,18 @@ TYPED_TEST(HipcubWarpScanTests, InclusiveScan) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -293,16 +295,18 @@ TYPED_TEST(HipcubWarpScanTests, InclusiveScanReduce) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -474,16 +478,18 @@ TYPED_TEST(HipcubWarpScanTests, ExclusiveScan) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -636,16 +642,18 @@ TYPED_TEST(HipcubWarpScanTests, ExclusiveReduceScan) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -831,16 +839,18 @@ TYPED_TEST(HipcubWarpScanTests, Scan) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; @@ -993,16 +1003,18 @@ TYPED_TEST(HipcubWarpScanTests, InclusiveScanCustomType) constexpr size_t ws64 = size_t(HIPCUB_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws32, logical_warp_size * 4) - : test_utils::max((ws32/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws32 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws32, logical_warp_size * 4) + : test_utils::max((ws32 / logical_warp_size) * logical_warp_size, + static_cast(1)); // Block size of warp size 64 - constexpr size_t block_size_ws64 = - test_utils::is_power_of_two(logical_warp_size) - ? test_utils::max(ws64, logical_warp_size * 4) - : test_utils::max((ws64/logical_warp_size) * logical_warp_size, 1); + constexpr size_t block_size_ws64 + = test_utils::is_power_of_two(logical_warp_size) + ? test_utils::max(ws64, logical_warp_size * 4) + : test_utils::max((ws64 / logical_warp_size) * logical_warp_size, + static_cast(1)); const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; diff --git a/test/hipcub/test_utils.hpp b/test/hipcub/test_utils.hpp index 82ee4df8..55c872d1 100644 --- a/test/hipcub/test_utils.hpp +++ b/test/hipcub/test_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -184,18 +184,114 @@ OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, return ++d_first; } +template +HIPCUB_HOST_DEVICE inline constexpr typename std::common_type::type max(const T& t, + const U& u) +{ + return t < u ? u : t; +} + +HIPCUB_HOST_DEVICE inline test_utils::half max(const test_utils::half& a, const test_utils::half& b) +{ + return test_utils::half_maximum{}(a, b); +} + template -HIPCUB_HOST_DEVICE inline -constexpr T max(const T& a, const T& b) +HIPCUB_HOST_DEVICE inline constexpr T max(const T& t, const test_utils::half& u) { - return a < b ? b : a; + return test_utils::max(t, static_cast(u)); } template -HIPCUB_HOST_DEVICE inline -constexpr T min(const T& a, const T& b) +HIPCUB_HOST_DEVICE inline constexpr T max(const test_utils::half& t, const T& u) +{ + return test_utils::max(static_cast(t), u); +} + +HIPCUB_HOST_DEVICE inline test_utils::bfloat16 max(const test_utils::bfloat16& a, + const test_utils::bfloat16& b) +{ + return test_utils::bfloat16_maximum{}(a, b); +} + +template +HIPCUB_HOST_DEVICE inline constexpr T max(const T& t, const test_utils::bfloat16& u) +{ + return test_utils::max(t, static_cast(u)); +} + +template +HIPCUB_HOST_DEVICE inline constexpr T max(const test_utils::bfloat16& t, const T& u) +{ + return test_utils::max(static_cast(t), u); +} + +template +HIPCUB_HOST_DEVICE inline constexpr typename std::common_type, + test_utils::custom_test_type>::type + min(const test_utils::custom_test_type& t, const test_utils::custom_test_type& u) +{ + using common_type = typename std::common_type, + test_utils::custom_test_type>::type; + const common_type common_t(t); + const common_type common_u(u); + + return common_t < common_u ? common_t : common_u; +} + +template +HIPCUB_HOST_DEVICE inline constexpr typename std::common_type::type min(const T& t, + const U& u) { - return a < b ? a : b; + return t < u ? t : u; +} + +template +HIPCUB_HOST_DEVICE inline constexpr T min(const T& t, const test_utils::half& u) +{ + return test_utils::min(t, static_cast(u)); +} + +template +HIPCUB_HOST_DEVICE inline constexpr T min(const test_utils::half& t, const T& u) +{ + return test_utils::min(static_cast(t), u); +} + +HIPCUB_HOST_DEVICE inline test_utils::half min(const test_utils::half& a, const test_utils::half& b) +{ + return test_utils::half_minimum{}(a, b); +} + +template +HIPCUB_HOST_DEVICE inline constexpr T min(const T& t, const test_utils::bfloat16& u) +{ + return test_utils::min(t, static_cast(u)); +} + +template +HIPCUB_HOST_DEVICE inline constexpr T min(const test_utils::bfloat16& t, const T& u) +{ + return test_utils::min(static_cast(t), u); +} + +HIPCUB_HOST_DEVICE inline test_utils::bfloat16 min(const test_utils::bfloat16& a, + const test_utils::bfloat16& b) +{ + return test_utils::bfloat16_minimum{}(a, b); +} + +template +HIPCUB_HOST_DEVICE inline constexpr typename std::common_type, + test_utils::custom_test_type>::type + max(const test_utils::custom_test_type& t, const test_utils::custom_test_type& u) +{ + using common_type = typename std::common_type, + test_utils::custom_test_type>::type; + const common_type common_t(t); + const common_type common_u(u); + + return common_t < common_u ? common_u : common_t; } template diff --git a/test/hipcub/test_utils_argminmax.hpp b/test/hipcub/test_utils_argminmax.hpp deleted file mode 100644 index 2745d9fd..00000000 --- a/test/hipcub/test_utils_argminmax.hpp +++ /dev/null @@ -1,117 +0,0 @@ -// MIT License -// -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN - -#ifndef HIPCUB_TEST_UTILS_ARGMINMAX_HPP -#define HIPCUB_TEST_UTILS_ARGMINMAX_HPP - -#include -#include - -/** - * \brief Arg max functor - Because NVIDIA's hipcub::ArgMax doesn't work with bfloat16 (HOST-SIDE) - */ -struct ArgMax -{ - template::value - || std::is_same::value, - bool> - = true> - HIPCUB_HOST_DEVICE __forceinline__ hipcub::KeyValuePair - operator()(const hipcub::KeyValuePair& a, - const hipcub::KeyValuePair& b) const - { - const hipcub::KeyValuePair native_a(a.key, a.value); - const hipcub::KeyValuePair native_b(b.key, b.value); - - if((native_b.value > native_a.value) - || ((native_a.value == native_b.value) && (native_b.key < native_a.key))) - return b; - return a; - } -}; -/** - * \brief Arg min functor - Because NVIDIA's hipcub::ArgMin doesn't work with bfloat16 (HOST-SIDE) - */ -struct ArgMin -{ - template::value - || std::is_same::value, - bool> - = true> - HIPCUB_HOST_DEVICE __forceinline__ hipcub::KeyValuePair - operator()(const hipcub::KeyValuePair& a, - const hipcub::KeyValuePair& b) const - { - const hipcub::KeyValuePair native_a(a.key, a.value); - const hipcub::KeyValuePair native_b(b.key, b.value); - - if((native_b.value < native_a.value) - || ((native_a.value == native_b.value) && (native_b.key < native_a.key))) - return b; - return a; - } -}; - -// Maximum to operator selector -template -struct ArgMaxSelector -{ - typedef hipcub::ArgMax type; -}; - -template<> -struct ArgMaxSelector -{ - typedef ArgMax type; -}; - -template<> -struct ArgMaxSelector -{ - typedef ArgMax type; -}; - -// Minimum to operator selector -template -struct ArgMinSelector -{ - typedef hipcub::ArgMin type; -}; - -#ifdef __HIP_PLATFORM_NVIDIA__ -template<> -struct ArgMinSelector -{ - typedef ArgMin type; -}; - -template<> -struct ArgMinSelector -{ - typedef ArgMin type; -}; -#endif - -#endif //HIPCUB_TEST_UTILS_ARGMINMAX_HPP diff --git a/test/hipcub/test_utils_assertions.hpp b/test/hipcub/test_utils_assertions.hpp index 05030d60..8fce0ce2 100644 --- a/test/hipcub/test_utils_assertions.hpp +++ b/test/hipcub/test_utils_assertions.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -238,5 +238,87 @@ inline void assert_bit_eq(const std::vector& result, const std::vector& ex } } +#if HIPCUB_IS_INT128_ENABLED +inline void assert_bit_eq(const std::vector<__int128_t>& result, + const std::vector<__int128_t>& expected) +{ + ASSERT_EQ(result.size(), expected.size()); + + auto to_string = [](__int128_t value) + { + static const char* charmap = "0123456789"; + + std::string result; + result.reserve(41); // max. 40 digits possible ( uint64_t has 20) plus sign + __uint128_t helper = (value < 0) ? -value : value; + + do + { + result += charmap[helper % 10]; + helper /= 10; + } + while(helper); + if(value < 0) + { + result += "-"; + } + std::reverse(result.begin(), result.end()); + return result; + }; + + for(size_t i = 0; i < result.size(); i++) + { + if(!bit_equal(result[i], expected[i])) + { + FAIL() << "Expected strict/bitwise equality of these values: " << std::endl + << " result[i]: " << to_string(result[i]) << std::endl + << " expected[i]: " << to_string(expected[i]) << std::endl + << "where index = " << i; + } + } +} + +inline void assert_bit_eq(const std::vector<__uint128_t>& result, + const std::vector<__uint128_t>& expected) +{ + ASSERT_EQ(result.size(), expected.size()); + + auto to_string = [](__uint128_t value) + { + static const char* charmap = "0123456789"; + + std::string result; + result.reserve(40); // max. 40 digits possible ( uint64_t has 20) + __uint128_t helper = value; + + do + { + result += charmap[helper % 10]; + helper /= 10; + } + while(helper); + std::reverse(result.begin(), result.end()); + return result; + }; + + for(size_t i = 0; i < result.size(); i++) + { + if(!bit_equal(result[i], expected[i])) + { + FAIL() << "Expected strict/bitwise equality of these values: " << std::endl + << " result[i]: " << to_string(result[i]) << std::endl + << " expected[i]: " << to_string(expected[i]) << std::endl + << "where index = " << i; + } + } +} +#endif //HIPCUB_IS_INT128_ENABLED + +/// Compile-time assertion for type equality of two objects. +template +inline void assert_type(ExpectedT /*obj1*/, ActualT /*obj2*/) +{ + testing::StaticAssertTypeEq(); +} } #endif // HIPCUB_TEST_HIPCUB_TEST_UTILS_ASSERTIONS_HPP_ diff --git a/test/hipcub/test_utils_bfloat16.hpp b/test/hipcub/test_utils_bfloat16.hpp index 854a4c6b..5348675e 100644 --- a/test/hipcub/test_utils_bfloat16.hpp +++ b/test/hipcub/test_utils_bfloat16.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -40,8 +40,8 @@ using bfloat16 = ::__nv_bfloat16; using native_bfloat16 = bfloat16_t; // Support bfloat16 operators on host side -HIPCUB_HOST inline -test_utils::bfloat16 native_to_bfloat16(const test_utils::native_bfloat16& x) +HIPCUB_HOST_DEVICE inline test_utils::bfloat16 + native_to_bfloat16(const test_utils::native_bfloat16& x) { return *reinterpret_cast(&x); } diff --git a/test/hipcub/test_utils_data_generation.hpp b/test/hipcub/test_utils_data_generation.hpp index b0ffdc32..3b739dd9 100644 --- a/test/hipcub/test_utils_data_generation.hpp +++ b/test/hipcub/test_utils_data_generation.hpp @@ -336,8 +336,11 @@ inline auto get_random_data(size_t size, T min, T max, int seed_value) } template -inline auto get_random_data(size_t size, S min, U max, int seed_value) - -> typename std::enable_if::value && !is_custom_test_type::value, std::vector>::type +inline auto get_random_data(size_t size, S min, U max, int seed_value) -> + typename std::enable_if::value && !is_custom_test_type::value + && !std::is_same::value + && !std::is_same::value, + std::vector>::type { std::default_random_engine gen(seed_value); using dis_type = @@ -352,6 +355,34 @@ inline auto get_random_data(size_t size, S min, U max, int seed_value) return data; } +template +inline auto get_random_data(size_t size, S min, U max, int seed_value) -> + typename std::enable_if::value, std::vector>::type +{ + std::default_random_engine gen(seed_value); + std::uniform_int_distribution distribution(static_cast(min), + static_cast(max)); + std::vector data(size); + std::generate(data.begin(), + data.end(), + [&]() { return static_cast(distribution(gen)) * 2; }); + return data; +} + +template +inline auto get_random_data(size_t size, S min, U max, int seed_value) -> + typename std::enable_if::value, std::vector>::type +{ + std::default_random_engine gen(seed_value); + std::uniform_int_distribution distribution(static_cast(min), + static_cast(max)); + std::vector data(size); + std::generate(data.begin(), + data.end(), + [&]() { return static_cast(distribution(gen)) * 2; }); + return data; +} + template inline auto get_random_data(size_t size, typename T::value_type min, diff --git a/test/hipcub/test_utils_thread_operators.hpp b/test/hipcub/test_utils_thread_operators.hpp new file mode 100644 index 00000000..4333dc2f --- /dev/null +++ b/test/hipcub/test_utils_thread_operators.hpp @@ -0,0 +1,449 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef HIPCUB_TEST_TEST_UTILS_THREAD_OPERATORS_HPP_ +#define HIPCUB_TEST_TEST_UTILS_THREAD_OPERATORS_HPP_ + +#include "test_utils.hpp" + +#include +#include +#include + +/** + * \brief ExtendedFloatBoolOp general functor - Because hipcub::Equality() and Inequality() + * don't work with input types and . + */ +template +struct ExtendedFloatBoolOp +{ + BoolOpT eq_op; + + HIPCUB_HOST_DEVICE inline ExtendedFloatBoolOp() {} + + template + HIPCUB_HOST_DEVICE bool operator()(T a, T b) const + { + return eq_op(a.raw(), b.raw()); + } + + HIPCUB_HOST_DEVICE bool operator()(float a, float b) const + { + return eq_op(a, b); + } + + HIPCUB_HOST_DEVICE bool operator()(test_utils::half a, test_utils::half b) const + { + return this->operator()(test_utils::native_half(a), test_utils::native_half(b)); + } + + HIPCUB_HOST_DEVICE bool operator()(test_utils::bfloat16 a, test_utils::bfloat16 b) const + { + return this->operator()(test_utils::native_bfloat16(a), test_utils::native_bfloat16(b)); + } + + HIPCUB_HOST_DEVICE bool operator()(float a, test_utils::half b) const + { + return this->operator()(a, float(b)); + } + + HIPCUB_HOST_DEVICE bool operator()(float a, test_utils::bfloat16 b) const + { + return this->operator()(a, float(b)); + } +}; + +/** + * \brief ExtendedFloatBinOp general functor - Because hipcub::Sum(), Difference(), Division(), + * Max() and Min() don't work with input types , + * and + * and . + * + * When using e.g. a constant input iterator of value 2 the CPU accumulator fails to keep adding + * 2 to 4096 because of precision limitations, as 2 (in half binary representation + * 0 10000 0000000000 = 1.0 x 2e1) needs to be converted to be able to sum it with 4096 + * (in half binary representation 0 11011 000000000 = 1.0 x 2e12), that is, the mantisa of 2 + * needs to be shifted to the left 11 times, but that yields a 0 and thus 4096 + 2 = 4096. + */ +template +struct ExtendedFloatBinOp +{ + BinOpT alg_op; + + HIPCUB_HOST_DEVICE inline ExtendedFloatBinOp() {} + + template + HIPCUB_HOST_DEVICE T operator()(T a, T b) const + { + T result{}; + result.__x = alg_op(a.raw(), b.raw()); + return result; + } + + HIPCUB_HOST_DEVICE float operator()(float a, float b) const + { + return alg_op(a, b); + } + + HIPCUB_HOST_DEVICE test_utils::half operator()(test_utils::half a, test_utils::half b) const + { + return test_utils::native_to_half( + this->operator()(test_utils::native_half(a), test_utils::native_half(b))); + } + + HIPCUB_HOST_DEVICE test_utils::bfloat16 operator()(test_utils::bfloat16 a, + test_utils::bfloat16 b) const + { + return test_utils::native_to_bfloat16( + this->operator()(test_utils::native_bfloat16(a), test_utils::native_bfloat16(b))); + } + + HIPCUB_HOST_DEVICE float operator()(float a, test_utils::half b) const + { + return this->operator()(a, float(b)); + } + + HIPCUB_HOST_DEVICE float operator()(float a, test_utils::bfloat16 b) const + { + return this->operator()(a, float(b)); + } +}; + +/** + * \brief Common type specialization - Because min and max don't work with + * . + */ +template<> +struct std::common_type +{ + using type = float; +}; + +/** + * \brief Common type specialization - Because min and max don't work with + * . + */ +template<> +struct std::common_type +{ + using type = float; +}; + +/** + * \brief ArgMax functor - Because NVIDIA's hipcub::ArgMax doesn't work with bfloat16 (HOST-SIDE) + */ +struct ArgMax +{ + template::value + || std::is_same::value, + bool> + = true> + HIPCUB_HOST_DEVICE __forceinline__ hipcub::KeyValuePair + operator()(const hipcub::KeyValuePair& a, + const hipcub::KeyValuePair& b) const + { + const hipcub::KeyValuePair native_a(a.key, a.value); + const hipcub::KeyValuePair native_b(b.key, b.value); + + if((native_b.value > native_a.value) + || ((native_a.value == native_b.value) && (native_b.key < native_a.key))) + return b; + return a; + } +}; +/** + * \brief ArgMin functor - Because NVIDIA's hipcub::ArgMin doesn't work with bfloat16 (HOST-SIDE) + */ +struct ArgMin +{ + template::value + || std::is_same::value, + bool> + = true> + HIPCUB_HOST_DEVICE __forceinline__ hipcub::KeyValuePair + operator()(const hipcub::KeyValuePair& a, + const hipcub::KeyValuePair& b) const + { + const hipcub::KeyValuePair native_a(a.key, a.value); + const hipcub::KeyValuePair native_b(b.key, b.value); + + if((native_b.value < native_a.value) + || ((native_a.value == native_b.value) && (native_b.key < native_a.key))) + return b; + return a; + } +}; + +/** + * \brief Common type specialization - Because some thread operators do not work with + * , custom_test_type> for different types T and U. + */ +template +struct std::common_type, test_utils::custom_test_type> +{ + using type = test_utils::custom_test_type::type>; +}; + +/** + * \brief CustomTestOp generic functor - Because some thread operators don't work with + * , custom_test_type> for different types T and U. + */ +template +struct CustomTestOp +{ + BinaryOpT binary_op; + + HIPCUB_HOST_DEVICE inline CustomTestOp() {} + + template + HIPCUB_HOST_DEVICE inline constexpr auto operator()(test_utils::custom_test_type t, + test_utils::custom_test_type u) const + -> decltype(auto) + { + using common_type = typename std::common_type, + test_utils::custom_test_type>::type; + const common_type common_t(t); + const common_type common_u(u); + return binary_op(common_t, common_u); + } +}; + +// Equality functor selector. +template +struct EqualitySelector +{ + typedef OpT type; +}; + +template +struct EqualitySelector +{ + typedef ExtendedFloatBoolOp type; +}; + +template +struct EqualitySelector +{ + typedef ExtendedFloatBoolOp type; +}; + +// Algebraic functor selector. +template +struct AlgebraicSelector +{ + typedef OpT type; +}; + +template +struct AlgebraicSelector, test_utils::custom_test_type> +{ + typedef CustomTestOp type; +}; + +template +struct AlgebraicSelector +{ + typedef ExtendedFloatBinOp type; +}; + +template +struct AlgebraicSelector +{ + typedef ExtendedFloatBinOp type; +}; + +// Max functor selector. +template +struct MaxSelector +{ + typedef hipcub::Max type; +}; + +template +struct MaxSelector, test_utils::custom_test_type> +{ + typedef CustomTestOp type; +}; + +template +struct MaxSelector +{ + typedef ExtendedFloatBinOp type; +}; + +template +struct MaxSelector +{ + typedef ExtendedFloatBinOp type; +}; + +// Min functor selector. +template +struct MinSelector +{ + typedef hipcub::Min type; +}; + +template +struct MinSelector, test_utils::custom_test_type> +{ + typedef CustomTestOp type; +}; + +template +struct MinSelector +{ + typedef ExtendedFloatBinOp type; +}; + +template +struct MinSelector +{ + typedef ExtendedFloatBinOp type; +}; + +// ArgMax functor selector +template +struct ArgMaxSelector +{ + typedef hipcub::ArgMax type; +}; + +#ifdef __HIP_PLATFORM_NVIDIA__ +template<> +struct ArgMaxSelector +{ + typedef ArgMax type; +}; + +template<> +struct ArgMaxSelector +{ + typedef ArgMax type; +}; +#endif + +// ArgMin functor selector +template +struct ArgMinSelector +{ + typedef hipcub::ArgMin type; +}; + +#ifdef __HIP_PLATFORM_NVIDIA__ +template<> +struct ArgMinSelector +{ + typedef ArgMin type; +}; + +template<> +struct ArgMinSelector +{ + typedef ArgMin type; +}; +#endif + +/** + * \brief DeviceReduce function selector - Because we need to resolve at compile time which function + * from namespace DeviceReduce we are calling: Sum or Reduce. + * + * When we want to compute the reduction using the hipcub::Sum operator() and extended float types + * we need to define our own functor due to extended floats not being arithmetically associative on CPU. + * + * But this new functor doesn't have an associated function in DeviceReduce, so we need to call + * to DeviceReduce::Reduce directly passing this functor, and thus we need to determine at + * compile time which function will be called so we don't get compile errors. + * For more clarity, we do get compile errors if we do a simple if..else because the compiler + * cannot determine which function will be called, and the new functor doesn't compile for all + * the types used in the tests. + * + * Note: with c++17 this selector can be substituted for an if..else in the test that uses + * "if constexpr", but currently we are using c++14. + */ +template +struct DeviceReduceSelector +{ + void reduce_sum_impl(std::true_type, + void* d_temp_storage, + size_t& temp_storage_size_bytes, + T* d_input, + U* d_output, + int num_items, + hipStream_t stream, + bool debug_synchronous) + { + HIP_CHECK(hipcub::DeviceReduce::Reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + num_items, + ExtendedFloatBinOp(), + U(0.f), + stream, + debug_synchronous)); + } + + void reduce_sum_impl(std::false_type, + void* d_temp_storage, + size_t& temp_storage_size_bytes, + T* d_input, + U* d_output, + int num_items, + hipStream_t stream, + bool debug_synchronous) + { + HIP_CHECK(hipcub::DeviceReduce::Sum(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + num_items, + stream, + debug_synchronous)); + } + + void reduce_sum(void* d_temp_storage, + size_t& temp_storage_size_bytes, + T* d_input, + U* d_output, + int num_items, + hipStream_t stream, + bool debug_synchronous) + { + reduce_sum_impl(std::integral_constant < bool, + std::is_same::value + || std::is_same::value > {}, + d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + num_items, + stream, + debug_synchronous); + } +}; + +#endif // HIPCUB_TEST_TEST_UTILS_THREAD_OPERATORS_HPP_