diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 49c8907..5c64bff 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -75,3 +75,61 @@ jobs: run: | cd ../boost-root/__build__ ctest --output-on-failure --no-tests=error + + nvrtc-cmake-test: + strategy: + fail-fast: false + + runs-on: gpu-runner-1 + + steps: + - uses: Jimver/cuda-toolkit@v0.2.16 + id: cuda-toolkit + with: + cuda: '12.5.0' + method: 'network' + + - name: Output CUDA information + run: | + echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+ + echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" + nvcc -V + - uses: actions/checkout@v4 + + - name: Install Packages + run: | + sudo apt-get install -y cmake make + - name: Setup Boost + run: | + echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY + LIBRARY=${GITHUB_REPOSITORY#*/} + echo LIBRARY: $LIBRARY + echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV + echo GITHUB_BASE_REF: $GITHUB_BASE_REF + echo GITHUB_REF: $GITHUB_REF + REF=${GITHUB_BASE_REF:-$GITHUB_REF} + REF=${REF#refs/heads/} + echo REF: $REF + BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true + echo BOOST_BRANCH: $BOOST_BRANCH + cd .. + git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root + cd boost-root + mkdir -p libs/$LIBRARY + cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY + git submodule update --init tools/boostdep + python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY + - name: Configure + run: | + cd ../boost-root + mkdir __build__ && cd __build__ + cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_CRYPT_ENABLE_NVRTC=1 -DCMAKE_CUDA_ARCHITECTURES=70 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.5 -DBOOST_CRYPT_NVRTC_CI_RUN=1 .. + pwd + - name: Build tests + run: | + cd ../boost-root/__build__ + cmake --build . --target tests -j $(nproc) + - name: Run tests + run: | + cd ../boost-root/__build__ + ctest --output-on-failure --no-tests=error diff --git a/doc/crypt/overview.adoc b/doc/crypt/overview.adoc index 1fab53d..4a245b4 100644 --- a/doc/crypt/overview.adoc +++ b/doc/crypt/overview.adoc @@ -36,7 +36,8 @@ as well as emulated PPC64LE and STM32 using QEMU with the following compilers: * GCC 7 and later * Clang 6 and later * Visual Studio 2017 and later -* Intel OneAPI DPC++ +* Intel OneAPI DPC++ 2024.2 and later +* CUDA Toolkit 12.5 and later (Both NVCC and NVRTC) Tested on https://github.com/cppalliance/decimal/actions[Github Actions] and https://drone.cpp.al/cppalliance/decimal[Drone]. Coverage can be found on https://app.codecov.io/gh/cppalliance/decimal[Codecov]. diff --git a/include/boost/crypt/hash/md5.hpp b/include/boost/crypt/hash/md5.hpp index 3458e12..600c008 100644 --- a/include/boost/crypt/hash/md5.hpp +++ b/include/boost/crypt/hash/md5.hpp @@ -18,7 +18,7 @@ #include #include -#ifndef BOOST_CRYPT_BUILD_MODULE +#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA) #include #include #include diff --git a/include/boost/crypt/hash/sha1.hpp b/include/boost/crypt/hash/sha1.hpp index 6b14fa4..9378698 100644 --- a/include/boost/crypt/hash/sha1.hpp +++ b/include/boost/crypt/hash/sha1.hpp @@ -20,7 +20,7 @@ #include #include -#ifndef BOOST_CRYPT_BUILD_MODULE +#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA) #include #include #include diff --git a/include/boost/crypt/utility/array.hpp b/include/boost/crypt/utility/array.hpp index b6a628e..9b6886b 100644 --- a/include/boost/crypt/utility/array.hpp +++ b/include/boost/crypt/utility/array.hpp @@ -10,7 +10,7 @@ #include #include -#ifndef BOOST_CRYPT_BUILD_MODULE +#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA) #include #endif @@ -102,6 +102,7 @@ class array *this = temp; } + #ifndef BOOST_CRYPT_HAS_CUDA constexpr operator std::array() noexcept { std::array new_array{}; @@ -112,6 +113,7 @@ class array return new_array; } + #endif }; template diff --git a/include/boost/crypt/utility/byte.hpp b/include/boost/crypt/utility/byte.hpp index cf39298..a6c9ace 100644 --- a/include/boost/crypt/utility/byte.hpp +++ b/include/boost/crypt/utility/byte.hpp @@ -18,52 +18,52 @@ class byte boost::crypt::uint8_t bits_; public: - constexpr byte() noexcept : bits_ {} {} - explicit constexpr byte(boost::crypt::uint8_t bits) noexcept : bits_ {bits} {} + BOOST_CRYPT_GPU_ENABLED constexpr byte() noexcept : bits_ {} {} + BOOST_CRYPT_GPU_ENABLED explicit constexpr byte(boost::crypt::uint8_t bits) noexcept : bits_ {bits} {} template - constexpr auto to_integer() noexcept + BOOST_CRYPT_GPU_ENABLED constexpr auto to_integer() noexcept BOOST_CRYPT_REQUIRES(boost::crypt::is_integral_v, IntegerType) { return static_cast(bits_); } template - constexpr auto operator<<(IntegerType shift) noexcept + BOOST_CRYPT_GPU_ENABLED constexpr auto operator<<(IntegerType shift) noexcept BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte) { return byte{bits_ << shift}; } template - constexpr auto operator>>(IntegerType shift) noexcept + BOOST_CRYPT_GPU_ENABLED constexpr auto operator>>(IntegerType shift) noexcept BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte) { return byte{bits_ >> shift}; } - constexpr auto operator|(byte rhs) const noexcept -> byte + BOOST_CRYPT_GPU_ENABLED constexpr auto operator|(byte rhs) const noexcept -> byte { return byte{static_cast(bits_ | rhs.bits_)}; } - constexpr auto operator&(byte rhs) const noexcept -> byte + BOOST_CRYPT_GPU_ENABLED constexpr auto operator&(byte rhs) const noexcept -> byte { return byte{static_cast(bits_ & rhs.bits_)}; } - constexpr auto operator^(byte rhs) const noexcept -> byte + BOOST_CRYPT_GPU_ENABLED constexpr auto operator^(byte rhs) const noexcept -> byte { return byte{static_cast(bits_ ^ rhs.bits_)}; } - constexpr auto operator~() const noexcept -> byte + BOOST_CRYPT_GPU_ENABLED constexpr auto operator~() const noexcept -> byte { return byte{static_cast(~bits_)}; } template - constexpr auto operator<<=(IntegerType shift) noexcept + BOOST_CRYPT_GPU_ENABLED constexpr auto operator<<=(IntegerType shift) noexcept BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte&) { bits_ <<= shift; @@ -71,32 +71,32 @@ class byte } template - constexpr auto operator >>=(IntegerType shift) noexcept + BOOST_CRYPT_GPU_ENABLED constexpr auto operator >>=(IntegerType shift) noexcept BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte&) { bits_ >>= shift; return *this; } - constexpr auto operator|(byte rhs) noexcept -> byte& + BOOST_CRYPT_GPU_ENABLED constexpr auto operator|(byte rhs) noexcept -> byte& { bits_ = static_cast(bits_ | rhs.bits_); return *this; } - constexpr auto operator&(byte rhs) noexcept -> byte& + BOOST_CRYPT_GPU_ENABLED constexpr auto operator&(byte rhs) noexcept -> byte& { bits_ = static_cast(bits_ & rhs.bits_); return *this; } - constexpr auto operator^(byte rhs) noexcept -> byte& + BOOST_CRYPT_GPU_ENABLED constexpr auto operator^(byte rhs) noexcept -> byte& { bits_ = static_cast(bits_ ^ rhs.bits_); return *this; } - constexpr auto operator~() noexcept -> byte& + BOOST_CRYPT_GPU_ENABLED constexpr auto operator~() noexcept -> byte& { bits_ = static_cast(~bits_); return *this; diff --git a/include/boost/crypt/utility/config.hpp b/include/boost/crypt/utility/config.hpp index 210d14d..d839fb9 100644 --- a/include/boost/crypt/utility/config.hpp +++ b/include/boost/crypt/utility/config.hpp @@ -53,18 +53,25 @@ // ---- Constexpr arrays ----- // ----- Assertions ----- -#include -#define BOOST_CRYPT_ASSERT(x) assert(x) -#define BOOST_CRYPT_ASSERT_MSG(expr, msg) assert((expr)&&(msg)) +#ifndef BOOST_CRYPT_HAS_CUDA +# include +# define BOOST_CRYPT_ASSERT(x) assert(x) +# define BOOST_CRYPT_ASSERT_MSG(expr, msg) assert((expr)&&(msg)) +#else +# define BOOST_CRYPT_ASSERT(x) +# define BOOST_CRYPT_ASSERT_MSG(expr, msg) +#endif // ----- Assertions ----- // ----- Has something ----- // C++17 -#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) -# if __has_include() -# include -# if defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L -# define BOOST_CRYPT_HAS_STRING_VIEW +#ifndef BOOST_CRYPT_HAS_CUDA +# if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +# if __has_include() +# include +# if defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L +# define BOOST_CRYPT_HAS_STRING_VIEW +# endif # endif # endif #endif diff --git a/include/boost/crypt/utility/file.hpp b/include/boost/crypt/utility/file.hpp index fdf7f27..d548337 100644 --- a/include/boost/crypt/utility/file.hpp +++ b/include/boost/crypt/utility/file.hpp @@ -6,6 +6,10 @@ #define BOOST_CRYPT_UTILITY_FILE_HPP #include + +// Can't use file streaming on a CUDA device anyway +#ifndef BOOST_CRYPT_HAS_CUDA + #include #ifndef BOOST_CRYPT_BUILD_MODULE @@ -83,4 +87,6 @@ class file_reader } // namespace crypt } // namespace boost +#endif // BOOST_CRYPT_HAS_CUDA + #endif //BOOST_CRYPT_UTILITY_FILE_HPP diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index eb2ed22..c1a8f58 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,6 +16,22 @@ if(HAVE_BOOST_TEST) boost_test_jamfile(FILE nvcc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_LIBRARIES} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} ) + elseif (BOOST_CRYPT_ENABLE_NVRTC) + + message(STATUS "Building boost.crypt with NVRTC") + find_package(CUDA REQUIRED) + set(CUDA_nvrtc_LIBRARY /usr/local/cuda/lib64/libnvrtc.so) + + if (BOOST_CRYPT_NVRTC_CI_RUN) + + boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} COMPILE_DEFINITIONS BOOST_CRYPT_NVRTC_CI_RUN=1 INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS}) + + else () + + boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} ) + + endif () + else () boost_test_jamfile(FILE Jamfile LINK_LIBRARIES Boost::crypt Boost::core Boost::uuid) diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile new file mode 100644 index 0000000..8952c30 --- /dev/null +++ b/test/nvrtc_jamfile @@ -0,0 +1,13 @@ +# Copyright 2024 Matt Borland +# Distributed under the Boost Software License, Version 1.0. +# https://www.boost.org/LICENSE_1_0.txt + +import testing ; +import ../../config/checks/config : requires ; + +project : requirements + [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] + ; + +run test_md5_nvrtc.cpp ; +run test_sha1_nvrtc.cpp ; diff --git a/test/test_md5_nvrtc.cpp b/test/test_md5_nvrtc.cpp new file mode 100644 index 0000000..59883e4 --- /dev/null +++ b/test/test_md5_nvrtc.cpp @@ -0,0 +1,202 @@ +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +// Must be included first + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include "generate_random_strings.hpp" +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +using digest_type = boost::crypt::array; + +const char* cuda_kernel = R"( + +#include +using digest_type = boost::crypt::array; +extern "C" __global__ +void test_md5_kernel(char** in, digest_type* out, int numElements) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::crypt::md5(in[i]); + } +} + +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_md5_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_md5_kernel"); + + #ifdef BOOST_CRYPT_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/crypt/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_md5_kernel"), "Failed to get kernel function"); + + // Allocate memory + int numElements = 50000; + int elementSize = 64; + + char** input_vector1; + cudaMallocManaged(&input_vector1, numElements * sizeof(char*)); + + for (int i = 0; i < numElements; ++i) + { + cudaMallocManaged(&input_vector1[i], elementSize * sizeof(char)); + if (input_vector1[i] == nullptr) + { + throw std::runtime_error("Failed to allocate memory for input_vector1"); + } + boost::crypt::generate_random_string(input_vector1[i], elementSize); + } + + digest_type* output_vector; + cudaMallocManaged(&output_vector, numElements * sizeof(digest_type)); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &input_vector1, &output_vector, &numElements }; + + watch w; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + checkCUDAError(cudaDeviceSynchronize(), "Kernel execution failed"); + + double t = w.elapsed(); + // Verify the result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + auto res = boost::crypt::md5(input_vector1[i]); + + for (int j = 0; j < res.size(); ++j) + { + if (res[j] != output_vector[i][j]) + { + std::cerr << std::hex << "Result verification failed at element " << i << "!\n" + << "Got: " << static_cast(output_vector[i][j]) << "\n" + << "Expected: " << static_cast(res[j]) << std::endl; + ++fail_counter; + if (fail_counter == 100) + { + break; + } + } + } + } + + if (fail_counter == 100) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + // Cleanup all the memory we allocated + for (int i = 0; i < numElements; ++i) + { + cudaFree(input_vector1[i]); + } + cudaFree(input_vector1); + cudaFree(output_vector); + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sha1_nvrtc.cpp b/test/test_sha1_nvrtc.cpp new file mode 100644 index 0000000..dc4a739 --- /dev/null +++ b/test/test_sha1_nvrtc.cpp @@ -0,0 +1,202 @@ +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +// Must be included first + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include "generate_random_strings.hpp" +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +using digest_type = boost::crypt::array; + +const char* cuda_kernel = R"( + +#include +using digest_type = boost::crypt::array; +extern "C" __global__ +void test_sha1_kernel(char** in, digest_type* out, int numElements) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::crypt::sha1(in[i]); + } +} + +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sha1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sha1_kernel"); + + #ifdef BOOST_CRYPT_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/crypt/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sha1_kernel"), "Failed to get kernel function"); + + // Allocate memory + int numElements = 50000; + int elementSize = 64; + + char** input_vector1; + cudaMallocManaged(&input_vector1, numElements * sizeof(char*)); + + for (int i = 0; i < numElements; ++i) + { + cudaMallocManaged(&input_vector1[i], elementSize * sizeof(char)); + if (input_vector1[i] == nullptr) + { + throw std::runtime_error("Failed to allocate memory for input_vector1"); + } + boost::crypt::generate_random_string(input_vector1[i], elementSize); + } + + digest_type* output_vector; + cudaMallocManaged(&output_vector, numElements * sizeof(digest_type)); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &input_vector1, &output_vector, &numElements }; + + watch w; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + checkCUDAError(cudaDeviceSynchronize(), "Kernel execution failed"); + + double t = w.elapsed(); + // Verify the result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + auto res = boost::crypt::sha1(input_vector1[i]); + + for (int j = 0; j < res.size(); ++j) + { + if (res[j] != output_vector[i][j]) + { + std::cerr << std::hex << "Result verification failed at element " << i << "!\n" + << "Got: " << static_cast(output_vector[i][j]) << "\n" + << "Expected: " << static_cast(res[j]) << std::endl; + ++fail_counter; + if (fail_counter == 100) + { + break; + } + } + } + } + + if (fail_counter == 100) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + // Cleanup all the memory we allocated + for (int i = 0; i < numElements; ++i) + { + cudaFree(input_vector1[i]); + } + cudaFree(input_vector1); + cudaFree(output_vector); + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +}