From 812e8e1a2814f672491cb49c2b3d01ca83d3e2ca Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:09:36 -0400
Subject: [PATCH 01/13] Add SHA1 NVRTC test

---
 test/test_sha1_nvrtc.cpp | 176 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 test/test_sha1_nvrtc.cpp
diff --git a/test/test_sha1_nvrtc.cpp b/test/test_sha1_nvrtc.cpp
new file mode 100644
index 0000000..39f0937
--- /dev/null
+++ b/test/test_sha1_nvrtc.cpp
@@ -0,0 +1,176 @@
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// Must be included first
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <exception>
+
+#include <boost/crypt/hash/sha1.hpp>
+#include "generate_random_strings.hpp"
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 20>;
+
+const char* cuda_kernel = R"(
+
+#include <boost/crypt/hash/sha1.hpp>
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 20>;
+extern "C" __global__
+void test_sha1_kernel(char** in, digest_type* out, int numElements)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::crypt::sha1(in[i]);
+    }
+}
+
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main()
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cauchy_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS)
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function");
+
+        // Allocate memory
+        int numElements = 50000;
+        int elementSize = 64;
+
+        char** input_vector1;
+        cudaMallocManaged(&input_vector1, numElements * sizeof(char*));
+
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaMallocManaged(&input_vector1[i], elementSize * sizeof(char));
+            if (input_vector1[i] == nullptr)
+            {
+                throw std::runtime_error("Failed to allocate memory for input_vector1");
+            }
+            boost::crypt::generate_random_string(input_vector1[i], elementSize);
+        }
+
+        cuda_managed_ptr<digest_type> output_vector(numElements);
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &input_vector1, &output_vector, &numElements };
+
+        watch w;
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        double t = w.elapsed();
+        // Verify the result
+        for (int i = 0; i < numElements; ++i)
+        {
+            auto res = boost::crypt::sha1(input_vector1[i]);
+
+            if (res[0] != output_vector[i][0])
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+
+        std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+        std::cout << "Done\n";
+
+        // Cleanup all the memory we allocated
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaFree(input_vector1[i]);
+        }
+        cudaFree(input_vector1);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From a253b34bcf930afb931e67adac20d231a0b1339b Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:09:45 -0400
Subject: [PATCH 02/13] Add NVRTC jamfile

---
 test/nvrtc_jamfile | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 test/nvrtc_jamfile

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
new file mode 100644
index 0000000..230d188
--- /dev/null
+++ b/test/nvrtc_jamfile
@@ -0,0 +1,12 @@
+# Copyright 2024 Matt Borland
+# Distributed under the Boost Software License, Version 1.0.
+# https://www.boost.org/LICENSE_1_0.txt
+
+import testing ;
+import ../../config/checks/config : requires ;
+
+project : requirements
+    [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
+    ;
+
+run test_sha1_nvrtc.cpp ;

From 349d2a33b8c0dc11fc2c4ce36a46600549197dad Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:09:59 -0400
Subject: [PATCH 03/13] Add CMake Path for NVRTC testing

---
 test/CMakeLists.txt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index eb2ed22..c1a8f58 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -16,6 +16,22 @@ if(HAVE_BOOST_TEST)
 
         boost_test_jamfile(FILE nvcc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_LIBRARIES} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
 
+    elseif (BOOST_CRYPT_ENABLE_NVRTC)
+
+        message(STATUS "Building boost.crypt with NVRTC")
+        find_package(CUDA REQUIRED)
+        set(CUDA_nvrtc_LIBRARY /usr/local/cuda/lib64/libnvrtc.so)
+
+        if (BOOST_CRYPT_NVRTC_CI_RUN)
+
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} COMPILE_DEFINITIONS BOOST_CRYPT_NVRTC_CI_RUN=1 INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS})
+
+        else ()
+
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
+
+        endif ()
+
     else ()
 
         boost_test_jamfile(FILE Jamfile LINK_LIBRARIES Boost::crypt Boost::core Boost::uuid)

From ddf021f7e1927a7c91949c042b76e4d582dfc0d6 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:15:54 -0400
Subject: [PATCH 04/13] Keep unsupported headers from being pulled into CUDA
 builds

---
 include/boost/crypt/utility/config.hpp | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/boost/crypt/utility/config.hpp b/include/boost/crypt/utility/config.hpp
index 210d14d..d839fb9 100644
--- a/include/boost/crypt/utility/config.hpp
+++ b/include/boost/crypt/utility/config.hpp
@@ -53,18 +53,25 @@
 // ---- Constexpr arrays -----
 
 // ----- Assertions -----
-#include <cassert>
-#define BOOST_CRYPT_ASSERT(x) assert(x)
-#define BOOST_CRYPT_ASSERT_MSG(expr, msg) assert((expr)&&(msg))
+#ifndef BOOST_CRYPT_HAS_CUDA
+#  include <cassert>
+#  define BOOST_CRYPT_ASSERT(x) assert(x)
+#  define BOOST_CRYPT_ASSERT_MSG(expr, msg) assert((expr)&&(msg))
+#else
+#  define BOOST_CRYPT_ASSERT(x)
+#  define BOOST_CRYPT_ASSERT_MSG(expr, msg)
+#endif
 // ----- Assertions -----
 
 // ----- Has something -----
 // C++17
-#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
-#  if __has_include(<string_view>)
-#    include <string_view>
-#    if defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L
-#      define BOOST_CRYPT_HAS_STRING_VIEW
+#ifndef BOOST_CRYPT_HAS_CUDA
+#  if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    if __has_include(<string_view>)
+#      include <string_view>
+#      if defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L
+#        define BOOST_CRYPT_HAS_STRING_VIEW
+#      endif
 #    endif
 #  endif
 #endif

From f54c600b0a73ff6ffa07219c6cf3ed6be09485eb Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:17:45 -0400
Subject: [PATCH 05/13] Add GPU_ENABLED annotations

---
 include/boost/crypt/utility/byte.hpp | 30 ++++++++++++++--------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/boost/crypt/utility/byte.hpp b/include/boost/crypt/utility/byte.hpp
index cf39298..a6c9ace 100644
--- a/include/boost/crypt/utility/byte.hpp
+++ b/include/boost/crypt/utility/byte.hpp
@@ -18,52 +18,52 @@ class byte
     boost::crypt::uint8_t bits_;
 
 public:
-    constexpr byte() noexcept : bits_ {} {}
-    explicit constexpr byte(boost::crypt::uint8_t bits) noexcept : bits_ {bits} {}
+    BOOST_CRYPT_GPU_ENABLED constexpr byte() noexcept : bits_ {} {}
+    BOOST_CRYPT_GPU_ENABLED explicit constexpr byte(boost::crypt::uint8_t bits) noexcept : bits_ {bits} {}
 
     template <typename IntegerType>
-    constexpr auto to_integer() noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto to_integer() noexcept
         BOOST_CRYPT_REQUIRES(boost::crypt::is_integral_v, IntegerType)
     {
         return static_cast<IntegerType>(bits_);
     }
 
     template <typename IntegerType>
-    constexpr auto operator<<(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator<<(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte)
     {
         return byte{bits_ << shift};
     }
 
     template <typename IntegerType>
-    constexpr auto operator>>(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator>>(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte)
     {
         return byte{bits_ >> shift};
     }
 
-    constexpr auto operator|(byte rhs) const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator|(byte rhs) const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(bits_ | rhs.bits_)};
     }
 
-    constexpr auto operator&(byte rhs) const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator&(byte rhs) const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(bits_ & rhs.bits_)};
     }
 
-    constexpr auto operator^(byte rhs) const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator^(byte rhs) const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(bits_ ^ rhs.bits_)};
     }
 
-    constexpr auto operator~() const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator~() const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(~bits_)};
     }
 
     template <typename IntegerType>
-    constexpr auto operator<<=(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator<<=(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte&)
     {
         bits_ <<= shift;
@@ -71,32 +71,32 @@ class byte
     }
 
     template <typename IntegerType>
-    constexpr auto operator >>=(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator >>=(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte&)
     {
         bits_ >>= shift;
         return *this;
     }
 
-    constexpr auto operator|(byte rhs) noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator|(byte rhs) noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(bits_ | rhs.bits_);
         return *this;
     }
 
-    constexpr auto operator&(byte rhs) noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator&(byte rhs) noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(bits_ & rhs.bits_);
         return *this;
     }
 
-    constexpr auto operator^(byte rhs) noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator^(byte rhs) noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(bits_ ^ rhs.bits_);
         return *this;
     }
 
-    constexpr auto operator~() noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator~() noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(~bits_);
         return *this;

From 6bcbb6957fb81acf434ef25b33ecd67e40987fcf Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:26:20 -0400
Subject: [PATCH 06/13] Disable support on CUDA devices

---
 include/boost/crypt/hash/sha1.hpp     | 2 +-
 include/boost/crypt/utility/array.hpp | 4 +++-
 include/boost/crypt/utility/file.hpp  | 6 ++++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/boost/crypt/hash/sha1.hpp b/include/boost/crypt/hash/sha1.hpp
index 6b14fa4..9378698 100644
--- a/include/boost/crypt/hash/sha1.hpp
+++ b/include/boost/crypt/hash/sha1.hpp
@@ -20,7 +20,7 @@
 #include <boost/crypt/utility/file.hpp>
 #include <boost/crypt/utility/null.hpp>
 
-#ifndef BOOST_CRYPT_BUILD_MODULE
+#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA)
 #include <memory>
 #include <string>
 #include <cstdint>
diff --git a/include/boost/crypt/utility/array.hpp b/include/boost/crypt/utility/array.hpp
index b6a628e..9b6886b 100644
--- a/include/boost/crypt/utility/array.hpp
+++ b/include/boost/crypt/utility/array.hpp
@@ -10,7 +10,7 @@
 #include <boost/crypt/utility/cstdint.hpp>
 #include <boost/crypt/utility/cstddef.hpp>
 
-#ifndef BOOST_CRYPT_BUILD_MODULE
+#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA)
 #include <array>
 #endif
 
@@ -102,6 +102,7 @@ class array
         *this = temp;
     }
 
+    #ifndef BOOST_CRYPT_HAS_CUDA
     constexpr operator std::array<T, N>() noexcept
     {
         std::array<T, N> new_array{};
@@ -112,6 +113,7 @@ class array
 
         return new_array;
     }
+    #endif
 };
 
 template <typename ForwardIter, typename T>
diff --git a/include/boost/crypt/utility/file.hpp b/include/boost/crypt/utility/file.hpp
index fdf7f27..d548337 100644
--- a/include/boost/crypt/utility/file.hpp
+++ b/include/boost/crypt/utility/file.hpp
@@ -6,6 +6,10 @@
 #define BOOST_CRYPT_UTILITY_FILE_HPP
 
 #include <boost/crypt/utility/config.hpp>
+
+// Can't use file streaming on a CUDA device anyway
+#ifndef BOOST_CRYPT_HAS_CUDA
+
 #include <boost/crypt/utility/cstdint.hpp>
 
 #ifndef BOOST_CRYPT_BUILD_MODULE
@@ -83,4 +87,6 @@ class file_reader
 } // namespace crypt
 } // namespace boost
 
+#endif // BOOST_CRYPT_HAS_CUDA
+
 #endif //BOOST_CRYPT_UTILITY_FILE_HPP

From ae46b12bb3d0ce757882254d8ea4d873b1f79924 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:44:39 -0400
Subject: [PATCH 07/13] Fix kernel synchronization issue

---
 test/test_sha1_nvrtc.cpp | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/test/test_sha1_nvrtc.cpp b/test/test_sha1_nvrtc.cpp
index 39f0937..9e0755b 100644
--- a/test/test_sha1_nvrtc.cpp
+++ b/test/test_sha1_nvrtc.cpp
@@ -83,10 +83,10 @@ int main()
         nvrtcProgram prog;
         nvrtcResult res;
 
-        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr);
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sha1_kernel.cu", 0, nullptr, nullptr);
         checkNVRTCError(res, "Failed to create NVRTC program");
 
-        nvrtcAddNameExpression(prog, "test_cauchy_kernel");
+        nvrtcAddNameExpression(prog, "test_sha1_kernel");
 
         #ifdef BOOST_MATH_NVRTC_CI_RUN
         const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"};
@@ -117,7 +117,7 @@ int main()
         CUmodule module;
         CUfunction kernel;
         checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
-        checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sha1_kernel"), "Failed to get kernel function");
 
         // Allocate memory
         int numElements = 50000;
@@ -136,7 +136,8 @@ int main()
             boost::crypt::generate_random_string(input_vector1[i], elementSize);
         }
 
-        cuda_managed_ptr<digest_type> output_vector(numElements);
+        digest_type* output_vector;
+        cudaMallocManaged(&output_vector, numElements * sizeof(digest_type));
 
         int blockSize = 256;
         int numBlocks = (numElements + blockSize - 1) / blockSize;
@@ -144,20 +145,36 @@ int main()
 
         watch w;
         checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+        checkCUDAError(cudaDeviceSynchronize(), "Kernel execution failed");
 
         double t = w.elapsed();
         // Verify the result
+        int fail_counter = 0;
         for (int i = 0; i < numElements; ++i)
         {
             auto res = boost::crypt::sha1(input_vector1[i]);
 
-            if (res[0] != output_vector[i][0])
+            for (int j = 0; j < res.size(); ++j)
             {
-                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
-                return EXIT_FAILURE;
+                if (res[j] != output_vector[i][j])
+                {
+                    std::cerr << std::hex << "Result verification failed at element " << i << "!\n"
+                              << "Got: " << static_cast<std::uint32_t>(output_vector[i][j]) << "\n"
+                              << "Expected: " << static_cast<std::uint32_t>(res[j]) << std::endl;
+                    ++fail_counter;
+                    if (fail_counter == 100)
+                    {
+                        break;
+                    }
+                }
             }
         }
 
+        if (fail_counter == 100)
+        {
+            return EXIT_FAILURE;
+        }
+
         std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
         std::cout << "Done\n";
 

From f388754e8ea43a1ef2642b7574d0436f815197db Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:44:59 -0400
Subject: [PATCH 08/13] Add proper destruction of all allocated memory

---
 test/test_sha1_nvrtc.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test_sha1_nvrtc.cpp b/test/test_sha1_nvrtc.cpp
index 9e0755b..9fc84b0 100644
--- a/test/test_sha1_nvrtc.cpp
+++ b/test/test_sha1_nvrtc.cpp
@@ -184,6 +184,15 @@ int main()
             cudaFree(input_vector1[i]);
         }
         cudaFree(input_vector1);
+        cudaFree(output_vector);
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
     }
     catch(const std::exception& e)
     {

From 79279d1fd1d38887050b3d7aa192189913d317c8 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:47:39 -0400
Subject: [PATCH 09/13] Add NVRTC CI runner

---
 .github/workflows/cuda.yml | 58 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 49c8907..5c64bff 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -75,3 +75,61 @@ jobs:
         run: |
           cd ../boost-root/__build__
           ctest --output-on-failure --no-tests=error
+
+  nvrtc-cmake-test:
+    strategy:
+      fail-fast: false
+
+    runs-on: gpu-runner-1
+
+    steps:
+      - uses: Jimver/cuda-toolkit@v0.2.16
+        id: cuda-toolkit
+        with:
+          cuda: '12.5.0'
+          method: 'network'
+
+      - name: Output CUDA information
+        run: |
+          echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          nvcc -V
+      - uses: actions/checkout@v4
+
+      - name: Install Packages
+        run: |
+          sudo apt-get install -y cmake make
+      - name: Setup Boost
+        run: |
+          echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY
+          LIBRARY=${GITHUB_REPOSITORY#*/}
+          echo LIBRARY: $LIBRARY
+          echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV
+          echo GITHUB_BASE_REF: $GITHUB_BASE_REF
+          echo GITHUB_REF: $GITHUB_REF
+          REF=${GITHUB_BASE_REF:-$GITHUB_REF}
+          REF=${REF#refs/heads/}
+          echo REF: $REF
+          BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true
+          echo BOOST_BRANCH: $BOOST_BRANCH
+          cd ..
+          git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root
+          cd boost-root
+          mkdir -p libs/$LIBRARY
+          cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY
+          git submodule update --init tools/boostdep
+          python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY
+      - name: Configure
+        run: |
+          cd ../boost-root
+          mkdir __build__ && cd __build__
+          cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_CRYPT_ENABLE_NVRTC=1 -DCMAKE_CUDA_ARCHITECTURES=70 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.5 -DBOOST_CRYPT_NVRTC_CI_RUN=1 ..
+          pwd
+      - name: Build tests
+        run: |
+          cd ../boost-root/__build__
+          cmake --build . --target tests -j $(nproc)
+      - name: Run tests
+        run: |
+          cd ../boost-root/__build__
+          ctest --output-on-failure --no-tests=error

From 8867255223c413986bf24f96d0d62e64a0458f5c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 10:59:21 -0400
Subject: [PATCH 10/13] Fix macro for CI path

---
 test/test_sha1_nvrtc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_sha1_nvrtc.cpp b/test/test_sha1_nvrtc.cpp
index 9fc84b0..dc4a739 100644
--- a/test/test_sha1_nvrtc.cpp
+++ b/test/test_sha1_nvrtc.cpp
@@ -88,7 +88,7 @@ int main()
 
         nvrtcAddNameExpression(prog, "test_sha1_kernel");
 
-        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        #ifdef BOOST_CRYPT_NVRTC_CI_RUN
         const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"};
         #else
         const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/crypt/include/", "-I/usr/local/cuda/include"};

From ade7a69bee8666c39a1f09f6356705f035ff05bd Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 11:01:57 -0400
Subject: [PATCH 11/13] Disable STL headers in CUDA environment

---
 include/boost/crypt/hash/md5.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/boost/crypt/hash/md5.hpp b/include/boost/crypt/hash/md5.hpp
index 3458e12..600c008 100644
--- a/include/boost/crypt/hash/md5.hpp
+++ b/include/boost/crypt/hash/md5.hpp
@@ -18,7 +18,7 @@
 #include <boost/crypt/utility/iterator.hpp>
 #include <boost/crypt/utility/file.hpp>
 
-#ifndef BOOST_CRYPT_BUILD_MODULE
+#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA)
 #include <memory>
 #include <string>
 #include <cstdint>

From 35aada09503e2f008adf7afcb5001ad7cb2f5cfb Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 11:02:05 -0400
Subject: [PATCH 12/13] Add NVRTC testing of MD5

---
 test/nvrtc_jamfile      |   1 +
 test/test_md5_nvrtc.cpp | 202 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 203 insertions(+)
 create mode 100644 test/test_md5_nvrtc.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 230d188..8952c30 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -9,4 +9,5 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
+run test_md5_nvrtc.cpp ;
 run test_sha1_nvrtc.cpp ;
diff --git a/test/test_md5_nvrtc.cpp b/test/test_md5_nvrtc.cpp
new file mode 100644
index 0000000..59883e4
--- /dev/null
+++ b/test/test_md5_nvrtc.cpp
@@ -0,0 +1,202 @@
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// Must be included first
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <exception>
+
+#include <boost/crypt/hash/md5.hpp>
+#include "generate_random_strings.hpp"
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 16>;
+
+const char* cuda_kernel = R"(
+
+#include <boost/crypt/hash/md5.hpp>
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 16>;
+extern "C" __global__
+void test_md5_kernel(char** in, digest_type* out, int numElements)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::crypt::md5(in[i]);
+    }
+}
+
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main()
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_md5_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_md5_kernel");
+
+        #ifdef BOOST_CRYPT_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS)
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_md5_kernel"), "Failed to get kernel function");
+
+        // Allocate memory
+        int numElements = 50000;
+        int elementSize = 64;
+
+        char** input_vector1;
+        cudaMallocManaged(&input_vector1, numElements * sizeof(char*));
+
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaMallocManaged(&input_vector1[i], elementSize * sizeof(char));
+            if (input_vector1[i] == nullptr)
+            {
+                throw std::runtime_error("Failed to allocate memory for input_vector1");
+            }
+            boost::crypt::generate_random_string(input_vector1[i], elementSize);
+        }
+
+        digest_type* output_vector;
+        cudaMallocManaged(&output_vector, numElements * sizeof(digest_type));
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &input_vector1, &output_vector, &numElements };
+
+        watch w;
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+        checkCUDAError(cudaDeviceSynchronize(), "Kernel execution failed");
+
+        double t = w.elapsed();
+        // Verify the result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i)
+        {
+            auto res = boost::crypt::md5(input_vector1[i]);
+
+            for (int j = 0; j < res.size(); ++j)
+            {
+                if (res[j] != output_vector[i][j])
+                {
+                    std::cerr << std::hex << "Result verification failed at element " << i << "!\n"
+                              << "Got: " << static_cast<std::uint32_t>(output_vector[i][j]) << "\n"
+                              << "Expected: " << static_cast<std::uint32_t>(res[j]) << std::endl;
+                    ++fail_counter;
+                    if (fail_counter == 100)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (fail_counter == 100)
+        {
+            return EXIT_FAILURE;
+        }
+
+        std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+        std::cout << "Done\n";
+
+        // Cleanup all the memory we allocated
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaFree(input_vector1[i]);
+        }
+        cudaFree(input_vector1);
+        cudaFree(output_vector);
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From f8504663793a36de73cc1d3dee5fecc350e0c340 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 21 Oct 2024 11:11:37 -0400
Subject: [PATCH 13/13] Update compiler versions in the docs

---
 doc/crypt/overview.adoc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/crypt/overview.adoc b/doc/crypt/overview.adoc
index 1fab53d..4a245b4 100644
--- a/doc/crypt/overview.adoc
+++ b/doc/crypt/overview.adoc
@@ -36,7 +36,8 @@ as well as emulated PPC64LE and STM32 using QEMU with the following compilers:
 * GCC 7 and later
 * Clang 6 and later
 * Visual Studio 2017 and later
-* Intel OneAPI DPC++
+* Intel OneAPI DPC++ 2024.2 and later
+* CUDA Toolkit 12.5 and later (Both NVCC and NVRTC)
 
 Tested on https://github.com/cppalliance/decimal/actions[Github Actions] and https://drone.cpp.al/cppalliance/decimal[Drone].
 Coverage can be found on https://app.codecov.io/gh/cppalliance/decimal[Codecov].