diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 49c89076..5c64bff8 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -75,3 +75,61 @@ jobs:
         run: |
           cd ../boost-root/__build__
           ctest --output-on-failure --no-tests=error
+
+  nvrtc-cmake-test:
+    strategy:
+      fail-fast: false
+
+    runs-on: gpu-runner-1
+
+    steps:
+      - uses: Jimver/cuda-toolkit@v0.2.16
+        id: cuda-toolkit
+        with:
+          cuda: '12.5.0'
+          method: 'network'
+
+      - name: Output CUDA information
+        run: |
+          echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          nvcc -V
+      - uses: actions/checkout@v4
+
+      - name: Install Packages
+        run: |
+          sudo apt-get install -y cmake make
+      - name: Setup Boost
+        run: |
+          echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY
+          LIBRARY=${GITHUB_REPOSITORY#*/}
+          echo LIBRARY: $LIBRARY
+          echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV
+          echo GITHUB_BASE_REF: $GITHUB_BASE_REF
+          echo GITHUB_REF: $GITHUB_REF
+          REF=${GITHUB_BASE_REF:-$GITHUB_REF}
+          REF=${REF#refs/heads/}
+          echo REF: $REF
+          BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true
+          echo BOOST_BRANCH: $BOOST_BRANCH
+          cd ..
+          git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root
+          cd boost-root
+          mkdir -p libs/$LIBRARY
+          cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY
+          git submodule update --init tools/boostdep
+          python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY
+      - name: Configure
+        run: |
+          cd ../boost-root
+          mkdir __build__ && cd __build__
+          cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_CRYPT_ENABLE_NVRTC=1 -DCMAKE_CUDA_ARCHITECTURES=70 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.5 -DBOOST_CRYPT_NVRTC_CI_RUN=1 ..
+          pwd
+      - name: Build tests
+        run: |
+          cd ../boost-root/__build__
+          cmake --build . --target tests -j $(nproc)
+      - name: Run tests
+        run: |
+          cd ../boost-root/__build__
+          ctest --output-on-failure --no-tests=error
diff --git a/doc/crypt/overview.adoc b/doc/crypt/overview.adoc
index 1fab53d8..4a245b4b 100644
--- a/doc/crypt/overview.adoc
+++ b/doc/crypt/overview.adoc
@@ -36,7 +36,8 @@ as well as emulated PPC64LE and STM32 using QEMU with the following compilers:
 * GCC 7 and later
 * Clang 6 and later
 * Visual Studio 2017 and later
-* Intel OneAPI DPC++
+* Intel OneAPI DPC++ 2024.2 and later
+* CUDA Toolkit 12.5 and later (Both NVCC and NVRTC)
 
 Tested on https://github.com/cppalliance/decimal/actions[Github Actions] and https://drone.cpp.al/cppalliance/decimal[Drone].
 Coverage can be found on https://app.codecov.io/gh/cppalliance/decimal[Codecov].
diff --git a/include/boost/crypt/hash/md5.hpp b/include/boost/crypt/hash/md5.hpp
index 3458e127..600c008c 100644
--- a/include/boost/crypt/hash/md5.hpp
+++ b/include/boost/crypt/hash/md5.hpp
@@ -18,7 +18,7 @@
 #include <boost/crypt/utility/iterator.hpp>
 #include <boost/crypt/utility/file.hpp>
 
-#ifndef BOOST_CRYPT_BUILD_MODULE
+#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA)
 #include <memory>
 #include <string>
 #include <cstdint>
diff --git a/include/boost/crypt/hash/sha1.hpp b/include/boost/crypt/hash/sha1.hpp
index 6b14fa4a..93786986 100644
--- a/include/boost/crypt/hash/sha1.hpp
+++ b/include/boost/crypt/hash/sha1.hpp
@@ -20,7 +20,7 @@
 #include <boost/crypt/utility/file.hpp>
 #include <boost/crypt/utility/null.hpp>
 
-#ifndef BOOST_CRYPT_BUILD_MODULE
+#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA)
 #include <memory>
 #include <string>
 #include <cstdint>
diff --git a/include/boost/crypt/utility/array.hpp b/include/boost/crypt/utility/array.hpp
index b6a628e8..9b6886b9 100644
--- a/include/boost/crypt/utility/array.hpp
+++ b/include/boost/crypt/utility/array.hpp
@@ -10,7 +10,7 @@
 #include <boost/crypt/utility/cstdint.hpp>
 #include <boost/crypt/utility/cstddef.hpp>
 
-#ifndef BOOST_CRYPT_BUILD_MODULE
+#if !defined(BOOST_CRYPT_BUILD_MODULE) && !defined(BOOST_CRYPT_HAS_CUDA)
 #include <array>
 #endif
 
@@ -102,6 +102,7 @@ class array
         *this = temp;
     }
 
+    #ifndef BOOST_CRYPT_HAS_CUDA
     constexpr operator std::array<T, N>() noexcept
     {
         std::array<T, N> new_array{};
@@ -112,6 +113,7 @@ class array
 
         return new_array;
     }
+    #endif
 };
 
 template <typename ForwardIter, typename T>
diff --git a/include/boost/crypt/utility/byte.hpp b/include/boost/crypt/utility/byte.hpp
index cf392988..a6c9acea 100644
--- a/include/boost/crypt/utility/byte.hpp
+++ b/include/boost/crypt/utility/byte.hpp
@@ -18,52 +18,52 @@ class byte
     boost::crypt::uint8_t bits_;
 
 public:
-    constexpr byte() noexcept : bits_ {} {}
-    explicit constexpr byte(boost::crypt::uint8_t bits) noexcept : bits_ {bits} {}
+    BOOST_CRYPT_GPU_ENABLED constexpr byte() noexcept : bits_ {} {}
+    BOOST_CRYPT_GPU_ENABLED explicit constexpr byte(boost::crypt::uint8_t bits) noexcept : bits_ {bits} {}
 
     template <typename IntegerType>
-    constexpr auto to_integer() noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto to_integer() noexcept
         BOOST_CRYPT_REQUIRES(boost::crypt::is_integral_v, IntegerType)
     {
         return static_cast<IntegerType>(bits_);
     }
 
     template <typename IntegerType>
-    constexpr auto operator<<(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator<<(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte)
     {
         return byte{bits_ << shift};
     }
 
     template <typename IntegerType>
-    constexpr auto operator>>(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator>>(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte)
     {
         return byte{bits_ >> shift};
     }
 
-    constexpr auto operator|(byte rhs) const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator|(byte rhs) const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(bits_ | rhs.bits_)};
     }
 
-    constexpr auto operator&(byte rhs) const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator&(byte rhs) const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(bits_ & rhs.bits_)};
     }
 
-    constexpr auto operator^(byte rhs) const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator^(byte rhs) const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(bits_ ^ rhs.bits_)};
     }
 
-    constexpr auto operator~() const noexcept -> byte
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator~() const noexcept -> byte
     {
         return byte{static_cast<boost::crypt::uint8_t>(~bits_)};
     }
 
     template <typename IntegerType>
-    constexpr auto operator<<=(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator<<=(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte&)
     {
         bits_ <<= shift;
@@ -71,32 +71,32 @@ class byte
     }
 
     template <typename IntegerType>
-    constexpr auto operator >>=(IntegerType shift) noexcept
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator >>=(IntegerType shift) noexcept
         BOOST_CRYPT_REQUIRES_RETURN(boost::crypt::is_integral_v, IntegerType, byte&)
     {
         bits_ >>= shift;
         return *this;
     }
 
-    constexpr auto operator|(byte rhs) noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator|(byte rhs) noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(bits_ | rhs.bits_);
         return *this;
     }
 
-    constexpr auto operator&(byte rhs) noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator&(byte rhs) noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(bits_ & rhs.bits_);
         return *this;
     }
 
-    constexpr auto operator^(byte rhs) noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator^(byte rhs) noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(bits_ ^ rhs.bits_);
         return *this;
     }
 
-    constexpr auto operator~() noexcept -> byte&
+    BOOST_CRYPT_GPU_ENABLED constexpr auto operator~() noexcept -> byte&
     {
         bits_ = static_cast<boost::crypt::uint8_t>(~bits_);
         return *this;
diff --git a/include/boost/crypt/utility/config.hpp b/include/boost/crypt/utility/config.hpp
index 210d14da..d839fb99 100644
--- a/include/boost/crypt/utility/config.hpp
+++ b/include/boost/crypt/utility/config.hpp
@@ -53,18 +53,25 @@
 // ---- Constexpr arrays -----
 
 // ----- Assertions -----
-#include <cassert>
-#define BOOST_CRYPT_ASSERT(x) assert(x)
-#define BOOST_CRYPT_ASSERT_MSG(expr, msg) assert((expr)&&(msg))
+#ifndef BOOST_CRYPT_HAS_CUDA
+#  include <cassert>
+#  define BOOST_CRYPT_ASSERT(x) assert(x)
+#  define BOOST_CRYPT_ASSERT_MSG(expr, msg) assert((expr)&&(msg))
+#else
+#  define BOOST_CRYPT_ASSERT(x)
+#  define BOOST_CRYPT_ASSERT_MSG(expr, msg)
+#endif
 // ----- Assertions -----
 
 // ----- Has something -----
 // C++17
-#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
-#  if __has_include(<string_view>)
-#    include <string_view>
-#    if defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L
-#      define BOOST_CRYPT_HAS_STRING_VIEW
+#ifndef BOOST_CRYPT_HAS_CUDA
+#  if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    if __has_include(<string_view>)
+#      include <string_view>
+#      if defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L
+#        define BOOST_CRYPT_HAS_STRING_VIEW
+#      endif
 #    endif
 #  endif
 #endif
diff --git a/include/boost/crypt/utility/file.hpp b/include/boost/crypt/utility/file.hpp
index fdf7f27a..d548337f 100644
--- a/include/boost/crypt/utility/file.hpp
+++ b/include/boost/crypt/utility/file.hpp
@@ -6,6 +6,10 @@
 #define BOOST_CRYPT_UTILITY_FILE_HPP
 
 #include <boost/crypt/utility/config.hpp>
+
+// Can't use file streaming on a CUDA device anyway
+#ifndef BOOST_CRYPT_HAS_CUDA
+
 #include <boost/crypt/utility/cstdint.hpp>
 
 #ifndef BOOST_CRYPT_BUILD_MODULE
@@ -83,4 +87,6 @@ class file_reader
 } // namespace crypt
 } // namespace boost
 
+#endif // BOOST_CRYPT_HAS_CUDA
+
 #endif //BOOST_CRYPT_UTILITY_FILE_HPP
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index eb2ed22a..c1a8f58a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -16,6 +16,22 @@ if(HAVE_BOOST_TEST)
 
         boost_test_jamfile(FILE nvcc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_LIBRARIES} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
 
+    elseif (BOOST_CRYPT_ENABLE_NVRTC)
+
+        message(STATUS "Building boost.crypt with NVRTC")
+        find_package(CUDA REQUIRED)
+        set(CUDA_nvrtc_LIBRARY /usr/local/cuda/lib64/libnvrtc.so)
+
+        if (BOOST_CRYPT_NVRTC_CI_RUN)
+
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} COMPILE_DEFINITIONS BOOST_CRYPT_NVRTC_CI_RUN=1 INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS})
+
+        else ()
+
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::crypt ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
+
+        endif ()
+
     else ()
 
         boost_test_jamfile(FILE Jamfile LINK_LIBRARIES Boost::crypt Boost::core Boost::uuid)
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
new file mode 100644
index 00000000..8952c30f
--- /dev/null
+++ b/test/nvrtc_jamfile
@@ -0,0 +1,13 @@
+# Copyright 2024 Matt Borland
+# Distributed under the Boost Software License, Version 1.0.
+# https://www.boost.org/LICENSE_1_0.txt
+
+import testing ;
+import ../../config/checks/config : requires ;
+
+project : requirements
+    [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
+    ;
+
+run test_md5_nvrtc.cpp ;
+run test_sha1_nvrtc.cpp ;
diff --git a/test/test_md5_nvrtc.cpp b/test/test_md5_nvrtc.cpp
new file mode 100644
index 00000000..59883e43
--- /dev/null
+++ b/test/test_md5_nvrtc.cpp
@@ -0,0 +1,202 @@
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// Must be included first
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <exception>
+
+#include <boost/crypt/hash/md5.hpp>
+#include "generate_random_strings.hpp"
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 16>;
+
+const char* cuda_kernel = R"(
+
+#include <boost/crypt/hash/md5.hpp>
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 16>;
+extern "C" __global__
+void test_md5_kernel(char** in, digest_type* out, int numElements)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::crypt::md5(in[i]);
+    }
+}
+
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main()
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_md5_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_md5_kernel");
+
+        #ifdef BOOST_CRYPT_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS)
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_md5_kernel"), "Failed to get kernel function");
+
+        // Allocate memory
+        int numElements = 50000;
+        int elementSize = 64;
+
+        char** input_vector1;
+        cudaMallocManaged(&input_vector1, numElements * sizeof(char*));
+
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaMallocManaged(&input_vector1[i], elementSize * sizeof(char));
+            if (input_vector1[i] == nullptr)
+            {
+                throw std::runtime_error("Failed to allocate memory for input_vector1");
+            }
+            boost::crypt::generate_random_string(input_vector1[i], elementSize);
+        }
+
+        digest_type* output_vector;
+        cudaMallocManaged(&output_vector, numElements * sizeof(digest_type));
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &input_vector1, &output_vector, &numElements };
+
+        watch w;
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+        checkCUDAError(cudaDeviceSynchronize(), "Kernel execution failed");
+
+        double t = w.elapsed();
+        // Verify the result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i)
+        {
+            auto res = boost::crypt::md5(input_vector1[i]);
+
+            for (int j = 0; j < res.size(); ++j)
+            {
+                if (res[j] != output_vector[i][j])
+                {
+                    std::cerr << std::hex << "Result verification failed at element " << i << "!\n"
+                              << "Got: " << static_cast<std::uint32_t>(output_vector[i][j]) << "\n"
+                              << "Expected: " << static_cast<std::uint32_t>(res[j]) << std::endl;
+                    ++fail_counter;
+                    if (fail_counter == 100)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (fail_counter == 100)
+        {
+            return EXIT_FAILURE;
+        }
+
+        std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+        std::cout << "Done\n";
+
+        // Cleanup all the memory we allocated
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaFree(input_vector1[i]);
+        }
+        cudaFree(input_vector1);
+        cudaFree(output_vector);
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_sha1_nvrtc.cpp b/test/test_sha1_nvrtc.cpp
new file mode 100644
index 00000000..dc4a7391
--- /dev/null
+++ b/test/test_sha1_nvrtc.cpp
@@ -0,0 +1,202 @@
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// Must be included first
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <exception>
+
+#include <boost/crypt/hash/sha1.hpp>
+#include "generate_random_strings.hpp"
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 20>;
+
+const char* cuda_kernel = R"(
+
+#include <boost/crypt/hash/sha1.hpp>
+using digest_type = boost::crypt::array<boost::crypt::uint8_t, 20>;
+extern "C" __global__
+void test_sha1_kernel(char** in, digest_type* out, int numElements)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::crypt::sha1(in[i]);
+    }
+}
+
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main()
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sha1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sha1_kernel");
+
+        #ifdef BOOST_CRYPT_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/crypt/boost-root/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/crypt/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS)
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sha1_kernel"), "Failed to get kernel function");
+
+        // Allocate memory
+        int numElements = 50000;
+        int elementSize = 64;
+
+        char** input_vector1;
+        cudaMallocManaged(&input_vector1, numElements * sizeof(char*));
+
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaMallocManaged(&input_vector1[i], elementSize * sizeof(char));
+            if (input_vector1[i] == nullptr)
+            {
+                throw std::runtime_error("Failed to allocate memory for input_vector1");
+            }
+            boost::crypt::generate_random_string(input_vector1[i], elementSize);
+        }
+
+        digest_type* output_vector;
+        cudaMallocManaged(&output_vector, numElements * sizeof(digest_type));
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &input_vector1, &output_vector, &numElements };
+
+        watch w;
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+        checkCUDAError(cudaDeviceSynchronize(), "Kernel execution failed");
+
+        double t = w.elapsed();
+        // Verify the result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i)
+        {
+            auto res = boost::crypt::sha1(input_vector1[i]);
+
+            for (int j = 0; j < res.size(); ++j)
+            {
+                if (res[j] != output_vector[i][j])
+                {
+                    std::cerr << std::hex << "Result verification failed at element " << i << "!\n"
+                              << "Got: " << static_cast<std::uint32_t>(output_vector[i][j]) << "\n"
+                              << "Expected: " << static_cast<std::uint32_t>(res[j]) << std::endl;
+                    ++fail_counter;
+                    if (fail_counter == 100)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (fail_counter == 100)
+        {
+            return EXIT_FAILURE;
+        }
+
+        std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+        std::cout << "Done\n";
+
+        // Cleanup all the memory we allocated
+        for (int i = 0; i < numElements; ++i)
+        {
+            cudaFree(input_vector1[i]);
+        }
+        cudaFree(input_vector1);
+        cudaFree(output_vector);
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}