From f225702e6c5811bbbc87ec651b1f7bcfcf00c504 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 16 Sep 2024 15:39:56 -0400
Subject: [PATCH 01/26] Add GPU support to expint

---
 .../boost/math/special_functions/expint.hpp   | 134 ++++++++++--------
 1 file changed, 75 insertions(+), 59 deletions(-)
diff --git a/include/boost/math/special_functions/expint.hpp b/include/boost/math/special_functions/expint.hpp
index 1475a9a88..79a46c52f 100644
--- a/include/boost/math/special_functions/expint.hpp
+++ b/include/boost/math/special_functions/expint.hpp
@@ -1,4 +1,5 @@
 //  Copyright John Maddock 2007.
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -12,6 +13,10 @@
 #pragma warning(disable:4702) // Unreachable code (release mode only warning)
 #endif
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/promotion.hpp>
 #include <boost/math/tools/fraction.hpp>
@@ -35,13 +40,13 @@
 namespace boost{ namespace math{
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    expint(unsigned n, T z, const Policy& /*pol*/);
 
 namespace detail{
 
 template <class T>
-inline T expint_1_rational(const T& z, const std::integral_constant<int, 0>&)
+BOOST_MATH_GPU_ENABLED inline T expint_1_rational(const T& z, const boost::math::integral_constant<int, 0>&)
 {
    // this function is never actually called
    BOOST_MATH_ASSERT(0);
@@ -49,7 +54,7 @@ inline T expint_1_rational(const T& z, const std::integral_constant<int, 0>&)
 }
 
 template <class T>
-T expint_1_rational(const T& z, const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 53>&)
 {
    BOOST_MATH_STD_USING
    T result;
@@ -123,7 +128,7 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 53>&)
 }
 
 template <class T>
-T expint_1_rational(const T& z, const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 64>&)
 {
    BOOST_MATH_STD_USING
    T result;
@@ -204,7 +209,7 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 64>&)
 }
 
 template <class T>
-T expint_1_rational(const T& z, const std::integral_constant<int, 113>&)
+BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 113>&)
 {
    BOOST_MATH_STD_USING
    T result;
@@ -351,14 +356,15 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 113>&)
    return result;
 }
 
+
 template <class T>
 struct expint_fraction
 {
-   typedef std::pair<T,T> result_type;
-   expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){}
-   std::pair<T,T> operator()()
+   typedef boost::math::pair<T,T> result_type;
+   BOOST_MATH_GPU_ENABLED expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){}
+   BOOST_MATH_GPU_ENABLED boost::math::pair<T,T> operator()()
    {
-      std::pair<T,T> result = std::make_pair(-static_cast<T>((i+1) * (n+i)), b);
+      boost::math::pair<T,T> result = boost::math::make_pair(-static_cast<T>((i+1) * (n+i)), b);
       b += 2;
       ++i;
       return result;
@@ -374,7 +380,7 @@ inline T expint_as_fraction(unsigned n, T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    BOOST_MATH_INSTRUMENT_VARIABLE(z)
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
    expint_fraction<T> f(n, z);
    T result = tools::continued_fraction_b(
       f,
@@ -392,9 +398,9 @@ template <class T>
 struct expint_series
 {
    typedef T result_type;
-   expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_)
+   BOOST_MATH_GPU_ENABLED expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_)
       : k(k_), z(z_), x_k(x_k_), denom(denom_), fact(fact_){}
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
    {
       x_k *= -z;
       denom += 1;
@@ -410,10 +416,10 @@ struct expint_series
 };
 
 template <class T, class Policy>
-inline T expint_as_series(unsigned n, T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T expint_as_series(unsigned n, T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
 
    BOOST_MATH_INSTRUMENT_VARIABLE(z)
 
@@ -443,10 +449,10 @@ inline T expint_as_series(unsigned n, T z, const Policy& pol)
 }
 
 template <class T, class Policy, class Tag>
-T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
+BOOST_MATH_GPU_ENABLED T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
 {
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(unsigned, %1%)";
+   constexpr auto function = "boost::math::expint<%1%>(unsigned, %1%)";
    if(z < 0)
       return policies::raise_domain_error<T>(function, "Function requires z >= 0 but got %1%.", z, pol);
    if(z == 0)
@@ -468,15 +474,21 @@ T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
 #  pragma warning(disable:4127) // conditional expression is constant
 #endif
    if(n == 0)
+   {
       result = exp(-z) / z;
+   }
    else if((n == 1) && (Tag::value))
    {
       result = expint_1_rational(z, tag);
    }
    else if(f)
+   {
       result = expint_as_series(n, z, pol);
+   }
    else
+   {
       result = expint_as_fraction(n, z, pol);
+   }
 #ifdef _MSC_VER
 #  pragma warning(pop)
 #endif
@@ -488,8 +500,8 @@ template <class T>
 struct expint_i_series
 {
    typedef T result_type;
-   expint_i_series(T z_) : k(0), z_k(1), z(z_){}
-   T operator()()
+   BOOST_MATH_GPU_ENABLED expint_i_series(T z_) : k(0), z_k(1), z(z_){}
+   BOOST_MATH_GPU_ENABLED T operator()()
    {
       z_k *= z / ++k;
       return z_k / k;
@@ -501,22 +513,22 @@ struct expint_i_series
 };
 
 template <class T, class Policy>
-T expint_i_as_series(T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T expint_i_as_series(T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    T result = log(z); // (log(z) - log(1 / z)) / 2;
    result += constants::euler<T>();
    expint_i_series<T> s(z);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
    result = tools::sum_series(s, policies::get_epsilon<T, Policy>(), max_iter, result);
    policies::check_series_iterations<T>("boost::math::expint_i_series<%1%>(%1%)", max_iter, pol);
    return result;
 }
 
 template <class T, class Policy, class Tag>
-T expint_i_imp(T z, const Policy& pol, const Tag& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const Tag& tag)
 {
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
    if(z < 0)
       return -expint_imp(1, T(-z), pol, tag);
    if(z == 0)
@@ -525,10 +537,10 @@ T expint_i_imp(T z, const Policy& pol, const Tag& tag)
 }
 
 template <class T, class Policy>
-T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 53>& tag)
 {
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
    if(z < 0)
       return -expint_imp(1, T(-z), pol, tag);
    if(z == 0)
@@ -739,10 +751,10 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
 }
 
 template <class T, class Policy>
-T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 64>& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 64>& tag)
 {
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
    if(z < 0)
       return -expint_imp(1, T(-z), pol, tag);
    if(z == 0)
@@ -976,7 +988,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 64>& ta
 }
 
 template <class T, class Policy>
-void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     1.230e-36
@@ -1044,7 +1056,7 @@ void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
 }
 
 template <class T>
-void expint_i_113b(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113b(T& result, const T& z)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     7.779e-36
@@ -1094,7 +1106,7 @@ void expint_i_113b(T& result, const T& z)
 }
 
 template <class T>
-void expint_i_113c(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113c(T& result, const T& z)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     1.082e-34
@@ -1147,7 +1159,7 @@ void expint_i_113c(T& result, const T& z)
 }
 
 template <class T>
-void expint_i_113d(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113d(T& result, const T& z)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     3.163e-35
@@ -1198,7 +1210,7 @@ void expint_i_113d(T& result, const T& z)
 }
 
 template <class T>
-void expint_i_113e(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113e(T& result, const T& z)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     7.972e-36
@@ -1252,7 +1264,7 @@ void expint_i_113e(T& result, const T& z)
 }
 
 template <class T>
-void expint_i_113f(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113f(T& result, const T& z)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     4.469e-36
@@ -1299,7 +1311,7 @@ void expint_i_113f(T& result, const T& z)
 }
 
 template <class T>
-void expint_i_113g(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113g(T& result, const T& z)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     5.588e-35
@@ -1344,7 +1356,7 @@ void expint_i_113g(T& result, const T& z)
 }
 
 template <class T>
-void expint_i_113h(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113h(T& result, const T& z)
 {
    BOOST_MATH_STD_USING
    // Maximum Deviation Found:                     4.448e-36
@@ -1383,10 +1395,10 @@ void expint_i_113h(T& result, const T& z)
 }
 
 template <class T, class Policy>
-T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 113>& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 113>& tag)
 {
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
    if(z < 0)
       return -expint_imp(1, T(-z), pol, tag);
    if(z == 0)
@@ -1491,12 +1503,12 @@ struct expint_i_initializer
 {
    struct init
    {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
       {
          do_init(tag());
       }
-      static void do_init(const std::integral_constant<int, 0>&){}
-      static void do_init(const std::integral_constant<int, 53>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 0>&){}
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 53>&)
       {
          boost::math::expint(T(5), Policy());
          boost::math::expint(T(7), Policy());
@@ -1504,7 +1516,7 @@ struct expint_i_initializer
          boost::math::expint(T(38), Policy());
          boost::math::expint(T(45), Policy());
       }
-      static void do_init(const std::integral_constant<int, 64>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
       {
          boost::math::expint(T(5), Policy());
          boost::math::expint(T(7), Policy());
@@ -1512,7 +1524,7 @@ struct expint_i_initializer
          boost::math::expint(T(38), Policy());
          boost::math::expint(T(45), Policy());
       }
-      static void do_init(const std::integral_constant<int, 113>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
       {
          boost::math::expint(T(5), Policy());
          boost::math::expint(T(7), Policy());
@@ -1524,12 +1536,14 @@ struct expint_i_initializer
          boost::math::expint(T(200), Policy());
          boost::math::expint(T(220), Policy());
       }
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
    {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
       initializer.force_instantiate();
+      #endif
    }
 };
 
@@ -1541,33 +1555,35 @@ struct expint_1_initializer
 {
    struct init
    {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
       {
          do_init(tag());
       }
-      static void do_init(const std::integral_constant<int, 0>&){}
-      static void do_init(const std::integral_constant<int, 53>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 0>&){}
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 53>&)
       {
          boost::math::expint(1, T(0.5), Policy());
          boost::math::expint(1, T(2), Policy());
       }
-      static void do_init(const std::integral_constant<int, 64>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
       {
          boost::math::expint(1, T(0.5), Policy());
          boost::math::expint(1, T(2), Policy());
       }
-      static void do_init(const std::integral_constant<int, 113>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
       {
          boost::math::expint(1, T(0.5), Policy());
          boost::math::expint(1, T(2), Policy());
          boost::math::expint(1, T(6), Policy());
       }
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
    {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
       initializer.force_instantiate();
+      #endif
    }
 };
 
@@ -1575,8 +1591,8 @@ template <class T, class Policy, class tag>
 const typename expint_1_initializer<T, Policy, tag>::init expint_1_initializer<T, Policy, tag>::initializer;
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
-   expint_forwarder(T z, const Policy& /*pol*/, std::true_type const&)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
+   expint_forwarder(T z, const Policy& /*pol*/, boost::math::true_type const&)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -1587,7 +1603,7 @@ inline typename tools::promote_args<T>::type
       policies::promote_double<false>,
       policies::discrete_quantile<>,
       policies::assert_undefined<> >::type forwarding_policy;
-   typedef std::integral_constant<int,
+   typedef boost::math::integral_constant<int,
       precision_type::value <= 0 ? 0 :
       precision_type::value <= 53 ? 53 :
       precision_type::value <= 64 ? 64 :
@@ -1603,8 +1619,8 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type
-expint_forwarder(unsigned n, T z, const std::false_type&)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
+expint_forwarder(unsigned n, T z, const boost::math::false_type&)
 {
    return boost::math::expint(n, z, policies::policy<>());
 }
@@ -1612,7 +1628,7 @@ expint_forwarder(unsigned n, T z, const std::false_type&)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    expint(unsigned n, T z, const Policy& /*pol*/)
 {
    typedef typename tools::promote_args<T>::type result_type;
@@ -1624,7 +1640,7 @@ inline typename tools::promote_args<T>::type
       policies::promote_double<false>,
       policies::discrete_quantile<>,
       policies::assert_undefined<> >::type forwarding_policy;
-   typedef std::integral_constant<int,
+   typedef boost::math::integral_constant<int,
       precision_type::value <= 0 ? 0 :
       precision_type::value <= 53 ? 53 :
       precision_type::value <= 64 ? 64 :
@@ -1641,7 +1657,7 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T, class U>
-inline typename detail::expint_result<T, U>::type
+BOOST_MATH_GPU_ENABLED inline typename detail::expint_result<T, U>::type
    expint(T const z, U const u)
 {
    typedef typename policies::is_policy<U>::type tag_type;
@@ -1649,7 +1665,7 @@ inline typename detail::expint_result<T, U>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    expint(T z)
 {
    return expint(z, policies::policy<>());

From ed381ab894dbd8dd0891325206507199ae3e8d01 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 16 Sep 2024 15:40:04 -0400
Subject: [PATCH 02/26] Add SYCL testing of expint

---
 test/sycl_jamfile    |  2 ++
 test/test_expint.cpp | 11 +++++++++++
 test/test_expint.hpp |  8 +++++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index ff6d84fde..23adafdbb 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -71,6 +71,8 @@ run test_sign.cpp ;
 
 run test_round.cpp ;
 
+run test_expint.cpp ;
+
 run test_expm1_simple.cpp ;
 
 run test_log1p_simple.cpp ;
diff --git a/test/test_expint.cpp b/test/test_expint.cpp
index 3f44a8091..3eede5e38 100644
--- a/test/test_expint.cpp
+++ b/test/test_expint.cpp
@@ -3,7 +3,14 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#endif
+
 #include "test_expint.hpp"
 
 //
@@ -78,7 +85,11 @@ void expected_results()
       ".*",                          // platform
       "float|double|long double",    // test type(s)
       ".*Ei.*",                      // test data group
+      #ifndef SYCL_LANGUAGE_VERSION
       ".*", 6, 3);                   // test function
+      #else
+      ".*", 10, 3);
+      #endif
    if(std::numeric_limits<long double>::digits > 100)
    {
       add_expected_result(
diff --git a/test/test_expint.hpp b/test/test_expint.hpp
index 491db2fcd..d6524810e 100644
--- a/test/test_expint.hpp
+++ b/test/test_expint.hpp
@@ -4,13 +4,19 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp>
+#endif
+
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/expint.hpp>
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>

From 01985d6f54f385d36bedea6fd9c456449217bb12 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 16 Sep 2024 16:06:36 -0400
Subject: [PATCH 03/26] Add markers to forward decls

---
 .../boost/math/special_functions/math_fwd.hpp | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index 35f8dd1bb..fb3a2029f 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -50,6 +50,16 @@ namespace detail{
       >::type;
    };
 
+   template <class T, class U>
+   struct expint_result
+   {
+      using type = typename boost::math::conditional<
+         policies::is_policy<U>::value,
+         tools::promote_args_t<T>,
+         typename tools::promote_args<U>::type
+      >::type;
+   };
+
    } // namespace detail
 
 } // namespace math
@@ -936,7 +946,7 @@ namespace boost
    template <class T, class U>
    struct expint_result
    {
-      typedef typename std::conditional<
+      typedef typename boost::math::conditional<
          policies::is_policy<U>::value,
          tools::promote_args_t<T>,
          typename tools::promote_args<U>::type
@@ -946,13 +956,13 @@ namespace boost
    } // namespace detail
 
    template <class T, class Policy>
-   tools::promote_args_t<T> expint(unsigned n, T z, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> expint(unsigned n, T z, const Policy&);
 
    template <class T, class U>
-   typename detail::expint_result<T, U>::type expint(T const z, U const u);
+   BOOST_MATH_GPU_ENABLED typename detail::expint_result<T, U>::type expint(T const z, U const u);
 
    template <class T>
-   tools::promote_args_t<T> expint(T z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> expint(T z);
 
    // Zeta:
    template <class T, class Policy>
@@ -1620,11 +1630,11 @@ template <class OutputIterator, class T>\
    using boost::math::changesign;\
    \
    template <class T, class U>\
-   inline typename boost::math::tools::promote_args_t<T,U> expint(T const& z, U const& u)\
+   BOOST_MATH_GPU_ENABLED inline typename boost::math::tools::promote_args_t<T,U> expint(T const& z, U const& u)\
    { return boost::math::expint(z, u, Policy()); }\
    \
    template <class T>\
-   inline boost::math::tools::promote_args_t<T> expint(T z){ return boost::math::expint(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> expint(T z){ return boost::math::expint(z, Policy()); }\
    \
    template <class T>\
    inline boost::math::tools::promote_args_t<T> zeta(T s){ return boost::math::zeta(s, Policy()); }\

From f829cc4f3cbd977707ec4f60b0cc5d4426b21ff6 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 16 Sep 2024 16:06:46 -0400
Subject: [PATCH 04/26] Add CUDA testing of expint

---
 test/cuda_jamfile          |   3 ++
 test/test_expint_double.cu | 106 +++++++++++++++++++++++++++++++++++++
 test/test_expint_float.cu  | 106 +++++++++++++++++++++++++++++++++++++
 3 files changed, 215 insertions(+)
 create mode 100644 test/test_expint_double.cu
 create mode 100644 test/test_expint_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 34ac3b965..9c583821a 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -340,6 +340,9 @@ run test_erfc_float.cu ;
 run test_erfc_inv_double.cu ;
 run test_erfc_inv_float.cu ;
 
+run test_expint_double.cu ;
+run test_expint_float.cu ;
+
 run test_expm1_double.cu ;
 run test_expm1_float.cu ;
 
diff --git a/test/test_expint_double.cu b/test/test_expint_double.cu
new file mode 100644
index 000000000..d82e90a93
--- /dev/null
+++ b/test/test_expint_double.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng(42);
+    std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::expint(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_expint_float.cu b/test/test_expint_float.cu
new file mode 100644
index 000000000..dd1fccd1d
--- /dev/null
+++ b/test/test_expint_float.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng(42);
+    std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::expint(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From ad66a6b41846fc6ceff8e59b03e7c3e32a410da4 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 16 Sep 2024 16:13:54 -0400
Subject: [PATCH 05/26] Fix static variable usage under NVRTC

---
 .../boost/math/special_functions/expint.hpp   | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/include/boost/math/special_functions/expint.hpp b/include/boost/math/special_functions/expint.hpp
index 79a46c52f..09e97bd4f 100644
--- a/include/boost/math/special_functions/expint.hpp
+++ b/include/boost/math/special_functions/expint.hpp
@@ -25,7 +25,6 @@
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/digamma.hpp>
 #include <boost/math/special_functions/log1p.hpp>
-#include <boost/math/special_functions/pow.hpp>
 
 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -376,7 +375,7 @@ struct expint_fraction
 };
 
 template <class T, class Policy>
-inline T expint_as_fraction(unsigned n, T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T expint_as_fraction(unsigned n, T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    BOOST_MATH_INSTRUMENT_VARIABLE(z)
@@ -553,7 +552,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
       // Maximum Deviation Found:                     2.852e-18
       // Expected Error Term:                         2.852e-18
       // Max Error found at double precision =        Poly: 2.636335e-16   Cheb: 4.187027e-16
-      static const T P[10] = {
+      BOOST_MATH_STATIC const T P[10] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, 2.98677224343598593013),
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.356343618769377415068),
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.780836076283730801839),
@@ -565,7 +564,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.798296365679269702435e-5),
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.2777056254402008721e-6)
       };
-      static const T Q[8] = {
+      BOOST_MATH_STATIC const T Q[8] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 53, -1.17090412365413911947),
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.62215109846016746276),
@@ -576,11 +575,11 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.138972589601781706598e-4)
       };
 
-      static const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0);
-      static const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0);
-      static const T r1 = static_cast<T>(c1 / c2);
-      static const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16);
-      static const T r = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392));
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T r1 = static_cast<T>(c1 / c2);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T r = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392));
       T t = (z / 3) - 1;
       result = tools::evaluate_polynomial(P, t)
          / tools::evaluate_polynomial(Q, t);
@@ -600,8 +599,8 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
       // Maximum Deviation Found:                     6.546e-17
       // Expected Error Term:                         6.546e-17
       // Max Error found at double precision =        Poly: 6.890169e-17   Cheb: 6.772128e-17
-      static const T Y = 1.158985137939453125F;
-      static const T P[8] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.158985137939453125F;
+      BOOST_MATH_STATIC const T P[8] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.00139324086199402804173),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.0349921221823888744966),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.0264095520754134848538),
@@ -611,7 +610,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.554086272024881826253e-4),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.396487648924804510056e-5)
       };
-      static const T Q[8] = {
+      BOOST_MATH_STATIC const T Q[8] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.744625566823272107711),
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.329061095011767059236),
@@ -633,8 +632,8 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
       // Expected Error Term:                         -1.842e-17
       // Max Error found at double precision =        Poly: 4.375868e-17   Cheb: 5.860967e-17
 
-      static const T Y = 1.0869731903076171875F;
-      static const T P[9] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.0869731903076171875F;
+      BOOST_MATH_STATIC const T P[9] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.00893891094356945667451),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.0484607730127134045806),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.0652810444222236895772),
@@ -645,7 +644,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.000209750022660200888349),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.138652200349182596186e-4)
       };
-      static const T Q[9] = {
+      BOOST_MATH_STATIC const T Q[9] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 53, 1.97017214039061194971),
          BOOST_MATH_BIG_CONSTANT(T, 53, 1.86232465043073157508),
@@ -669,8 +668,8 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
       // Max Error found at double precision =        Poly: 1.441088e-16   Cheb: 1.864792e-16
 
 
-      static const T Y = 1.03937530517578125F;
-      static const T P[9] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.03937530517578125F;
+      BOOST_MATH_STATIC const T P[9] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.00356165148914447597995),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.0229930320357982333406),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.0449814350482277917716),
@@ -681,7 +680,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.000192178045857733706044),
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.113161784705911400295e-9)
       };
-      static const T Q[9] = {
+      BOOST_MATH_STATIC const T Q[9] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 53, 2.84354408840148561131),
          BOOST_MATH_BIG_CONSTANT(T, 53, 3.6599610090072393012),
@@ -700,9 +699,9 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
    else
    {
       // Max Error found at double precision =        3.381886e-17
-      static const T exp40 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17));
-      static const T Y= 1.013065338134765625F;
-      static const T P[6] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T exp40 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17));
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y= 1.013065338134765625F;
+      BOOST_MATH_STATIC const T P[6] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, -0.0130653381347656243849),
          BOOST_MATH_BIG_CONSTANT(T, 53, 0.19029710559486576682),
          BOOST_MATH_BIG_CONSTANT(T, 53, 94.7365094537197236011),
@@ -710,7 +709,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math:
          BOOST_MATH_BIG_CONSTANT(T, 53, 18932.0850014925993025),
          BOOST_MATH_BIG_CONSTANT(T, 53, -38703.1431362056714134)
       };
-      static const T Q[7] = {
+      BOOST_MATH_STATIC const T Q[7] = {
          BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 53, 61.9733592849439884145),
          BOOST_MATH_BIG_CONSTANT(T, 53, -2354.56211323420194283),

From 59faf9128e35e09df52690103243b13647fa3398 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 16 Sep 2024 16:14:04 -0400
Subject: [PATCH 06/26] Add NVRTC testing

---
 test/nvrtc_jamfile                |   6 +-
 test/test_expint_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++++++
 test/test_expint_nvrtc_float.cpp  | 190 ++++++++++++++++++++++++++++++
 3 files changed, 384 insertions(+), 2 deletions(-)
 create mode 100644 test/test_expint_nvrtc_double.cpp
 create mode 100644 test/test_expint_nvrtc_float.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index eb5da37c8..294559821 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -9,8 +9,8 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
-run test_heumann_lambda_nvrtc_double.cpp ;
-run test_heumann_lambda_nvrtc_float.cpp ;
+run test_expint_nvrtc_double.cpp ;
+run test_expint_nvrtc_float.cpp ;
 
 # Quad
 run test_exp_sinh_quad_nvrtc_float.cpp ;
@@ -335,6 +335,8 @@ run test_ellint_d_nvrtc_double.cpp ;
 run test_ellint_d_nvrtc_float.cpp ;
 run test_jacobi_zeta_nvrtc_double.cpp ;
 run test_jacobi_zeta_nvrtc_float.cpp ;
+run test_heumann_lambda_nvrtc_double.cpp ;
+run test_heumann_lambda_nvrtc_float.cpp ;
 
 run test_expm1_nvrtc_double.cpp ;
 run test_expm1_nvrtc_float.cpp ;
diff --git a/test/test_expint_nvrtc_double.cpp b/test/test_expint_nvrtc_double.cpp
new file mode 100644
index 000000000..3ab45e6a1
--- /dev/null
+++ b/test/test_expint_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/expint.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/expint.hpp>
+extern "C" __global__ 
+void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_expint_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::expint(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_expint_nvrtc_float.cpp b/test/test_expint_nvrtc_float.cpp
new file mode 100644
index 000000000..bff58580e
--- /dev/null
+++ b/test/test_expint_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/expint.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/expint.hpp>
+extern "C" __global__ 
+void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_expint_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::expint(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From 9c565e92c16fcf0adb7b81a487a7e96246171b68 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 10:12:45 -0400
Subject: [PATCH 07/26] Add configurable definition of complex

---
 include/boost/math/tools/complex.hpp | 26 ++++++++++++++++++++++++--
 include/boost/math/tools/config.hpp  |  1 +
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp
index f93d2958d..ec5144011 100644
--- a/include/boost/math/tools/complex.hpp
+++ b/include/boost/math/tools/complex.hpp
@@ -14,11 +14,33 @@
 #include <boost/math/tools/is_detected.hpp>
 
 #ifdef BOOST_MATH_ENABLE_CUDA
+
 #include <cuda/std/utility>
-#endif
+#include <cuda/std/complex>
+
+namespace boost {
+namespace math {
+
+template <typename T>
+using complex = cuda::std::complex<T>;
+
+} // namespace math
+} // namespace boost
+
+#else
 
-#ifndef BOOST_MATH_HAS_NVRTC
 #include <utility>
+#include <complex>
+
+namespace boost {
+namespace math {
+
+template <typename T>
+using complex = std::complex<T>;
+
+} // namespace math
+} // namespace boost
+
 #endif
 
 namespace boost {
diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp
index 99bc4ee27..b3a8c2cf8 100644
--- a/include/boost/math/tools/config.hpp
+++ b/include/boost/math/tools/config.hpp
@@ -678,6 +678,7 @@ namespace boost{ namespace math{
 #include <cuda/std/cstdint>
 #include <cuda/std/array>
 #include <cuda/std/tuple>
+#include <cuda/std/complex>
 
 #  define BOOST_MATH_CUDA_ENABLED __host__ __device__
 #  define BOOST_MATH_HAS_GPU_SUPPORT

From bf43d6ea25103830c0090fb4df88c3429ee45140 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 10:19:10 -0400
Subject: [PATCH 08/26] Add function aliases

---
 include/boost/math/tools/complex.hpp | 60 ++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp
index ec5144011..88db8e8a7 100644
--- a/include/boost/math/tools/complex.hpp
+++ b/include/boost/math/tools/complex.hpp
@@ -24,6 +24,36 @@ namespace math {
 template <typename T>
 using complex = cuda::std::complex<T>;
 
+using cuda::std::real;
+using cuda::std::imag;
+using cuda::std::abs;
+using cuda::std::arg;
+using cuda::std::norm;
+using cuda::std::conj;
+using cuda::std::proj;
+using cuda::std::polar;
+
+using cuda::std::exp;
+using cuda::std::log;
+using cuda::std::log10;
+
+using cuda::std::pow;
+using cuda::std::sqrt;
+
+using cuda::std::sin;
+using cuda::std::cos;
+using cuda::std::tan;
+using cuda::std::asin;
+using cuda::std::acos;
+using cuda::std::atan;
+
+using cuda::std::sinh;
+using cuda::std::cosh;
+using cuda::std::tanh;
+using cuda::std::asinh;
+using cuda::std::acosh;
+using cuda::std::atanh;
+
 } // namespace math
 } // namespace boost
 
@@ -38,6 +68,36 @@ namespace math {
 template <typename T>
 using complex = std::complex<T>;
 
+using std::real;
+using std::imag;
+using std::abs;
+using std::arg;
+using std::norm;
+using std::conj;
+using std::proj;
+using std::polar;
+
+using std::exp;
+using std::log;
+using std::log10;
+
+using std::pow;
+using std::sqrt;
+
+using std::sin;
+using std::cos;
+using std::tan;
+using std::asin;
+using std::acos;
+using std::atan;
+
+using std::sinh;
+using std::cosh;
+using std::tanh;
+using std::asinh;
+using std::acosh;
+using std::atanh;
+
 } // namespace math
 } // namespace boost
 

From 571958701bd445f0807f3e5c3bfdb3a584dc8015 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 10:53:17 -0400
Subject: [PATCH 09/26] Add GPU support to gegenbauer polynomials

---
 include/boost/math/special_functions/gegenbauer.hpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/boost/math/special_functions/gegenbauer.hpp b/include/boost/math/special_functions/gegenbauer.hpp
index b7033cd14..41fcdbd34 100644
--- a/include/boost/math/special_functions/gegenbauer.hpp
+++ b/include/boost/math/special_functions/gegenbauer.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright Nick Thompson 2019.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -6,16 +7,20 @@
 #ifndef BOOST_MATH_SPECIAL_GEGENBAUER_HPP
 #define BOOST_MATH_SPECIAL_GEGENBAUER_HPP
 
-#include <limits>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+
+#ifndef BOOST_MATH_NO_EXCEPTIONS
 #include <stdexcept>
-#include <type_traits>
+#endif
 
 namespace boost { namespace math {
 
 template<typename Real>
 Real gegenbauer(unsigned n, Real lambda, Real x)
 {
-    static_assert(!std::is_integral<Real>::value, "Gegenbauer polynomials required floating point arguments.");
+    static_assert(!boost::math::is_integral<Real>::value, "Gegenbauer polynomials required floating point arguments.");
     if (lambda <= -1/Real(2)) {
 #ifndef BOOST_MATH_NO_EXCEPTIONS
        throw std::domain_error("lambda > -1/2 is required.");
@@ -41,7 +46,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x)
 
     Real yk = y1;
     Real k = 2;
-    Real k_max = n*(1+std::numeric_limits<Real>::epsilon());
+    Real k_max = n*(1+boost::math::numeric_limits<Real>::epsilon());
     Real gamma = 2*(lambda - 1);
     while(k < k_max)
     {

From 8de822321a390aaec1c74582758da8e56ac189e1 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 10:53:27 -0400
Subject: [PATCH 10/26] Add SYCL testing of gegenbauer

---
 test/sycl_jamfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 23adafdbb..2f617be85 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -75,6 +75,8 @@ run test_expint.cpp ;
 
 run test_expm1_simple.cpp ;
 
+run gegenbauer_test.cpp ;
+
 run test_log1p_simple.cpp ;
 
 run test_digamma_simple.cpp ;

From 608ba2fe7b9346d1d1ab9750aa5a1e5a07847611 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 11:16:40 -0400
Subject: [PATCH 11/26] Add NVCC testing of gegenbauer

---
 .../math/special_functions/gegenbauer.hpp     |   8 +-
 test/cuda_jamfile                             |   3 +
 test/test_gegenbauer_double.cu                | 125 ++++++++++++++++++
 test/test_gegenbauer_float.cu                 | 124 +++++++++++++++++
 4 files changed, 256 insertions(+), 4 deletions(-)
 create mode 100644 test/test_gegenbauer_double.cu
 create mode 100644 test/test_gegenbauer_float.cu

diff --git a/include/boost/math/special_functions/gegenbauer.hpp b/include/boost/math/special_functions/gegenbauer.hpp
index 41fcdbd34..70324cf65 100644
--- a/include/boost/math/special_functions/gegenbauer.hpp
+++ b/include/boost/math/special_functions/gegenbauer.hpp
@@ -18,14 +18,14 @@
 namespace boost { namespace math {
 
 template<typename Real>
-Real gegenbauer(unsigned n, Real lambda, Real x)
+BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x)
 {
     static_assert(!boost::math::is_integral<Real>::value, "Gegenbauer polynomials required floating point arguments.");
     if (lambda <= -1/Real(2)) {
 #ifndef BOOST_MATH_NO_EXCEPTIONS
        throw std::domain_error("lambda > -1/2 is required.");
 #else
-       return std::numeric_limits<Real>::quiet_NaN();
+       return boost::math::numeric_limits<Real>::quiet_NaN();
 #endif
     }
     // The only reason to do this is because of some instability that could be present for x < 0 that is not present for x > 0.
@@ -60,7 +60,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x)
 
 
 template<typename Real>
-Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
+BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
 {
     if (k > n) {
         return Real(0);
@@ -75,7 +75,7 @@ Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
 }
 
 template<typename Real>
-Real gegenbauer_prime(unsigned n, Real lambda, Real x) {
+BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x) {
     return gegenbauer_derivative<Real>(n, lambda, x, 1);
 }
 
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 9c583821a..6d6eea454 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -9,6 +9,9 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
+run test_gegenbauer_double.cu ;
+run test_gegenbauer_float.cu ;
+
 # Quad
 run test_exp_sinh_quad_float.cu ;
 run test_exp_sinh_quad_double.cu ;
diff --git a/test/test_gegenbauer_double.cu b/test/test_gegenbauer_double.cu
new file mode 100644
index 000000000..21278d7a8
--- /dev/null
+++ b/test/test_gegenbauer_double.cu
@@ -0,0 +1,125 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+//  Gegenbauer prime uses all methods internally so it's the easy choice
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
+            // Most elements are under 50 but extremely small numbers very more greatly
+            if (eps > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i]
+                          << "\n  Host: " << results[i]
+                          << "\n   Eps: " << eps << std::endl;
+                ++failure_counter;
+                if (failure_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_gegenbauer_float.cu b/test/test_gegenbauer_float.cu
new file mode 100644
index 000000000..b7affaecd
--- /dev/null
+++ b/test/test_gegenbauer_float.cu
@@ -0,0 +1,124 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+//  Gegenbauer prime uses all methods internally so it's the easy choice
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
+            if (eps > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i]
+                          << "\n  Host: " << results[i]
+                          << "\n   Eps: " << eps << std::endl;
+                ++failure_counter;
+                if (failure_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From dbd194ed88f119f47149704be852f897958ad693 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 11:22:45 -0400
Subject: [PATCH 12/26] Add NVRTC testing of gegenbauer

---
 test/cuda_jamfile                     |   6 +-
 test/nvrtc_jamfile                    |   7 +-
 test/test_gegenbauer_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++
 test/test_gegenbauer_nvrtc_float.cpp  | 190 ++++++++++++++++++++++++++
 4 files changed, 388 insertions(+), 5 deletions(-)
 create mode 100644 test/test_gegenbauer_nvrtc_double.cpp
 create mode 100644 test/test_gegenbauer_nvrtc_float.cpp

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 6d6eea454..6abcf00b8 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -9,9 +9,6 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
-run test_gegenbauer_double.cu ;
-run test_gegenbauer_float.cu ;
-
 # Quad
 run test_exp_sinh_quad_float.cu ;
 run test_exp_sinh_quad_double.cu ;
@@ -349,6 +346,9 @@ run test_expint_float.cu ;
 run test_expm1_double.cu ;
 run test_expm1_float.cu ;
 
+run test_gegenbauer_double.cu ;
+run test_gegenbauer_float.cu ;
+
 run test_lgamma_double.cu ;
 run test_lgamma_float.cu ;
 run test_tgamma_double.cu ;
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 294559821..2cd61028e 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -9,8 +9,8 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
-run test_expint_nvrtc_double.cpp ;
-run test_expint_nvrtc_float.cpp ;
+run test_gegenbauer_nvrtc_double.cpp ;
+run test_gegenbauer_nvrtc_float.cpp ;
 
 # Quad
 run test_exp_sinh_quad_nvrtc_float.cpp ;
@@ -338,6 +338,9 @@ run test_jacobi_zeta_nvrtc_float.cpp ;
 run test_heumann_lambda_nvrtc_double.cpp ;
 run test_heumann_lambda_nvrtc_float.cpp ;
 
+run test_expint_nvrtc_double.cpp ;
+run test_expint_nvrtc_float.cpp ;
+
 run test_expm1_nvrtc_double.cpp ;
 run test_expm1_nvrtc_float.cpp ;
 
diff --git a/test/test_gegenbauer_nvrtc_double.cpp b/test/test_gegenbauer_nvrtc_double.cpp
new file mode 100644
index 000000000..0c8416cb6
--- /dev/null
+++ b/test/test_gegenbauer_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/gegenbauer.hpp>
+extern "C" __global__ 
+void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer(2, in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gegenbauer_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gegenbauer_nvrtc_float.cpp b/test/test_gegenbauer_nvrtc_float.cpp
new file mode 100644
index 000000000..c0d348417
--- /dev/null
+++ b/test/test_gegenbauer_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/gegenbauer.hpp>
+extern "C" __global__ 
+void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer(2, in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gegenbauer_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From 0f17f14d9789f39b635fe4c97b424d21fa369b30 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 11:47:49 -0400
Subject: [PATCH 13/26] Add GPU support for hankel

---
 .../boost/math/special_functions/bessel.hpp   | 44 -------------
 .../boost/math/special_functions/hankel.hpp   | 62 +++++++++---------
 .../boost/math/special_functions/math_fwd.hpp | 63 +++++++++++++++----
 3 files changed, 84 insertions(+), 85 deletions(-)

diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp
index 081473442..c32f251bc 100644
--- a/include/boost/math/special_functions/bessel.hpp
+++ b/include/boost/math/special_functions/bessel.hpp
@@ -47,50 +47,6 @@
 
 namespace boost{ namespace math{
 
-// Since we cannot pull this in from math fwd we need a copy
-#ifdef BOOST_MATH_HAS_NVRTC
-
-namespace detail{
-
-      typedef boost::math::integral_constant<int, 0> bessel_no_int_tag;      // No integer optimisation possible.
-      typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag;   // Maybe integer optimisation.
-      typedef boost::math::integral_constant<int, 2> bessel_int_tag;         // Definite integer optimisation.
-
-      template <class T1, class T2, class Policy>
-      struct bessel_traits
-      {
-         using result_type = typename boost::math::conditional<
-            boost::math::is_integral<T1>::value,
-            typename tools::promote_args<T2>::type,
-            tools::promote_args_t<T1, T2>
-         >::type;
-
-         typedef typename policies::precision<result_type, Policy>::type precision_type;
-
-         using optimisation_tag = typename boost::math::conditional<
-            (precision_type::value <= 0 || precision_type::value > 64),
-            bessel_no_int_tag,
-            typename boost::math::conditional<
-               boost::math::is_integral<T1>::value,
-               bessel_int_tag,
-               bessel_maybe_int_tag
-            >::type
-         >::type;
-
-         using optimisation_tag128 = typename boost::math::conditional<
-            (precision_type::value <= 0 || precision_type::value > 113),
-            bessel_no_int_tag,
-            typename boost::math::conditional<
-               boost::math::is_integral<T1>::value,
-               bessel_int_tag,
-               bessel_maybe_int_tag
-            >::type
-         >::type;
-      };
-   } // detail
-
-#endif
-
 namespace detail{
 
 template <class T, class Policy>
diff --git a/include/boost/math/special_functions/hankel.hpp b/include/boost/math/special_functions/hankel.hpp
index 51b8390d9..d9cff8f92 100644
--- a/include/boost/math/special_functions/hankel.hpp
+++ b/include/boost/math/special_functions/hankel.hpp
@@ -1,4 +1,5 @@
 // Copyright John Maddock 2012.
+// Copyright Matt Borland 2024.
 // Use, modification and distribution are subject to the
 // Boost Software License, Version 1.0.
 // (See accompanying file LICENSE_1_0.txt
@@ -7,26 +8,31 @@
 #ifndef BOOST_MATH_HANKEL_HPP
 #define BOOST_MATH_HANKEL_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/complex.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/detail/iconv.hpp>
+#include <boost/math/constants/constants.hpp>
+#include <boost/math/policies/error_handling.hpp>
 
 namespace boost{ namespace math{
 
 namespace detail{
 
 template <class T, class Policy>
-std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
+boost::math::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)";
+   constexpr auto function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)";
 
    if(x < 0)
    {
       bool isint_v = floor(v) == v;
       T j, y;
       bessel_jy(v, -x, &j, &y, need_j | need_y, pol);
-      std::complex<T> cx(x), cv(v);
-      std::complex<T> j_result, y_result;
+      boost::math::complex<T> cx(x), cv(v);
+      boost::math::complex<T> j_result, y_result;
       if(isint_v)
       {
          int s = (iround(v) & 1) ? -1 : 1;
@@ -37,12 +43,12 @@ std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol
       {
          j_result = pow(cx, v) * pow(-cx, -v) * j;
          T p1 = pow(-x, v);
-         std::complex<T> p2 = pow(cx, v);
+         boost::math::complex<T> p2 = pow(cx, v);
          y_result = p1 * y / p2
             + (p2 / p1 - p1 / p2) * j / tan(constants::pi<T>() * v);
       }
       // multiply y_result by i:
-      y_result = std::complex<T>(-sign * y_result.imag(), sign * y_result.real());
+      y_result = boost::math::complex<T>(-sign * y_result.imag(), sign * y_result.real());
       return j_result + y_result;
    }
 
@@ -51,25 +57,25 @@ std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol
       if(v == 0)
       {
          // J is 1, Y is -INF
-         return std::complex<T>(1, sign * -policies::raise_overflow_error<T>(function, nullptr, pol));
+         return boost::math::complex<T>(1, sign * -policies::raise_overflow_error<T>(function, nullptr, pol));
       }
       else
       {
          // At least one of J and Y is complex infinity:
-         return std::complex<T>(policies::raise_overflow_error<T>(function, nullptr, pol), sign * policies::raise_overflow_error<T>(function, nullptr, pol));
+         return boost::math::complex<T>(policies::raise_overflow_error<T>(function, nullptr, pol), sign * policies::raise_overflow_error<T>(function, nullptr, pol));
       }
    }
 
    T j, y;
    bessel_jy(v, x, &j, &y, need_j | need_y, pol);
-   return std::complex<T>(j, sign * y);
+   return boost::math::complex<T>(j, sign * y);
 }
 
 template <class T, class Policy>
-std::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);
+boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);
 
 template <class T, class Policy>
-inline std::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
+inline boost::math::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING  // ADL of std names.
    int ival = detail::iconv(v, pol);
@@ -81,57 +87,57 @@ inline std::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const P
 }
 
 template <class T, class Policy>
-inline std::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
+inline boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING
-   if((std::abs(v) < 200) && (x > 0))
-      return std::complex<T>(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol));
+   if((abs(v) < 200) && (x > 0))
+      return boost::math::complex<T>(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol));
    return hankel_imp(static_cast<T>(v), x, bessel_no_int_tag(), pol, sign);
 }
 
 template <class T, class Policy>
-inline std::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
+inline boost::math::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING
-   return constants::root_half_pi<T>() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(std::complex<T>(x));
+   return constants::root_half_pi<T>() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(boost::math::complex<T>(x));
 }
 
 } // namespace detail
 
 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
    typedef typename detail::bessel_traits<T1, T2, Policy>::optimisation_tag tag_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
 }
 
 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
 {
    return cyl_hankel_1(v, x, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
    typedef typename detail::bessel_traits<T1, T2, Policy>::optimisation_tag tag_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
 }
 
 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
 {
    return cyl_hankel_2(v, x, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -143,17 +149,17 @@ inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type>
       policies::discrete_quantile<>,
       policies::assert_undefined<> >::type forwarding_policy;
 
-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
 }
 
 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
 {
    return sph_hankel_1(v, x, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -165,11 +171,11 @@ inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type>
       policies::discrete_quantile<>,
       policies::assert_undefined<> >::type forwarding_policy;
 
-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
 }
 
 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
+inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
 {
    return sph_hankel_2(v, x, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index fb3a2029f..e989f5b18 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -27,6 +27,7 @@
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/promotion.hpp> // for argument promotion.
 #include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/complex.hpp>
 #include <boost/math/policies/policy.hpp>
 
 #ifdef BOOST_MATH_HAS_NVRTC
@@ -60,7 +61,43 @@ namespace detail{
       >::type;
    };
 
-   } // namespace detail
+   typedef boost::math::integral_constant<int, 0> bessel_no_int_tag;      // No integer optimisation possible.
+   typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag;   // Maybe integer optimisation.
+   typedef boost::math::integral_constant<int, 2> bessel_int_tag;         // Definite integer optimisation.
+
+   template <class T1, class T2, class Policy>
+   struct bessel_traits
+   {
+      using result_type = typename boost::math::conditional<
+         boost::math::is_integral<T1>::value,
+         typename tools::promote_args<T2>::type,
+         tools::promote_args_t<T1, T2>
+      >::type;
+
+      typedef typename policies::precision<result_type, Policy>::type precision_type;
+
+      using optimisation_tag = typename boost::math::conditional<
+         (precision_type::value <= 0 || precision_type::value > 64),
+         bessel_no_int_tag,
+         typename boost::math::conditional<
+            boost::math::is_integral<T1>::value,
+            bessel_int_tag,
+            bessel_maybe_int_tag
+         >::type
+      >::type;
+
+      using optimisation_tag128 = typename boost::math::conditional<
+         (precision_type::value <= 0 || precision_type::value > 113),
+         bessel_no_int_tag,
+         typename boost::math::conditional<
+            boost::math::is_integral<T1>::value,
+            bessel_int_tag,
+            bessel_maybe_int_tag
+         >::type
+      >::type;
+   };
+
+} // namespace detail
 
 } // namespace math
 } // namespace boost
@@ -818,28 +855,28 @@ namespace boost
                          const Policy&);
 
    template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x);
 
    template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol);
 
    template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol);
 
    template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x);
 
    template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol);
 
    template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x);
 
    template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol);
 
    template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x);
 
    template <class T, class Policy>
    BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> airy_ai(T x, const Policy&);
@@ -1679,19 +1716,19 @@ template <class OutputIterator, class T>\
    inline boost::math::tools::promote_args_t<RT1, RT2> owens_t(RT1 a, RT2 z){ return boost::math::owens_t(a, z, Policy()); }\
    \
    template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_1(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_1(T1 v, T2 x)\
    {  return boost::math::cyl_hankel_1(v, x, Policy()); }\
    \
    template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_2(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_2(T1 v, T2 x)\
    { return boost::math::cyl_hankel_2(v, x, Policy()); }\
    \
    template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_1(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_1(T1 v, T2 x)\
    { return boost::math::sph_hankel_1(v, x, Policy()); }\
    \
    template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_2(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_2(T1 v, T2 x)\
    { return boost::math::sph_hankel_2(v, x, Policy()); }\
    \
    template <class T>\

From f561a7722ee8353c4cb88f609a30b41056db6629 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 11:47:59 -0400
Subject: [PATCH 14/26] Add SYCL testing of hankel

---
 test/sycl_jamfile    | 2 ++
 test/test_hankel.cpp | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 2f617be85..991d34520 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -77,6 +77,8 @@ run test_expm1_simple.cpp ;
 
 run gegenbauer_test.cpp ;
 
+run test_hankel.cpp ;
+
 run test_log1p_simple.cpp ;
 
 run test_digamma_simple.cpp ;
diff --git a/test/test_hankel.cpp b/test/test_hankel.cpp
index f8bd173da..a93e90c4d 100644
--- a/test/test_hankel.cpp
+++ b/test/test_hankel.cpp
@@ -3,9 +3,13 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
 
 #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#include <boost/math/tools/config.hpp>
+
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
@@ -85,6 +89,7 @@ void test_hankel(T, const char* name)
 //
 // Instantiate a few instances to check our error handling code can cope with std::complex:
 //
+#ifndef SYCL_LANGUAGE_VERSION
 typedef boost::math::policies::policy<
    boost::math::policies::overflow_error<boost::math::policies::throw_on_error>,
    boost::math::policies::denorm_error<boost::math::policies::throw_on_error>,
@@ -120,7 +125,7 @@ typedef boost::math::policies::policy<
    boost::math::policies::indeterminate_result_error<boost::math::policies::ignore_error> > pol3;
 
 template std::complex<double> boost::math::cyl_hankel_1<double, double, pol3>(double, double, const pol3&);
-
+#endif
 
 BOOST_AUTO_TEST_CASE( test_main )
 {

From 597d6a833fc75b2df729f1eff17a0396b001d14a Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 12:22:16 -0400
Subject: [PATCH 15/26] Add NVCC testing of cyl_hankel_1

---
 .../boost/math/special_functions/hankel.hpp   |  26 ++--
 test/cuda_jamfile                             |   3 +
 test/test_cyl_hankel_1_double.cu              | 119 ++++++++++++++++++
 test/test_cyl_hankel_1_float.cu               | 119 ++++++++++++++++++
 4 files changed, 254 insertions(+), 13 deletions(-)
 create mode 100644 test/test_cyl_hankel_1_double.cu
 create mode 100644 test/test_cyl_hankel_1_float.cu

diff --git a/include/boost/math/special_functions/hankel.hpp b/include/boost/math/special_functions/hankel.hpp
index d9cff8f92..730c7afa0 100644
--- a/include/boost/math/special_functions/hankel.hpp
+++ b/include/boost/math/special_functions/hankel.hpp
@@ -21,7 +21,7 @@ namespace boost{ namespace math{
 namespace detail{
 
 template <class T, class Policy>
-boost::math::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED boost::math::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING
    constexpr auto function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)";
@@ -72,10 +72,10 @@ boost::math::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Pol
 }
 
 template <class T, class Policy>
-boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);
+BOOST_MATH_GPU_ENABLED boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);
 
 template <class T, class Policy>
-inline boost::math::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING  // ADL of std names.
    int ival = detail::iconv(v, pol);
@@ -87,7 +87,7 @@ inline boost::math::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&,
 }
 
 template <class T, class Policy>
-inline boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING
    if((abs(v) < 200) && (x > 0))
@@ -96,7 +96,7 @@ inline boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, con
 }
 
 template <class T, class Policy>
-inline boost::math::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
 {
    BOOST_MATH_STD_USING
    return constants::root_half_pi<T>() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(boost::math::complex<T>(x));
@@ -105,7 +105,7 @@ inline boost::math::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int s
 } // namespace detail
 
 template <class T1, class T2, class Policy>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -115,13 +115,13 @@ inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::resu
 }
 
 template <class T1, class T2>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
 {
    return cyl_hankel_1(v, x, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -131,13 +131,13 @@ inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::resu
 }
 
 template <class T1, class T2>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
 {
    return cyl_hankel_2(v, x, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -153,13 +153,13 @@ inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::resu
 }
 
 template <class T1, class T2>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
 {
    return sph_hankel_1(v, x, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
    typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -175,7 +175,7 @@ inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::resu
 }
 
 template <class T1, class T2>
-inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
 {
    return sph_hankel_2(v, x, policies::policy<>());
 }
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 6abcf00b8..01b2334b9 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -9,6 +9,9 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
+run test_cyl_hankel_1_double.cu ;
+run test_cyl_hankel_1_float.cu ;
+
 # Quad
 run test_exp_sinh_quad_float.cu ;
 run test_exp_sinh_quad_double.cu ;
diff --git a/test/test_cyl_hankel_1_double.cu b/test/test_cyl_hankel_1_double.cu
new file mode 100644
index 000000000..b718521c6
--- /dev/null
+++ b/test/test_cyl_hankel_1_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cyl_hankel_1_float.cu b/test/test_cyl_hankel_1_float.cu
new file mode 100644
index 000000000..794d6d763
--- /dev/null
+++ b/test/test_cyl_hankel_1_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From cdb600c2f6c26d9a4b244e28f3651a5b3fc43761 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 12:32:37 -0400
Subject: [PATCH 16/26] Add comprehensive NVCC testing

---
 test/cuda_jamfile                |  11 ++-
 test/test_cyl_hankel_2_double.cu | 119 +++++++++++++++++++++++++++++++
 test/test_cyl_hankel_2_float.cu  | 119 +++++++++++++++++++++++++++++++
 test/test_sph_hankel_1_double.cu | 119 +++++++++++++++++++++++++++++++
 test/test_sph_hankel_1_float.cu  | 119 +++++++++++++++++++++++++++++++
 test/test_sph_hankel_2_double.cu | 119 +++++++++++++++++++++++++++++++
 test/test_sph_hankel_2_float.cu  | 119 +++++++++++++++++++++++++++++++
 7 files changed, 722 insertions(+), 3 deletions(-)
 create mode 100644 test/test_cyl_hankel_2_double.cu
 create mode 100644 test/test_cyl_hankel_2_float.cu
 create mode 100644 test/test_sph_hankel_1_double.cu
 create mode 100644 test/test_sph_hankel_1_float.cu
 create mode 100644 test/test_sph_hankel_2_double.cu
 create mode 100644 test/test_sph_hankel_2_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 01b2334b9..32f87f44e 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -9,9 +9,6 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
-run test_cyl_hankel_1_double.cu ;
-run test_cyl_hankel_1_float.cu ;
-
 # Quad
 run test_exp_sinh_quad_float.cu ;
 run test_exp_sinh_quad_double.cu ;
@@ -308,6 +305,14 @@ run test_cyl_neumann_double.cu ;
 run test_cyl_neumann_float.cu ;
 run test_sph_neumann_double.cu ;
 run test_sph_neumann_float.cu ;
+run test_cyl_hankel_1_double.cu ;
+run test_cyl_hankel_1_float.cu ;
+run test_cyl_hankel_2_double.cu ;
+run test_cyl_hankel_2_float.cu ;
+run test_sph_hankel_1_double.cu ;
+run test_sph_hankel_1_float.cu ;
+run test_sph_hankel_2_double.cu ;
+run test_sph_hankel_2_float.cu ;
 
 run test_cbrt_double.cu ;
 run test_cbrt_float.cu ;
diff --git a/test/test_cyl_hankel_2_double.cu b/test/test_cyl_hankel_2_double.cu
new file mode 100644
index 000000000..8218b5347
--- /dev/null
+++ b/test/test_cyl_hankel_2_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cyl_hankel_2_float.cu b/test/test_cyl_hankel_2_float.cu
new file mode 100644
index 000000000..f00174b47
--- /dev/null
+++ b/test/test_cyl_hankel_2_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_sph_hankel_1_double.cu b/test/test_sph_hankel_1_double.cu
new file mode 100644
index 000000000..3d8473e22
--- /dev/null
+++ b/test/test_sph_hankel_1_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_sph_hankel_1_float.cu b/test/test_sph_hankel_1_float.cu
new file mode 100644
index 000000000..9cf17ffb2
--- /dev/null
+++ b/test/test_sph_hankel_1_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_sph_hankel_2_double.cu b/test/test_sph_hankel_2_double.cu
new file mode 100644
index 000000000..6c7a032f6
--- /dev/null
+++ b/test/test_sph_hankel_2_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_sph_hankel_2_float.cu b/test/test_sph_hankel_2_float.cu
new file mode 100644
index 000000000..bcf41e797
--- /dev/null
+++ b/test/test_sph_hankel_2_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From 17c2207fa1303ff9cd7dba97f0c1e342ccb68b6b Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 12:52:43 -0400
Subject: [PATCH 17/26] Add NVRTC testing of cyl and sph hankel

---
 test/nvrtc_jamfile                      |  14 +-
 test/test_cyl_hankel_1_nvrtc_double.cpp | 199 ++++++++++++++++++++++++
 test/test_cyl_hankel_1_nvrtc_float.cpp  | 199 ++++++++++++++++++++++++
 test/test_cyl_hankel_2_nvrtc_double.cpp | 199 ++++++++++++++++++++++++
 test/test_cyl_hankel_2_nvrtc_float.cpp  | 199 ++++++++++++++++++++++++
 test/test_sph_hankel_1_nvrtc_double.cpp | 199 ++++++++++++++++++++++++
 test/test_sph_hankel_1_nvrtc_float.cpp  | 199 ++++++++++++++++++++++++
 test/test_sph_hankel_2_nvrtc_double.cpp | 199 ++++++++++++++++++++++++
 test/test_sph_hankel_2_nvrtc_float.cpp  | 199 ++++++++++++++++++++++++
 9 files changed, 1603 insertions(+), 3 deletions(-)
 create mode 100644 test/test_cyl_hankel_1_nvrtc_double.cpp
 create mode 100644 test/test_cyl_hankel_1_nvrtc_float.cpp
 create mode 100644 test/test_cyl_hankel_2_nvrtc_double.cpp
 create mode 100644 test/test_cyl_hankel_2_nvrtc_float.cpp
 create mode 100644 test/test_sph_hankel_1_nvrtc_double.cpp
 create mode 100644 test/test_sph_hankel_1_nvrtc_float.cpp
 create mode 100644 test/test_sph_hankel_2_nvrtc_double.cpp
 create mode 100644 test/test_sph_hankel_2_nvrtc_float.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 2cd61028e..734c87ad4 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -9,9 +9,6 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
-run test_gegenbauer_nvrtc_double.cpp ;
-run test_gegenbauer_nvrtc_float.cpp ;
-
 # Quad
 run test_exp_sinh_quad_nvrtc_float.cpp ;
 run test_exp_sinh_quad_nvrtc_double.cpp ;
@@ -306,6 +303,14 @@ run test_cyl_neumann_nvrtc_double.cpp ;
 run test_cyl_neumann_nvrtc_float.cpp ;
 run test_sph_neumann_nvrtc_double.cpp ;
 run test_sph_neumann_nvrtc_float.cpp ;
+run test_cyl_hankel_1_nvrtc_double.cpp ;
+run test_cyl_hankel_1_nvrtc_float.cpp ;
+run test_cyl_hankel_2_nvrtc_double.cpp ;
+run test_cyl_hankel_2_nvrtc_float.cpp ;
+run test_sph_hankel_1_nvrtc_double.cpp ;
+run test_sph_hankel_1_nvrtc_float.cpp ;
+run test_sph_hankel_2_nvrtc_double.cpp ;
+run test_sph_hankel_2_nvrtc_float.cpp ;
 
 run test_cbrt_nvrtc_double.cpp ;
 run test_cbrt_nvrtc_float.cpp ;
@@ -356,6 +361,9 @@ run test_gamma_p_inv_nvrtc_float.cpp ;
 run test_tgamma_ratio_nvrtc_double.cpp ;
 run test_tgamma_ratio_nvrtc_float.cpp ;
 
+run test_gegenbauer_nvrtc_double.cpp ;
+run test_gegenbauer_nvrtc_float.cpp ;
+
 run test_log1p_nvrtc_double.cpp ;
 run test_log1p_nvrtc_float.cpp ;
 
diff --git a/test/test_cyl_hankel_1_nvrtc_double.cpp b/test/test_cyl_hankel_1_nvrtc_double.cpp
new file mode 100644
index 000000000..01f68ecb5
--- /dev/null
+++ b/test/test_cyl_hankel_1_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_cyl_hankel_1_nvrtc_float.cpp b/test/test_cyl_hankel_1_nvrtc_float.cpp
new file mode 100644
index 000000000..68fb928c2
--- /dev/null
+++ b/test/test_cyl_hankel_1_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_cyl_hankel_2_nvrtc_double.cpp b/test/test_cyl_hankel_2_nvrtc_double.cpp
new file mode 100644
index 000000000..188d93bbe
--- /dev/null
+++ b/test/test_cyl_hankel_2_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_cyl_hankel_2_nvrtc_float.cpp b/test/test_cyl_hankel_2_nvrtc_float.cpp
new file mode 100644
index 000000000..9349f3c0d
--- /dev/null
+++ b/test/test_cyl_hankel_2_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_sph_hankel_1_nvrtc_double.cpp b/test/test_sph_hankel_1_nvrtc_double.cpp
new file mode 100644
index 000000000..1e6863c55
--- /dev/null
+++ b/test/test_sph_hankel_1_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_sph_hankel_1_nvrtc_float.cpp b/test/test_sph_hankel_1_nvrtc_float.cpp
new file mode 100644
index 000000000..28346d284
--- /dev/null
+++ b/test/test_sph_hankel_1_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_sph_hankel_2_nvrtc_double.cpp b/test/test_sph_hankel_2_nvrtc_double.cpp
new file mode 100644
index 000000000..17b8c7ff0
--- /dev/null
+++ b/test/test_sph_hankel_2_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_sph_hankel_2_nvrtc_float.cpp b/test/test_sph_hankel_2_nvrtc_float.cpp
new file mode 100644
index 000000000..038e215d9
--- /dev/null
+++ b/test/test_sph_hankel_2_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i]
+                        << "\n  Serial: " << res
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From 4f2fe1183e0874828151dfb1c5122bb683a4aebb Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 12:59:39 -0400
Subject: [PATCH 18/26] Update docs

---
 doc/sf/airy.qbk       | 32 +++++++++++++--------------
 doc/sf/expint.qbk     | 16 +++++++-------
 doc/sf/gegenbauer.qbk |  6 ++---
 doc/sf/hankel.qbk     | 51 ++++++++++++++++++++++++++++++++++++-------
 4 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/doc/sf/airy.qbk b/doc/sf/airy.qbk
index 5ff4c7cb5..4756bee2d 100644
--- a/doc/sf/airy.qbk
+++ b/doc/sf/airy.qbk
@@ -18,10 +18,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   namespace boost { namespace math {
 
    template <class T>
-   ``__sf_result`` airy_ai(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x);
 
    template <class T, class Policy>
-   ``__sf_result`` airy_ai(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x, const Policy&);
 
   }} // namespaces
   
@@ -78,10 +78,10 @@ This function is implemented in terms of the Bessel functions using the relation
   namespace boost { namespace math {
 
    template <class T>
-   ``__sf_result`` airy_bi(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x);
 
    template <class T, class Policy>
-   ``__sf_result`` airy_bi(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x, const Policy&);
 
   }} // namespaces
   
@@ -132,10 +132,10 @@ This function is implemented in terms of the Bessel functions using the relation
   namespace boost { namespace math {
 
    template <class T>
-   ``__sf_result`` airy_ai_prime(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x);
 
    template <class T, class Policy>
-   ``__sf_result`` airy_ai_prime(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x, const Policy&);
 
   }} // namespaces
   
@@ -186,10 +186,10 @@ This function is implemented in terms of the Bessel functions using the relation
   namespace boost { namespace math {
 
    template <class T>
-   ``__sf_result`` airy_bi_prime(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x);
 
    template <class T, class Policy>
-   ``__sf_result`` airy_bi_prime(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x, const Policy&);
 
   }} // namespaces
   
@@ -242,23 +242,23 @@ by providing an output iterator.
 The signature of the single value functions are:
 
   template <class T>
-  T airy_ai_zero(
+  BOOST_MATH_GPU_ENABLED T airy_ai_zero(
            int m);         // 1-based index of zero.
 
   template <class T>
-  T airy_bi_zero(
+  BOOST_MATH_GPU_ENABLED T airy_bi_zero(
            int m);         // 1-based index of zero.
 
 and for multiple zeros:
 
  template <class T, class OutputIterator>
- OutputIterator airy_ai_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(
                       int start_index,           // 1-based index of first zero.
                       unsigned number_of_zeros,  // How many zeros to generate.
                       OutputIterator out_it);    // Destination for zeros.
 
  template <class T, class OutputIterator>
- OutputIterator airy_bi_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(
                       int start_index,           // 1-based index of zero.
                       unsigned number_of_zeros,  // How many zeros to generate
                       OutputIterator out_it);    // Destination for zeros.
@@ -266,25 +266,25 @@ and for multiple zeros:
 There are also versions which allow control of the __policy_section for error handling and precision.
 
   template <class T>
-  T airy_ai_zero(
+  BOOST_MATH_GPU_ENABLED T airy_ai_zero(
            int m,          // 1-based index of zero.
            const Policy&); // Policy to use.
 
   template <class T>
-  T airy_bi_zero(
+  BOOST_MATH_GPU_ENABLED T airy_bi_zero(
            int m,          // 1-based index of zero.
            const Policy&); // Policy to use.
 
 
  template <class T, class OutputIterator>
- OutputIterator airy_ai_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(
                       int start_index,           // 1-based index of first zero.
                       unsigned number_of_zeros,  // How many zeros to generate.
                       OutputIterator out_it,     // Destination for zeros.
                       const Policy& pol);        // Policy to use.
 
  template <class T, class OutputIterator>
- OutputIterator airy_bi_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(
                       int start_index,           // 1-based index of zero.
                       unsigned number_of_zeros,  // How many zeros to generate.
                       OutputIterator out_it,     // Destination for zeros.
diff --git a/doc/sf/expint.qbk b/doc/sf/expint.qbk
index 89554730d..f0abf090e 100644
--- a/doc/sf/expint.qbk
+++ b/doc/sf/expint.qbk
@@ -11,10 +11,10 @@
    namespace boost{ namespace math{
    
    template <class T>
-   ``__sf_result`` expint(unsigned n, T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
    
    }} // namespaces
    
@@ -26,10 +26,10 @@ the return type is `double` if T is an integer type, and T otherwise.
 [h4 Description]
 
    template <class T>
-   ``__sf_result`` expint(unsigned n, T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
    
 Returns the [@http://mathworld.wolfram.com/En-Function.html exponential integral En]
 of z:
@@ -100,10 +100,10 @@ is used.
    namespace boost{ namespace math{
    
    template <class T>
-   ``__sf_result`` expint(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` expint(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&);
    
    }} // namespaces
    
@@ -115,10 +115,10 @@ the return type is `double` if T is an integer type, and T otherwise.
 [h4 Description]
 
    template <class T>
-   ``__sf_result`` expint(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` expint(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&);
    
 Returns the [@http://mathworld.wolfram.com/ExponentialIntegral.html exponential integral]
 of z:
diff --git a/doc/sf/gegenbauer.qbk b/doc/sf/gegenbauer.qbk
index a6afc53d8..69671917c 100644
--- a/doc/sf/gegenbauer.qbk
+++ b/doc/sf/gegenbauer.qbk
@@ -16,13 +16,13 @@
    namespace boost{ namespace math{
 
    template<typename Real>
-   Real gegenbauer(unsigned n, Real lambda, Real x);
+   BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x);
 
    template<typename Real>
-   Real gegenbauer_prime(unsigned n, Real lambda, Real x);
+   BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x);
 
    template<typename Real>
-   Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k);
+   BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k);
 
    }} // namespaces
 
diff --git a/doc/sf/hankel.qbk b/doc/sf/hankel.qbk
index 4d8a5eda1..05d65201b 100644
--- a/doc/sf/hankel.qbk
+++ b/doc/sf/hankel.qbk
@@ -3,18 +3,36 @@
 
 [h4 Synopsis]
 
+   #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__)
+
    template <class T1, class T2>
-   std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
 
    template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
 
    template <class T1, class T2>
-   std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
    
    template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
    
+   #else // When using cuda we use namespace cuda::std:: instead of std::
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
+
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
+   
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
+
+   #endif
+
    
 [h4 Description]
 
@@ -77,18 +95,35 @@ routines for integer order are used.
 
 [h4 Synopsis]
 
+   #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__)
+
    template <class T1, class T2>
-   std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
 
    template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
 
    template <class T1, class T2>
-   std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
+   
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
    
+   #else // When using cuda we use namespace cuda::std:: instead of std::
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
+
    template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
    
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
+
+   #endif
    
 [h4 Description]
 

From 79c8b49894597eed9a6cb8b1607950fdfccd7e7c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 13:12:37 -0400
Subject: [PATCH 19/26] Fix writing cuda::std::complex<T> to stdout

---
 test/test_cyl_hankel_1_double.cu        | 4 ++--
 test/test_cyl_hankel_1_float.cu         | 4 ++--
 test/test_cyl_hankel_1_nvrtc_double.cpp | 4 ++--
 test/test_cyl_hankel_1_nvrtc_float.cpp  | 4 ++--
 test/test_cyl_hankel_2_double.cu        | 4 ++--
 test/test_cyl_hankel_2_float.cu         | 4 ++--
 test/test_cyl_hankel_2_nvrtc_double.cpp | 4 ++--
 test/test_cyl_hankel_2_nvrtc_float.cpp  | 4 ++--
 test/test_sph_hankel_1_double.cu        | 4 ++--
 test/test_sph_hankel_1_float.cu         | 4 ++--
 test/test_sph_hankel_1_nvrtc_double.cpp | 4 ++--
 test/test_sph_hankel_1_nvrtc_float.cpp  | 4 ++--
 test/test_sph_hankel_2_double.cu        | 4 ++--
 test/test_sph_hankel_2_float.cu         | 4 ++--
 test/test_sph_hankel_2_nvrtc_double.cpp | 4 ++--
 test/test_sph_hankel_2_nvrtc_float.cpp  | 4 ++--
 16 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/test/test_cyl_hankel_1_double.cu b/test/test_cyl_hankel_1_double.cu
index b718521c6..134946934 100644
--- a/test/test_cyl_hankel_1_double.cu
+++ b/test/test_cyl_hankel_1_double.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_cyl_hankel_1_float.cu b/test/test_cyl_hankel_1_float.cu
index 794d6d763..da78c375c 100644
--- a/test/test_cyl_hankel_1_float.cu
+++ b/test/test_cyl_hankel_1_float.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_cyl_hankel_1_nvrtc_double.cpp b/test/test_cyl_hankel_1_nvrtc_double.cpp
index 01f68ecb5..298436d06 100644
--- a/test/test_cyl_hankel_1_nvrtc_double.cpp
+++ b/test/test_cyl_hankel_1_nvrtc_double.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)
diff --git a/test/test_cyl_hankel_1_nvrtc_float.cpp b/test/test_cyl_hankel_1_nvrtc_float.cpp
index 68fb928c2..d505c7bc4 100644
--- a/test/test_cyl_hankel_1_nvrtc_float.cpp
+++ b/test/test_cyl_hankel_1_nvrtc_float.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)
diff --git a/test/test_cyl_hankel_2_double.cu b/test/test_cyl_hankel_2_double.cu
index 8218b5347..55b643173 100644
--- a/test/test_cyl_hankel_2_double.cu
+++ b/test/test_cyl_hankel_2_double.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_cyl_hankel_2_float.cu b/test/test_cyl_hankel_2_float.cu
index f00174b47..5766ebeb4 100644
--- a/test/test_cyl_hankel_2_float.cu
+++ b/test/test_cyl_hankel_2_float.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_cyl_hankel_2_nvrtc_double.cpp b/test/test_cyl_hankel_2_nvrtc_double.cpp
index 188d93bbe..f7589d201 100644
--- a/test/test_cyl_hankel_2_nvrtc_double.cpp
+++ b/test/test_cyl_hankel_2_nvrtc_double.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)
diff --git a/test/test_cyl_hankel_2_nvrtc_float.cpp b/test/test_cyl_hankel_2_nvrtc_float.cpp
index 9349f3c0d..54216d39c 100644
--- a/test/test_cyl_hankel_2_nvrtc_float.cpp
+++ b/test/test_cyl_hankel_2_nvrtc_float.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)
diff --git a/test/test_sph_hankel_1_double.cu b/test/test_sph_hankel_1_double.cu
index 3d8473e22..ea9ec2306 100644
--- a/test/test_sph_hankel_1_double.cu
+++ b/test/test_sph_hankel_1_double.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_sph_hankel_1_float.cu b/test/test_sph_hankel_1_float.cu
index 9cf17ffb2..4b01fe02a 100644
--- a/test/test_sph_hankel_1_float.cu
+++ b/test/test_sph_hankel_1_float.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_sph_hankel_1_nvrtc_double.cpp b/test/test_sph_hankel_1_nvrtc_double.cpp
index 1e6863c55..3ff1da8b2 100644
--- a/test/test_sph_hankel_1_nvrtc_double.cpp
+++ b/test/test_sph_hankel_1_nvrtc_double.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)
diff --git a/test/test_sph_hankel_1_nvrtc_float.cpp b/test/test_sph_hankel_1_nvrtc_float.cpp
index 28346d284..0b0796653 100644
--- a/test/test_sph_hankel_1_nvrtc_float.cpp
+++ b/test/test_sph_hankel_1_nvrtc_float.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)
diff --git a/test/test_sph_hankel_2_double.cu b/test/test_sph_hankel_2_double.cu
index 6c7a032f6..6631f73a0 100644
--- a/test/test_sph_hankel_2_double.cu
+++ b/test/test_sph_hankel_2_double.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_sph_hankel_2_float.cu b/test/test_sph_hankel_2_float.cu
index bcf41e797..1910aef04 100644
--- a/test/test_sph_hankel_2_float.cu
+++ b/test/test_sph_hankel_2_float.cu
@@ -96,8 +96,8 @@ int main(void)
         if (eps > 10)
         {
             std::cerr << "Result verification failed at element " << i << "!\n"
-                        << "Device: " << output_vector[i]
-                        << "\n  Host: " << results[i]
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
                         << "\n   Eps: " << eps << std::endl;
             ++failure_counter;
             if (failure_counter > 100)
diff --git a/test/test_sph_hankel_2_nvrtc_double.cpp b/test/test_sph_hankel_2_nvrtc_double.cpp
index 17b8c7ff0..fa57fcbb1 100644
--- a/test/test_sph_hankel_2_nvrtc_double.cpp
+++ b/test/test_sph_hankel_2_nvrtc_double.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)
diff --git a/test/test_sph_hankel_2_nvrtc_float.cpp b/test/test_sph_hankel_2_nvrtc_float.cpp
index 038e215d9..be6fd0d09 100644
--- a/test/test_sph_hankel_2_nvrtc_float.cpp
+++ b/test/test_sph_hankel_2_nvrtc_float.cpp
@@ -160,8 +160,8 @@ int main()
             if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
             {
                 std::cout << "error at line: " << i
-                        << "\nParallel: " << h_out[i]
-                        << "\n  Serial: " << res
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
                         << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
                 ++fail_counter;
                 if (fail_counter > 100)

From f10f18a27cca678e74913b1bd4d9c2f8b68171e1 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 13:59:19 -0400
Subject: [PATCH 20/26] Add GPU support to hermite

---
 include/boost/math/special_functions/hermite.hpp  | 14 ++++++++------
 include/boost/math/special_functions/math_fwd.hpp |  8 ++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/boost/math/special_functions/hermite.hpp b/include/boost/math/special_functions/hermite.hpp
index 81ccb2ac6..3d77fc03e 100644
--- a/include/boost/math/special_functions/hermite.hpp
+++ b/include/boost/math/special_functions/hermite.hpp
@@ -1,5 +1,6 @@
 
 //  (C) Copyright John Maddock 2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -11,8 +12,9 @@
 #pragma once
 #endif
 
-#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/config.hpp>
+#include <boost/math/tools/promotion.hpp>
+#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/policies/error_handling.hpp>
 
 namespace boost{
@@ -20,7 +22,7 @@ namespace math{
 
 // Recurrence relation for Hermite polynomials:
 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type 
    hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1)
 {
    using promoted_type = tools::promote_args_t<T1, T2, T3>;
@@ -31,7 +33,7 @@ namespace detail{
 
 // Implement Hermite polynomials via recurrence:
 template <class T>
-T hermite_imp(unsigned n, T x)
+BOOST_MATH_GPU_ENABLED T hermite_imp(unsigned n, T x)
 {
    T p0 = 1;
    T p1 = 2 * x;
@@ -43,7 +45,7 @@ T hermite_imp(unsigned n, T x)
 
    while(c < n)
    {
-      std::swap(p0, p1);
+      BOOST_MATH_GPU_SAFE_SWAP(p0, p1);
       p1 = static_cast<T>(hermite_next(c, x, p0, p1));
       ++c;
    }
@@ -53,7 +55,7 @@ T hermite_imp(unsigned n, T x)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type 
    hermite(unsigned n, T x, const Policy&)
 {
    typedef typename tools::promote_args<T>::type result_type;
@@ -62,7 +64,7 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type 
    hermite(unsigned n, T x)
 {
    return boost::math::hermite(n, x, policies::policy<>());
diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index e989f5b18..91dde74cc 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -331,15 +331,15 @@ namespace boost
       laguerre(unsigned n, T1 m, T2 x);
 
    template <class T>
-   tools::promote_args_t<T>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T>
       hermite(unsigned n, T x);
 
    template <class T, class Policy>
-   tools::promote_args_t<T>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T>
       hermite(unsigned n, T x, const Policy& pol);
 
    template <class T1, class T2, class T3>
-   tools::promote_args_t<T1, T2, T3>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2, T3>
       hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
 
    template<class T1, class T2, class T3>
@@ -1393,7 +1393,7 @@ namespace boost
    laguerre(unsigned n, T1 m, T2 x) { return ::boost::math::laguerre(n, m, x, Policy()); }\
 \
    template <class T>\
-   inline boost::math::tools::promote_args_t<T> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> \
    hermite(unsigned n, T x){ return ::boost::math::hermite(n, x, Policy()); }\
 \
    using boost::math::hermite_next;\

From acb686132023fd3c2639994d4fd23cdffc8b2440 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 13:59:27 -0400
Subject: [PATCH 21/26] Add SYCL testing of hermite

---
 test/sycl_jamfile     | 2 ++
 test/test_hermite.cpp | 5 ++++-
 test/test_hermite.hpp | 6 ++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 991d34520..582eaea40 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -91,3 +91,5 @@ run test_gamma.cpp ;
 run test_igamma.cpp ;
 run test_igamma_inv.cpp ;
 run test_igamma_inva.cpp ;
+
+run test_hermite.cpp ;
diff --git a/test/test_hermite.cpp b/test/test_hermite.cpp
index d1127feec..c7dd8b8c4 100644
--- a/test/test_hermite.cpp
+++ b/test/test_hermite.cpp
@@ -5,8 +5,11 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
-#include"test_hermite.hpp"
+#endif
+
+#include "test_hermite.hpp"
 
 //
 // DESCRIPTION:
diff --git a/test/test_hermite.hpp b/test/test_hermite.hpp
index 0b00677ee..8f7c55ff1 100644
--- a/test/test_hermite.hpp
+++ b/test/test_hermite.hpp
@@ -11,11 +11,17 @@
 // Constants are too big for float case, but this doesn't matter for test.
 #endif
 
+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp>
+#endif
+
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/hermite.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/array.hpp>
 #include "functor.hpp"

From 9f05a5288a5c3991514960847a10f26fdaf2b4f8 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 14:11:31 -0400
Subject: [PATCH 22/26] Add CUDA testing of hermite

---
 test/cuda_jamfile           |   3 +
 test/test_hermite_double.cu | 120 ++++++++++++++++++++++++++++++++++++
 test/test_hermite_float.cu  | 120 ++++++++++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+)
 create mode 100644 test/test_hermite_double.cu
 create mode 100644 test/test_hermite_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 32f87f44e..d72501559 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -357,6 +357,9 @@ run test_expm1_float.cu ;
 run test_gegenbauer_double.cu ;
 run test_gegenbauer_float.cu ;
 
+run test_hermite_double.cu ;
+run test_hermite_float.cu ;
+
 run test_lgamma_double.cu ;
 run test_lgamma_float.cu ;
 run test_tgamma_double.cu ;
diff --git a/test/test_hermite_double.cu b/test/test_hermite_double.cu
new file mode 100644
index 000000000..a53766171
--- /dev/null
+++ b/test/test_hermite_double.cu
@@ -0,0 +1,120 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::hermite(1U, input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    int fail_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i] << '\n'
+                          << "  Host: " << results[i] << '\n'
+                          << "   Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+                fail_counter++;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (fail_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_hermite_float.cu b/test/test_hermite_float.cu
new file mode 100644
index 000000000..c48560bbe
--- /dev/null
+++ b/test/test_hermite_float.cu
@@ -0,0 +1,120 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::hermite(1U, input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    int fail_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i] << '\n'
+                          << "  Host: " << results[i] << '\n'
+                          << "   Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+                fail_counter++;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (fail_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From 677e699cdc508a8ede4d197990db8a663fece48c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 14:16:04 -0400
Subject: [PATCH 23/26] Add NVRTC testing of hermite

---
 test/nvrtc_jamfile                 |   3 +
 test/test_hermite_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++++
 test/test_hermite_nvrtc_float.cpp  | 190 +++++++++++++++++++++++++++++
 3 files changed, 383 insertions(+)
 create mode 100644 test/test_hermite_nvrtc_double.cpp
 create mode 100644 test/test_hermite_nvrtc_float.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 734c87ad4..e049a24d2 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -364,6 +364,9 @@ run test_tgamma_ratio_nvrtc_float.cpp ;
 run test_gegenbauer_nvrtc_double.cpp ;
 run test_gegenbauer_nvrtc_float.cpp ;
 
+run test_hermite_nvrtc_double.cpp ;
+run test_hermite_nvrtc_float.cpp ;
+
 run test_log1p_nvrtc_double.cpp ;
 run test_log1p_nvrtc_float.cpp ;
 
diff --git a/test/test_hermite_nvrtc_double.cpp b/test/test_hermite_nvrtc_double.cpp
new file mode 100644
index 000000000..569d975cb
--- /dev/null
+++ b/test/test_hermite_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/hermite.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hermite.hpp>
+extern "C" __global__ 
+void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_hermite_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::hermite(1U, h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_hermite_nvrtc_float.cpp b/test/test_hermite_nvrtc_float.cpp
new file mode 100644
index 000000000..e2e907c51
--- /dev/null
+++ b/test/test_hermite_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/hermite.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hermite.hpp>
+extern "C" __global__ 
+void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_hermite_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::hermite(1U, h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From cecd0e2cd85fcc9444f160783f0a2b321d3894ad Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 17 Sep 2024 14:16:41 -0400
Subject: [PATCH 24/26] Add markers to hermite docs

---
 doc/sf/hermite.qbk | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/sf/hermite.qbk b/doc/sf/hermite.qbk
index c88aadc34..965aa8092 100644
--- a/doc/sf/hermite.qbk
+++ b/doc/sf/hermite.qbk
@@ -9,13 +9,13 @@
    namespace boost{ namespace math{
    
    template <class T>
-   ``__sf_result`` hermite(unsigned n, T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
       
    }} // namespaces
 
@@ -26,10 +26,10 @@ note than when there is a single template argument the result is the same type
 as that argument or `double` if the template argument is an integer type.
 
    template <class T>
-   ``__sf_result`` hermite(unsigned n, T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
    
 Returns the value of the Hermite Polynomial of order /n/ at point /x/:
 
@@ -43,7 +43,7 @@ Hermite Polynomials:
 [graph hermite]
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
    
 Implements the three term recurrence relation for the Hermite
 polynomials, this function can be used to create a sequence of

From 11529ca0e30e6a3955acc7d2deef5f9e9bdbbfa4 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 18 Sep 2024 09:59:26 -0400
Subject: [PATCH 25/26] Remove using statements for complex functions

---
 include/boost/math/tools/complex.hpp | 60 ----------------------------
 1 file changed, 60 deletions(-)

diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp
index 88db8e8a7..ec5144011 100644
--- a/include/boost/math/tools/complex.hpp
+++ b/include/boost/math/tools/complex.hpp
@@ -24,36 +24,6 @@ namespace math {
 template <typename T>
 using complex = cuda::std::complex<T>;
 
-using cuda::std::real;
-using cuda::std::imag;
-using cuda::std::abs;
-using cuda::std::arg;
-using cuda::std::norm;
-using cuda::std::conj;
-using cuda::std::proj;
-using cuda::std::polar;
-
-using cuda::std::exp;
-using cuda::std::log;
-using cuda::std::log10;
-
-using cuda::std::pow;
-using cuda::std::sqrt;
-
-using cuda::std::sin;
-using cuda::std::cos;
-using cuda::std::tan;
-using cuda::std::asin;
-using cuda::std::acos;
-using cuda::std::atan;
-
-using cuda::std::sinh;
-using cuda::std::cosh;
-using cuda::std::tanh;
-using cuda::std::asinh;
-using cuda::std::acosh;
-using cuda::std::atanh;
-
 } // namespace math
 } // namespace boost
 
@@ -68,36 +38,6 @@ namespace math {
 template <typename T>
 using complex = std::complex<T>;
 
-using std::real;
-using std::imag;
-using std::abs;
-using std::arg;
-using std::norm;
-using std::conj;
-using std::proj;
-using std::polar;
-
-using std::exp;
-using std::log;
-using std::log10;
-
-using std::pow;
-using std::sqrt;
-
-using std::sin;
-using std::cos;
-using std::tan;
-using std::asin;
-using std::acos;
-using std::atan;
-
-using std::sinh;
-using std::cosh;
-using std::tanh;
-using std::asinh;
-using std::acosh;
-using std::atanh;
-
 } // namespace math
 } // namespace boost
 

From c0046200fc92955654608dececc31757a351aabb Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 18 Sep 2024 12:51:00 -0400
Subject: [PATCH 26/26] Fix overflow policy definition location on MS platforms

---
 test/test_hermite.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_hermite.cpp b/test/test_hermite.cpp
index c7dd8b8c4..60dafdb8f 100644
--- a/test/test_hermite.cpp
+++ b/test/test_hermite.cpp
@@ -9,6 +9,10 @@
 #include <pch_light.hpp>
 #endif
 
+#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#endif
+
 #include "test_hermite.hpp"
 
 //