From e0db5811100af3f8ecd0c4d6eb077eec49083666 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 29 Jul 2024 16:37:39 -0400
Subject: [PATCH 01/33] Add GPU support to sqrt1pm1

---
 include/boost/math/special_functions/sqrt1pm1.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/include/boost/math/special_functions/sqrt1pm1.hpp b/include/boost/math/special_functions/sqrt1pm1.hpp
index 041916a53..4d8aeb38c 100644
--- a/include/boost/math/special_functions/sqrt1pm1.hpp
+++ b/include/boost/math/special_functions/sqrt1pm1.hpp
@@ -10,6 +10,7 @@
 #pragma once
 #endif
 
+#include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/log1p.hpp>
 #include <boost/math/special_functions/expm1.hpp>
@@ -21,7 +22,7 @@
 namespace boost{ namespace math{
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type sqrt1pm1(const T& val, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type sqrt1pm1(const T& val, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
    BOOST_MATH_STD_USING
@@ -32,7 +33,7 @@ inline typename tools::promote_args<T>::type sqrt1pm1(const T& val, const Policy
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type sqrt1pm1(const T& val)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type sqrt1pm1(const T& val)
 {
    return sqrt1pm1(val, policies::policy<>());
 }

From 24c0c01a7216b965dcbd8e526efd41fc4bc523a2 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 29 Jul 2024 17:10:15 -0400
Subject: [PATCH 02/33] Add GPU support to continued fractions and remove
 recursion

---
 include/boost/math/tools/fraction.hpp | 125 ++++++++++++++++++++------
 1 file changed, 96 insertions(+), 29 deletions(-)

diff --git a/include/boost/math/tools/fraction.hpp b/include/boost/math/tools/fraction.hpp
index a64c07025..1970e5ca3 100644
--- a/include/boost/math/tools/fraction.hpp
+++ b/include/boost/math/tools/fraction.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright John Maddock 2005-2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,6 +11,8 @@
 #pragma once
 #endif
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/complex.hpp>
 #include <type_traits>
@@ -25,7 +28,7 @@ namespace detail
    struct is_pair : public std::false_type{};
 
    template <typename T, typename U>
-   struct is_pair<std::pair<T,U>> : public std::true_type{};
+   struct is_pair<boost::math::pair<T,U>> : public std::true_type{};
 
    template <typename Gen>
    struct fraction_traits_simple
@@ -33,11 +36,11 @@ namespace detail
       using result_type = typename Gen::result_type;
       using  value_type = typename Gen::result_type;
 
-      static result_type a(const value_type&) BOOST_MATH_NOEXCEPT(value_type)
+      BOOST_MATH_GPU_ENABLED static result_type a(const value_type&) BOOST_MATH_NOEXCEPT(value_type)
       {
          return 1;
       }
-      static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
+      BOOST_MATH_GPU_ENABLED static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
       {
          return v;
       }
@@ -49,11 +52,11 @@ namespace detail
       using  value_type = typename Gen::result_type;
       using result_type = typename value_type::first_type;
 
-      static result_type a(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
+      BOOST_MATH_GPU_ENABLED static result_type a(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
       {
          return v.first;
       }
-      static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
+      BOOST_MATH_GPU_ENABLED static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
       {
          return v.second;
       }
@@ -74,7 +77,7 @@ namespace detail
       // For float, double, and long double, 1/min_value<T>() is finite.
       // But for mpfr_float and cpp_bin_float, 1/min_value<T>() is inf.
       // Multiply the min by 16 so that the reciprocal doesn't overflow.
-      static T get() {
+      BOOST_MATH_GPU_ENABLED static T get() {
          return 16*tools::min_value<T>();
       }
    };
@@ -82,13 +85,15 @@ namespace detail
    struct tiny_value<T, true>
    {
       using value_type = typename T::value_type;
-      static T get() {
+      BOOST_MATH_GPU_ENABLED static T get() {
          return 16*tools::min_value<value_type>();
       }
    };
 
 } // namespace detail
 
+namespace detail {
+
 //
 // continued_fraction_b
 // Evaluates:
@@ -103,9 +108,15 @@ namespace detail
 //
 // Note that the first a0 returned by generator Gen is discarded.
 //
+
 template <typename Gen, typename U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor, std::uintmax_t& max_terms)
-      noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b_impl(Gen& g, const U& factor, std::uintmax_t& max_terms)
+      noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+      #ifndef BOOST_MATH_ENABLE_SYCL
+      // SYCL can not handle this condition so we only check float on that platform
+      && noexcept(std::declval<Gen>()())
+      #endif
+      )
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -148,17 +159,38 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
    return f;
 }
 
+} // namespace detail
+
+template <typename Gen, typename U>
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor, std::uintmax_t& max_terms)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+         #ifndef BOOST_MATH_ENABLE_SYCL
+         && noexcept(std::declval<Gen>()())
+         #endif
+         )
+{
+   return detail::continued_fraction_b_impl(g, factor, max_terms);
+}
+
 template <typename Gen, typename U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor)
-   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
 {
    std::uintmax_t max_terms = (std::numeric_limits<std::uintmax_t>::max)();
-   return continued_fraction_b(g, factor, max_terms);
+   return detail::continued_fraction_b_impl(g, factor, max_terms);
 }
 
 template <typename Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits)
-   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -167,12 +199,16 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
 
    result_type factor = ldexp(1.0f, 1 - bits); // 1 / pow(result_type(2), bits);
    std::uintmax_t max_terms = (std::numeric_limits<std::uintmax_t>::max)();
-   return continued_fraction_b(g, factor, max_terms);
+   return detail::continued_fraction_b_impl(g, factor, max_terms);
 }
 
 template <typename Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits, std::uintmax_t& max_terms)
-   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits, std::uintmax_t& max_terms)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -180,9 +216,11 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
    using result_type = typename traits::result_type;
 
    result_type factor = ldexp(1.0f, 1 - bits); // 1 / pow(result_type(2), bits);
-   return continued_fraction_b(g, factor, max_terms);
+   return detail::continued_fraction_b_impl(g, factor, max_terms);
 }
 
+namespace detail {
+
 //
 // continued_fraction_a
 // Evaluates:
@@ -198,8 +236,12 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
 // Note that the first a1 and b1 returned by generator Gen are both used.
 //
 template <typename Gen, typename U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor, std::uintmax_t& max_terms)
-   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a_impl(Gen& g, const U& factor, std::uintmax_t& max_terms)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -244,17 +286,38 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(G
    return a0/f;
 }
 
+} // namespace detail
+
+template <typename Gen, typename U>
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor, std::uintmax_t& max_terms)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
+{
+   return detail::continued_fraction_a_impl(g, factor, max_terms);
+}
+
 template <typename Gen, typename U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor)
-   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type)
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
 {
    std::uintmax_t max_iter = (std::numeric_limits<std::uintmax_t>::max)();
-   return continued_fraction_a(g, factor, max_iter);
+   return detail::continued_fraction_a_impl(g, factor, max_iter);
 }
 
 template <typename Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits)
-   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -264,12 +327,16 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(G
    result_type factor = ldexp(1.0f, 1-bits); // 1 / pow(result_type(2), bits);
    std::uintmax_t max_iter = (std::numeric_limits<std::uintmax_t>::max)();
 
-   return continued_fraction_a(g, factor, max_iter);
+   return detail::continued_fraction_a_impl(g, factor, max_iter);
 }
 
 template <typename Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits, std::uintmax_t& max_terms)
-   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
+BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits, std::uintmax_t& max_terms)
+   noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) 
+   #ifndef BOOST_MATH_ENABLE_SYCL
+   && noexcept(std::declval<Gen>()())
+   #endif
+   )
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -277,7 +344,7 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(G
    using result_type = typename traits::result_type;
 
    result_type factor = ldexp(1.0f, 1-bits); // 1 / pow(result_type(2), bits);
-   return continued_fraction_a(g, factor, max_terms);
+   return detail::continued_fraction_a_impl(g, factor, max_terms);
 }
 
 } // namespace tools

From 96953ea52186d70905efc64294b0e0cb3d98f30f Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 29 Jul 2024 17:10:38 -0400
Subject: [PATCH 03/33] Add GPU support to erf and erfc

---
 include/boost/math/special_functions/erf.hpp  | 90 +++++++++++--------
 .../boost/math/special_functions/math_fwd.hpp | 12 +--
 2 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/include/boost/math/special_functions/erf.hpp b/include/boost/math/special_functions/erf.hpp
index 57ff60529..323a91937 100644
--- a/include/boost/math/special_functions/erf.hpp
+++ b/include/boost/math/special_functions/erf.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright John Maddock 2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -39,7 +40,7 @@ template <class T>
 struct erf_asympt_series_t
 {
    // LCOV_EXCL_START multiprecision case only, excluded from coverage analysis
-   erf_asympt_series_t(T z) : xx(2 * -z * z), tk(1)
+   BOOST_MATH_GPU_ENABLED erf_asympt_series_t(T z) : xx(2 * -z * z), tk(1)
    {
       BOOST_MATH_STD_USING
       result = -exp(-z * z) / sqrt(boost::math::constants::pi<T>());
@@ -48,7 +49,7 @@ struct erf_asympt_series_t
 
    typedef T result_type;
 
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
    {
       BOOST_MATH_STD_USING
       T r = result;
@@ -68,33 +69,33 @@ struct erf_asympt_series_t
 // How large z has to be in order to ensure that the series converges:
 //
 template <class T>
-inline float erf_asymptotic_limit_N(const T&)
+BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const T&)
 {
    return (std::numeric_limits<float>::max)();
 }
-inline float erf_asymptotic_limit_N(const std::integral_constant<int, 24>&)
+BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant<int, 24>&)
 {
    return 2.8F;
 }
-inline float erf_asymptotic_limit_N(const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant<int, 53>&)
 {
    return 4.3F;
 }
-inline float erf_asymptotic_limit_N(const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant<int, 64>&)
 {
    return 4.8F;
 }
-inline float erf_asymptotic_limit_N(const std::integral_constant<int, 106>&)
+BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant<int, 106>&)
 {
    return 6.5F;
 }
-inline float erf_asymptotic_limit_N(const std::integral_constant<int, 113>&)
+BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant<int, 113>&)
 {
    return 6.8F;
 }
 
 template <class T, class Policy>
-inline T erf_asymptotic_limit()
+BOOST_MATH_GPU_ENABLED inline T erf_asymptotic_limit()
 {
    typedef typename policies::precision<T, Policy>::type precision_type;
    typedef std::integral_constant<int,
@@ -198,7 +199,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const Tag& t)
 }
 
 template <class T, class Policy>
-T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int, 53>& t)
+BOOST_MATH_GPU_ENABLED T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int, 53>& t)
 {
    BOOST_MATH_STD_USING
 
@@ -207,14 +208,30 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
    if ((boost::math::isnan)(z))
       return policies::raise_domain_error("boost::math::erf<%1%>(%1%)", "Expected a finite argument but got %1%", z, pol);
 
+   int prefix_multiplier = 1;
+   int prefix_adder = 0;
+
    if(z < 0)
    {
+      // Recursion is logically simpler here, but confuses static analyzers that need to be
+      // able to calculate the maximimum program stack size at compile time (ie CUDA).
+      z = -z;
       if(!invert)
-         return -erf_imp(T(-z), invert, pol, t);
-      else if(z < T(-0.5))
-         return 2 - erf_imp(T(-z), invert, pol, t);
+      {
+         prefix_multiplier = -1;
+         // return -erf_imp(T(-z), invert, pol, t);
+      }
+      else if(z < -0.5)
+      {
+         prefix_adder = 2;
+         // return 2 - erf_imp(T(-z), invert, pol, t);
+      }
       else
-         return 1 + erf_imp(T(-z), false, pol, t);
+      {
+         invert = false;
+         prefix_adder = 1;
+         // return 1 + erf_imp(T(-z), false, pol, t);
+      }
    }
 
    T result;
@@ -237,7 +254,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
          }
          else
          {
-            static const T c = BOOST_MATH_BIG_CONSTANT(T, 53, 0.003379167095512573896158903121545171688);  // LCOV_EXCL_LINE
+            BOOST_MATH_STATIC_LOCAL_VARIABLE const T c = BOOST_MATH_BIG_CONSTANT(T, 53, 0.003379167095512573896158903121545171688);  // LCOV_EXCL_LINE
             result = static_cast<T>(z * 1.125f + z * c);
          }
       }
@@ -248,15 +265,15 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
          // Maximum Relative Change in Control Points:   1.155e-04
          // Max Error found at double precision =        2.961182e-17
          // LCOV_EXCL_START
-         static const T Y = 1.044948577880859375f;
-         static const T P[] = {    
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.044948577880859375f;
+         BOOST_MATH_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0834305892146531832907),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.338165134459360935041),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.0509990735146777432841),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.00772758345802133288487),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.000322780120964605683831),
          };
-         static const T Q[] = {    
+         BOOST_MATH_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.455004033050794024546),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0875222600142252549554),
@@ -281,8 +298,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
          // Maximum Relative Change in Control Points:   2.845e-04
          // Max Error found at double precision =        4.841816e-17
          // LCOV_EXCL_START
-         static const T Y = 0.405935764312744140625f;
-         static const T P[] = {    
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 0.405935764312744140625f;
+         BOOST_MATH_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.098090592216281240205),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.178114665841120341155),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.191003695796775433986),
@@ -290,7 +307,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0195049001251218801359),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00180424538297014223957),
          };
-         static const T Q[] = {    
+         BOOST_MATH_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.84759070983002217845),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.42628004845511324508),
@@ -316,8 +333,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
          // Expected Error Term:                         3.909e-18
          // Maximum Relative Change in Control Points:   9.886e-05
          // LCOV_EXCL_START
-         static const T Y = 0.50672817230224609375f;
-         static const T P[] = {    
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 0.50672817230224609375f;
+         BOOST_MATH_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.0243500476207698441272),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0386540375035707201728),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.04394818964209516296),
@@ -325,7 +342,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00323962406290842133584),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.000235839115596880717416),
          };
-         static const T Q[] = {    
+         BOOST_MATH_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.53991494948552447182),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.982403709157920235114),
@@ -351,8 +368,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
          // Maximum Relative Change in Control Points:   2.222e-04
          // Max Error found at double precision =        2.062515e-17
          // LCOV_EXCL_START
-         static const T Y = 0.5405750274658203125f;
-         static const T P[] = {    
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 0.5405750274658203125f;
+         BOOST_MATH_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00295276716530971662634),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0137384425896355332126),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00840807615555585383007),
@@ -360,7 +377,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.000250269961544794627958),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.113212406648847561139e-4),
          };
-         static const T Q[] = {    
+         BOOST_MATH_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.04217814166938418171),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.442597659481563127003),
@@ -386,8 +403,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
          // Expected Error Term:                         2.859e-17
          // Maximum Relative Change in Control Points:   1.357e-05
          // LCOV_EXCL_START
-         static const T Y = 0.5579090118408203125f;
-         static const T P[] = {    
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 0.5579090118408203125f;
+         BOOST_MATH_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00628057170626964891937),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0175389834052493308818),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.212652252872804219852),
@@ -396,7 +413,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
             BOOST_MATH_BIG_CONSTANT(T, 53, -3.22729451764143718517),
             BOOST_MATH_BIG_CONSTANT(T, 53, -2.8175401114513378771),
          };
-         static const T Q[] = {    
+         BOOST_MATH_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 2.79257750980575282228),
             BOOST_MATH_BIG_CONSTANT(T, 53, 11.0567237927800161565),
@@ -428,10 +445,11 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
 
    if(invert)
    {
-      result = 1 - result;
+      prefix_adder += prefix_multiplier * 1;
+      prefix_multiplier = -prefix_multiplier;
    }
 
-   return result;
+   return prefix_adder + prefix_multiplier * result;
 } // template <class T, class Lanczos>T erf_imp(T z, bool invert, const Lanczos& l, const std::integral_constant<int, 53>& t)
 
 
@@ -1175,7 +1193,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant<int,
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type erf(T z, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type erf(T z, const Policy& /* pol */)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -1208,7 +1226,7 @@ inline typename tools::promote_args<T>::type erf(T z, const Policy& /* pol */)
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type erfc(T z, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type erfc(T z, const Policy& /* pol */)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -1241,13 +1259,13 @@ inline typename tools::promote_args<T>::type erfc(T z, const Policy& /* pol */)
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erf(T z)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type erf(T z)
 {
    return boost::math::erf(z, policies::policy<>());
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erfc(T z)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type erfc(T z)
 {
    return boost::math::erfc(z, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index 5e8153734..cbfe27e81 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -156,14 +156,14 @@ namespace boost
 
    // erf & erfc error functions.
    template <class RT> // Error function.
-   tools::promote_args_t<RT> erf(RT z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erf(RT z);
    template <class RT, class Policy> // Error function.
-   tools::promote_args_t<RT> erf(RT z, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erf(RT z, const Policy&);
 
    template <class RT>// Error function complement.
-   tools::promote_args_t<RT> erfc(RT z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erfc(RT z);
    template <class RT, class Policy>// Error function complement.
-   tools::promote_args_t<RT> erfc(RT z, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erfc(RT z, const Policy&);
 
    template <class RT>// Error function inverse.
    tools::promote_args_t<RT> erf_inv(RT z);
@@ -1272,10 +1272,10 @@ namespace boost
    template <class T> T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient<T, Policy>(n, k, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> erf(RT z) { return ::boost::math::erf(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erf(RT z) { return ::boost::math::erf(z, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> erfc(RT z){ return ::boost::math::erfc(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erfc(RT z){ return ::boost::math::erfc(z, Policy()); }\
 \
    template <class RT>\
    inline boost::math::tools::promote_args_t<RT> erf_inv(RT z) { return ::boost::math::erf_inv(z, Policy()); }\

From a8af41e17e845f5fc50bafb0486951604530727a Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 29 Jul 2024 17:10:47 -0400
Subject: [PATCH 04/33] Add SYCL erf testing

---
 test/sycl_jamfile |  1 +
 test/test_erf.cpp |  4 ++++
 test/test_erf.hpp | 11 +++++++++--
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 642329806..e336523e6 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -32,3 +32,4 @@ run test_expm1_simple.cpp ;
 run test_log1p_simple.cpp ;
 run test_digamma_simple.cpp ;
 run test_trigamma.cpp ;
+run test_erf.cpp ;
diff --git a/test/test_erf.cpp b/test/test_erf.cpp
index 535903983..beb521ac4 100644
--- a/test/test_erf.cpp
+++ b/test/test_erf.cpp
@@ -4,7 +4,11 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#include <boost/math/special_functions/erf.hpp>
 #include "test_erf.hpp"
 
 //
diff --git a/test/test_erf.hpp b/test/test_erf.hpp
index dc42c8124..b70c73953 100644
--- a/test/test_erf.hpp
+++ b/test/test_erf.hpp
@@ -1,9 +1,11 @@
-// Copyright John Maddock 2006.
-// Copyright Paul A. Bristow 2007, 2009
+//  Copyright John Maddock 2006.
+//  Copyright Paul A. Bristow 2007, 2009
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#include <boost/math/tools/config.hpp>
 #include <boost/math/concepts/real_concept.hpp>
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
@@ -21,6 +23,11 @@
 #define SC_(x) static_cast<typename table_type<T>::type>(BOOST_JOIN(x, L))
 #endif
 
+#ifdef BOOST_MATH_NO_EXCEPTIONS
+#  undef BOOST_CHECK_THROW
+#  define BOOST_CHECK_THROW(x, y)
+#endif
+
 template <class Real, class T>
 void do_test_erf(const T& data, const char* type_name, const char* test_name)
 {

From c91ac7ccb147855ca16739ec24b76f5b7e6e8590 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 29 Jul 2024 17:18:00 -0400
Subject: [PATCH 05/33] Add sqrt1pm1 to fwd

---
 include/boost/math/special_functions/math_fwd.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index cbfe27e81..c789d82df 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -600,10 +600,10 @@ namespace boost
 
    // sqrt(1+x) - 1
    template <class T>
-   tools::promote_args_t<T> sqrt1pm1(const T& val);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> sqrt1pm1(const T& val);
 
    template <class T, class Policy>
-   tools::promote_args_t<T> sqrt1pm1(const T& val, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> sqrt1pm1(const T& val, const Policy&);
 
    // sinus cardinals:
    template <class T>
@@ -1484,7 +1484,7 @@ namespace boost
    powm1(const T1 a, const T2 z){ return boost::math::powm1(a, z, Policy()); }\
 \
    template <class T>\
-   inline boost::math::tools::promote_args_t<T> sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\
 \
    template <class T>\
    inline boost::math::tools::promote_args_t<T> sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\

From 325720d9035546f865603a719271f94a7a6a467e Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 29 Jul 2024 17:18:42 -0400
Subject: [PATCH 06/33] Add CUDA erf and erfc testing

---
 test/cuda_jamfile        |   4 ++
 test/test_erf_double.cu  | 100 +++++++++++++++++++++++++++++++++++++++
 test/test_erf_float.cu   | 100 +++++++++++++++++++++++++++++++++++++++
 test/test_erfc_double.cu | 100 +++++++++++++++++++++++++++++++++++++++
 test/test_erfc_float.cu  | 100 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 404 insertions(+)
 create mode 100644 test/test_erf_double.cu
 create mode 100644 test/test_erf_float.cu
 create mode 100644 test/test_erfc_double.cu
 create mode 100644 test/test_erfc_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index cb067d564..fe67d9f4e 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -100,6 +100,10 @@ run test_cos_pi_double.cu ;
 run test_cos_pi_float.cu ;
 run test_digamma_double.cu ;
 run test_digamma_float.cu ;
+run test_erf_double.cu ;
+run test_erf_float.cu ;
+run test_erfc_double.cu ;
+run test_erfc_float.cu ;
 run test_expm1_double.cu ;
 run test_expm1_float.cu ;
 run test_log1p_double.cu ;
diff --git a/test/test_erf_double.cu b/test/test_erf_double.cu
new file mode 100644
index 000000000..3e8398262
--- /dev/null
+++ b/test/test_erf_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_erf_float.cu b/test/test_erf_float.cu
new file mode 100644
index 000000000..6cbd07e6a
--- /dev/null
+++ b/test/test_erf_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_erfc_double.cu b/test/test_erfc_double.cu
new file mode 100644
index 000000000..86d3c6e5b
--- /dev/null
+++ b/test/test_erfc_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erfc(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erfc(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_erfc_float.cu b/test/test_erfc_float.cu
new file mode 100644
index 000000000..7970063a4
--- /dev/null
+++ b/test/test_erfc_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erfc(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erfc(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From d1b582332b23e6695dce58fa5b2cfb7fd6277df2 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 08:22:19 -0400
Subject: [PATCH 07/33] Replace integral constant pointers with references

---
 .../boost/math/special_functions/detail/erf_inv.hpp   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/boost/math/special_functions/detail/erf_inv.hpp b/include/boost/math/special_functions/detail/erf_inv.hpp
index 0054a7426..690ea4a44 100644
--- a/include/boost/math/special_functions/detail/erf_inv.hpp
+++ b/include/boost/math/special_functions/detail/erf_inv.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright John Maddock 2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -23,7 +24,7 @@ namespace detail{
 // this version is for 80-bit long double's and smaller:
 //
 template <class T, class Policy>
-T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant<int, 64>*)
+T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant<int, 64>&)
 {
    BOOST_MATH_STD_USING // for ADL of std names.
 
@@ -310,12 +311,12 @@ struct erf_roots
 };
 
 template <class T, class Policy>
-T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_constant<int, 0>*)
+T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_constant<int, 0>&)
 {
    //
    // Generic version, get a guess that's accurate to 64-bits (10^-19)
    //
-   T guess = erf_inv_imp(p, q, pol, static_cast<std::integral_constant<int, 64> const*>(nullptr));
+   T guess = erf_inv_imp(p, q, pol, std::integral_constant<int, 64>());
    T result;
    //
    // If T has more bit's than 64 in it's mantissa then we need to iterate,
@@ -401,7 +402,7 @@ typename tools::promote_args<T>::type erfc_inv(T z, const Policy& pol)
    // And get the result, negating where required:
    //
    return s * policies::checked_narrowing_cast<result_type, forwarding_policy>(
-      detail::erf_inv_imp(static_cast<eval_type>(p), static_cast<eval_type>(q), forwarding_policy(), static_cast<tag_type const*>(nullptr)), function);
+      detail::erf_inv_imp(static_cast<eval_type>(p), static_cast<eval_type>(q), forwarding_policy(), tag_type()), function);
 }
 
 template <class T, class Policy>
@@ -469,7 +470,7 @@ typename tools::promote_args<T>::type erf_inv(T z, const Policy& pol)
    // And get the result, negating where required:
    //
    return s * policies::checked_narrowing_cast<result_type, forwarding_policy>(
-      detail::erf_inv_imp(static_cast<eval_type>(p), static_cast<eval_type>(q), forwarding_policy(), static_cast<tag_type const*>(nullptr)), function);
+      detail::erf_inv_imp(static_cast<eval_type>(p), static_cast<eval_type>(q), forwarding_policy(), tag_type()), function);
 }
 
 template <class T>

From 6a12bf06a745ccb7a031dd6d3fd5c3a9223a5808 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 08:31:37 -0400
Subject: [PATCH 08/33] Make erf_inv and erfc_inv GPU enabled

---
 .../math/special_functions/detail/erf_inv.hpp | 57 ++++++++++---------
 .../boost/math/special_functions/math_fwd.hpp | 12 ++--
 2 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/include/boost/math/special_functions/detail/erf_inv.hpp b/include/boost/math/special_functions/detail/erf_inv.hpp
index 690ea4a44..09772632d 100644
--- a/include/boost/math/special_functions/detail/erf_inv.hpp
+++ b/include/boost/math/special_functions/detail/erf_inv.hpp
@@ -14,6 +14,7 @@
 #pragma warning(disable:4702) // Unreachable code: optimization warning
 #endif
 
+#include <boost/math/tools/config.hpp>
 #include <type_traits>
 
 namespace boost{ namespace math{
@@ -24,7 +25,7 @@ namespace detail{
 // this version is for 80-bit long double's and smaller:
 //
 template <class T, class Policy>
-T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant<int, 64>&)
 {
    BOOST_MATH_STD_USING // for ADL of std names.
 
@@ -45,8 +46,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
       // Maximum Deviation Found (actual error term at infinite precision) 8.030e-21
       //
       // LCOV_EXCL_START
-      static const float Y = 0.0891314744949340820313f;
-      static const T P[] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.0891314744949340820313f;
+      BOOST_MATH_STATIC const T P[] = {
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.000508781949658280665617),
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.00836874819741736770379),
          BOOST_MATH_BIG_CONSTANT(T, 64, 0.0334806625409744615033),
@@ -56,7 +57,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
          BOOST_MATH_BIG_CONSTANT(T, 64, 0.00822687874676915743155),
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.00538772965071242932965)
       };
-      static const T Q[] = {
+      BOOST_MATH_STATIC const T Q[] = {
          BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.970005043303290640362),
          BOOST_MATH_BIG_CONSTANT(T, 64, -1.56574558234175846809),
@@ -88,8 +89,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
       // Maximum Deviation Found (error term) 4.811e-20
       //
       // LCOV_EXCL_START
-      static const float Y = 2.249481201171875f;
-      static const T P[] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 2.249481201171875f;
+      BOOST_MATH_STATIC const T P[] = {
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.202433508355938759655),
          BOOST_MATH_BIG_CONSTANT(T, 64, 0.105264680699391713268),
          BOOST_MATH_BIG_CONSTANT(T, 64, 8.37050328343119927838),
@@ -100,7 +101,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
          BOOST_MATH_BIG_CONSTANT(T, 64, 21.1294655448340526258),
          BOOST_MATH_BIG_CONSTANT(T, 64, -3.67192254707729348546)
       };
-      static const T Q[] = {
+      BOOST_MATH_STATIC const T Q[] = {
          BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 64, 6.24264124854247537712),
          BOOST_MATH_BIG_CONSTANT(T, 64, 3.9713437953343869095),
@@ -143,8 +144,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
       {
          // LCOV_EXCL_START
          // Max error found: 1.089051e-20
-         static const float Y = 0.807220458984375f;
-         static const T P[] = {
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.807220458984375f;
+         BOOST_MATH_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.131102781679951906451),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.163794047193317060787),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.117030156341995252019),
@@ -157,7 +158,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.285225331782217055858e-7),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.681149956853776992068e-9)
          };
-         static const T Q[] = {
+         BOOST_MATH_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 3.46625407242567245975),
             BOOST_MATH_BIG_CONSTANT(T, 64, 5.38168345707006855425),
@@ -176,8 +177,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
       {
          // LCOV_EXCL_START
          // Max error found: 8.389174e-21
-         static const float Y = 0.93995571136474609375f;
-         static const T P[] = {
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.93995571136474609375f;
+         BOOST_MATH_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.0350353787183177984712),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.00222426529213447927281),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.0185573306514231072324),
@@ -188,7 +189,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.230404776911882601748e-9),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.266339227425782031962e-11)
          };
-         static const T Q[] = {
+         BOOST_MATH_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.3653349817554063097),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.762059164553623404043),
@@ -206,8 +207,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
       {
          // LCOV_EXCL_START
          // Max error found: 1.481312e-19
-         static const float Y = 0.98362827301025390625f;
-         static const T P[] = {
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.98362827301025390625f;
+         BOOST_MATH_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.0167431005076633737133),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.00112951438745580278863),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.00105628862152492910091),
@@ -218,7 +219,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.281128735628831791805e-13),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.99055709973310326855e-16)
          };
-         static const T Q[] = {
+         BOOST_MATH_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.591429344886417493481),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.138151865749083321638),
@@ -236,8 +237,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
       {
          // LCOV_EXCL_START
          // Max error found: 5.697761e-20
-         static const float Y = 0.99714565277099609375f;
-         static const T P[] = {
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.99714565277099609375f;
+         BOOST_MATH_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.0024978212791898131227),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.779190719229053954292e-5),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.254723037413027451751e-4),
@@ -247,7 +248,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.145596286718675035587e-11),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.116765012397184275695e-17)
          };
-         static const T Q[] = {
+         BOOST_MATH_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.207123112214422517181),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.0169410838120975906478),
@@ -265,8 +266,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
       {
          // LCOV_EXCL_START
          // Max error found: 1.279746e-20
-         static const float Y = 0.99941349029541015625f;
-         static const T P[] = {
+         BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.99941349029541015625f;
+         BOOST_MATH_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.000539042911019078575891),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.28398759004727721098e-6),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.899465114892291446442e-6),
@@ -276,7 +277,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.135880130108924861008e-14),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.348890393399948882918e-21)
          };
-         static const T Q[] = {
+         BOOST_MATH_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.0845746234001899436914),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.00282092984726264681981),
@@ -345,14 +346,14 @@ T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_con
 } // namespace detail
 
 template <class T, class Policy>
-typename tools::promote_args<T>::type erfc_inv(T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<T>::type erfc_inv(T z, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
 
    //
    // Begin by testing for domain errors, and other special cases:
    //
-   static const char* function = "boost::math::erfc_inv<%1%>(%1%, %1%)";
+   constexpr auto function = "boost::math::erfc_inv<%1%>(%1%, %1%)";
    if((z < 0) || (z > 2))
       return policies::raise_domain_error<result_type>(function, "Argument outside range [0,2] in inverse erfc function (got p=%1%).", z, pol);
    if(z == 0)
@@ -406,14 +407,14 @@ typename tools::promote_args<T>::type erfc_inv(T z, const Policy& pol)
 }
 
 template <class T, class Policy>
-typename tools::promote_args<T>::type erf_inv(T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<T>::type erf_inv(T z, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
 
    //
    // Begin by testing for domain errors, and other special cases:
    //
-   static const char* function = "boost::math::erf_inv<%1%>(%1%, %1%)";
+   constexpr auto function = "boost::math::erf_inv<%1%>(%1%, %1%)";
    if((z < -1) || (z > 1))
       return policies::raise_domain_error<result_type>(function, "Argument outside range [-1, 1] in inverse erf function (got p=%1%).", z, pol);
    if(z == 1)
@@ -474,13 +475,13 @@ typename tools::promote_args<T>::type erf_inv(T z, const Policy& pol)
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erfc_inv(T z)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type erfc_inv(T z)
 {
    return erfc_inv(z, policies::policy<>());
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erf_inv(T z)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type erf_inv(T z)
 {
    return erf_inv(z, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index c789d82df..1d22f61b1 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -166,14 +166,14 @@ namespace boost
    BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erfc(RT z, const Policy&);
 
    template <class RT>// Error function inverse.
-   tools::promote_args_t<RT> erf_inv(RT z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erf_inv(RT z);
    template <class RT, class Policy>// Error function inverse.
-   tools::promote_args_t<RT> erf_inv(RT z, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erf_inv(RT z, const Policy& pol);
 
    template <class RT>// Error function complement inverse.
-   tools::promote_args_t<RT> erfc_inv(RT z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erfc_inv(RT z);
    template <class RT, class Policy>// Error function complement inverse.
-   tools::promote_args_t<RT> erfc_inv(RT z, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> erfc_inv(RT z, const Policy& pol);
 
    // Polynomials:
    template <class T1, class T2, class T3>
@@ -1278,10 +1278,10 @@ namespace boost
    BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erfc(RT z){ return ::boost::math::erfc(z, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> erf_inv(RT z) { return ::boost::math::erf_inv(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erf_inv(RT z) { return ::boost::math::erf_inv(z, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> erfc_inv(RT z){ return ::boost::math::erfc_inv(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erfc_inv(RT z){ return ::boost::math::erfc_inv(z, Policy()); }\
 \
    using boost::math::legendre_next;\
 \

From 0e1be528125151c78b9cbb2b78fb7fddda0c70e4 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 08:31:49 -0400
Subject: [PATCH 09/33] Add erf_inv and erfc_inv CUDA testing

---
 test/cuda_jamfile            |   4 ++
 test/test_erf_inv_double.cu  | 100 +++++++++++++++++++++++++++++++++++
 test/test_erf_inv_float.cu   | 100 +++++++++++++++++++++++++++++++++++
 test/test_erfc_inv_double.cu | 100 +++++++++++++++++++++++++++++++++++
 test/test_erfc_inv_float.cu  | 100 +++++++++++++++++++++++++++++++++++
 5 files changed, 404 insertions(+)
 create mode 100644 test/test_erf_inv_double.cu
 create mode 100644 test/test_erf_inv_float.cu
 create mode 100644 test/test_erfc_inv_double.cu
 create mode 100644 test/test_erfc_inv_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index fe67d9f4e..7af3b9e48 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -102,8 +102,12 @@ run test_digamma_double.cu ;
 run test_digamma_float.cu ;
 run test_erf_double.cu ;
 run test_erf_float.cu ;
+run test_erf_inv_double.cu ;
+run test_erf_inv_float.cu ;
 run test_erfc_double.cu ;
 run test_erfc_float.cu ;
+run test_erfc_inv_double.cu ;
+run test_erfc_inv_float.cu ;
 run test_expm1_double.cu ;
 run test_expm1_float.cu ;
 run test_log1p_double.cu ;
diff --git a/test/test_erf_inv_double.cu b/test/test_erf_inv_double.cu
new file mode 100644
index 000000000..f540babbb
--- /dev/null
+++ b/test/test_erf_inv_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf_inv(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf_inv(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_erf_inv_float.cu b/test/test_erf_inv_float.cu
new file mode 100644
index 000000000..d9f37687f
--- /dev/null
+++ b/test/test_erf_inv_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf_inv(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf_inv(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_erfc_inv_double.cu b/test/test_erfc_inv_double.cu
new file mode 100644
index 000000000..68642cd10
--- /dev/null
+++ b/test/test_erfc_inv_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erfc_inv(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erfc_inv(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_erfc_inv_float.cu b/test/test_erfc_inv_float.cu
new file mode 100644
index 000000000..b5b72cd05
--- /dev/null
+++ b/test/test_erfc_inv_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erfc_inv(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erfc_inv(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From 3d0bd18f11529f1669e2751104244d216d9b95f2 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 08:39:00 -0400
Subject: [PATCH 10/33] Replace igamma_temme_large integral constant pointer
 with reference

---
 .../math/special_functions/detail/igamma_large.hpp  | 13 ++++++++-----
 include/boost/math/special_functions/gamma.hpp      |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp
index 5483b53fb..e8162095c 100644
--- a/include/boost/math/special_functions/detail/igamma_large.hpp
+++ b/include/boost/math/special_functions/detail/igamma_large.hpp
@@ -1,4 +1,5 @@
 //  Copyright John Maddock 2006.
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -59,13 +60,15 @@
 #pragma GCC system_header
 #endif
 
+#include <boost/math/tools/config.hpp>
+
 namespace boost{ namespace math{ namespace detail{
 
 // This version will never be called (at runtime), it's a stub used
 // when T is unsuitable to be passed to these routines:
 //
 template <class T, class Policy>
-inline T igamma_temme_large(T, T, const Policy& /* pol */, std::integral_constant<int, 0> const *)
+inline T igamma_temme_large(T, T, const Policy& /* pol */, const std::integral_constant<int, 0>&)
 {
    // stub function, should never actually be called
    BOOST_MATH_ASSERT(0);
@@ -76,7 +79,7 @@ inline T igamma_temme_large(T, T, const Policy& /* pol */, std::integral_constan
 // (80-bit long double, or 10^-20).
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant<int, 64> const *)
+T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 64>&)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -281,7 +284,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant<int, 64
 // (IEEE double precision or 10^-17).
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant<int, 53> const *)
+T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 53>&)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -423,7 +426,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant<int, 53
 // (IEEE float precision, or 10^-8)
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant<int, 24> const *)
+T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 24>&)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -479,7 +482,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant<int, 24
 // require many more terms in the polynomials.
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant<int, 113> const *)
+T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 113>&)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index a58ea3e69..0c3b32b46 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -1454,7 +1454,7 @@ T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
             precision_type::value <= 113 ? 113 : 0
          > tag_type;
 
-         result = igamma_temme_large(a, x, pol, static_cast<tag_type const*>(nullptr));
+         result = igamma_temme_large(a, x, pol, tag_type());
          if(x >= a)
             invert = !invert;
          if(p_derivative)

From 8862896ff8dffe8abde3e277e8729612f322dd6d Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 08:40:50 -0400
Subject: [PATCH 11/33] Add GPU support to igamma_temme_large

---
 .../special_functions/detail/igamma_large.hpp | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp
index e8162095c..1fa13c692 100644
--- a/include/boost/math/special_functions/detail/igamma_large.hpp
+++ b/include/boost/math/special_functions/detail/igamma_large.hpp
@@ -284,7 +284,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
 // (IEEE double precision or 10^-17).
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 53>&)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -296,7 +296,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
 
    T workspace[10];
 
-   static const T C0[] = {
+   BOOST_MATH_STATIC const T C0[] = {
       static_cast<T>(-0.33333333333333333L),
       static_cast<T>(0.083333333333333333L),
       static_cast<T>(-0.014814814814814815L),
@@ -315,7 +315,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[0] = tools::evaluate_polynomial(C0, z);
 
-   static const T C1[] = {
+   BOOST_MATH_STATIC const T C1[] = {
       static_cast<T>(-0.0018518518518518519L),
       static_cast<T>(-0.0034722222222222222L),
       static_cast<T>(0.0026455026455026455L),
@@ -332,7 +332,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[1] = tools::evaluate_polynomial(C1, z);
 
-   static const T C2[] = {
+   BOOST_MATH_STATIC const T C2[] = {
       static_cast<T>(0.0041335978835978836L),
       static_cast<T>(-0.0026813271604938272L),
       static_cast<T>(0.00077160493827160494L),
@@ -347,7 +347,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[2] = tools::evaluate_polynomial(C2, z);
 
-   static const T C3[] = {
+   BOOST_MATH_STATIC const T C3[] = {
       static_cast<T>(0.00064943415637860082L),
       static_cast<T>(0.00022947209362139918L),
       static_cast<T>(-0.00046918949439525571L),
@@ -360,7 +360,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[3] = tools::evaluate_polynomial(C3, z);
 
-   static const T C4[] = {
+   BOOST_MATH_STATIC const T C4[] = {
       static_cast<T>(-0.0008618882909167117L),
       static_cast<T>(0.00078403922172006663L),
       static_cast<T>(-0.00029907248030319018L),
@@ -371,7 +371,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[4] = tools::evaluate_polynomial(C4, z);
 
-   static const T C5[] = {
+   BOOST_MATH_STATIC const T C5[] = {
       static_cast<T>(-0.00033679855336635815L),
       static_cast<T>(-0.69728137583658578e-4L),
       static_cast<T>(0.00027727532449593921L),
@@ -384,7 +384,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[5] = tools::evaluate_polynomial(C5, z);
 
-   static const T C6[] = {
+   BOOST_MATH_STATIC const T C6[] = {
       static_cast<T>(0.00053130793646399222L),
       static_cast<T>(-0.00059216643735369388L),
       static_cast<T>(0.00027087820967180448L),
@@ -395,7 +395,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[6] = tools::evaluate_polynomial(C6, z);
 
-   static const T C7[] = {
+   BOOST_MATH_STATIC const T C7[] = {
       static_cast<T>(0.00034436760689237767L),
       static_cast<T>(0.51717909082605922e-4L),
       static_cast<T>(-0.00033493161081142236L),
@@ -404,7 +404,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[7] = tools::evaluate_polynomial(C7, z);
 
-   static const T C8[] = {
+   BOOST_MATH_STATIC const T C8[] = {
       static_cast<T>(-0.00065262391859530942L),
       static_cast<T>(0.00083949872067208728L),
       static_cast<T>(-0.00043829709854172101L),
@@ -426,7 +426,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
 // (IEEE float precision, or 10^-8)
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 24>&)
+BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<int, 24>&)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -438,7 +438,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
 
    T workspace[3];
 
-   static const T C0[] = {
+   BOOST_MATH_STATIC const T C0[] = {
       static_cast<T>(-0.333333333L),
       static_cast<T>(0.0833333333L),
       static_cast<T>(-0.0148148148L),
@@ -449,7 +449,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[0] = tools::evaluate_polynomial(C0, z);
 
-   static const T C1[] = {
+   BOOST_MATH_STATIC const T C1[] = {
       static_cast<T>(-0.00185185185L),
       static_cast<T>(-0.00347222222L),
       static_cast<T>(0.00264550265L),
@@ -458,7 +458,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant<i
    };
    workspace[1] = tools::evaluate_polynomial(C1, z);
 
-   static const T C2[] = {
+   BOOST_MATH_STATIC const T C2[] = {
       static_cast<T>(0.00413359788L),
       static_cast<T>(-0.00268132716L),
       static_cast<T>(0.000771604938L),

From 7cb386068f2cd213c9cd6f809bc227b0299e305f Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 11:52:57 -0400
Subject: [PATCH 12/33] Add GPU support to lgamma_small_imp

---
 .../special_functions/detail/lgamma_small.hpp | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp
index 6a4a9171f..a4d3bac70 100644
--- a/include/boost/math/special_functions/detail/lgamma_small.hpp
+++ b/include/boost/math/special_functions/detail/lgamma_small.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright John Maddock 2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,6 +11,7 @@
 #pragma once
 #endif
 
+#include <boost/math/tools/config.hpp>
 #include <boost/math/tools/big_constant.hpp>
 
 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
@@ -36,7 +38,7 @@ T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l);
 // lgamma for small arguments:
 //
 template <class T, class Policy, class Lanczos>
-T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, const Policy& /* l */, const Lanczos&)
+BOOST_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, const Policy& /* l */, const Lanczos&)
 {
    // This version uses rational approximations for small
    // values of z accurate enough for 64-bit mantissas
@@ -87,7 +89,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, co
       // At long double: Max error found:               1.987e-21
       // Maximum Deviation Found (approximation error): 5.900e-24
       //
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.180355685678449379109e-1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.25126649619989678683e-1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.494103151567532234274e-1)),
@@ -96,7 +98,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, co
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.541009869215204396339e-3)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.324588649825948492091e-4))
       };
-      static const T Q[] = {
+      BOOST_MATH_STATIC const T Q[] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.196202987197795200688e1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.148019669424231326694e1)),
@@ -107,7 +109,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, co
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.223352763208617092964e-6))
       };
 
-      static const float Y = 0.158963680267333984375e0f;
+      constexpr float Y = 0.158963680267333984375e0f;
 
       T r = zm2 * (z + 1);
       T R = tools::evaluate_polynomial(P, zm2);
@@ -152,9 +154,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, co
          // Expected Error Term:                                 3.139e-021
 
          //
-         static const float Y = 0.52815341949462890625f;
+         constexpr float Y = 0.52815341949462890625f;
 
-         static const T P[] = {
+         BOOST_MATH_STATIC const T P[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.490622454069039543534e-1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.969117530159521214579e-1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.414983358359495381969e0)),
@@ -163,7 +165,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, co
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.240149820648571559892e-1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.100346687696279557415e-2))
          };
-         static const T Q[] = {
+         BOOST_MATH_STATIC const T Q[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.302349829846463038743e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.348739585360723852576e1)),
@@ -197,9 +199,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, co
          // Maximum Deviation Found:                           2.151e-021
          // Expected Error Term:                               2.150e-021
          //
-         static const float Y = 0.452017307281494140625f;
+         constexpr float Y = 0.452017307281494140625f;
 
-         static const T P[] = {
+         BOOST_MATH_STATIC const T P[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.292329721830270012337e-1)), 
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.144216267757192309184e0)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.142440390738631274135e0)),
@@ -207,7 +209,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, co
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.850535976868336437746e-2)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.431171342679297331241e-3))
          };
-         static const T Q[] = {
+         BOOST_MATH_STATIC const T Q[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.150169356054485044494e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.846973248876495016101e0)),

From cc8f359d565efe429c1e7f8f52009304fe433b5f Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:10:28 -0400
Subject: [PATCH 13/33] Add additional factorial overloads for GPU times

---
 .../detail/unchecked_factorial.hpp            | 122 +++++++++++++++++-
 1 file changed, 119 insertions(+), 3 deletions(-)

diff --git a/include/boost/math/special_functions/detail/unchecked_factorial.hpp b/include/boost/math/special_functions/detail/unchecked_factorial.hpp
index b528a24fe..f7720a2ab 100644
--- a/include/boost/math/special_functions/detail/unchecked_factorial.hpp
+++ b/include/boost/math/special_functions/detail/unchecked_factorial.hpp
@@ -46,6 +46,8 @@ struct max_factorial;
 template <class T, bool = true>
 struct unchecked_factorial_data;
 
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+
 template <bool b>
 struct unchecked_factorial_data<float, b>
 {
@@ -137,17 +139,67 @@ template<bool b>
 
 // Definitions:
 template <>
-inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial<float>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float))
+BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial<float>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float))
 {
    return unchecked_factorial_data<float>::factorials[i];
 }
 
+#else
+
+template <>
+BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial<float>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float))
+{
+   constexpr float factorials[] = {
+      1.0F,
+      1.0F,
+      2.0F,
+      6.0F,
+      24.0F,
+      120.0F,
+      720.0F,
+      5040.0F,
+      40320.0F,
+      362880.0F,
+      3628800.0F,
+      39916800.0F,
+      479001600.0F,
+      6227020800.0F,
+      87178291200.0F,
+      1307674368000.0F,
+      20922789888000.0F,
+      355687428096000.0F,
+      6402373705728000.0F,
+      121645100408832000.0F,
+      0.243290200817664e19F,
+      0.5109094217170944e20F,
+      0.112400072777760768e22F,
+      0.2585201673888497664e23F,
+      0.62044840173323943936e24F,
+      0.15511210043330985984e26F,
+      0.403291461126605635584e27F,
+      0.10888869450418352160768e29F,
+      0.304888344611713860501504e30F,
+      0.8841761993739701954543616e31F,
+      0.26525285981219105863630848e33F,
+      0.822283865417792281772556288e34F,
+      0.26313083693369353016721801216e36F,
+      0.868331761881188649551819440128e37F,
+      0.29523279903960414084761860964352e39F,
+   };
+
+   return factorials[i];
+}
+
+#endif
+
 template <>
 struct max_factorial<float>
 {
    static constexpr unsigned value = 34;
 };
 
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+
 template <bool b>
 struct unchecked_factorial_data<double, b>
 {
@@ -510,7 +562,7 @@ template <bool b>
 #endif
 
 template <>
-inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION double unchecked_factorial<double>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double))
+BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION double unchecked_factorial<double>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double))
 {
    return unchecked_factorial_data<double>::factorials[i];
 }
@@ -521,6 +573,62 @@ struct max_factorial<double>
    static constexpr unsigned value = 170;
 };
 
+#else
+
+template <>
+BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION double unchecked_factorial<double>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double))
+{
+   constexpr double factorials[] = {
+      1,
+      1,
+      2,
+      6,
+      24,
+      120,
+      720,
+      5040,
+      40320,
+      362880.0,
+      3628800.0,
+      39916800.0,
+      479001600.0,
+      6227020800.0,
+      87178291200.0,
+      1307674368000.0,
+      20922789888000.0,
+      355687428096000.0,
+      6402373705728000.0,
+      121645100408832000.0,
+      0.243290200817664e19,
+      0.5109094217170944e20,
+      0.112400072777760768e22,
+      0.2585201673888497664e23,
+      0.62044840173323943936e24,
+      0.15511210043330985984e26,
+      0.403291461126605635584e27,
+      0.10888869450418352160768e29,
+      0.304888344611713860501504e30,
+      0.8841761993739701954543616e31,
+      0.26525285981219105863630848e33,
+      0.822283865417792281772556288e34,
+      0.26313083693369353016721801216e36,
+      0.868331761881188649551819440128e37,
+      0.29523279903960414084761860964352e39,
+   };
+
+   return factorials[i];
+}
+
+template <>
+struct max_factorial<double>
+{
+   static constexpr unsigned value = 34;
+};
+
+#endif
+
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+
 template <bool b>
 struct unchecked_factorial_data<long double, b>
 {
@@ -1556,6 +1664,8 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 0
    return factorials[i];
 }
 
+#endif // BOOST_MATH_HAS_GPU_SUPPORT
+
 template <class T>
 inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, std::numeric_limits<float>::digits>&)
 {
@@ -1568,6 +1678,8 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, s
    return unchecked_factorial<double>(i);
 }
 
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+
 #if DBL_MANT_DIG != LDBL_MANT_DIG
 template <class T>
 inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, LDBL_MANT_DIG>&)
@@ -1583,6 +1695,8 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 1
 }
 #endif
 
+#endif // BOOST_MATH_HAS_GPU_SUPPORT
+
 template <class T>
 inline T unchecked_factorial(unsigned i)
 {
@@ -1601,9 +1715,11 @@ struct max_factorial
 {
    static constexpr unsigned value = 
       std::numeric_limits<T>::digits == std::numeric_limits<float>::digits ? max_factorial<float>::value 
-      : std::numeric_limits<T>::digits == std::numeric_limits<double>::digits ? max_factorial<double>::value 
+      : std::numeric_limits<T>::digits == std::numeric_limits<double>::digits ? max_factorial<double>::value
+      #ifndef BOOST_MATH_GPU_ENABLED 
       : std::numeric_limits<T>::digits == std::numeric_limits<long double>::digits ? max_factorial<long double>::value 
       BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL
+      #endif
       : 100;
 };
 

From d3fc301f8425bc493ae4a4371931081924cbf43c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:10:41 -0400
Subject: [PATCH 14/33] Add GPU support to lgamma_small

---
 include/boost/math/special_functions/detail/lgamma_small.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp
index a4d3bac70..82a1761f1 100644
--- a/include/boost/math/special_functions/detail/lgamma_small.hpp
+++ b/include/boost/math/special_functions/detail/lgamma_small.hpp
@@ -30,7 +30,7 @@ namespace boost{ namespace math{ namespace detail{
 // These need forward declaring to keep GCC happy:
 //
 template <class T, class Policy, class Lanczos>
-T gamma_imp(T z, const Policy& pol, const Lanczos& l);
+BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l);
 template <class T, class Policy>
 T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l);
 

From d02674c50c22407a2154c0c5474319cf97ac5398 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:10:59 -0400
Subject: [PATCH 15/33] Fix signbit GPU support

---
 include/boost/math/special_functions/sign.hpp | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/boost/math/special_functions/sign.hpp b/include/boost/math/special_functions/sign.hpp
index cd7fdfb2a..2092a41bb 100644
--- a/include/boost/math/special_functions/sign.hpp
+++ b/include/boost/math/special_functions/sign.hpp
@@ -26,9 +26,10 @@ namespace detail {
 
 #ifdef BOOST_MATH_USE_STD_FPCLASSIFY
     template<class T> 
-    inline int signbit_impl(T x, native_tag const&)
+    BOOST_MATH_GPU_ENABLED inline int signbit_impl(T x, native_tag const&)
     {
-        return (std::signbit)(x) ? 1 : 0;
+        using std::signbit;
+        return (signbit)(x) ? 1 : 0;
     }
 #endif
 
@@ -66,7 +67,7 @@ namespace detail {
 #endif
 
     template<class T>
-    inline int signbit_impl(T x, ieee_copy_all_bits_tag const&)
+    BOOST_MATH_GPU_ENABLED inline int signbit_impl(T x, ieee_copy_all_bits_tag const&)
     {
         typedef typename fp_traits<T>::type traits;
 
@@ -76,7 +77,7 @@ namespace detail {
     }
 
     template<class T> 
-    inline int signbit_impl(T x, ieee_copy_leading_bits_tag const&)
+    BOOST_MATH_GPU_ENABLED inline int signbit_impl(T x, ieee_copy_leading_bits_tag const&)
     {
         typedef typename fp_traits<T>::type traits;
 
@@ -161,6 +162,22 @@ BOOST_MATH_GPU_ENABLED int (signbit)(T x)
    return detail::signbit_impl(static_cast<result_type>(x), method());
 }
 
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+template<>
+BOOST_MATH_GPU_ENABLED int signbit<float>(float x)
+{
+    using std::signbit;
+    return (signbit)(x);
+}
+
+template<>
+BOOST_MATH_GPU_ENABLED int signbit<double>(double x)
+{
+    using std::signbit;
+    return (signbit)(x);
+}
+#endif
+
 template <class T>
 BOOST_MATH_GPU_ENABLED inline int sign BOOST_NO_MACRO_EXPAND(const T& z)
 {

From b0496b15c98c359852eabd4ce70023fe29dc3929 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:11:16 -0400
Subject: [PATCH 16/33] Support GPU with root epsilon functions

---
 include/boost/math/tools/precision.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/boost/math/tools/precision.hpp b/include/boost/math/tools/precision.hpp
index 3f2f6f3ac..28caefff9 100644
--- a/include/boost/math/tools/precision.hpp
+++ b/include/boost/math/tools/precision.hpp
@@ -316,13 +316,13 @@ BOOST_MATH_GPU_ENABLED constexpr long double epsilon(BOOST_MATH_EXPLICIT_TEMPLAT
 namespace detail{
 
 template <class T>
-inline constexpr T root_epsilon_imp(const std::integral_constant<int, 24>&) noexcept(std::is_floating_point<T>::value)
+BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const std::integral_constant<int, 24>&) noexcept(std::is_floating_point<T>::value)
 {
    return static_cast<T>(0.00034526698300124390839884978618400831996329879769945L);
 }
 
 template <class T>
-inline constexpr T root_epsilon_imp(const T*, const std::integral_constant<int, 53>&) noexcept(std::is_floating_point<T>::value)
+BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const T*, const std::integral_constant<int, 53>&) noexcept(std::is_floating_point<T>::value)
 {
    return static_cast<T>(0.1490116119384765625e-7L);
 }
@@ -355,13 +355,13 @@ inline T root_epsilon_imp(const T*, const std::integral_constant<int, 0>&)
 }
 
 template <class T>
-inline constexpr T cbrt_epsilon_imp(const std::integral_constant<int, 24>&) noexcept(std::is_floating_point<T>::value)
+BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const std::integral_constant<int, 24>&) noexcept(std::is_floating_point<T>::value)
 {
    return static_cast<T>(0.0049215666011518482998719164346805794944150447839903L);
 }
 
 template <class T>
-inline constexpr T cbrt_epsilon_imp(const T*, const std::integral_constant<int, 53>&) noexcept(std::is_floating_point<T>::value)
+BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const T*, const std::integral_constant<int, 53>&) noexcept(std::is_floating_point<T>::value)
 {
    return static_cast<T>(6.05545445239333906078989272793696693569753008995e-6L);
 }
@@ -394,13 +394,13 @@ inline T cbrt_epsilon_imp(const T*, const std::integral_constant<int, 0>&)
 }
 
 template <class T>
-inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant<int, 24>&) noexcept(std::is_floating_point<T>::value)
+BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant<int, 24>&) noexcept(std::is_floating_point<T>::value)
 {
    return static_cast<T>(0.018581361171917516667460937040007436176452688944747L);
 }
 
 template <class T>
-inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant<int, 53>&) noexcept(std::is_floating_point<T>::value)
+BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant<int, 53>&) noexcept(std::is_floating_point<T>::value)
 {
    return static_cast<T>(0.0001220703125L);
 }
@@ -442,19 +442,19 @@ struct root_epsilon_traits
 }
 
 template <class T>
-inline constexpr T root_epsilon() noexcept(std::is_floating_point<T>::value && detail::root_epsilon_traits<T>::has_noexcept)
+BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon() noexcept(std::is_floating_point<T>::value && detail::root_epsilon_traits<T>::has_noexcept)
 {
    return detail::root_epsilon_imp(static_cast<T const*>(nullptr), typename detail::root_epsilon_traits<T>::tag_type());
 }
 
 template <class T>
-inline constexpr T cbrt_epsilon() noexcept(std::is_floating_point<T>::value && detail::root_epsilon_traits<T>::has_noexcept)
+BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon() noexcept(std::is_floating_point<T>::value && detail::root_epsilon_traits<T>::has_noexcept)
 {
    return detail::cbrt_epsilon_imp(static_cast<T const*>(nullptr), typename detail::root_epsilon_traits<T>::tag_type());
 }
 
 template <class T>
-inline constexpr T forth_root_epsilon() noexcept(std::is_floating_point<T>::value && detail::root_epsilon_traits<T>::has_noexcept)
+BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon() noexcept(std::is_floating_point<T>::value && detail::root_epsilon_traits<T>::has_noexcept)
 {
    return detail::forth_root_epsilon_imp(static_cast<T const*>(nullptr), typename detail::root_epsilon_traits<T>::tag_type());
 }

From 58a5bc47ca2b03a4a165c8a5016a922602755d45 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:11:35 -0400
Subject: [PATCH 17/33] Add GPU support to gamma functions

---
 .../boost/math/special_functions/gamma.hpp    | 148 +++++++++---------
 1 file changed, 76 insertions(+), 72 deletions(-)

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index 0c3b32b46..b77f4f5ea 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -2,7 +2,7 @@
 //  Copyright Paul A. Bristow 2007, 2013-14.
 //  Copyright Nikhar Agrawal 2013-14
 //  Copyright Christopher Kormanyos 2013-14, 2020, 2024
-
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -14,12 +14,12 @@
 #pragma once
 #endif
 
+#include <boost/math/tools/config.hpp>
 #include <boost/math/tools/series.hpp>
 #include <boost/math/tools/fraction.hpp>
 #include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/promotion.hpp>
 #include <boost/math/tools/assert.hpp>
-#include <boost/math/tools/config.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
@@ -56,13 +56,13 @@ namespace boost{ namespace math{
 namespace detail{
 
 template <class T>
-inline bool is_odd(T v, const std::true_type&)
+BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const std::true_type&)
 {
    int i = static_cast<int>(v);
    return i&1;
 }
 template <class T>
-inline bool is_odd(T v, const std::false_type&)
+BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const std::false_type&)
 {
    // Oh dear can't cast T to int!
    BOOST_MATH_STD_USING
@@ -70,13 +70,13 @@ inline bool is_odd(T v, const std::false_type&)
    return static_cast<bool>(modulus != 0);
 }
 template <class T>
-inline bool is_odd(T v)
+BOOST_MATH_GPU_ENABLED inline bool is_odd(T v)
 {
    return is_odd(v, ::std::is_convertible<T, int>());
 }
 
 template <class T>
-T sinpx(T z)
+BOOST_MATH_GPU_ENABLED T sinpx(T z)
 {
    // Ad hoc function calculates x * sin(pi * x),
    // taking extra care near when x is near a whole number.
@@ -108,7 +108,7 @@ T sinpx(T z)
 // tgamma(z), with Lanczos support:
 //
 template <class T, class Policy, class Lanczos>
-T gamma_imp(T z, const Policy& pol, const Lanczos& l)
+BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
 
@@ -122,7 +122,7 @@ T gamma_imp(T z, const Policy& pol, const Lanczos& l)
       b = true;
    }
 #endif
-   static const char* function = "boost::math::tgamma<%1%>(%1%)";
+   constexpr auto function = "boost::math::tgamma<%1%>(%1%)";
 
    if(z <= 0)
    {
@@ -199,7 +199,7 @@ T gamma_imp(T z, const Policy& pol, const Lanczos& l)
 // lgamma(z) with Lanczos support:
 //
 template <class T, class Policy, class Lanczos>
-T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr)
+BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr)
 {
 #ifdef BOOST_MATH_INSTRUMENT
    static bool b = false;
@@ -212,7 +212,7 @@ T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr)
 
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::lgamma<%1%>(%1%)";
+   constexpr auto function = "boost::math::lgamma<%1%>(%1%)";
 
    T result = 0;
    int sresult = 1;
@@ -291,12 +291,12 @@ struct upper_incomplete_gamma_fract
 public:
    typedef std::pair<T,T> result_type;
 
-   upper_incomplete_gamma_fract(T a1, T z1)
+   BOOST_MATH_GPU_ENABLED upper_incomplete_gamma_fract(T a1, T z1)
       : z(z1-a1+1), a(a1), k(0)
    {
    }
 
-   result_type operator()()
+   BOOST_MATH_GPU_ENABLED result_type operator()()
    {
       ++k;
       z += 2;
@@ -305,7 +305,7 @@ struct upper_incomplete_gamma_fract
 };
 
 template <class T>
-inline T upper_gamma_fraction(T a, T z, T eps)
+BOOST_MATH_GPU_ENABLED inline T upper_gamma_fraction(T a, T z, T eps)
 {
    // Multiply result by z^a * e^-z to get the full
    // upper incomplete integral.  Divide by tgamma(z)
@@ -321,9 +321,9 @@ struct lower_incomplete_gamma_series
    T a, z, result;
 public:
    typedef T result_type;
-   lower_incomplete_gamma_series(T a1, T z1) : a(a1), z(z1), result(1){}
+   BOOST_MATH_GPU_ENABLED lower_incomplete_gamma_series(T a1, T z1) : a(a1), z(z1), result(1){}
 
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
    {
       T r = result;
       a += 1;
@@ -333,7 +333,7 @@ struct lower_incomplete_gamma_series
 };
 
 template <class T, class Policy>
-inline T lower_gamma_series(T a, T z, const Policy& pol, T init_value = 0)
+BOOST_MATH_GPU_ENABLED inline T lower_gamma_series(T a, T z, const Policy& pol, T init_value = 0)
 {
    // Multiply result by ((z^a) * (e^-z) / a) to get the full
    // lower incomplete integral. Then divide by tgamma(a)
@@ -460,7 +460,7 @@ T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&)
 {
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::tgamma<%1%>(%1%)";
+   constexpr auto function = "boost::math::tgamma<%1%>(%1%)";
 
    // Check if the argument of tgamma is identically zero.
    const bool is_at_zero = (z == 0);
@@ -610,7 +610,7 @@ T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sig
 {
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::lgamma<%1%>(%1%)";
+   constexpr auto function = "boost::math::lgamma<%1%>(%1%)";
 
    // Check if the argument of lgamma is identically zero.
    const bool is_at_zero = (z == 0);
@@ -720,7 +720,7 @@ T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sig
 // used by the upper incomplete gamma with z < 1:
 //
 template <class T, class Policy, class Lanczos>
-T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& l)
+BOOST_MATH_GPU_ENABLED T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
 
@@ -789,9 +789,9 @@ struct small_gamma2_series
 {
    typedef T result_type;
 
-   small_gamma2_series(T a_, T x_) : result(-x_), x(-x_), apn(a_+1), n(1){}
+   BOOST_MATH_GPU_ENABLED small_gamma2_series(T a_, T x_) : result(-x_), x(-x_), apn(a_+1), n(1){}
 
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
    {
       T r = result / (apn);
       result *= x;
@@ -809,7 +809,7 @@ struct small_gamma2_series
 // incomplete gammas:
 //
 template <class T, class Policy>
-T full_igamma_prefix(T a, T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T full_igamma_prefix(T a, T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -864,7 +864,7 @@ T full_igamma_prefix(T a, T z, const Policy& pol)
 // most if the error occurs in this function:
 //
 template <class T, class Policy, class Lanczos>
-T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l)
+BOOST_MATH_GPU_ENABLED T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
    if (z >= tools::max_value<T>())
@@ -1017,7 +1017,7 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const lanczos::undefined
 // Upper gamma fraction for very small a:
 //
 template <class T, class Policy>
-inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool invert = false, T* pderivative = 0)
+BOOST_MATH_GPU_ENABLED inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool invert = false, T* pderivative = 0)
 {
    BOOST_MATH_STD_USING  // ADL of std functions.
    //
@@ -1047,7 +1047,7 @@ inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool
 // Upper gamma fraction for integer a:
 //
 template <class T, class Policy>
-inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0)
+BOOST_MATH_GPU_ENABLED inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0)
 {
    //
    // Calculates normalised Q when a is an integer:
@@ -1075,7 +1075,7 @@ inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0)
 // Upper gamma fraction for half integer a:
 //
 template <class T, class Policy>
-T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol)
 {
    //
    // Calculates normalised Q when a is a half-integer:
@@ -1115,9 +1115,9 @@ template <class T>
 struct incomplete_tgamma_large_x_series
 {
    typedef T result_type;
-   incomplete_tgamma_large_x_series(const T& a, const T& x)
+   BOOST_MATH_GPU_ENABLED incomplete_tgamma_large_x_series(const T& a, const T& x)
       : a_poch(a - 1), z(x), term(1) {}
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
    {
       T result = term;
       term *= a_poch / z;
@@ -1128,7 +1128,7 @@ struct incomplete_tgamma_large_x_series
 };
 
 template <class T, class Policy>
-T incomplete_tgamma_large_x(const T& a, const T& x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T incomplete_tgamma_large_x(const T& a, const T& x, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    incomplete_tgamma_large_x_series<T> s(a, x);
@@ -1143,10 +1143,10 @@ T incomplete_tgamma_large_x(const T& a, const T& x, const Policy& pol)
 // Main incomplete gamma entry point, handles all four incomplete gamma's:
 //
 template <class T, class Policy>
-T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
+BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
                        const Policy& pol, T* p_derivative)
 {
-   static const char* function = "boost::math::gamma_p<%1%>(%1%, %1%)";
+   constexpr auto function = "boost::math::gamma_p<%1%>(%1%, %1%)";
    if(a <= 0)
       return policies::raise_domain_error<T>(function, "Argument a to the incomplete gamma function must be greater than zero (got a=%1%).", a, pol);
    if(x < 0)
@@ -1529,7 +1529,7 @@ T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
 // Ratios of two gamma functions:
 //
 template <class T, class Policy, class Lanczos>
-T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
+BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
    if(z < tools::epsilon<T>())
@@ -1648,7 +1648,7 @@ T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const lanczos:
 }
 
 template <class T, class Policy>
-T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -1706,7 +1706,7 @@ T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol)
 }
 
 template <class T, class Policy>
-T tgamma_ratio_imp(T x, T y, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T tgamma_ratio_imp(T x, T y, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -1775,7 +1775,7 @@ T tgamma_ratio_imp(T x, T y, const Policy& pol)
 }
 
 template <class T, class Policy>
-T gamma_p_derivative_imp(T a, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T gamma_p_derivative_imp(T a, T x, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    //
@@ -1816,7 +1816,7 @@ T gamma_p_derivative_imp(T a, T x, const Policy& pol)
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    tgamma(T z, const Policy& /* pol */, const std::true_type)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1837,7 +1837,7 @@ struct igamma_initializer
 {
    struct init
    {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
       {
          typedef typename policies::precision<T, Policy>::type precision_type;
 
@@ -1851,24 +1851,26 @@ struct igamma_initializer
          do_init(tag_type());
       }
       template <int N>
-      static void do_init(const std::integral_constant<int, N>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant<int, N>&)
       {
          // If std::numeric_limits<T>::digits is zero, we must not call
          // our initialization code here as the precision presumably
          // varies at runtime, and will not have been set yet.  Plus the
          // code requiring initialization isn't called when digits == 0.
-         if(std::numeric_limits<T>::digits)
+         BOOST_MATH_IF_CONSTEXPR (std::numeric_limits<T>::digits)
          {
             boost::math::gamma_p(static_cast<T>(400), static_cast<T>(400), Policy());
          }
       }
-      static void do_init(const std::integral_constant<int, 53>&){}
+      BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant<int, 53>&){}
       void force_instantiate()const{}
    };
-   static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_STATIC const init initializer;
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
    {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
       initializer.force_instantiate();
+      #endif
    }
 };
 
@@ -1880,7 +1882,7 @@ struct lgamma_initializer
 {
    struct init
    {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
       {
          typedef typename policies::precision<T, Policy>::type precision_type;
          typedef std::integral_constant<int,
@@ -1891,28 +1893,30 @@ struct lgamma_initializer
 
          do_init(tag_type());
       }
-      static void do_init(const std::integral_constant<int, 64>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant<int, 64>&)
       {
          boost::math::lgamma(static_cast<T>(2.5), Policy());
          boost::math::lgamma(static_cast<T>(1.25), Policy());
          boost::math::lgamma(static_cast<T>(1.75), Policy());
       }
-      static void do_init(const std::integral_constant<int, 113>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant<int, 113>&)
       {
          boost::math::lgamma(static_cast<T>(2.5), Policy());
          boost::math::lgamma(static_cast<T>(1.25), Policy());
          boost::math::lgamma(static_cast<T>(1.5), Policy());
          boost::math::lgamma(static_cast<T>(1.75), Policy());
       }
-      static void do_init(const std::integral_constant<int, 0>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant<int, 0>&)
       {
       }
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
    };
-   static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_STATIC const init initializer;
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
    {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
       initializer.force_instantiate();
+      #endif
    }
 };
 
@@ -1920,7 +1924,7 @@ template <class T, class Policy>
 const typename lgamma_initializer<T, Policy>::init lgamma_initializer<T, Policy>::initializer;
 
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma(T1 a, T2 z, const Policy&, const std::false_type)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1943,7 +1947,7 @@ inline tools::promote_args_t<T1, T2>
 }
 
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma(T1 a, T2 z, const std::false_type& tag)
 {
    return tgamma(a, z, policies::policy<>(), tag);
@@ -1953,14 +1957,14 @@ inline tools::promote_args_t<T1, T2>
 } // namespace detail
 
 template <class T>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    tgamma(T z)
 {
    return tgamma(z, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    lgamma(T z, int* sign, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1980,28 +1984,28 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    lgamma(T z, int* sign)
 {
    return lgamma(z, sign, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    lgamma(T x, const Policy& pol)
 {
    return ::boost::math::lgamma(x, nullptr, pol);
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    lgamma(T x)
 {
    return ::boost::math::lgamma(x, nullptr, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    tgamma1pm1(T z, const Policy& /* pol */)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2019,7 +2023,7 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
    tgamma1pm1(T z)
 {
    return tgamma1pm1(z, policies::policy<>());
@@ -2029,7 +2033,7 @@ inline typename tools::promote_args<T>::type
 // Full upper incomplete gamma:
 //
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma(T1 a, T2 z)
 {
    //
@@ -2041,7 +2045,7 @@ inline tools::promote_args_t<T1, T2>
    return static_cast<result_type>(detail::tgamma(a, z, maybe_policy()));
 }
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma(T1 a, T2 z, const Policy& pol)
 {
    using result_type = tools::promote_args_t<T1, T2>;
@@ -2051,7 +2055,7 @@ inline tools::promote_args_t<T1, T2>
 // Full lower incomplete gamma:
 //
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma_lower(T1 a, T2 z, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2073,7 +2077,7 @@ inline tools::promote_args_t<T1, T2>
       forwarding_policy(), static_cast<value_type*>(nullptr)), "tgamma_lower<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma_lower(T1 a, T2 z)
 {
    return tgamma_lower(a, z, policies::policy<>());
@@ -2082,7 +2086,7 @@ inline tools::promote_args_t<T1, T2>
 // Regularised upper incomplete gamma:
 //
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    gamma_q(T1 a, T2 z, const Policy& /* pol */)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2104,7 +2108,7 @@ inline tools::promote_args_t<T1, T2>
       forwarding_policy(), static_cast<value_type*>(nullptr)), "gamma_q<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    gamma_q(T1 a, T2 z)
 {
    return gamma_q(a, z, policies::policy<>());
@@ -2113,7 +2117,7 @@ inline tools::promote_args_t<T1, T2>
 // Regularised lower incomplete gamma:
 //
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    gamma_p(T1 a, T2 z, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2135,7 +2139,7 @@ inline tools::promote_args_t<T1, T2>
       forwarding_policy(), static_cast<value_type*>(nullptr)), "gamma_p<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    gamma_p(T1 a, T2 z)
 {
    return gamma_p(a, z, policies::policy<>());
@@ -2143,7 +2147,7 @@ inline tools::promote_args_t<T1, T2>
 
 // ratios of gamma functions:
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma_delta_ratio(T1 z, T2 delta, const Policy& /* pol */)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2159,13 +2163,13 @@ inline tools::promote_args_t<T1, T2>
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::tgamma_delta_ratio_imp(static_cast<value_type>(z), static_cast<value_type>(delta), forwarding_policy()), "boost::math::tgamma_delta_ratio<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma_delta_ratio(T1 z, T2 delta)
 {
    return tgamma_delta_ratio(z, delta, policies::policy<>());
 }
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma_ratio(T1 a, T2 b, const Policy&)
 {
    typedef tools::promote_args_t<T1, T2> result_type;
@@ -2180,14 +2184,14 @@ inline tools::promote_args_t<T1, T2>
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::tgamma_ratio_imp(static_cast<value_type>(a), static_cast<value_type>(b), forwarding_policy()), "boost::math::tgamma_delta_ratio<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    tgamma_ratio(T1 a, T2 b)
 {
    return tgamma_ratio(a, b, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    gamma_p_derivative(T1 a, T2 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2203,7 +2207,7 @@ inline tools::promote_args_t<T1, T2>
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::gamma_p_derivative_imp(static_cast<value_type>(a), static_cast<value_type>(x), forwarding_policy()), "boost::math::gamma_p_derivative<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline tools::promote_args_t<T1, T2>
+BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
    gamma_p_derivative(T1 a, T2 x)
 {
    return gamma_p_derivative(a, x, policies::policy<>());

From 4362b0241b81e13cc1a759c57dc9c416a14d6003 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:11:55 -0400
Subject: [PATCH 18/33] Add GPU markers to math_fwd for gamma functions

---
 .../boost/math/special_functions/math_fwd.hpp | 66 +++++++++----------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index 1d22f61b1..fdccfad8b 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -424,7 +424,7 @@ namespace boost
    template <class RT, class Policy>
    RT factorial(unsigned int, const Policy& pol);
    template <class RT>
-   RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT));
+   BOOST_MATH_GPU_ENABLED RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT));
    template <class RT>
    RT double_factorial(unsigned i);
    template <class RT, class Policy>
@@ -444,67 +444,67 @@ namespace boost
 
    // Gamma functions.
    template <class RT>
-   tools::promote_args_t<RT> tgamma(RT z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> tgamma(RT z);
 
    template <class RT>
-   tools::promote_args_t<RT> tgamma1pm1(RT z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> tgamma1pm1(RT z);
 
    template <class RT, class Policy>
-   tools::promote_args_t<RT> tgamma1pm1(RT z, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> tgamma1pm1(RT z, const Policy& pol);
 
    template <class RT1, class RT2>
-   tools::promote_args_t<RT1, RT2> tgamma(RT1 a, RT2 z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> tgamma(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   tools::promote_args_t<RT1, RT2> tgamma(RT1 a, RT2 z, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> tgamma(RT1 a, RT2 z, const Policy& pol);
 
    template <class RT>
-   tools::promote_args_t<RT> lgamma(RT z, int* sign);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> lgamma(RT z, int* sign);
 
    template <class RT, class Policy>
-   tools::promote_args_t<RT> lgamma(RT z, int* sign, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> lgamma(RT z, int* sign, const Policy& pol);
 
    template <class RT>
-   tools::promote_args_t<RT> lgamma(RT x);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> lgamma(RT x);
 
    template <class RT, class Policy>
-   tools::promote_args_t<RT> lgamma(RT x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT> lgamma(RT x, const Policy& pol);
 
    template <class RT1, class RT2>
-   tools::promote_args_t<RT1, RT2> tgamma_lower(RT1 a, RT2 z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> tgamma_lower(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   tools::promote_args_t<RT1, RT2> tgamma_lower(RT1 a, RT2 z, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> tgamma_lower(RT1 a, RT2 z, const Policy&);
 
    template <class RT1, class RT2>
-   tools::promote_args_t<RT1, RT2> gamma_q(RT1 a, RT2 z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> gamma_q(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   tools::promote_args_t<RT1, RT2> gamma_q(RT1 a, RT2 z, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> gamma_q(RT1 a, RT2 z, const Policy&);
 
    template <class RT1, class RT2>
-   tools::promote_args_t<RT1, RT2> gamma_p(RT1 a, RT2 z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> gamma_p(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   tools::promote_args_t<RT1, RT2> gamma_p(RT1 a, RT2 z, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<RT1, RT2> gamma_p(RT1 a, RT2 z, const Policy&);
 
    template <class T1, class T2>
-   tools::promote_args_t<T1, T2> tgamma_delta_ratio(T1 z, T2 delta);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2> tgamma_delta_ratio(T1 z, T2 delta);
 
    template <class T1, class T2, class Policy>
-   tools::promote_args_t<T1, T2> tgamma_delta_ratio(T1 z, T2 delta, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2> tgamma_delta_ratio(T1 z, T2 delta, const Policy&);
 
    template <class T1, class T2>
-   tools::promote_args_t<T1, T2> tgamma_ratio(T1 a, T2 b);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2> tgamma_ratio(T1 a, T2 b);
 
    template <class T1, class T2, class Policy>
-   tools::promote_args_t<T1, T2> tgamma_ratio(T1 a, T2 b, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2> tgamma_ratio(T1 a, T2 b, const Policy&);
 
    template <class T1, class T2>
-   tools::promote_args_t<T1, T2> gamma_p_derivative(T1 a, T2 x);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2> gamma_p_derivative(T1 a, T2 x);
 
    template <class T1, class T2, class Policy>
-   tools::promote_args_t<T1, T2> gamma_p_derivative(T1 a, T2 x, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2> gamma_p_derivative(T1 a, T2 x, const Policy&);
 
    // gamma inverse.
    template <class T1, class T2>
@@ -1410,37 +1410,37 @@ namespace boost
    inline boost::math::tools::promote_args_t<RT> rising_factorial(RT x, unsigned n){ return boost::math::rising_factorial(x, n, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> tgamma(RT z){ return boost::math::tgamma(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> tgamma(RT z){ return boost::math::tgamma(z, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> tgamma1pm1(RT z){ return boost::math::tgamma1pm1(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> tgamma1pm1(RT z){ return boost::math::tgamma1pm1(z, Policy()); }\
 \
    template <class RT1, class RT2>\
-   inline boost::math::tools::promote_args_t<RT1, RT2> tgamma(RT1 a, RT2 z){ return boost::math::tgamma(a, z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2> tgamma(RT1 a, RT2 z){ return boost::math::tgamma(a, z, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> lgamma(RT z, int* sign){ return boost::math::lgamma(z, sign, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> lgamma(RT z, int* sign){ return boost::math::lgamma(z, sign, Policy()); }\
 \
    template <class RT>\
-   inline boost::math::tools::promote_args_t<RT> lgamma(RT x){ return boost::math::lgamma(x, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> lgamma(RT x){ return boost::math::lgamma(x, Policy()); }\
 \
    template <class RT1, class RT2>\
-   inline boost::math::tools::promote_args_t<RT1, RT2> tgamma_lower(RT1 a, RT2 z){ return boost::math::tgamma_lower(a, z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2> tgamma_lower(RT1 a, RT2 z){ return boost::math::tgamma_lower(a, z, Policy()); }\
 \
    template <class RT1, class RT2>\
-   inline boost::math::tools::promote_args_t<RT1, RT2> gamma_q(RT1 a, RT2 z){ return boost::math::gamma_q(a, z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2> gamma_q(RT1 a, RT2 z){ return boost::math::gamma_q(a, z, Policy()); }\
 \
    template <class RT1, class RT2>\
-   inline boost::math::tools::promote_args_t<RT1, RT2> gamma_p(RT1 a, RT2 z){ return boost::math::gamma_p(a, z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2> gamma_p(RT1 a, RT2 z){ return boost::math::gamma_p(a, z, Policy()); }\
 \
    template <class T1, class T2>\
-   inline boost::math::tools::promote_args_t<T1, T2> tgamma_delta_ratio(T1 z, T2 delta){ return boost::math::tgamma_delta_ratio(z, delta, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2> tgamma_delta_ratio(T1 z, T2 delta){ return boost::math::tgamma_delta_ratio(z, delta, Policy()); }\
 \
    template <class T1, class T2>\
-   inline boost::math::tools::promote_args_t<T1, T2> tgamma_ratio(T1 a, T2 b) { return boost::math::tgamma_ratio(a, b, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2> tgamma_ratio(T1 a, T2 b) { return boost::math::tgamma_ratio(a, b, Policy()); }\
 \
    template <class T1, class T2>\
-   inline boost::math::tools::promote_args_t<T1, T2> gamma_p_derivative(T1 a, T2 x){ return boost::math::gamma_p_derivative(a, x, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2> gamma_p_derivative(T1 a, T2 x){ return boost::math::gamma_p_derivative(a, x, Policy()); }\
 \
    template <class T1, class T2>\
    inline boost::math::tools::promote_args_t<T1, T2> gamma_p_inv(T1 a, T2 p){ return boost::math::gamma_p_inv(a, p, Policy()); }\

From 443887e26c4fca7daa69aa3a74eedac185223cd9 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:12:05 -0400
Subject: [PATCH 19/33] Add CUDA testing for tgamma and lgamma

---
 test/cuda_jamfile          |   4 ++
 test/test_lgamma_double.cu | 102 +++++++++++++++++++++++++++++++++++++
 test/test_lgamma_float.cu  | 102 +++++++++++++++++++++++++++++++++++++
 test/test_tgamma_double.cu | 102 +++++++++++++++++++++++++++++++++++++
 test/test_tgamma_float.cu  | 102 +++++++++++++++++++++++++++++++++++++
 5 files changed, 412 insertions(+)
 create mode 100644 test/test_lgamma_double.cu
 create mode 100644 test/test_lgamma_float.cu
 create mode 100644 test/test_tgamma_double.cu
 create mode 100644 test/test_tgamma_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 7af3b9e48..69bdbfb7f 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -110,6 +110,8 @@ run test_erfc_inv_double.cu ;
 run test_erfc_inv_float.cu ;
 run test_expm1_double.cu ;
 run test_expm1_float.cu ;
+run test_lgamma_double.cu ;
+run test_lgamma_float.cu ;
 run test_log1p_double.cu ;
 run test_log1p_float.cu ;
 run test_modf_double.cu ;
@@ -118,6 +120,8 @@ run test_round_double.cu ;
 run test_round_float.cu ;
 run test_sin_pi_double.cu ; 
 run test_sin_pi_float.cu ;
+run test_tgamma_double.cu ;
+run test_tgamma_float.cu ;
 run test_trigamma_double.cu ;
 run test_trigamma_float.cu ;
 run test_trunc_double.cu ;
diff --git a/test/test_lgamma_double.cu b/test/test_lgamma_double.cu
new file mode 100644
index 000000000..776ff5d27
--- /dev/null
+++ b/test/test_lgamma_double.cu
@@ -0,0 +1,102 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::lgamma(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::lgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_lgamma_float.cu b/test/test_lgamma_float.cu
new file mode 100644
index 000000000..101037ab3
--- /dev/null
+++ b/test/test_lgamma_float.cu
@@ -0,0 +1,102 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::lgamma(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::lgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_tgamma_double.cu b/test/test_tgamma_double.cu
new file mode 100644
index 000000000..6e4140ab6
--- /dev/null
+++ b/test/test_tgamma_double.cu
@@ -0,0 +1,102 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::tgamma(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::tgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_tgamma_float.cu b/test/test_tgamma_float.cu
new file mode 100644
index 000000000..cb2d01482
--- /dev/null
+++ b/test/test_tgamma_float.cu
@@ -0,0 +1,102 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::tgamma(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::tgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From fcb1c5bf4a95a77e6f883b462cca3e9d4e6714b1 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:32:08 -0400
Subject: [PATCH 20/33] Disable factorial check on GPU platform

---
 include/boost/math/special_functions/beta.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp
index 25e524d0b..07ac9d5a6 100644
--- a/include/boost/math/special_functions/beta.hpp
+++ b/include/boost/math/special_functions/beta.hpp
@@ -930,7 +930,9 @@ template <>
 struct Pn_size<long double>
 {
    static constexpr unsigned value = 50; // ~35-50 digit accuracy
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
    static_assert(::boost::math::max_factorial<long double>::value >= 100, "Type does not provide for ~35-50 digits of accuracy");
+#endif
 };
 
 template <class T, class Policy>

From e2a9039b2324c2a0f4a55d19f0e26ebd5b0f44d0 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:37:44 -0400
Subject: [PATCH 21/33] Add SYCL gamma testing

---
 test/sycl_jamfile   | 1 +
 test/test_gamma.cpp | 4 ++++
 test/test_gamma.hpp | 5 ++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index e336523e6..d0a458cce 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -33,3 +33,4 @@ run test_log1p_simple.cpp ;
 run test_digamma_simple.cpp ;
 run test_trigamma.cpp ;
 run test_erf.cpp ;
+run test_gamma.cpp ;
diff --git a/test/test_gamma.cpp b/test/test_gamma.cpp
index 6b2d19cca..565d5888a 100644
--- a/test/test_gamma.cpp
+++ b/test/test_gamma.cpp
@@ -3,7 +3,11 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#include <boost/math/special_functions/gamma.hpp>
 #include "test_gamma.hpp"
 
 //
diff --git a/test/test_gamma.hpp b/test/test_gamma.hpp
index c21573dac..729a8266d 100644
--- a/test/test_gamma.hpp
+++ b/test/test_gamma.hpp
@@ -6,6 +6,7 @@
 
 #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
 
+#include <boost/math/tools/config.hpp>
 #include <boost/math/concepts/real_concept.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #define BOOST_TEST_MAIN
@@ -13,7 +14,7 @@
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/special_functions/next.hpp>  // for has_denorm_now
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
@@ -320,11 +321,13 @@ void test_spots(T, const char* name)
       BOOST_CHECK(sign == -1);
    }
 
+   #ifndef BOOST_MATH_HAS_GPU_SUPPORT
    if(boost::math::detail::has_denorm_now<T>() && std::numeric_limits<T>::has_infinity && (boost::math::isinf)(1 / std::numeric_limits<T>::denorm_min()))
    {
       BOOST_CHECK_EQUAL(boost::math::tgamma(-std::numeric_limits<T>::denorm_min()), -std::numeric_limits<T>::infinity());
       BOOST_CHECK_EQUAL(boost::math::tgamma(std::numeric_limits<T>::denorm_min()), std::numeric_limits<T>::infinity());
    }
+   #endif
    //
    // Extra large values for lgamma, see https://github.com/boostorg/math/issues/242
    //

From 7dcb4320abf6be960b18f7b606c295e122dc04a3 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 14:48:18 -0400
Subject: [PATCH 22/33] Remove extra overloads

---
 include/boost/math/special_functions/sign.hpp | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/include/boost/math/special_functions/sign.hpp b/include/boost/math/special_functions/sign.hpp
index 2092a41bb..d147b5b1b 100644
--- a/include/boost/math/special_functions/sign.hpp
+++ b/include/boost/math/special_functions/sign.hpp
@@ -162,22 +162,6 @@ BOOST_MATH_GPU_ENABLED int (signbit)(T x)
    return detail::signbit_impl(static_cast<result_type>(x), method());
 }
 
-#ifndef BOOST_MATH_HAS_GPU_SUPPORT
-template<>
-BOOST_MATH_GPU_ENABLED int signbit<float>(float x)
-{
-    using std::signbit;
-    return (signbit)(x);
-}
-
-template<>
-BOOST_MATH_GPU_ENABLED int signbit<double>(double x)
-{
-    using std::signbit;
-    return (signbit)(x);
-}
-#endif
-
 template <class T>
 BOOST_MATH_GPU_ENABLED inline int sign BOOST_NO_MACRO_EXPAND(const T& z)
 {

From f1f470ec67800cf7e982ef844435e09b9f69afa3 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 15:08:54 -0400
Subject: [PATCH 23/33] Remove recursion from tgamma

---
 .../boost/math/special_functions/gamma.hpp    | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index b77f4f5ea..62448d69e 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -108,7 +108,7 @@ BOOST_MATH_GPU_ENABLED T sinpx(T z)
 // tgamma(z), with Lanczos support:
 //
 template <class T, class Policy, class Lanczos>
-BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l)
+BOOST_MATH_GPU_ENABLED T gamma_imp_final(T z, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
 
@@ -127,20 +127,8 @@ BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l)
    if(z <= 0)
    {
       if(floor(z) == z)
-         return policies::raise_pole_error<T>(function, "Evaluation of tgamma at a negative integer %1%.", z, pol);
-      if(z <= -20)
       {
-         result = gamma_imp(T(-z), pol, l) * sinpx(z);
-         BOOST_MATH_INSTRUMENT_VARIABLE(result);
-         if((fabs(result) < 1) && (tools::max_value<T>() * fabs(result) < boost::math::constants::pi<T>()))
-            return -boost::math::sign(result) * policies::raise_overflow_error<T>(function, "Result of tgamma is too large to represent.", pol);
-         result = -boost::math::constants::pi<T>() / result;
-         if(result == 0)
-            return policies::raise_underflow_error<T>(function, "Result of tgamma is too small to represent.", pol);
-         if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL)
-            return policies::raise_denorm_error<T>(function, "Result of tgamma is denormalized.", result, pol);
-         BOOST_MATH_INSTRUMENT_VARIABLE(result);
-         return result;
+         return policies::raise_pole_error<T>(function, "Evaluation of tgamma at a negative integer %1%.", z, pol);
       }
 
       // shift z to > 1:
@@ -195,6 +183,32 @@ BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l)
    }
    return result;
 }
+// SYCL compilers can not support recursion so we extract it into a dispatch function
+template <class T, class Policy, class Lanczos>
+BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol, const Lanczos& l)
+{
+   BOOST_MATH_STD_USING
+   
+   if(z <= -20)
+   {
+      constexpr auto function = "boost::math::tgamma<%1%>(%1%)";
+      T result = gamma_imp_final(T(-z), pol, l) * sinpx(z);
+      BOOST_MATH_INSTRUMENT_VARIABLE(result);
+      if((fabs(result) < 1) && (tools::max_value<T>() * fabs(result) < boost::math::constants::pi<T>()))
+         return -boost::math::sign(result) * policies::raise_overflow_error<T>(function, "Result of tgamma is too large to represent.", pol);
+      result = -boost::math::constants::pi<T>() / result;
+      if(result == 0)
+         return policies::raise_underflow_error<T>(function, "Result of tgamma is too small to represent.", pol);
+      if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL)
+         return policies::raise_denorm_error<T>(function, "Result of tgamma is denormalized.", result, pol);
+      BOOST_MATH_INSTRUMENT_VARIABLE(result);
+      return result;
+   }
+   else
+   {
+      return gamma_imp_final(T(z), pol, l);
+   }
+}
 //
 // lgamma(z) with Lanczos support:
 //

From 30a7ad1da9e2334ca81b8c94add283359a206757 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 15:47:04 -0400
Subject: [PATCH 24/33] Remove recursion in lgamma

---
 .../boost/math/special_functions/gamma.hpp    | 66 +++++++++++++------
 1 file changed, 45 insertions(+), 21 deletions(-)

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index 62448d69e..7601d5295 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -188,7 +188,7 @@ template <class T, class Policy, class Lanczos>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
-   
+
    if(z <= -20)
    {
       constexpr auto function = "boost::math::tgamma<%1%>(%1%)";
@@ -213,7 +213,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol
 // lgamma(z) with Lanczos support:
 //
 template <class T, class Policy, class Lanczos>
-BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr)
+BOOST_MATH_GPU_ENABLED T lgamma_imp_final(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr)
 {
 #ifdef BOOST_MATH_INSTRUMENT
    static bool b = false;
@@ -230,25 +230,8 @@ BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, in
 
    T result = 0;
    int sresult = 1;
-   if(z <= -tools::root_epsilon<T>())
-   {
-      // reflection formula:
-      if(floor(z) == z)
-         return policies::raise_pole_error<T>(function, "Evaluation of lgamma at a negative integer %1%.", z, pol);
-
-      T t = sinpx(z);
-      z = -z;
-      if(t < 0)
-      {
-         t = -t;
-      }
-      else
-      {
-         sresult = -sresult;
-      }
-      result = log(boost::math::constants::pi<T>()) - lgamma_imp(z, pol, l) - log(t);
-   }
-   else if (z < tools::root_epsilon<T>())
+   
+   if (z < tools::root_epsilon<T>())
    {
       if (0 == z)
          return policies::raise_pole_error<T>(function, "Evaluation of lgamma at %1%.", z, pol);
@@ -293,6 +276,47 @@ BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, in
    return result;
 }
 
+template <class T, class Policy, class Lanczos>
+BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr)
+{
+   BOOST_MATH_STD_USING
+
+   if(z <= -tools::root_epsilon<T>())
+   {
+      constexpr auto function = "boost::math::lgamma<%1%>(%1%)";
+
+      T result = 0;
+      int sresult = 1;
+
+      // reflection formula:
+      if(floor(z) == z)
+         return policies::raise_pole_error<T>(function, "Evaluation of lgamma at a negative integer %1%.", z, pol);
+
+      T t = sinpx(z);
+      z = -z;
+      if(t < 0)
+      {
+         t = -t;
+      }
+      else
+      {
+         sresult = -sresult;
+      }
+      result = log(boost::math::constants::pi<T>()) - lgamma_imp_final(T(z), pol, l) - log(t);
+
+      if(sign)
+      {
+         *sign = sresult;
+      }
+
+      return result;
+   }
+   else
+   {
+      return lgamma_imp_final(T(z), pol, l, sign);
+   }
+}
+
 //
 // Incomplete gamma functions follow:
 //

From 082b625460cd8fbf1585bf2d60099321d5dbe88a Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 15:51:14 -0400
Subject: [PATCH 25/33] Ignore CUDA warning about GNU force inline

---
 include/boost/math/special_functions/gamma.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index 7601d5295..af0599c88 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -183,6 +183,11 @@ BOOST_MATH_GPU_ENABLED T gamma_imp_final(T z, const Policy& pol, const Lanczos&
    }
    return result;
 }
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+#  pragma nv_diag_suppress 2190
+#endif
+
 // SYCL compilers can not support recursion so we extract it into a dispatch function
 template <class T, class Policy, class Lanczos>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol, const Lanczos& l)
@@ -209,6 +214,11 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol
       return gamma_imp_final(T(z), pol, l);
    }
 }
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+#  pragma nv_diag_default 2190
+#endif
+
 //
 // lgamma(z) with Lanczos support:
 //
@@ -276,6 +286,10 @@ BOOST_MATH_GPU_ENABLED T lgamma_imp_final(T z, const Policy& pol, const Lanczos&
    return result;
 }
 
+#ifdef BOOST_MATH_ENABLE_CUDA
+#  pragma nv_diag_suppress 2190
+#endif
+
 template <class T, class Policy, class Lanczos>
 BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr)
 {
@@ -317,6 +331,10 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T lgamma_imp(T z, const Policy& po
    }
 }
 
+#ifdef BOOST_MATH_ENABLE_CUDA
+#  pragma nv_diag_default 2190
+#endif
+
 //
 // Incomplete gamma functions follow:
 //

From 6d19b67bf323692046f6d1998d5897477ffce720 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 15:57:04 -0400
Subject: [PATCH 26/33] Ignore literal range warnings

---
 test/test_erf.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/test_erf.cpp b/test/test_erf.cpp
index beb521ac4..b4513d96d 100644
--- a/test/test_erf.cpp
+++ b/test/test_erf.cpp
@@ -8,6 +8,14 @@
 #include <pch_light.hpp>
 #endif
 
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma gcc diagnostic push 
+#  pragma gcc diagnostic ignored "-Wliteral-range"
+#endif
+
 #include <boost/math/special_functions/erf.hpp>
 #include "test_erf.hpp"
 

From 19f166bfe60ee1d7bbb629ddd9a66fd9af1b8312 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 16:15:14 -0400
Subject: [PATCH 27/33] Ignore mapairy literal range warnings

---
 test/test_mapairy.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/test_mapairy.cpp b/test/test_mapairy.cpp
index c1d1c739b..096abde8b 100644
--- a/test/test_mapairy.cpp
+++ b/test/test_mapairy.cpp
@@ -4,6 +4,14 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma gcc diagnostic push 
+#  pragma gcc diagnostic ignored "-Wliteral-range"
+#endif
+
 #define BOOST_TEST_MAIN
 #define BOOST_TEST_MODULE StatsMapAiryTest
 #include <boost/math/tools/config.hpp>

From 84a7272e6cf962340700653ab77431d77477a044 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 16:16:28 -0400
Subject: [PATCH 28/33] Make powm1 GPU compatible

---
 .../boost/math/special_functions/math_fwd.hpp |  6 +-
 .../boost/math/special_functions/powm1.hpp    | 66 +++++++++++--------
 2 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index fdccfad8b..289b27592 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -591,11 +591,11 @@ namespace boost
 
    // Power - 1
    template <class T1, class T2>
-   tools::promote_args_t<T1, T2>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
          powm1(const T1 a, const T2 z);
 
    template <class T1, class T2, class Policy>
-   tools::promote_args_t<T1, T2>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
          powm1(const T1 a, const T2 z, const Policy&);
 
    // sqrt(1+x) - 1
@@ -1481,7 +1481,7 @@ namespace boost
 \
    template <class T1, class T2>\
    inline boost::math::tools::promote_args_t<T1, T2> \
-   powm1(const T1 a, const T2 z){ return boost::math::powm1(a, z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED powm1(const T1 a, const T2 z){ return boost::math::powm1(a, z, Policy()); }\
 \
    template <class T>\
    BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\
diff --git a/include/boost/math/special_functions/powm1.hpp b/include/boost/math/special_functions/powm1.hpp
index e52277b16..f00af2264 100644
--- a/include/boost/math/special_functions/powm1.hpp
+++ b/include/boost/math/special_functions/powm1.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright John Maddock 2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -12,6 +13,7 @@
 #pragma warning(disable:4702) // Unreachable code (release mode only warning)
 #endif
 
+#include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/log1p.hpp>
 #include <boost/math/special_functions/expm1.hpp>
@@ -22,32 +24,23 @@
 namespace boost{ namespace math{ namespace detail{
 
 template <class T, class Policy>
-inline T powm1_imp(const T x, const T y, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T powm1_imp(const T x, const T y, const Policy& pol)
 {
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::powm1<%1%>(%1%, %1%)";
-   if (x > 0)
+   constexpr auto function = "boost::math::powm1<%1%>(%1%, %1%)";
+
+   if ((fabs(y * (x - 1)) < T(0.5)) || (fabs(y) < T(0.2)))
    {
-      if ((fabs(y * (x - 1)) < T(0.5)) || (fabs(y) < T(0.2)))
-      {
-         // We don't have any good/quick approximation for log(x) * y
-         // so just try it and see:
-         T l = y * log(x);
-         if (l < T(0.5))
-            return boost::math::expm1(l, pol);
-         if (l > boost::math::tools::log_max_value<T>())
-            return boost::math::policies::raise_overflow_error<T>(function, nullptr, pol);
-         // fall through....
-      }
-   }
-   else if ((boost::math::signbit)(x)) // Need to error check -0 here as well
-   {
-      // y had better be an integer:
-      if (boost::math::trunc(y) != y)
-         return boost::math::policies::raise_domain_error<T>(function, "For non-integral exponent, expected base > 0 but got %1%", x, pol);
-      if (boost::math::trunc(y / 2) == y / 2)
-         return powm1_imp(T(-x), y, pol);
+      // We don't have any good/quick approximation for log(x) * y
+      // so just try it and see:
+      T l = y * log(x);
+      if (l < T(0.5))
+         return boost::math::expm1(l, pol);
+      if (l > boost::math::tools::log_max_value<T>())
+         return boost::math::policies::raise_overflow_error<T>(function, nullptr, pol);
+      // fall through....
    }
+   
    T result = pow(x, y) - 1;
    if((boost::math::isinf)(result))
       return result < 0 ? -boost::math::policies::raise_overflow_error<T>(function, nullptr, pol) : boost::math::policies::raise_overflow_error<T>(function, nullptr, pol);
@@ -56,22 +49,43 @@ inline T powm1_imp(const T x, const T y, const Policy& pol)
    return result;
 }
 
+template <class T, class Policy>
+BOOST_MATH_GPU_ENABLED inline T powm1_imp_dispatch(const T x, const T y, const Policy& pol)
+{
+   BOOST_MATH_STD_USING
+
+   if ((boost::math::signbit)(x)) // Need to error check -0 here as well
+   {
+      constexpr auto function = "boost::math::powm1<%1%>(%1%, %1%)";
+
+      // y had better be an integer:
+      if (boost::math::trunc(y) != y)
+         return boost::math::policies::raise_domain_error<T>(function, "For non-integral exponent, expected base > 0 but got %1%", x, pol);
+      if (boost::math::trunc(y / 2) == y / 2)
+         return powm1_imp(T(-x), T(y), pol);
+   }
+   else
+   {
+      return powm1_imp(T(x), T(y), pol);
+   }
+}
+
 } // detail
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2>::type
    powm1(const T1 a, const T2 z)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
-   return detail::powm1_imp(static_cast<result_type>(a), static_cast<result_type>(z), policies::policy<>());
+   return detail::powm1_imp_dispatch(static_cast<result_type>(a), static_cast<result_type>(z), policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2>::type
    powm1(const T1 a, const T2 z, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
-   return detail::powm1_imp(static_cast<result_type>(a), static_cast<result_type>(z), pol);
+   return detail::powm1_imp_dispatch(static_cast<result_type>(a), static_cast<result_type>(z), pol);
 }
 
 } // namespace math

From 7bd6f957aef5515a80a8ac103bf920116f0b524c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 16:23:52 -0400
Subject: [PATCH 29/33] Fix GCC warnings

---
 include/boost/math/special_functions/powm1.hpp | 6 ++----
 test/test_erf.cpp                              | 4 ++--
 test/test_mapairy.cpp                          | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/boost/math/special_functions/powm1.hpp b/include/boost/math/special_functions/powm1.hpp
index f00af2264..80d02dc29 100644
--- a/include/boost/math/special_functions/powm1.hpp
+++ b/include/boost/math/special_functions/powm1.hpp
@@ -64,10 +64,8 @@ BOOST_MATH_GPU_ENABLED inline T powm1_imp_dispatch(const T x, const T y, const P
       if (boost::math::trunc(y / 2) == y / 2)
          return powm1_imp(T(-x), T(y), pol);
    }
-   else
-   {
-      return powm1_imp(T(x), T(y), pol);
-   }
+
+   return powm1_imp(T(x), T(y), pol);
 }
 
 } // detail
diff --git a/test/test_erf.cpp b/test/test_erf.cpp
index b4513d96d..d01f8024d 100644
--- a/test/test_erf.cpp
+++ b/test/test_erf.cpp
@@ -12,8 +12,8 @@
 #  pragma clang diagnostic push 
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
-#  pragma gcc diagnostic push 
-#  pragma gcc diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Wliteral-range"
 #endif
 
 #include <boost/math/special_functions/erf.hpp>
diff --git a/test/test_mapairy.cpp b/test/test_mapairy.cpp
index 096abde8b..ee8e43bf0 100644
--- a/test/test_mapairy.cpp
+++ b/test/test_mapairy.cpp
@@ -8,8 +8,8 @@
 #  pragma clang diagnostic push 
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
-#  pragma gcc diagnostic push 
-#  pragma gcc diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Wliteral-range"
 #endif
 
 #define BOOST_TEST_MAIN

From c86e74424dea51d6ed677ecbedf0dbd46eface4c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 30 Jul 2024 17:20:49 -0400
Subject: [PATCH 30/33] Revert if constexpr change

---
 include/boost/math/special_functions/gamma.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index af0599c88..b1beeb766 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -1913,7 +1913,7 @@ struct igamma_initializer
          // our initialization code here as the precision presumably
          // varies at runtime, and will not have been set yet.  Plus the
          // code requiring initialization isn't called when digits == 0.
-         BOOST_MATH_IF_CONSTEXPR (std::numeric_limits<T>::digits)
+         if (std::numeric_limits<T>::digits)
          {
             boost::math::gamma_p(static_cast<T>(400), static_cast<T>(400), Policy());
          }

From 4f846a9527fde46b0759d8ac3ab6f27a1b580588 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 31 Jul 2024 12:05:00 -0400
Subject: [PATCH 31/33] Fix typo

---
 include/boost/math/special_functions/detail/lgamma_small.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp
index 82a1761f1..d9fa88b8e 100644
--- a/include/boost/math/special_functions/detail/lgamma_small.hpp
+++ b/include/boost/math/special_functions/detail/lgamma_small.hpp
@@ -38,7 +38,7 @@ T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l);
 // lgamma for small arguments:
 //
 template <class T, class Policy, class Lanczos>
-BOOST_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, const Policy& /* l */, const Lanczos&)
+BOOST_MATH_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant<int, 64>&, const Policy& /* l */, const Lanczos&)
 {
    // This version uses rational approximations for small
    // values of z accurate enough for 64-bit mantissas

From d13ff4e498ebf984a846a0a6a32f44f338da0077 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 31 Jul 2024 14:29:09 -0400
Subject: [PATCH 32/33] Fix comparison

---
 include/boost/math/special_functions/erf.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/boost/math/special_functions/erf.hpp b/include/boost/math/special_functions/erf.hpp
index 323a91937..0b311c464 100644
--- a/include/boost/math/special_functions/erf.hpp
+++ b/include/boost/math/special_functions/erf.hpp
@@ -221,7 +221,7 @@ BOOST_MATH_GPU_ENABLED T erf_imp(T z, bool invert, const Policy& pol, const std:
          prefix_multiplier = -1;
          // return -erf_imp(T(-z), invert, pol, t);
       }
-      else if(z < -0.5)
+      else if(z < T(-0.5))
       {
          prefix_adder = 2;
          // return 2 - erf_imp(T(-z), invert, pol, t);

From 43d5905134126549b4a80546f81a29a0549cfdd1 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 31 Jul 2024 14:29:54 -0400
Subject: [PATCH 33/33] Use tag type idiom

---
 include/boost/math/special_functions/detail/erf_inv.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/boost/math/special_functions/detail/erf_inv.hpp b/include/boost/math/special_functions/detail/erf_inv.hpp
index 09772632d..69a08abe6 100644
--- a/include/boost/math/special_functions/detail/erf_inv.hpp
+++ b/include/boost/math/special_functions/detail/erf_inv.hpp
@@ -317,7 +317,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_con
    //
    // Generic version, get a guess that's accurate to 64-bits (10^-19)
    //
-   T guess = erf_inv_imp(p, q, pol, std::integral_constant<int, 64>());
+   using tag_type = std::integral_constant<int, 64>;
+   T guess = erf_inv_imp(p, q, pol, tag_type());
    T result;
    //
    // If T has more bit's than 64 in it's mantissa then we need to iterate,