From e0db5811100af3f8ecd0c4d6eb077eec49083666 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 29 Jul 2024 16:37:39 -0400 Subject: [PATCH 01/33] Add GPU support to sqrt1pm1 --- include/boost/math/special_functions/sqrt1pm1.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/boost/math/special_functions/sqrt1pm1.hpp b/include/boost/math/special_functions/sqrt1pm1.hpp index 041916a53..4d8aeb38c 100644 --- a/include/boost/math/special_functions/sqrt1pm1.hpp +++ b/include/boost/math/special_functions/sqrt1pm1.hpp @@ -10,6 +10,7 @@ #pragma once #endif +#include #include #include #include @@ -21,7 +22,7 @@ namespace boost{ namespace math{ template -inline typename tools::promote_args::type sqrt1pm1(const T& val, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type sqrt1pm1(const T& val, const Policy& pol) { typedef typename tools::promote_args::type result_type; BOOST_MATH_STD_USING @@ -32,7 +33,7 @@ inline typename tools::promote_args::type sqrt1pm1(const T& val, const Policy } template -inline typename tools::promote_args::type sqrt1pm1(const T& val) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type sqrt1pm1(const T& val) { return sqrt1pm1(val, policies::policy<>()); } From 24c0c01a7216b965dcbd8e526efd41fc4bc523a2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 29 Jul 2024 17:10:15 -0400 Subject: [PATCH 02/33] Add GPU support to continued fractions and remove recursion --- include/boost/math/tools/fraction.hpp | 125 ++++++++++++++++++++------ 1 file changed, 96 insertions(+), 29 deletions(-) diff --git a/include/boost/math/tools/fraction.hpp b/include/boost/math/tools/fraction.hpp index a64c07025..1970e5ca3 100644 --- a/include/boost/math/tools/fraction.hpp +++ b/include/boost/math/tools/fraction.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2005-2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,6 +11,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -25,7 +28,7 @@ namespace detail struct is_pair : public std::false_type{}; template - struct is_pair> : public std::true_type{}; + struct is_pair> : public std::true_type{}; template struct fraction_traits_simple @@ -33,11 +36,11 @@ namespace detail using result_type = typename Gen::result_type; using value_type = typename Gen::result_type; - static result_type a(const value_type&) BOOST_MATH_NOEXCEPT(value_type) + BOOST_MATH_GPU_ENABLED static result_type a(const value_type&) BOOST_MATH_NOEXCEPT(value_type) { return 1; } - static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type) + BOOST_MATH_GPU_ENABLED static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type) { return v; } @@ -49,11 +52,11 @@ namespace detail using value_type = typename Gen::result_type; using result_type = typename value_type::first_type; - static result_type a(const value_type& v) BOOST_MATH_NOEXCEPT(value_type) + BOOST_MATH_GPU_ENABLED static result_type a(const value_type& v) BOOST_MATH_NOEXCEPT(value_type) { return v.first; } - static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type) + BOOST_MATH_GPU_ENABLED static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type) { return v.second; } @@ -74,7 +77,7 @@ namespace detail // For float, double, and long double, 1/min_value() is finite. // But for mpfr_float and cpp_bin_float, 1/min_value() is inf. // Multiply the min by 16 so that the reciprocal doesn't overflow. - static T get() { + BOOST_MATH_GPU_ENABLED static T get() { return 16*tools::min_value(); } }; @@ -82,13 +85,15 @@ namespace detail struct tiny_value { using value_type = typename T::value_type; - static T get() { + BOOST_MATH_GPU_ENABLED static T get() { return 16*tools::min_value(); } }; } // namespace detail +namespace detail { + // // continued_fraction_b // Evaluates: @@ -103,9 +108,15 @@ namespace detail // // Note that the first a0 returned by generator Gen is discarded. // + template -inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, const U& factor, std::uintmax_t& max_terms) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b_impl(Gen& g, const U& factor, std::uintmax_t& max_terms) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + // SYCL can not handle this condition so we only check float on that platform + && noexcept(std::declval()()) + #endif + ) { BOOST_MATH_STD_USING // ADL of std names @@ -148,17 +159,38 @@ inline typename detail::fraction_traits::result_type continued_fraction_b(G return f; } +} // namespace detail + +template +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, const U& factor, std::uintmax_t& max_terms) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) +{ + return detail::continued_fraction_b_impl(g, factor, max_terms); +} + template -inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, const U& factor) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, const U& factor) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) { std::uintmax_t max_terms = (std::numeric_limits::max)(); - return continued_fraction_b(g, factor, max_terms); + return detail::continued_fraction_b_impl(g, factor, max_terms); } template -inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, int bits) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, int bits) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) { BOOST_MATH_STD_USING // ADL of std names @@ -167,12 +199,16 @@ inline typename detail::fraction_traits::result_type continued_fraction_b(G result_type factor = ldexp(1.0f, 1 - bits); // 1 / pow(result_type(2), bits); std::uintmax_t max_terms = (std::numeric_limits::max)(); - return continued_fraction_b(g, factor, max_terms); + return detail::continued_fraction_b_impl(g, factor, max_terms); } template -inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, int bits, std::uintmax_t& max_terms) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, int bits, std::uintmax_t& max_terms) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) { BOOST_MATH_STD_USING // ADL of std names @@ -180,9 +216,11 @@ inline typename detail::fraction_traits::result_type continued_fraction_b(G using result_type = typename traits::result_type; result_type factor = ldexp(1.0f, 1 - bits); // 1 / pow(result_type(2), bits); - return continued_fraction_b(g, factor, max_terms); + return detail::continued_fraction_b_impl(g, factor, max_terms); } +namespace detail { + // // continued_fraction_a // Evaluates: @@ -198,8 +236,12 @@ inline typename detail::fraction_traits::result_type continued_fraction_b(G // Note that the first a1 and b1 returned by generator Gen are both used. // template -inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, const U& factor, std::uintmax_t& max_terms) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a_impl(Gen& g, const U& factor, std::uintmax_t& max_terms) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) { BOOST_MATH_STD_USING // ADL of std names @@ -244,17 +286,38 @@ inline typename detail::fraction_traits::result_type continued_fraction_a(G return a0/f; } +} // namespace detail + +template +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, const U& factor, std::uintmax_t& max_terms) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) +{ + return detail::continued_fraction_a_impl(g, factor, max_terms); +} + template -inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, const U& factor) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, const U& factor) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) { std::uintmax_t max_iter = (std::numeric_limits::max)(); - return continued_fraction_a(g, factor, max_iter); + return detail::continued_fraction_a_impl(g, factor, max_iter); } template -inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, int bits) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, int bits) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) { BOOST_MATH_STD_USING // ADL of std names @@ -264,12 +327,16 @@ inline typename detail::fraction_traits::result_type continued_fraction_a(G result_type factor = ldexp(1.0f, 1-bits); // 1 / pow(result_type(2), bits); std::uintmax_t max_iter = (std::numeric_limits::max)(); - return continued_fraction_a(g, factor, max_iter); + return detail::continued_fraction_a_impl(g, factor, max_iter); } template -inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, int bits, std::uintmax_t& max_terms) - noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, int bits, std::uintmax_t& max_terms) + noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) + #ifndef BOOST_MATH_ENABLE_SYCL + && noexcept(std::declval()()) + #endif + ) { BOOST_MATH_STD_USING // ADL of std names @@ -277,7 +344,7 @@ inline typename detail::fraction_traits::result_type continued_fraction_a(G using result_type = typename traits::result_type; result_type factor = ldexp(1.0f, 1-bits); // 1 / pow(result_type(2), bits); - return continued_fraction_a(g, factor, max_terms); + return detail::continued_fraction_a_impl(g, factor, max_terms); } } // namespace tools From 96953ea52186d70905efc64294b0e0cb3d98f30f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 29 Jul 2024 17:10:38 -0400 Subject: [PATCH 03/33] Add GPU support to erf and erfc --- include/boost/math/special_functions/erf.hpp | 90 +++++++++++-------- .../boost/math/special_functions/math_fwd.hpp | 12 +-- 2 files changed, 60 insertions(+), 42 deletions(-) diff --git a/include/boost/math/special_functions/erf.hpp b/include/boost/math/special_functions/erf.hpp index 57ff60529..323a91937 100644 --- a/include/boost/math/special_functions/erf.hpp +++ b/include/boost/math/special_functions/erf.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -39,7 +40,7 @@ template struct erf_asympt_series_t { // LCOV_EXCL_START multiprecision case only, excluded from coverage analysis - erf_asympt_series_t(T z) : xx(2 * -z * z), tk(1) + BOOST_MATH_GPU_ENABLED erf_asympt_series_t(T z) : xx(2 * -z * z), tk(1) { BOOST_MATH_STD_USING result = -exp(-z * z) / sqrt(boost::math::constants::pi()); @@ -48,7 +49,7 @@ struct erf_asympt_series_t typedef T result_type; - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { BOOST_MATH_STD_USING T r = result; @@ -68,33 +69,33 @@ struct erf_asympt_series_t // How large z has to be in order to ensure that the series converges: // template -inline float erf_asymptotic_limit_N(const T&) +BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const T&) { return (std::numeric_limits::max)(); } -inline float erf_asymptotic_limit_N(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant&) { return 2.8F; } -inline float erf_asymptotic_limit_N(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant&) { return 4.3F; } -inline float erf_asymptotic_limit_N(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant&) { return 4.8F; } -inline float erf_asymptotic_limit_N(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant&) { return 6.5F; } -inline float erf_asymptotic_limit_N(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline float erf_asymptotic_limit_N(const std::integral_constant&) { return 6.8F; } template -inline T erf_asymptotic_limit() +BOOST_MATH_GPU_ENABLED inline T erf_asymptotic_limit() { typedef typename policies::precision::type precision_type; typedef std::integral_constant -T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant& t) +BOOST_MATH_GPU_ENABLED T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant& t) { BOOST_MATH_STD_USING @@ -207,14 +208,30 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant(%1%)", "Expected a finite argument but got %1%", z, pol); + int prefix_multiplier = 1; + int prefix_adder = 0; + if(z < 0) { + // Recursion is logically simpler here, but confuses static analyzers that need to be + // able to calculate the maximimum program stack size at compile time (ie CUDA). + z = -z; if(!invert) - return -erf_imp(T(-z), invert, pol, t); - else if(z < T(-0.5)) - return 2 - erf_imp(T(-z), invert, pol, t); + { + prefix_multiplier = -1; + // return -erf_imp(T(-z), invert, pol, t); + } + else if(z < -0.5) + { + prefix_adder = 2; + // return 2 - erf_imp(T(-z), invert, pol, t); + } else - return 1 + erf_imp(T(-z), false, pol, t); + { + invert = false; + prefix_adder = 1; + // return 1 + erf_imp(T(-z), false, pol, t); + } } T result; @@ -237,7 +254,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant(z * 1.125f + z * c); } } @@ -248,15 +265,15 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constantT erf_imp(T z, bool invert, const Lanczos& l, const std::integral_constant& t) @@ -1175,7 +1193,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const std::integral_constant -inline typename tools::promote_args::type erf(T z, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type erf(T z, const Policy& /* pol */) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -1208,7 +1226,7 @@ inline typename tools::promote_args::type erf(T z, const Policy& /* pol */) } template -inline typename tools::promote_args::type erfc(T z, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type erfc(T z, const Policy& /* pol */) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -1241,13 +1259,13 @@ inline typename tools::promote_args::type erfc(T z, const Policy& /* pol */) } template -inline typename tools::promote_args::type erf(T z) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type erf(T z) { return boost::math::erf(z, policies::policy<>()); } template -inline typename tools::promote_args::type erfc(T z) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type erfc(T z) { return boost::math::erfc(z, policies::policy<>()); } diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 5e8153734..cbfe27e81 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -156,14 +156,14 @@ namespace boost // erf & erfc error functions. template // Error function. - tools::promote_args_t erf(RT z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erf(RT z); template // Error function. - tools::promote_args_t erf(RT z, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erf(RT z, const Policy&); template // Error function complement. - tools::promote_args_t erfc(RT z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erfc(RT z); template // Error function complement. - tools::promote_args_t erfc(RT z, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erfc(RT z, const Policy&); template // Error function inverse. tools::promote_args_t erf_inv(RT z); @@ -1272,10 +1272,10 @@ namespace boost template T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient(n, k, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t erf(RT z) { return ::boost::math::erf(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t erf(RT z) { return ::boost::math::erf(z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t erfc(RT z){ return ::boost::math::erfc(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t erfc(RT z){ return ::boost::math::erfc(z, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t erf_inv(RT z) { return ::boost::math::erf_inv(z, Policy()); }\ From a8af41e17e845f5fc50bafb0486951604530727a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 29 Jul 2024 17:10:47 -0400 Subject: [PATCH 04/33] Add SYCL erf testing --- test/sycl_jamfile | 1 + test/test_erf.cpp | 4 ++++ test/test_erf.hpp | 11 +++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 642329806..e336523e6 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -32,3 +32,4 @@ run test_expm1_simple.cpp ; run test_log1p_simple.cpp ; run test_digamma_simple.cpp ; run test_trigamma.cpp ; +run test_erf.cpp ; diff --git a/test/test_erf.cpp b/test/test_erf.cpp index 535903983..beb521ac4 100644 --- a/test/test_erf.cpp +++ b/test/test_erf.cpp @@ -4,7 +4,11 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#include #include "test_erf.hpp" // diff --git a/test/test_erf.hpp b/test/test_erf.hpp index dc42c8124..b70c73953 100644 --- a/test/test_erf.hpp +++ b/test/test_erf.hpp @@ -1,9 +1,11 @@ -// Copyright John Maddock 2006. -// Copyright Paul A. Bristow 2007, 2009 +// Copyright John Maddock 2006. +// Copyright Paul A. Bristow 2007, 2009 +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#include #include #define BOOST_TEST_MAIN #include @@ -21,6 +23,11 @@ #define SC_(x) static_cast::type>(BOOST_JOIN(x, L)) #endif +#ifdef BOOST_MATH_NO_EXCEPTIONS +# undef BOOST_CHECK_THROW +# define BOOST_CHECK_THROW(x, y) +#endif + template void do_test_erf(const T& data, const char* type_name, const char* test_name) { From c91ac7ccb147855ca16739ec24b76f5b7e6e8590 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 29 Jul 2024 17:18:00 -0400 Subject: [PATCH 05/33] Add sqrt1pm1 to fwd --- include/boost/math/special_functions/math_fwd.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index cbfe27e81..c789d82df 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -600,10 +600,10 @@ namespace boost // sqrt(1+x) - 1 template - tools::promote_args_t sqrt1pm1(const T& val); + BOOST_MATH_GPU_ENABLED tools::promote_args_t sqrt1pm1(const T& val); template - tools::promote_args_t sqrt1pm1(const T& val, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t sqrt1pm1(const T& val, const Policy&); // sinus cardinals: template @@ -1484,7 +1484,7 @@ namespace boost powm1(const T1 a, const T2 z){ return boost::math::powm1(a, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\ From 325720d9035546f865603a719271f94a7a6a467e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 29 Jul 2024 17:18:42 -0400 Subject: [PATCH 06/33] Add CUDA erf and erfc testing --- test/cuda_jamfile | 4 ++ test/test_erf_double.cu | 100 +++++++++++++++++++++++++++++++++++++++ test/test_erf_float.cu | 100 +++++++++++++++++++++++++++++++++++++++ test/test_erfc_double.cu | 100 +++++++++++++++++++++++++++++++++++++++ test/test_erfc_float.cu | 100 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 404 insertions(+) create mode 100644 test/test_erf_double.cu create mode 100644 test/test_erf_float.cu create mode 100644 test/test_erfc_double.cu create mode 100644 test/test_erfc_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index cb067d564..fe67d9f4e 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -100,6 +100,10 @@ run test_cos_pi_double.cu ; run test_cos_pi_float.cu ; run test_digamma_double.cu ; run test_digamma_float.cu ; +run test_erf_double.cu ; +run test_erf_float.cu ; +run test_erfc_double.cu ; +run test_erfc_float.cu ; run test_expm1_double.cu ; run test_expm1_float.cu ; run test_log1p_double.cu ; diff --git a/test/test_erf_double.cu b/test/test_erf_double.cu new file mode 100644 index 000000000..3e8398262 --- /dev/null +++ b/test/test_erf_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erf(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erf(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_erf_float.cu b/test/test_erf_float.cu new file mode 100644 index 000000000..6cbd07e6a --- /dev/null +++ b/test/test_erf_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erf(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erf(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_erfc_double.cu b/test/test_erfc_double.cu new file mode 100644 index 000000000..86d3c6e5b --- /dev/null +++ b/test/test_erfc_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erfc(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erfc(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_erfc_float.cu b/test/test_erfc_float.cu new file mode 100644 index 000000000..7970063a4 --- /dev/null +++ b/test/test_erfc_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erfc(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erfc(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From d1b582332b23e6695dce58fa5b2cfb7fd6277df2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 08:22:19 -0400 Subject: [PATCH 07/33] Replace integral constant pointers with references --- .../boost/math/special_functions/detail/erf_inv.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/boost/math/special_functions/detail/erf_inv.hpp b/include/boost/math/special_functions/detail/erf_inv.hpp index 0054a7426..690ea4a44 100644 --- a/include/boost/math/special_functions/detail/erf_inv.hpp +++ b/include/boost/math/special_functions/detail/erf_inv.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -23,7 +24,7 @@ namespace detail{ // this version is for 80-bit long double's and smaller: // template -T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant*) +T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant&) { BOOST_MATH_STD_USING // for ADL of std names. @@ -310,12 +311,12 @@ struct erf_roots }; template -T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_constant*) +T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_constant&) { // // Generic version, get a guess that's accurate to 64-bits (10^-19) // - T guess = erf_inv_imp(p, q, pol, static_cast const*>(nullptr)); + T guess = erf_inv_imp(p, q, pol, std::integral_constant()); T result; // // If T has more bit's than 64 in it's mantissa then we need to iterate, @@ -401,7 +402,7 @@ typename tools::promote_args::type erfc_inv(T z, const Policy& pol) // And get the result, negating where required: // return s * policies::checked_narrowing_cast( - detail::erf_inv_imp(static_cast(p), static_cast(q), forwarding_policy(), static_cast(nullptr)), function); + detail::erf_inv_imp(static_cast(p), static_cast(q), forwarding_policy(), tag_type()), function); } template @@ -469,7 +470,7 @@ typename tools::promote_args::type erf_inv(T z, const Policy& pol) // And get the result, negating where required: // return s * policies::checked_narrowing_cast( - detail::erf_inv_imp(static_cast(p), static_cast(q), forwarding_policy(), static_cast(nullptr)), function); + detail::erf_inv_imp(static_cast(p), static_cast(q), forwarding_policy(), tag_type()), function); } template From 6a12bf06a745ccb7a031dd6d3fd5c3a9223a5808 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 08:31:37 -0400 Subject: [PATCH 08/33] Make erf_inv and erfc_inv GPU enabled --- .../math/special_functions/detail/erf_inv.hpp | 57 ++++++++++--------- .../boost/math/special_functions/math_fwd.hpp | 12 ++-- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/include/boost/math/special_functions/detail/erf_inv.hpp b/include/boost/math/special_functions/detail/erf_inv.hpp index 690ea4a44..09772632d 100644 --- a/include/boost/math/special_functions/detail/erf_inv.hpp +++ b/include/boost/math/special_functions/detail/erf_inv.hpp @@ -14,6 +14,7 @@ #pragma warning(disable:4702) // Unreachable code: optimization warning #endif +#include #include namespace boost{ namespace math{ @@ -24,7 +25,7 @@ namespace detail{ // this version is for 80-bit long double's and smaller: // template -T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constant&) { BOOST_MATH_STD_USING // for ADL of std names. @@ -45,8 +46,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan // Maximum Deviation Found (actual error term at infinite precision) 8.030e-21 // // LCOV_EXCL_START - static const float Y = 0.0891314744949340820313f; - static const T P[] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.0891314744949340820313f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.000508781949658280665617), BOOST_MATH_BIG_CONSTANT(T, 64, -0.00836874819741736770379), BOOST_MATH_BIG_CONSTANT(T, 64, 0.0334806625409744615033), @@ -56,7 +57,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan BOOST_MATH_BIG_CONSTANT(T, 64, 0.00822687874676915743155), BOOST_MATH_BIG_CONSTANT(T, 64, -0.00538772965071242932965) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, -0.970005043303290640362), BOOST_MATH_BIG_CONSTANT(T, 64, -1.56574558234175846809), @@ -88,8 +89,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan // Maximum Deviation Found (error term) 4.811e-20 // // LCOV_EXCL_START - static const float Y = 2.249481201171875f; - static const T P[] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 2.249481201171875f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.202433508355938759655), BOOST_MATH_BIG_CONSTANT(T, 64, 0.105264680699391713268), BOOST_MATH_BIG_CONSTANT(T, 64, 8.37050328343119927838), @@ -100,7 +101,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan BOOST_MATH_BIG_CONSTANT(T, 64, 21.1294655448340526258), BOOST_MATH_BIG_CONSTANT(T, 64, -3.67192254707729348546) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, 6.24264124854247537712), BOOST_MATH_BIG_CONSTANT(T, 64, 3.9713437953343869095), @@ -143,8 +144,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan { // LCOV_EXCL_START // Max error found: 1.089051e-20 - static const float Y = 0.807220458984375f; - static const T P[] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.807220458984375f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.131102781679951906451), BOOST_MATH_BIG_CONSTANT(T, 64, -0.163794047193317060787), BOOST_MATH_BIG_CONSTANT(T, 64, 0.117030156341995252019), @@ -157,7 +158,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan BOOST_MATH_BIG_CONSTANT(T, 64, 0.285225331782217055858e-7), BOOST_MATH_BIG_CONSTANT(T, 64, -0.681149956853776992068e-9) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, 3.46625407242567245975), BOOST_MATH_BIG_CONSTANT(T, 64, 5.38168345707006855425), @@ -176,8 +177,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan { // LCOV_EXCL_START // Max error found: 8.389174e-21 - static const float Y = 0.93995571136474609375f; - static const T P[] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.93995571136474609375f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.0350353787183177984712), BOOST_MATH_BIG_CONSTANT(T, 64, -0.00222426529213447927281), BOOST_MATH_BIG_CONSTANT(T, 64, 0.0185573306514231072324), @@ -188,7 +189,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan BOOST_MATH_BIG_CONSTANT(T, 64, -0.230404776911882601748e-9), BOOST_MATH_BIG_CONSTANT(T, 64, 0.266339227425782031962e-11) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, 1.3653349817554063097), BOOST_MATH_BIG_CONSTANT(T, 64, 0.762059164553623404043), @@ -206,8 +207,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan { // LCOV_EXCL_START // Max error found: 1.481312e-19 - static const float Y = 0.98362827301025390625f; - static const T P[] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.98362827301025390625f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.0167431005076633737133), BOOST_MATH_BIG_CONSTANT(T, 64, -0.00112951438745580278863), BOOST_MATH_BIG_CONSTANT(T, 64, 0.00105628862152492910091), @@ -218,7 +219,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan BOOST_MATH_BIG_CONSTANT(T, 64, -0.281128735628831791805e-13), BOOST_MATH_BIG_CONSTANT(T, 64, 0.99055709973310326855e-16) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, 0.591429344886417493481), BOOST_MATH_BIG_CONSTANT(T, 64, 0.138151865749083321638), @@ -236,8 +237,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan { // LCOV_EXCL_START // Max error found: 5.697761e-20 - static const float Y = 0.99714565277099609375f; - static const T P[] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.99714565277099609375f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.0024978212791898131227), BOOST_MATH_BIG_CONSTANT(T, 64, -0.779190719229053954292e-5), BOOST_MATH_BIG_CONSTANT(T, 64, 0.254723037413027451751e-4), @@ -247,7 +248,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan BOOST_MATH_BIG_CONSTANT(T, 64, 0.145596286718675035587e-11), BOOST_MATH_BIG_CONSTANT(T, 64, -0.116765012397184275695e-17) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, 0.207123112214422517181), BOOST_MATH_BIG_CONSTANT(T, 64, 0.0169410838120975906478), @@ -265,8 +266,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan { // LCOV_EXCL_START // Max error found: 1.279746e-20 - static const float Y = 0.99941349029541015625f; - static const T P[] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const float Y = 0.99941349029541015625f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.000539042911019078575891), BOOST_MATH_BIG_CONSTANT(T, 64, -0.28398759004727721098e-6), BOOST_MATH_BIG_CONSTANT(T, 64, 0.899465114892291446442e-6), @@ -276,7 +277,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const std::integral_constan BOOST_MATH_BIG_CONSTANT(T, 64, 0.135880130108924861008e-14), BOOST_MATH_BIG_CONSTANT(T, 64, -0.348890393399948882918e-21) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, 0.0845746234001899436914), BOOST_MATH_BIG_CONSTANT(T, 64, 0.00282092984726264681981), @@ -345,14 +346,14 @@ T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_con } // namespace detail template -typename tools::promote_args::type erfc_inv(T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type erfc_inv(T z, const Policy& pol) { typedef typename tools::promote_args::type result_type; // // Begin by testing for domain errors, and other special cases: // - static const char* function = "boost::math::erfc_inv<%1%>(%1%, %1%)"; + constexpr auto function = "boost::math::erfc_inv<%1%>(%1%, %1%)"; if((z < 0) || (z > 2)) return policies::raise_domain_error(function, "Argument outside range [0,2] in inverse erfc function (got p=%1%).", z, pol); if(z == 0) @@ -406,14 +407,14 @@ typename tools::promote_args::type erfc_inv(T z, const Policy& pol) } template -typename tools::promote_args::type erf_inv(T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type erf_inv(T z, const Policy& pol) { typedef typename tools::promote_args::type result_type; // // Begin by testing for domain errors, and other special cases: // - static const char* function = "boost::math::erf_inv<%1%>(%1%, %1%)"; + constexpr auto function = "boost::math::erf_inv<%1%>(%1%, %1%)"; if((z < -1) || (z > 1)) return policies::raise_domain_error(function, "Argument outside range [-1, 1] in inverse erf function (got p=%1%).", z, pol); if(z == 1) @@ -474,13 +475,13 @@ typename tools::promote_args::type erf_inv(T z, const Policy& pol) } template -inline typename tools::promote_args::type erfc_inv(T z) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type erfc_inv(T z) { return erfc_inv(z, policies::policy<>()); } template -inline typename tools::promote_args::type erf_inv(T z) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type erf_inv(T z) { return erf_inv(z, policies::policy<>()); } diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index c789d82df..1d22f61b1 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -166,14 +166,14 @@ namespace boost BOOST_MATH_GPU_ENABLED tools::promote_args_t erfc(RT z, const Policy&); template // Error function inverse. - tools::promote_args_t erf_inv(RT z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erf_inv(RT z); template // Error function inverse. - tools::promote_args_t erf_inv(RT z, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erf_inv(RT z, const Policy& pol); template // Error function complement inverse. - tools::promote_args_t erfc_inv(RT z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erfc_inv(RT z); template // Error function complement inverse. - tools::promote_args_t erfc_inv(RT z, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t erfc_inv(RT z, const Policy& pol); // Polynomials: template @@ -1278,10 +1278,10 @@ namespace boost BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t erfc(RT z){ return ::boost::math::erfc(z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t erf_inv(RT z) { return ::boost::math::erf_inv(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t erf_inv(RT z) { return ::boost::math::erf_inv(z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t erfc_inv(RT z){ return ::boost::math::erfc_inv(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t erfc_inv(RT z){ return ::boost::math::erfc_inv(z, Policy()); }\ \ using boost::math::legendre_next;\ \ From 0e1be528125151c78b9cbb2b78fb7fddda0c70e4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 08:31:49 -0400 Subject: [PATCH 09/33] Add erf_inv and erfc_inv CUDA testing --- test/cuda_jamfile | 4 ++ test/test_erf_inv_double.cu | 100 +++++++++++++++++++++++++++++++++++ test/test_erf_inv_float.cu | 100 +++++++++++++++++++++++++++++++++++ test/test_erfc_inv_double.cu | 100 +++++++++++++++++++++++++++++++++++ test/test_erfc_inv_float.cu | 100 +++++++++++++++++++++++++++++++++++ 5 files changed, 404 insertions(+) create mode 100644 test/test_erf_inv_double.cu create mode 100644 test/test_erf_inv_float.cu create mode 100644 test/test_erfc_inv_double.cu create mode 100644 test/test_erfc_inv_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index fe67d9f4e..7af3b9e48 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -102,8 +102,12 @@ run test_digamma_double.cu ; run test_digamma_float.cu ; run test_erf_double.cu ; run test_erf_float.cu ; +run test_erf_inv_double.cu ; +run test_erf_inv_float.cu ; run test_erfc_double.cu ; run test_erfc_float.cu ; +run test_erfc_inv_double.cu ; +run test_erfc_inv_float.cu ; run test_expm1_double.cu ; run test_expm1_float.cu ; run test_log1p_double.cu ; diff --git a/test/test_erf_inv_double.cu b/test/test_erf_inv_double.cu new file mode 100644 index 000000000..f540babbb --- /dev/null +++ b/test/test_erf_inv_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erf_inv(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erf_inv(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_erf_inv_float.cu b/test/test_erf_inv_float.cu new file mode 100644 index 000000000..d9f37687f --- /dev/null +++ b/test/test_erf_inv_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erf_inv(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erf_inv(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_erfc_inv_double.cu b/test/test_erfc_inv_double.cu new file mode 100644 index 000000000..68642cd10 --- /dev/null +++ b/test/test_erfc_inv_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erfc_inv(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erfc_inv(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_erfc_inv_float.cu b/test/test_erfc_inv_float.cu new file mode 100644 index 000000000..b5b72cd05 --- /dev/null +++ b/test/test_erfc_inv_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::erfc_inv(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::erfc_inv(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 3d0bd18f11529f1669e2751104244d216d9b95f2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 08:39:00 -0400 Subject: [PATCH 10/33] Replace igamma_temme_large integral constant pointer with reference --- .../math/special_functions/detail/igamma_large.hpp | 13 ++++++++----- include/boost/math/special_functions/gamma.hpp | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp index 5483b53fb..e8162095c 100644 --- a/include/boost/math/special_functions/detail/igamma_large.hpp +++ b/include/boost/math/special_functions/detail/igamma_large.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2006. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -59,13 +60,15 @@ #pragma GCC system_header #endif +#include + namespace boost{ namespace math{ namespace detail{ // This version will never be called (at runtime), it's a stub used // when T is unsuitable to be passed to these routines: // template -inline T igamma_temme_large(T, T, const Policy& /* pol */, std::integral_constant const *) +inline T igamma_temme_large(T, T, const Policy& /* pol */, const std::integral_constant&) { // stub function, should never actually be called BOOST_MATH_ASSERT(0); @@ -76,7 +79,7 @@ inline T igamma_temme_large(T, T, const Policy& /* pol */, std::integral_constan // (80-bit long double, or 10^-20). // template -T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant const *) +T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -281,7 +284,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant -T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant const *) +T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -423,7 +426,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant -T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant const *) +T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -479,7 +482,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant -T igamma_temme_large(T a, T x, const Policy& pol, std::integral_constant const *) +T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index a58ea3e69..0c3b32b46 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1454,7 +1454,7 @@ T gamma_incomplete_imp(T a, T x, bool normalised, bool invert, precision_type::value <= 113 ? 113 : 0 > tag_type; - result = igamma_temme_large(a, x, pol, static_cast(nullptr)); + result = igamma_temme_large(a, x, pol, tag_type()); if(x >= a) invert = !invert; if(p_derivative) From 8862896ff8dffe8abde3e277e8729612f322dd6d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 08:40:50 -0400 Subject: [PATCH 11/33] Add GPU support to igamma_temme_large --- .../special_functions/detail/igamma_large.hpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp index e8162095c..1fa13c692 100644 --- a/include/boost/math/special_functions/detail/igamma_large.hpp +++ b/include/boost/math/special_functions/detail/igamma_large.hpp @@ -284,7 +284,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant -T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -296,7 +296,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(-0.33333333333333333L), static_cast(0.083333333333333333L), static_cast(-0.014814814814814815L), @@ -315,7 +315,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(-0.0018518518518518519L), static_cast(-0.0034722222222222222L), static_cast(0.0026455026455026455L), @@ -332,7 +332,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(0.0041335978835978836L), static_cast(-0.0026813271604938272L), static_cast(0.00077160493827160494L), @@ -347,7 +347,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(0.00064943415637860082L), static_cast(0.00022947209362139918L), static_cast(-0.00046918949439525571L), @@ -360,7 +360,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(-0.0008618882909167117L), static_cast(0.00078403922172006663L), static_cast(-0.00029907248030319018L), @@ -371,7 +371,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(-0.00033679855336635815L), static_cast(-0.69728137583658578e-4L), static_cast(0.00027727532449593921L), @@ -384,7 +384,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(0.00053130793646399222L), static_cast(-0.00059216643735369388L), static_cast(0.00027087820967180448L), @@ -395,7 +395,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(0.00034436760689237767L), static_cast(0.51717909082605922e-4L), static_cast(-0.00033493161081142236L), @@ -404,7 +404,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(-0.00065262391859530942L), static_cast(0.00083949872067208728L), static_cast(-0.00043829709854172101L), @@ -426,7 +426,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant -T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -438,7 +438,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(-0.333333333L), static_cast(0.0833333333L), static_cast(-0.0148148148L), @@ -449,7 +449,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(-0.00185185185L), static_cast(-0.00347222222L), static_cast(0.00264550265L), @@ -458,7 +458,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant(0.00413359788L), static_cast(-0.00268132716L), static_cast(0.000771604938L), From 7cb386068f2cd213c9cd6f809bc227b0299e305f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 11:52:57 -0400 Subject: [PATCH 12/33] Add GPU support to lgamma_small_imp --- .../special_functions/detail/lgamma_small.hpp | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp index 6a4a9171f..a4d3bac70 100644 --- a/include/boost/math/special_functions/detail/lgamma_small.hpp +++ b/include/boost/math/special_functions/detail/lgamma_small.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,6 +11,7 @@ #pragma once #endif +#include #include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) @@ -36,7 +38,7 @@ T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l); // lgamma for small arguments: // template -T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, const Policy& /* l */, const Lanczos&) +BOOST_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, const Policy& /* l */, const Lanczos&) { // This version uses rational approximations for small // values of z accurate enough for 64-bit mantissas @@ -87,7 +89,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, co // At long double: Max error found: 1.987e-21 // Maximum Deviation Found (approximation error): 5.900e-24 // - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.180355685678449379109e-1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.25126649619989678683e-1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.494103151567532234274e-1)), @@ -96,7 +98,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, co static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.541009869215204396339e-3)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.324588649825948492091e-4)) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.196202987197795200688e1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.148019669424231326694e1)), @@ -107,7 +109,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, co static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.223352763208617092964e-6)) }; - static const float Y = 0.158963680267333984375e0f; + constexpr float Y = 0.158963680267333984375e0f; T r = zm2 * (z + 1); T R = tools::evaluate_polynomial(P, zm2); @@ -152,9 +154,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, co // Expected Error Term: 3.139e-021 // - static const float Y = 0.52815341949462890625f; + constexpr float Y = 0.52815341949462890625f; - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.490622454069039543534e-1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.969117530159521214579e-1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.414983358359495381969e0)), @@ -163,7 +165,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, co static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.240149820648571559892e-1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.100346687696279557415e-2)) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.302349829846463038743e1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.348739585360723852576e1)), @@ -197,9 +199,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, co // Maximum Deviation Found: 2.151e-021 // Expected Error Term: 2.150e-021 // - static const float Y = 0.452017307281494140625f; + constexpr float Y = 0.452017307281494140625f; - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.292329721830270012337e-1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.144216267757192309184e0)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.142440390738631274135e0)), @@ -207,7 +209,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, co static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.850535976868336437746e-2)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.431171342679297331241e-3)) }; - static const T Q[] = { + BOOST_MATH_STATIC const T Q[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -0.150169356054485044494e1)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.846973248876495016101e0)), From cc8f359d565efe429c1e7f8f52009304fe433b5f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:10:28 -0400 Subject: [PATCH 13/33] Add additional factorial overloads for GPU times --- .../detail/unchecked_factorial.hpp | 122 +++++++++++++++++- 1 file changed, 119 insertions(+), 3 deletions(-) diff --git a/include/boost/math/special_functions/detail/unchecked_factorial.hpp b/include/boost/math/special_functions/detail/unchecked_factorial.hpp index b528a24fe..f7720a2ab 100644 --- a/include/boost/math/special_functions/detail/unchecked_factorial.hpp +++ b/include/boost/math/special_functions/detail/unchecked_factorial.hpp @@ -46,6 +46,8 @@ struct max_factorial; template struct unchecked_factorial_data; +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template struct unchecked_factorial_data { @@ -137,17 +139,67 @@ template // Definitions: template <> -inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float)) +BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float)) { return unchecked_factorial_data::factorials[i]; } +#else + +template <> +BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float)) +{ + constexpr float factorials[] = { + 1.0F, + 1.0F, + 2.0F, + 6.0F, + 24.0F, + 120.0F, + 720.0F, + 5040.0F, + 40320.0F, + 362880.0F, + 3628800.0F, + 39916800.0F, + 479001600.0F, + 6227020800.0F, + 87178291200.0F, + 1307674368000.0F, + 20922789888000.0F, + 355687428096000.0F, + 6402373705728000.0F, + 121645100408832000.0F, + 0.243290200817664e19F, + 0.5109094217170944e20F, + 0.112400072777760768e22F, + 0.2585201673888497664e23F, + 0.62044840173323943936e24F, + 0.15511210043330985984e26F, + 0.403291461126605635584e27F, + 0.10888869450418352160768e29F, + 0.304888344611713860501504e30F, + 0.8841761993739701954543616e31F, + 0.26525285981219105863630848e33F, + 0.822283865417792281772556288e34F, + 0.26313083693369353016721801216e36F, + 0.868331761881188649551819440128e37F, + 0.29523279903960414084761860964352e39F, + }; + + return factorials[i]; +} + +#endif + template <> struct max_factorial { static constexpr unsigned value = 34; }; +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template struct unchecked_factorial_data { @@ -510,7 +562,7 @@ template #endif template <> -inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION double unchecked_factorial(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double)) +BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION double unchecked_factorial(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double)) { return unchecked_factorial_data::factorials[i]; } @@ -521,6 +573,62 @@ struct max_factorial static constexpr unsigned value = 170; }; +#else + +template <> +BOOST_MATH_GPU_ENABLED inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION double unchecked_factorial(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double)) +{ + constexpr double factorials[] = { + 1, + 1, + 2, + 6, + 24, + 120, + 720, + 5040, + 40320, + 362880.0, + 3628800.0, + 39916800.0, + 479001600.0, + 6227020800.0, + 87178291200.0, + 1307674368000.0, + 20922789888000.0, + 355687428096000.0, + 6402373705728000.0, + 121645100408832000.0, + 0.243290200817664e19, + 0.5109094217170944e20, + 0.112400072777760768e22, + 0.2585201673888497664e23, + 0.62044840173323943936e24, + 0.15511210043330985984e26, + 0.403291461126605635584e27, + 0.10888869450418352160768e29, + 0.304888344611713860501504e30, + 0.8841761993739701954543616e31, + 0.26525285981219105863630848e33, + 0.822283865417792281772556288e34, + 0.26313083693369353016721801216e36, + 0.868331761881188649551819440128e37, + 0.29523279903960414084761860964352e39, + }; + + return factorials[i]; +} + +template <> +struct max_factorial +{ + static constexpr unsigned value = 34; +}; + +#endif + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template struct unchecked_factorial_data { @@ -1556,6 +1664,8 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant inline T unchecked_factorial_imp(unsigned i, const std::integral_constant::digits>&) { @@ -1568,6 +1678,8 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant(i); } +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + #if DBL_MANT_DIG != LDBL_MANT_DIG template inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) @@ -1583,6 +1695,8 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant inline T unchecked_factorial(unsigned i) { @@ -1601,9 +1715,11 @@ struct max_factorial { static constexpr unsigned value = std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value - : std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value + : std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value + #ifndef BOOST_MATH_GPU_ENABLED : std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL + #endif : 100; }; From d3fc301f8425bc493ae4a4371931081924cbf43c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:10:41 -0400 Subject: [PATCH 14/33] Add GPU support to lgamma_small --- include/boost/math/special_functions/detail/lgamma_small.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp index a4d3bac70..82a1761f1 100644 --- a/include/boost/math/special_functions/detail/lgamma_small.hpp +++ b/include/boost/math/special_functions/detail/lgamma_small.hpp @@ -30,7 +30,7 @@ namespace boost{ namespace math{ namespace detail{ // These need forward declaring to keep GCC happy: // template -T gamma_imp(T z, const Policy& pol, const Lanczos& l); +BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l); template T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l); From d02674c50c22407a2154c0c5474319cf97ac5398 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:10:59 -0400 Subject: [PATCH 15/33] Fix signbit GPU support --- include/boost/math/special_functions/sign.hpp | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/include/boost/math/special_functions/sign.hpp b/include/boost/math/special_functions/sign.hpp index cd7fdfb2a..2092a41bb 100644 --- a/include/boost/math/special_functions/sign.hpp +++ b/include/boost/math/special_functions/sign.hpp @@ -26,9 +26,10 @@ namespace detail { #ifdef BOOST_MATH_USE_STD_FPCLASSIFY template - inline int signbit_impl(T x, native_tag const&) + BOOST_MATH_GPU_ENABLED inline int signbit_impl(T x, native_tag const&) { - return (std::signbit)(x) ? 1 : 0; + using std::signbit; + return (signbit)(x) ? 1 : 0; } #endif @@ -66,7 +67,7 @@ namespace detail { #endif template - inline int signbit_impl(T x, ieee_copy_all_bits_tag const&) + BOOST_MATH_GPU_ENABLED inline int signbit_impl(T x, ieee_copy_all_bits_tag const&) { typedef typename fp_traits::type traits; @@ -76,7 +77,7 @@ namespace detail { } template - inline int signbit_impl(T x, ieee_copy_leading_bits_tag const&) + BOOST_MATH_GPU_ENABLED inline int signbit_impl(T x, ieee_copy_leading_bits_tag const&) { typedef typename fp_traits::type traits; @@ -161,6 +162,22 @@ BOOST_MATH_GPU_ENABLED int (signbit)(T x) return detail::signbit_impl(static_cast(x), method()); } +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +template<> +BOOST_MATH_GPU_ENABLED int signbit(float x) +{ + using std::signbit; + return (signbit)(x); +} + +template<> +BOOST_MATH_GPU_ENABLED int signbit(double x) +{ + using std::signbit; + return (signbit)(x); +} +#endif + template BOOST_MATH_GPU_ENABLED inline int sign BOOST_NO_MACRO_EXPAND(const T& z) { From b0496b15c98c359852eabd4ce70023fe29dc3929 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:11:16 -0400 Subject: [PATCH 16/33] Support GPU with root epsilon functions --- include/boost/math/tools/precision.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/boost/math/tools/precision.hpp b/include/boost/math/tools/precision.hpp index 3f2f6f3ac..28caefff9 100644 --- a/include/boost/math/tools/precision.hpp +++ b/include/boost/math/tools/precision.hpp @@ -316,13 +316,13 @@ BOOST_MATH_GPU_ENABLED constexpr long double epsilon(BOOST_MATH_EXPLICIT_TEMPLAT namespace detail{ template -inline constexpr T root_epsilon_imp(const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const std::integral_constant&) noexcept(std::is_floating_point::value) { return static_cast(0.00034526698300124390839884978618400831996329879769945L); } template -inline constexpr T root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) { return static_cast(0.1490116119384765625e-7L); } @@ -355,13 +355,13 @@ inline T root_epsilon_imp(const T*, const std::integral_constant&) } template -inline constexpr T cbrt_epsilon_imp(const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const std::integral_constant&) noexcept(std::is_floating_point::value) { return static_cast(0.0049215666011518482998719164346805794944150447839903L); } template -inline constexpr T cbrt_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) { return static_cast(6.05545445239333906078989272793696693569753008995e-6L); } @@ -394,13 +394,13 @@ inline T cbrt_epsilon_imp(const T*, const std::integral_constant&) } template -inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) { return static_cast(0.018581361171917516667460937040007436176452688944747L); } template -inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) { return static_cast(0.0001220703125L); } @@ -442,19 +442,19 @@ struct root_epsilon_traits } template -inline constexpr T root_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) { return detail::root_epsilon_imp(static_cast(nullptr), typename detail::root_epsilon_traits::tag_type()); } template -inline constexpr T cbrt_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) { return detail::cbrt_epsilon_imp(static_cast(nullptr), typename detail::root_epsilon_traits::tag_type()); } template -inline constexpr T forth_root_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) { return detail::forth_root_epsilon_imp(static_cast(nullptr), typename detail::root_epsilon_traits::tag_type()); } From 58a5bc47ca2b03a4a165c8a5016a922602755d45 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:11:35 -0400 Subject: [PATCH 17/33] Add GPU support to gamma functions --- .../boost/math/special_functions/gamma.hpp | 148 +++++++++--------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 0c3b32b46..b77f4f5ea 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -2,7 +2,7 @@ // Copyright Paul A. Bristow 2007, 2013-14. // Copyright Nikhar Agrawal 2013-14 // Copyright Christopher Kormanyos 2013-14, 2020, 2024 - +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -14,12 +14,12 @@ #pragma once #endif +#include #include #include #include #include #include -#include #include #include #include @@ -56,13 +56,13 @@ namespace boost{ namespace math{ namespace detail{ template -inline bool is_odd(T v, const std::true_type&) +BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const std::true_type&) { int i = static_cast(v); return i&1; } template -inline bool is_odd(T v, const std::false_type&) +BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const std::false_type&) { // Oh dear can't cast T to int! BOOST_MATH_STD_USING @@ -70,13 +70,13 @@ inline bool is_odd(T v, const std::false_type&) return static_cast(modulus != 0); } template -inline bool is_odd(T v) +BOOST_MATH_GPU_ENABLED inline bool is_odd(T v) { return is_odd(v, ::std::is_convertible()); } template -T sinpx(T z) +BOOST_MATH_GPU_ENABLED T sinpx(T z) { // Ad hoc function calculates x * sin(pi * x), // taking extra care near when x is near a whole number. @@ -108,7 +108,7 @@ T sinpx(T z) // tgamma(z), with Lanczos support: // template -T gamma_imp(T z, const Policy& pol, const Lanczos& l) +BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l) { BOOST_MATH_STD_USING @@ -122,7 +122,7 @@ T gamma_imp(T z, const Policy& pol, const Lanczos& l) b = true; } #endif - static const char* function = "boost::math::tgamma<%1%>(%1%)"; + constexpr auto function = "boost::math::tgamma<%1%>(%1%)"; if(z <= 0) { @@ -199,7 +199,7 @@ T gamma_imp(T z, const Policy& pol, const Lanczos& l) // lgamma(z) with Lanczos support: // template -T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr) +BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr) { #ifdef BOOST_MATH_INSTRUMENT static bool b = false; @@ -212,7 +212,7 @@ T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr) BOOST_MATH_STD_USING - static const char* function = "boost::math::lgamma<%1%>(%1%)"; + constexpr auto function = "boost::math::lgamma<%1%>(%1%)"; T result = 0; int sresult = 1; @@ -291,12 +291,12 @@ struct upper_incomplete_gamma_fract public: typedef std::pair result_type; - upper_incomplete_gamma_fract(T a1, T z1) + BOOST_MATH_GPU_ENABLED upper_incomplete_gamma_fract(T a1, T z1) : z(z1-a1+1), a(a1), k(0) { } - result_type operator()() + BOOST_MATH_GPU_ENABLED result_type operator()() { ++k; z += 2; @@ -305,7 +305,7 @@ struct upper_incomplete_gamma_fract }; template -inline T upper_gamma_fraction(T a, T z, T eps) +BOOST_MATH_GPU_ENABLED inline T upper_gamma_fraction(T a, T z, T eps) { // Multiply result by z^a * e^-z to get the full // upper incomplete integral. Divide by tgamma(z) @@ -321,9 +321,9 @@ struct lower_incomplete_gamma_series T a, z, result; public: typedef T result_type; - lower_incomplete_gamma_series(T a1, T z1) : a(a1), z(z1), result(1){} + BOOST_MATH_GPU_ENABLED lower_incomplete_gamma_series(T a1, T z1) : a(a1), z(z1), result(1){} - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = result; a += 1; @@ -333,7 +333,7 @@ struct lower_incomplete_gamma_series }; template -inline T lower_gamma_series(T a, T z, const Policy& pol, T init_value = 0) +BOOST_MATH_GPU_ENABLED inline T lower_gamma_series(T a, T z, const Policy& pol, T init_value = 0) { // Multiply result by ((z^a) * (e^-z) / a) to get the full // lower incomplete integral. Then divide by tgamma(a) @@ -460,7 +460,7 @@ T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&) { BOOST_MATH_STD_USING - static const char* function = "boost::math::tgamma<%1%>(%1%)"; + constexpr auto function = "boost::math::tgamma<%1%>(%1%)"; // Check if the argument of tgamma is identically zero. const bool is_at_zero = (z == 0); @@ -610,7 +610,7 @@ T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sig { BOOST_MATH_STD_USING - static const char* function = "boost::math::lgamma<%1%>(%1%)"; + constexpr auto function = "boost::math::lgamma<%1%>(%1%)"; // Check if the argument of lgamma is identically zero. const bool is_at_zero = (z == 0); @@ -720,7 +720,7 @@ T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sig // used by the upper incomplete gamma with z < 1: // template -T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& l) +BOOST_MATH_GPU_ENABLED T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& l) { BOOST_MATH_STD_USING @@ -789,9 +789,9 @@ struct small_gamma2_series { typedef T result_type; - small_gamma2_series(T a_, T x_) : result(-x_), x(-x_), apn(a_+1), n(1){} + BOOST_MATH_GPU_ENABLED small_gamma2_series(T a_, T x_) : result(-x_), x(-x_), apn(a_+1), n(1){} - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = result / (apn); result *= x; @@ -809,7 +809,7 @@ struct small_gamma2_series // incomplete gammas: // template -T full_igamma_prefix(T a, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED T full_igamma_prefix(T a, T z, const Policy& pol) { BOOST_MATH_STD_USING @@ -864,7 +864,7 @@ T full_igamma_prefix(T a, T z, const Policy& pol) // most if the error occurs in this function: // template -T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l) +BOOST_MATH_GPU_ENABLED T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l) { BOOST_MATH_STD_USING if (z >= tools::max_value()) @@ -1017,7 +1017,7 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const lanczos::undefined // Upper gamma fraction for very small a: // template -inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool invert = false, T* pderivative = 0) +BOOST_MATH_GPU_ENABLED inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool invert = false, T* pderivative = 0) { BOOST_MATH_STD_USING // ADL of std functions. // @@ -1047,7 +1047,7 @@ inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool // Upper gamma fraction for integer a: // template -inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0) +BOOST_MATH_GPU_ENABLED inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0) { // // Calculates normalised Q when a is an integer: @@ -1075,7 +1075,7 @@ inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0) // Upper gamma fraction for half integer a: // template -T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol) +BOOST_MATH_GPU_ENABLED T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol) { // // Calculates normalised Q when a is a half-integer: @@ -1115,9 +1115,9 @@ template struct incomplete_tgamma_large_x_series { typedef T result_type; - incomplete_tgamma_large_x_series(const T& a, const T& x) + BOOST_MATH_GPU_ENABLED incomplete_tgamma_large_x_series(const T& a, const T& x) : a_poch(a - 1), z(x), term(1) {} - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T result = term; term *= a_poch / z; @@ -1128,7 +1128,7 @@ struct incomplete_tgamma_large_x_series }; template -T incomplete_tgamma_large_x(const T& a, const T& x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T incomplete_tgamma_large_x(const T& a, const T& x, const Policy& pol) { BOOST_MATH_STD_USING incomplete_tgamma_large_x_series s(a, x); @@ -1143,10 +1143,10 @@ T incomplete_tgamma_large_x(const T& a, const T& x, const Policy& pol) // Main incomplete gamma entry point, handles all four incomplete gamma's: // template -T gamma_incomplete_imp(T a, T x, bool normalised, bool invert, +BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool invert, const Policy& pol, T* p_derivative) { - static const char* function = "boost::math::gamma_p<%1%>(%1%, %1%)"; + constexpr auto function = "boost::math::gamma_p<%1%>(%1%, %1%)"; if(a <= 0) return policies::raise_domain_error(function, "Argument a to the incomplete gamma function must be greater than zero (got a=%1%).", a, pol); if(x < 0) @@ -1529,7 +1529,7 @@ T gamma_incomplete_imp(T a, T x, bool normalised, bool invert, // Ratios of two gamma functions: // template -T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l) +BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l) { BOOST_MATH_STD_USING if(z < tools::epsilon()) @@ -1648,7 +1648,7 @@ T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const lanczos: } template -T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol) +BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol) { BOOST_MATH_STD_USING @@ -1706,7 +1706,7 @@ T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol) } template -T tgamma_ratio_imp(T x, T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED T tgamma_ratio_imp(T x, T y, const Policy& pol) { BOOST_MATH_STD_USING @@ -1775,7 +1775,7 @@ T tgamma_ratio_imp(T x, T y, const Policy& pol) } template -T gamma_p_derivative_imp(T a, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T gamma_p_derivative_imp(T a, T x, const Policy& pol) { BOOST_MATH_STD_USING // @@ -1816,7 +1816,7 @@ T gamma_p_derivative_imp(T a, T x, const Policy& pol) } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type tgamma(T z, const Policy& /* pol */, const std::true_type) { BOOST_FPU_EXCEPTION_GUARD @@ -1837,7 +1837,7 @@ struct igamma_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { typedef typename policies::precision::type precision_type; @@ -1851,24 +1851,26 @@ struct igamma_initializer do_init(tag_type()); } template - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) { // If std::numeric_limits::digits is zero, we must not call // our initialization code here as the precision presumably // varies at runtime, and will not have been set yet. Plus the // code requiring initialization isn't called when digits == 0. - if(std::numeric_limits::digits) + BOOST_MATH_IF_CONSTEXPR (std::numeric_limits::digits) { boost::math::gamma_p(static_cast(400), static_cast(400), Policy()); } } - static void do_init(const std::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&){} void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -1880,7 +1882,7 @@ struct lgamma_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { typedef typename policies::precision::type precision_type; typedef std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) { boost::math::lgamma(static_cast(2.5), Policy()); boost::math::lgamma(static_cast(1.25), Policy()); boost::math::lgamma(static_cast(1.75), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) { boost::math::lgamma(static_cast(2.5), Policy()); boost::math::lgamma(static_cast(1.25), Policy()); boost::math::lgamma(static_cast(1.5), Policy()); boost::math::lgamma(static_cast(1.75), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) { } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -1920,7 +1924,7 @@ template const typename lgamma_initializer::init lgamma_initializer::initializer; template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma(T1 a, T2 z, const Policy&, const std::false_type) { BOOST_FPU_EXCEPTION_GUARD @@ -1943,7 +1947,7 @@ inline tools::promote_args_t } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma(T1 a, T2 z, const std::false_type& tag) { return tgamma(a, z, policies::policy<>(), tag); @@ -1953,14 +1957,14 @@ inline tools::promote_args_t } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type tgamma(T z) { return tgamma(z, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type lgamma(T z, int* sign, const Policy&) { BOOST_FPU_EXCEPTION_GUARD @@ -1980,28 +1984,28 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type lgamma(T z, int* sign) { return lgamma(z, sign, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type lgamma(T x, const Policy& pol) { return ::boost::math::lgamma(x, nullptr, pol); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type lgamma(T x) { return ::boost::math::lgamma(x, nullptr, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type tgamma1pm1(T z, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD @@ -2019,7 +2023,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type tgamma1pm1(T z) { return tgamma1pm1(z, policies::policy<>()); @@ -2029,7 +2033,7 @@ inline typename tools::promote_args::type // Full upper incomplete gamma: // template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma(T1 a, T2 z) { // @@ -2041,7 +2045,7 @@ inline tools::promote_args_t return static_cast(detail::tgamma(a, z, maybe_policy())); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma(T1 a, T2 z, const Policy& pol) { using result_type = tools::promote_args_t; @@ -2051,7 +2055,7 @@ inline tools::promote_args_t // Full lower incomplete gamma: // template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma_lower(T1 a, T2 z, const Policy&) { BOOST_FPU_EXCEPTION_GUARD @@ -2073,7 +2077,7 @@ inline tools::promote_args_t forwarding_policy(), static_cast(nullptr)), "tgamma_lower<%1%>(%1%, %1%)"); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma_lower(T1 a, T2 z) { return tgamma_lower(a, z, policies::policy<>()); @@ -2082,7 +2086,7 @@ inline tools::promote_args_t // Regularised upper incomplete gamma: // template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t gamma_q(T1 a, T2 z, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD @@ -2104,7 +2108,7 @@ inline tools::promote_args_t forwarding_policy(), static_cast(nullptr)), "gamma_q<%1%>(%1%, %1%)"); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t gamma_q(T1 a, T2 z) { return gamma_q(a, z, policies::policy<>()); @@ -2113,7 +2117,7 @@ inline tools::promote_args_t // Regularised lower incomplete gamma: // template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t gamma_p(T1 a, T2 z, const Policy&) { BOOST_FPU_EXCEPTION_GUARD @@ -2135,7 +2139,7 @@ inline tools::promote_args_t forwarding_policy(), static_cast(nullptr)), "gamma_p<%1%>(%1%, %1%)"); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t gamma_p(T1 a, T2 z) { return gamma_p(a, z, policies::policy<>()); @@ -2143,7 +2147,7 @@ inline tools::promote_args_t // ratios of gamma functions: template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD @@ -2159,13 +2163,13 @@ inline tools::promote_args_t return policies::checked_narrowing_cast(detail::tgamma_delta_ratio_imp(static_cast(z), static_cast(delta), forwarding_policy()), "boost::math::tgamma_delta_ratio<%1%>(%1%, %1%)"); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta) { return tgamma_delta_ratio(z, delta, policies::policy<>()); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma_ratio(T1 a, T2 b, const Policy&) { typedef tools::promote_args_t result_type; @@ -2180,14 +2184,14 @@ inline tools::promote_args_t return policies::checked_narrowing_cast(detail::tgamma_ratio_imp(static_cast(a), static_cast(b), forwarding_policy()), "boost::math::tgamma_delta_ratio<%1%>(%1%, %1%)"); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t tgamma_ratio(T1 a, T2 b) { return tgamma_ratio(a, b, policies::policy<>()); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t gamma_p_derivative(T1 a, T2 x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD @@ -2203,7 +2207,7 @@ inline tools::promote_args_t return policies::checked_narrowing_cast(detail::gamma_p_derivative_imp(static_cast(a), static_cast(x), forwarding_policy()), "boost::math::gamma_p_derivative<%1%>(%1%, %1%)"); } template -inline tools::promote_args_t +BOOST_MATH_GPU_ENABLED inline tools::promote_args_t gamma_p_derivative(T1 a, T2 x) { return gamma_p_derivative(a, x, policies::policy<>()); From 4362b0241b81e13cc1a759c57dc9c416a14d6003 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:11:55 -0400 Subject: [PATCH 18/33] Add GPU markers to math_fwd for gamma functions --- .../boost/math/special_functions/math_fwd.hpp | 66 +++++++++---------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 1d22f61b1..fdccfad8b 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -424,7 +424,7 @@ namespace boost template RT factorial(unsigned int, const Policy& pol); template - RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT)); + BOOST_MATH_GPU_ENABLED RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT)); template RT double_factorial(unsigned i); template @@ -444,67 +444,67 @@ namespace boost // Gamma functions. template - tools::promote_args_t tgamma(RT z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma(RT z); template - tools::promote_args_t tgamma1pm1(RT z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma1pm1(RT z); template - tools::promote_args_t tgamma1pm1(RT z, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma1pm1(RT z, const Policy& pol); template - tools::promote_args_t tgamma(RT1 a, RT2 z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma(RT1 a, RT2 z); template - tools::promote_args_t tgamma(RT1 a, RT2 z, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma(RT1 a, RT2 z, const Policy& pol); template - tools::promote_args_t lgamma(RT z, int* sign); + BOOST_MATH_GPU_ENABLED tools::promote_args_t lgamma(RT z, int* sign); template - tools::promote_args_t lgamma(RT z, int* sign, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t lgamma(RT z, int* sign, const Policy& pol); template - tools::promote_args_t lgamma(RT x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t lgamma(RT x); template - tools::promote_args_t lgamma(RT x, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t lgamma(RT x, const Policy& pol); template - tools::promote_args_t tgamma_lower(RT1 a, RT2 z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma_lower(RT1 a, RT2 z); template - tools::promote_args_t tgamma_lower(RT1 a, RT2 z, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma_lower(RT1 a, RT2 z, const Policy&); template - tools::promote_args_t gamma_q(RT1 a, RT2 z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_q(RT1 a, RT2 z); template - tools::promote_args_t gamma_q(RT1 a, RT2 z, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_q(RT1 a, RT2 z, const Policy&); template - tools::promote_args_t gamma_p(RT1 a, RT2 z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p(RT1 a, RT2 z); template - tools::promote_args_t gamma_p(RT1 a, RT2 z, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p(RT1 a, RT2 z, const Policy&); template - tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta); template - tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta, const Policy&); template - tools::promote_args_t tgamma_ratio(T1 a, T2 b); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma_ratio(T1 a, T2 b); template - tools::promote_args_t tgamma_ratio(T1 a, T2 b, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma_ratio(T1 a, T2 b, const Policy&); template - tools::promote_args_t gamma_p_derivative(T1 a, T2 x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p_derivative(T1 a, T2 x); template - tools::promote_args_t gamma_p_derivative(T1 a, T2 x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p_derivative(T1 a, T2 x, const Policy&); // gamma inverse. template @@ -1410,37 +1410,37 @@ namespace boost inline boost::math::tools::promote_args_t rising_factorial(RT x, unsigned n){ return boost::math::rising_factorial(x, n, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t tgamma(RT z){ return boost::math::tgamma(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t tgamma(RT z){ return boost::math::tgamma(z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t tgamma1pm1(RT z){ return boost::math::tgamma1pm1(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t tgamma1pm1(RT z){ return boost::math::tgamma1pm1(z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t tgamma(RT1 a, RT2 z){ return boost::math::tgamma(a, z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t tgamma(RT1 a, RT2 z){ return boost::math::tgamma(a, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t lgamma(RT z, int* sign){ return boost::math::lgamma(z, sign, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t lgamma(RT z, int* sign){ return boost::math::lgamma(z, sign, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t lgamma(RT x){ return boost::math::lgamma(x, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t lgamma(RT x){ return boost::math::lgamma(x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t tgamma_lower(RT1 a, RT2 z){ return boost::math::tgamma_lower(a, z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t tgamma_lower(RT1 a, RT2 z){ return boost::math::tgamma_lower(a, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t gamma_q(RT1 a, RT2 z){ return boost::math::gamma_q(a, z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_q(RT1 a, RT2 z){ return boost::math::gamma_q(a, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t gamma_p(RT1 a, RT2 z){ return boost::math::gamma_p(a, z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_p(RT1 a, RT2 z){ return boost::math::gamma_p(a, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta){ return boost::math::tgamma_delta_ratio(z, delta, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t tgamma_delta_ratio(T1 z, T2 delta){ return boost::math::tgamma_delta_ratio(z, delta, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t tgamma_ratio(T1 a, T2 b) { return boost::math::tgamma_ratio(a, b, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t tgamma_ratio(T1 a, T2 b) { return boost::math::tgamma_ratio(a, b, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t gamma_p_derivative(T1 a, T2 x){ return boost::math::gamma_p_derivative(a, x, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_p_derivative(T1 a, T2 x){ return boost::math::gamma_p_derivative(a, x, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t gamma_p_inv(T1 a, T2 p){ return boost::math::gamma_p_inv(a, p, Policy()); }\ From 443887e26c4fca7daa69aa3a74eedac185223cd9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:12:05 -0400 Subject: [PATCH 19/33] Add CUDA testing for tgamma and lgamma --- test/cuda_jamfile | 4 ++ test/test_lgamma_double.cu | 102 +++++++++++++++++++++++++++++++++++++ test/test_lgamma_float.cu | 102 +++++++++++++++++++++++++++++++++++++ test/test_tgamma_double.cu | 102 +++++++++++++++++++++++++++++++++++++ test/test_tgamma_float.cu | 102 +++++++++++++++++++++++++++++++++++++ 5 files changed, 412 insertions(+) create mode 100644 test/test_lgamma_double.cu create mode 100644 test/test_lgamma_float.cu create mode 100644 test/test_tgamma_double.cu create mode 100644 test/test_tgamma_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 7af3b9e48..69bdbfb7f 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -110,6 +110,8 @@ run test_erfc_inv_double.cu ; run test_erfc_inv_float.cu ; run test_expm1_double.cu ; run test_expm1_float.cu ; +run test_lgamma_double.cu ; +run test_lgamma_float.cu ; run test_log1p_double.cu ; run test_log1p_float.cu ; run test_modf_double.cu ; @@ -118,6 +120,8 @@ run test_round_double.cu ; run test_round_float.cu ; run test_sin_pi_double.cu ; run test_sin_pi_float.cu ; +run test_tgamma_double.cu ; +run test_tgamma_float.cu ; run test_trigamma_double.cu ; run test_trigamma_float.cu ; run test_trunc_double.cu ; diff --git a/test/test_lgamma_double.cu b/test/test_lgamma_double.cu new file mode 100644 index 000000000..776ff5d27 --- /dev/null +++ b/test/test_lgamma_double.cu @@ -0,0 +1,102 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::lgamma(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::lgamma(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_lgamma_float.cu b/test/test_lgamma_float.cu new file mode 100644 index 000000000..101037ab3 --- /dev/null +++ b/test/test_lgamma_float.cu @@ -0,0 +1,102 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::lgamma(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::lgamma(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_tgamma_double.cu b/test/test_tgamma_double.cu new file mode 100644 index 000000000..6e4140ab6 --- /dev/null +++ b/test/test_tgamma_double.cu @@ -0,0 +1,102 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::tgamma(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::tgamma(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_tgamma_float.cu b/test/test_tgamma_float.cu new file mode 100644 index 000000000..cb2d01482 --- /dev/null +++ b/test/test_tgamma_float.cu @@ -0,0 +1,102 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::tgamma(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::tgamma(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From fcb1c5bf4a95a77e6f883b462cca3e9d4e6714b1 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:32:08 -0400 Subject: [PATCH 20/33] Disable factorial check on GPU platform --- include/boost/math/special_functions/beta.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp index 25e524d0b..07ac9d5a6 100644 --- a/include/boost/math/special_functions/beta.hpp +++ b/include/boost/math/special_functions/beta.hpp @@ -930,7 +930,9 @@ template <> struct Pn_size { static constexpr unsigned value = 50; // ~35-50 digit accuracy +#ifndef BOOST_MATH_HAS_GPU_SUPPORT static_assert(::boost::math::max_factorial::value >= 100, "Type does not provide for ~35-50 digits of accuracy"); +#endif }; template From e2a9039b2324c2a0f4a55d19f0e26ebd5b0f44d0 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:37:44 -0400 Subject: [PATCH 21/33] Add SYCL gamma testing --- test/sycl_jamfile | 1 + test/test_gamma.cpp | 4 ++++ test/test_gamma.hpp | 5 ++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index e336523e6..d0a458cce 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -33,3 +33,4 @@ run test_log1p_simple.cpp ; run test_digamma_simple.cpp ; run test_trigamma.cpp ; run test_erf.cpp ; +run test_gamma.cpp ; diff --git a/test/test_gamma.cpp b/test/test_gamma.cpp index 6b2d19cca..565d5888a 100644 --- a/test/test_gamma.cpp +++ b/test/test_gamma.cpp @@ -3,7 +3,11 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#include #include "test_gamma.hpp" // diff --git a/test/test_gamma.hpp b/test/test_gamma.hpp index c21573dac..729a8266d 100644 --- a/test/test_gamma.hpp +++ b/test/test_gamma.hpp @@ -6,6 +6,7 @@ #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#include #include #include #define BOOST_TEST_MAIN @@ -13,7 +14,7 @@ #include #include // for has_denorm_now #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include @@ -320,11 +321,13 @@ void test_spots(T, const char* name) BOOST_CHECK(sign == -1); } + #ifndef BOOST_MATH_HAS_GPU_SUPPORT if(boost::math::detail::has_denorm_now() && std::numeric_limits::has_infinity && (boost::math::isinf)(1 / std::numeric_limits::denorm_min())) { BOOST_CHECK_EQUAL(boost::math::tgamma(-std::numeric_limits::denorm_min()), -std::numeric_limits::infinity()); BOOST_CHECK_EQUAL(boost::math::tgamma(std::numeric_limits::denorm_min()), std::numeric_limits::infinity()); } + #endif // // Extra large values for lgamma, see https://github.com/boostorg/math/issues/242 // From 7dcb4320abf6be960b18f7b606c295e122dc04a3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 14:48:18 -0400 Subject: [PATCH 22/33] Remove extra overloads --- include/boost/math/special_functions/sign.hpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/include/boost/math/special_functions/sign.hpp b/include/boost/math/special_functions/sign.hpp index 2092a41bb..d147b5b1b 100644 --- a/include/boost/math/special_functions/sign.hpp +++ b/include/boost/math/special_functions/sign.hpp @@ -162,22 +162,6 @@ BOOST_MATH_GPU_ENABLED int (signbit)(T x) return detail::signbit_impl(static_cast(x), method()); } -#ifndef BOOST_MATH_HAS_GPU_SUPPORT -template<> -BOOST_MATH_GPU_ENABLED int signbit(float x) -{ - using std::signbit; - return (signbit)(x); -} - -template<> -BOOST_MATH_GPU_ENABLED int signbit(double x) -{ - using std::signbit; - return (signbit)(x); -} -#endif - template BOOST_MATH_GPU_ENABLED inline int sign BOOST_NO_MACRO_EXPAND(const T& z) { From f1f470ec67800cf7e982ef844435e09b9f69afa3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 15:08:54 -0400 Subject: [PATCH 23/33] Remove recursion from tgamma --- .../boost/math/special_functions/gamma.hpp | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index b77f4f5ea..62448d69e 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -108,7 +108,7 @@ BOOST_MATH_GPU_ENABLED T sinpx(T z) // tgamma(z), with Lanczos support: // template -BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l) +BOOST_MATH_GPU_ENABLED T gamma_imp_final(T z, const Policy& pol, const Lanczos& l) { BOOST_MATH_STD_USING @@ -127,20 +127,8 @@ BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l) if(z <= 0) { if(floor(z) == z) - return policies::raise_pole_error(function, "Evaluation of tgamma at a negative integer %1%.", z, pol); - if(z <= -20) { - result = gamma_imp(T(-z), pol, l) * sinpx(z); - BOOST_MATH_INSTRUMENT_VARIABLE(result); - if((fabs(result) < 1) && (tools::max_value() * fabs(result) < boost::math::constants::pi())) - return -boost::math::sign(result) * policies::raise_overflow_error(function, "Result of tgamma is too large to represent.", pol); - result = -boost::math::constants::pi() / result; - if(result == 0) - return policies::raise_underflow_error(function, "Result of tgamma is too small to represent.", pol); - if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL) - return policies::raise_denorm_error(function, "Result of tgamma is denormalized.", result, pol); - BOOST_MATH_INSTRUMENT_VARIABLE(result); - return result; + return policies::raise_pole_error(function, "Evaluation of tgamma at a negative integer %1%.", z, pol); } // shift z to > 1: @@ -195,6 +183,32 @@ BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l) } return result; } +// SYCL compilers can not support recursion so we extract it into a dispatch function +template +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol, const Lanczos& l) +{ + BOOST_MATH_STD_USING + + if(z <= -20) + { + constexpr auto function = "boost::math::tgamma<%1%>(%1%)"; + T result = gamma_imp_final(T(-z), pol, l) * sinpx(z); + BOOST_MATH_INSTRUMENT_VARIABLE(result); + if((fabs(result) < 1) && (tools::max_value() * fabs(result) < boost::math::constants::pi())) + return -boost::math::sign(result) * policies::raise_overflow_error(function, "Result of tgamma is too large to represent.", pol); + result = -boost::math::constants::pi() / result; + if(result == 0) + return policies::raise_underflow_error(function, "Result of tgamma is too small to represent.", pol); + if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL) + return policies::raise_denorm_error(function, "Result of tgamma is denormalized.", result, pol); + BOOST_MATH_INSTRUMENT_VARIABLE(result); + return result; + } + else + { + return gamma_imp_final(T(z), pol, l); + } +} // // lgamma(z) with Lanczos support: // From 30a7ad1da9e2334ca81b8c94add283359a206757 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 15:47:04 -0400 Subject: [PATCH 24/33] Remove recursion in lgamma --- .../boost/math/special_functions/gamma.hpp | 66 +++++++++++++------ 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 62448d69e..7601d5295 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -188,7 +188,7 @@ template BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol, const Lanczos& l) { BOOST_MATH_STD_USING - + if(z <= -20) { constexpr auto function = "boost::math::tgamma<%1%>(%1%)"; @@ -213,7 +213,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol // lgamma(z) with Lanczos support: // template -BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr) +BOOST_MATH_GPU_ENABLED T lgamma_imp_final(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr) { #ifdef BOOST_MATH_INSTRUMENT static bool b = false; @@ -230,25 +230,8 @@ BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, in T result = 0; int sresult = 1; - if(z <= -tools::root_epsilon()) - { - // reflection formula: - if(floor(z) == z) - return policies::raise_pole_error(function, "Evaluation of lgamma at a negative integer %1%.", z, pol); - - T t = sinpx(z); - z = -z; - if(t < 0) - { - t = -t; - } - else - { - sresult = -sresult; - } - result = log(boost::math::constants::pi()) - lgamma_imp(z, pol, l) - log(t); - } - else if (z < tools::root_epsilon()) + + if (z < tools::root_epsilon()) { if (0 == z) return policies::raise_pole_error(function, "Evaluation of lgamma at %1%.", z, pol); @@ -293,6 +276,47 @@ BOOST_MATH_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, in return result; } +template +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr) +{ + BOOST_MATH_STD_USING + + if(z <= -tools::root_epsilon()) + { + constexpr auto function = "boost::math::lgamma<%1%>(%1%)"; + + T result = 0; + int sresult = 1; + + // reflection formula: + if(floor(z) == z) + return policies::raise_pole_error(function, "Evaluation of lgamma at a negative integer %1%.", z, pol); + + T t = sinpx(z); + z = -z; + if(t < 0) + { + t = -t; + } + else + { + sresult = -sresult; + } + result = log(boost::math::constants::pi()) - lgamma_imp_final(T(z), pol, l) - log(t); + + if(sign) + { + *sign = sresult; + } + + return result; + } + else + { + return lgamma_imp_final(T(z), pol, l, sign); + } +} + // // Incomplete gamma functions follow: // From 082b625460cd8fbf1585bf2d60099321d5dbe88a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 15:51:14 -0400 Subject: [PATCH 25/33] Ignore CUDA warning about GNU force inline --- include/boost/math/special_functions/gamma.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 7601d5295..af0599c88 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -183,6 +183,11 @@ BOOST_MATH_GPU_ENABLED T gamma_imp_final(T z, const Policy& pol, const Lanczos& } return result; } + +#ifdef BOOST_MATH_ENABLE_CUDA +# pragma nv_diag_suppress 2190 +#endif + // SYCL compilers can not support recursion so we extract it into a dispatch function template BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol, const Lanczos& l) @@ -209,6 +214,11 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol return gamma_imp_final(T(z), pol, l); } } + +#ifdef BOOST_MATH_ENABLE_CUDA +# pragma nv_diag_default 2190 +#endif + // // lgamma(z) with Lanczos support: // @@ -276,6 +286,10 @@ BOOST_MATH_GPU_ENABLED T lgamma_imp_final(T z, const Policy& pol, const Lanczos& return result; } +#ifdef BOOST_MATH_ENABLE_CUDA +# pragma nv_diag_suppress 2190 +#endif + template BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = nullptr) { @@ -317,6 +331,10 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T lgamma_imp(T z, const Policy& po } } +#ifdef BOOST_MATH_ENABLE_CUDA +# pragma nv_diag_default 2190 +#endif + // // Incomplete gamma functions follow: // From 6d19b67bf323692046f6d1998d5897477ffce720 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 15:57:04 -0400 Subject: [PATCH 26/33] Ignore literal range warnings --- test/test_erf.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_erf.cpp b/test/test_erf.cpp index beb521ac4..b4513d96d 100644 --- a/test/test_erf.cpp +++ b/test/test_erf.cpp @@ -8,6 +8,14 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma gcc diagnostic push +# pragma gcc diagnostic ignored "-Wliteral-range" +#endif + #include #include "test_erf.hpp" From 19f166bfe60ee1d7bbb629ddd9a66fd9af1b8312 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 16:15:14 -0400 Subject: [PATCH 27/33] Ignore mapairy literal range warnings --- test/test_mapairy.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_mapairy.cpp b/test/test_mapairy.cpp index c1d1c739b..096abde8b 100644 --- a/test/test_mapairy.cpp +++ b/test/test_mapairy.cpp @@ -4,6 +4,14 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma gcc diagnostic push +# pragma gcc diagnostic ignored "-Wliteral-range" +#endif + #define BOOST_TEST_MAIN #define BOOST_TEST_MODULE StatsMapAiryTest #include From 84a7272e6cf962340700653ab77431d77477a044 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 16:16:28 -0400 Subject: [PATCH 28/33] Make powm1 GPU compatible --- .../boost/math/special_functions/math_fwd.hpp | 6 +- .../boost/math/special_functions/powm1.hpp | 66 +++++++++++-------- 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index fdccfad8b..289b27592 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -591,11 +591,11 @@ namespace boost // Power - 1 template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t powm1(const T1 a, const T2 z); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t powm1(const T1 a, const T2 z, const Policy&); // sqrt(1+x) - 1 @@ -1481,7 +1481,7 @@ namespace boost \ template \ inline boost::math::tools::promote_args_t \ - powm1(const T1 a, const T2 z){ return boost::math::powm1(a, z, Policy()); }\ + BOOST_MATH_GPU_ENABLED powm1(const T1 a, const T2 z){ return boost::math::powm1(a, z, Policy()); }\ \ template \ BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\ diff --git a/include/boost/math/special_functions/powm1.hpp b/include/boost/math/special_functions/powm1.hpp index e52277b16..f00af2264 100644 --- a/include/boost/math/special_functions/powm1.hpp +++ b/include/boost/math/special_functions/powm1.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -12,6 +13,7 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include #include #include #include @@ -22,32 +24,23 @@ namespace boost{ namespace math{ namespace detail{ template -inline T powm1_imp(const T x, const T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T powm1_imp(const T x, const T y, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::powm1<%1%>(%1%, %1%)"; - if (x > 0) + constexpr auto function = "boost::math::powm1<%1%>(%1%, %1%)"; + + if ((fabs(y * (x - 1)) < T(0.5)) || (fabs(y) < T(0.2))) { - if ((fabs(y * (x - 1)) < T(0.5)) || (fabs(y) < T(0.2))) - { - // We don't have any good/quick approximation for log(x) * y - // so just try it and see: - T l = y * log(x); - if (l < T(0.5)) - return boost::math::expm1(l, pol); - if (l > boost::math::tools::log_max_value()) - return boost::math::policies::raise_overflow_error(function, nullptr, pol); - // fall through.... - } - } - else if ((boost::math::signbit)(x)) // Need to error check -0 here as well - { - // y had better be an integer: - if (boost::math::trunc(y) != y) - return boost::math::policies::raise_domain_error(function, "For non-integral exponent, expected base > 0 but got %1%", x, pol); - if (boost::math::trunc(y / 2) == y / 2) - return powm1_imp(T(-x), y, pol); + // We don't have any good/quick approximation for log(x) * y + // so just try it and see: + T l = y * log(x); + if (l < T(0.5)) + return boost::math::expm1(l, pol); + if (l > boost::math::tools::log_max_value()) + return boost::math::policies::raise_overflow_error(function, nullptr, pol); + // fall through.... } + T result = pow(x, y) - 1; if((boost::math::isinf)(result)) return result < 0 ? -boost::math::policies::raise_overflow_error(function, nullptr, pol) : boost::math::policies::raise_overflow_error(function, nullptr, pol); @@ -56,22 +49,43 @@ inline T powm1_imp(const T x, const T y, const Policy& pol) return result; } +template +BOOST_MATH_GPU_ENABLED inline T powm1_imp_dispatch(const T x, const T y, const Policy& pol) +{ + BOOST_MATH_STD_USING + + if ((boost::math::signbit)(x)) // Need to error check -0 here as well + { + constexpr auto function = "boost::math::powm1<%1%>(%1%, %1%)"; + + // y had better be an integer: + if (boost::math::trunc(y) != y) + return boost::math::policies::raise_domain_error(function, "For non-integral exponent, expected base > 0 but got %1%", x, pol); + if (boost::math::trunc(y / 2) == y / 2) + return powm1_imp(T(-x), T(y), pol); + } + else + { + return powm1_imp(T(x), T(y), pol); + } +} + } // detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type powm1(const T1 a, const T2 z) { typedef typename tools::promote_args::type result_type; - return detail::powm1_imp(static_cast(a), static_cast(z), policies::policy<>()); + return detail::powm1_imp_dispatch(static_cast(a), static_cast(z), policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type powm1(const T1 a, const T2 z, const Policy& pol) { typedef typename tools::promote_args::type result_type; - return detail::powm1_imp(static_cast(a), static_cast(z), pol); + return detail::powm1_imp_dispatch(static_cast(a), static_cast(z), pol); } } // namespace math From 7bd6f957aef5515a80a8ac103bf920116f0b524c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 16:23:52 -0400 Subject: [PATCH 29/33] Fix GCC warnings --- include/boost/math/special_functions/powm1.hpp | 6 ++---- test/test_erf.cpp | 4 ++-- test/test_mapairy.cpp | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/boost/math/special_functions/powm1.hpp b/include/boost/math/special_functions/powm1.hpp index f00af2264..80d02dc29 100644 --- a/include/boost/math/special_functions/powm1.hpp +++ b/include/boost/math/special_functions/powm1.hpp @@ -64,10 +64,8 @@ BOOST_MATH_GPU_ENABLED inline T powm1_imp_dispatch(const T x, const T y, const P if (boost::math::trunc(y / 2) == y / 2) return powm1_imp(T(-x), T(y), pol); } - else - { - return powm1_imp(T(x), T(y), pol); - } + + return powm1_imp(T(x), T(y), pol); } } // detail diff --git a/test/test_erf.cpp b/test/test_erf.cpp index b4513d96d..d01f8024d 100644 --- a/test/test_erf.cpp +++ b/test/test_erf.cpp @@ -12,8 +12,8 @@ # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) -# pragma gcc diagnostic push -# pragma gcc diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" #endif #include diff --git a/test/test_mapairy.cpp b/test/test_mapairy.cpp index 096abde8b..ee8e43bf0 100644 --- a/test/test_mapairy.cpp +++ b/test/test_mapairy.cpp @@ -8,8 +8,8 @@ # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) -# pragma gcc diagnostic push -# pragma gcc diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" #endif #define BOOST_TEST_MAIN From c86e74424dea51d6ed677ecbedf0dbd46eface4c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 30 Jul 2024 17:20:49 -0400 Subject: [PATCH 30/33] Revert if constexpr change --- include/boost/math/special_functions/gamma.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index af0599c88..b1beeb766 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1913,7 +1913,7 @@ struct igamma_initializer // our initialization code here as the precision presumably // varies at runtime, and will not have been set yet. Plus the // code requiring initialization isn't called when digits == 0. - BOOST_MATH_IF_CONSTEXPR (std::numeric_limits::digits) + if (std::numeric_limits::digits) { boost::math::gamma_p(static_cast(400), static_cast(400), Policy()); } From 4f846a9527fde46b0759d8ac3ab6f27a1b580588 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 31 Jul 2024 12:05:00 -0400 Subject: [PATCH 31/33] Fix typo --- include/boost/math/special_functions/detail/lgamma_small.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp index 82a1761f1..d9fa88b8e 100644 --- a/include/boost/math/special_functions/detail/lgamma_small.hpp +++ b/include/boost/math/special_functions/detail/lgamma_small.hpp @@ -38,7 +38,7 @@ T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l); // lgamma for small arguments: // template -BOOST_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, const Policy& /* l */, const Lanczos&) +BOOST_MATH_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, const Policy& /* l */, const Lanczos&) { // This version uses rational approximations for small // values of z accurate enough for 64-bit mantissas From d13ff4e498ebf984a846a0a6a32f44f338da0077 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 31 Jul 2024 14:29:09 -0400 Subject: [PATCH 32/33] Fix comparison --- include/boost/math/special_functions/erf.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/erf.hpp b/include/boost/math/special_functions/erf.hpp index 323a91937..0b311c464 100644 --- a/include/boost/math/special_functions/erf.hpp +++ b/include/boost/math/special_functions/erf.hpp @@ -221,7 +221,7 @@ BOOST_MATH_GPU_ENABLED T erf_imp(T z, bool invert, const Policy& pol, const std: prefix_multiplier = -1; // return -erf_imp(T(-z), invert, pol, t); } - else if(z < -0.5) + else if(z < T(-0.5)) { prefix_adder = 2; // return 2 - erf_imp(T(-z), invert, pol, t); From 43d5905134126549b4a80546f81a29a0549cfdd1 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 31 Jul 2024 14:29:54 -0400 Subject: [PATCH 33/33] Use tag type idiom --- include/boost/math/special_functions/detail/erf_inv.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/detail/erf_inv.hpp b/include/boost/math/special_functions/detail/erf_inv.hpp index 09772632d..69a08abe6 100644 --- a/include/boost/math/special_functions/detail/erf_inv.hpp +++ b/include/boost/math/special_functions/detail/erf_inv.hpp @@ -317,7 +317,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy& pol, const std::integral_con // // Generic version, get a guess that's accurate to 64-bits (10^-19) // - T guess = erf_inv_imp(p, q, pol, std::integral_constant()); + using tag_type = std::integral_constant; + T guess = erf_inv_imp(p, q, pol, tag_type()); T result; // // If T has more bit's than 64 in it's mantissa then we need to iterate,