From f225702e6c5811bbbc87ec651b1f7bcfcf00c504 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 15:39:56 -0400 Subject: [PATCH 01/26] Add GPU support to expint --- .../boost/math/special_functions/expint.hpp | 134 ++++++++++-------- 1 file changed, 75 insertions(+), 59 deletions(-) diff --git a/include/boost/math/special_functions/expint.hpp b/include/boost/math/special_functions/expint.hpp index 1475a9a88..79a46c52f 100644 --- a/include/boost/math/special_functions/expint.hpp +++ b/include/boost/math/special_functions/expint.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2007. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -12,6 +13,10 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include +#include +#include +#include #include #include #include @@ -35,13 +40,13 @@ namespace boost{ namespace math{ template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expint(unsigned n, T z, const Policy& /*pol*/); namespace detail{ template -inline T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline T expint_1_rational(const T& z, const boost::math::integral_constant&) { // this function is never actually called BOOST_MATH_ASSERT(0); @@ -49,7 +54,7 @@ inline T expint_1_rational(const T& z, const std::integral_constant&) } template -T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant&) { BOOST_MATH_STD_USING T result; @@ -123,7 +128,7 @@ T expint_1_rational(const T& z, const std::integral_constant&) } template -T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant&) { BOOST_MATH_STD_USING T result; @@ -204,7 +209,7 @@ T expint_1_rational(const T& z, const std::integral_constant&) } template -T expint_1_rational(const T& z, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant&) { BOOST_MATH_STD_USING T result; @@ -351,14 +356,15 @@ T expint_1_rational(const T& z, const std::integral_constant&) return result; } + template struct expint_fraction { - typedef std::pair result_type; - expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){} - std::pair operator()() + typedef boost::math::pair result_type; + BOOST_MATH_GPU_ENABLED expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){} + BOOST_MATH_GPU_ENABLED boost::math::pair operator()() { - std::pair result = std::make_pair(-static_cast((i+1) * (n+i)), b); + boost::math::pair result = boost::math::make_pair(-static_cast((i+1) * (n+i)), b); b += 2; ++i; return result; @@ -374,7 +380,7 @@ inline T expint_as_fraction(unsigned n, T z, const Policy& pol) { BOOST_MATH_STD_USING BOOST_MATH_INSTRUMENT_VARIABLE(z) - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); expint_fraction f(n, z); T result = tools::continued_fraction_b( f, @@ -392,9 +398,9 @@ template struct expint_series { typedef T result_type; - expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_) + BOOST_MATH_GPU_ENABLED expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_) : k(k_), z(z_), x_k(x_k_), denom(denom_), fact(fact_){} - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { x_k *= -z; denom += 1; @@ -410,10 +416,10 @@ struct expint_series }; template -inline T expint_as_series(unsigned n, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T expint_as_series(unsigned n, T z, const Policy& pol) { BOOST_MATH_STD_USING - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); BOOST_MATH_INSTRUMENT_VARIABLE(z) @@ -443,10 +449,10 @@ inline T expint_as_series(unsigned n, T z, const Policy& pol) } template -T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag) +BOOST_MATH_GPU_ENABLED T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(unsigned, %1%)"; + constexpr auto function = "boost::math::expint<%1%>(unsigned, %1%)"; if(z < 0) return policies::raise_domain_error(function, "Function requires z >= 0 but got %1%.", z, pol); if(z == 0) @@ -468,15 +474,21 @@ T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag) # pragma warning(disable:4127) // conditional expression is constant #endif if(n == 0) + { result = exp(-z) / z; + } else if((n == 1) && (Tag::value)) { result = expint_1_rational(z, tag); } else if(f) + { result = expint_as_series(n, z, pol); + } else + { result = expint_as_fraction(n, z, pol); + } #ifdef _MSC_VER # pragma warning(pop) #endif @@ -488,8 +500,8 @@ template struct expint_i_series { typedef T result_type; - expint_i_series(T z_) : k(0), z_k(1), z(z_){} - T operator()() + BOOST_MATH_GPU_ENABLED expint_i_series(T z_) : k(0), z_k(1), z(z_){} + BOOST_MATH_GPU_ENABLED T operator()() { z_k *= z / ++k; return z_k / k; @@ -501,22 +513,22 @@ struct expint_i_series }; template -T expint_i_as_series(T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED T expint_i_as_series(T z, const Policy& pol) { BOOST_MATH_STD_USING T result = log(z); // (log(z) - log(1 / z)) / 2; result += constants::euler(); expint_i_series s(z); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); result = tools::sum_series(s, policies::get_epsilon(), max_iter, result); policies::check_series_iterations("boost::math::expint_i_series<%1%>(%1%)", max_iter, pol); return result; } template -T expint_i_imp(T z, const Policy& pol, const Tag& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const Tag& tag) { - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -525,10 +537,10 @@ T expint_i_imp(T z, const Policy& pol, const Tag& tag) } template -T expint_i_imp(T z, const Policy& pol, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -739,10 +751,10 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta } template -T expint_i_imp(T z, const Policy& pol, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -976,7 +988,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant& ta } template -void expint_i_imp_113a(T& result, const T& z, const Policy& pol) +BOOST_MATH_GPU_ENABLED void expint_i_imp_113a(T& result, const T& z, const Policy& pol) { BOOST_MATH_STD_USING // Maximum Deviation Found: 1.230e-36 @@ -1044,7 +1056,7 @@ void expint_i_imp_113a(T& result, const T& z, const Policy& pol) } template -void expint_i_113b(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113b(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 7.779e-36 @@ -1094,7 +1106,7 @@ void expint_i_113b(T& result, const T& z) } template -void expint_i_113c(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113c(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 1.082e-34 @@ -1147,7 +1159,7 @@ void expint_i_113c(T& result, const T& z) } template -void expint_i_113d(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113d(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 3.163e-35 @@ -1198,7 +1210,7 @@ void expint_i_113d(T& result, const T& z) } template -void expint_i_113e(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113e(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 7.972e-36 @@ -1252,7 +1264,7 @@ void expint_i_113e(T& result, const T& z) } template -void expint_i_113f(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113f(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 4.469e-36 @@ -1299,7 +1311,7 @@ void expint_i_113f(T& result, const T& z) } template -void expint_i_113g(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113g(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 5.588e-35 @@ -1344,7 +1356,7 @@ void expint_i_113g(T& result, const T& z) } template -void expint_i_113h(T& result, const T& z) +BOOST_MATH_GPU_ENABLED void expint_i_113h(T& result, const T& z) { BOOST_MATH_STD_USING // Maximum Deviation Found: 4.448e-36 @@ -1383,10 +1395,10 @@ void expint_i_113h(T& result, const T& z) } template -T expint_i_imp(T z, const Policy& pol, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING - static const char* function = "boost::math::expint<%1%>(%1%)"; + constexpr auto function = "boost::math::expint<%1%>(%1%)"; if(z < 0) return -expint_imp(1, T(-z), pol, tag); if(z == 0) @@ -1491,12 +1503,12 @@ struct expint_i_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&){} - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(T(5), Policy()); boost::math::expint(T(7), Policy()); @@ -1504,7 +1516,7 @@ struct expint_i_initializer boost::math::expint(T(38), Policy()); boost::math::expint(T(45), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(T(5), Policy()); boost::math::expint(T(7), Policy()); @@ -1512,7 +1524,7 @@ struct expint_i_initializer boost::math::expint(T(38), Policy()); boost::math::expint(T(45), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(T(5), Policy()); boost::math::expint(T(7), Policy()); @@ -1524,12 +1536,14 @@ struct expint_i_initializer boost::math::expint(T(200), Policy()); boost::math::expint(T(220), Policy()); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; static const init initializer; - static void force_instantiate() + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -1541,33 +1555,35 @@ struct expint_1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&){} - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(1, T(0.5), Policy()); boost::math::expint(1, T(2), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(1, T(0.5), Policy()); boost::math::expint(1, T(2), Policy()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::expint(1, T(0.5), Policy()); boost::math::expint(1, T(2), Policy()); boost::math::expint(1, T(6), Policy()); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; static const init initializer; - static void force_instantiate() + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -1575,8 +1591,8 @@ template const typename expint_1_initializer::init expint_1_initializer::initializer; template -inline typename tools::promote_args::type - expint_forwarder(T z, const Policy& /*pol*/, std::true_type const&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type + expint_forwarder(T z, const Policy& /*pol*/, boost::math::true_type const&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -1587,7 +1603,7 @@ inline typename tools::promote_args::type policies::promote_double, policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant::type } template -inline typename tools::promote_args::type -expint_forwarder(unsigned n, T z, const std::false_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type +expint_forwarder(unsigned n, T z, const boost::math::false_type&) { return boost::math::expint(n, z, policies::policy<>()); } @@ -1612,7 +1628,7 @@ expint_forwarder(unsigned n, T z, const std::false_type&) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expint(unsigned n, T z, const Policy& /*pol*/) { typedef typename tools::promote_args::type result_type; @@ -1624,7 +1640,7 @@ inline typename tools::promote_args::type policies::promote_double, policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant::type } template -inline typename detail::expint_result::type +BOOST_MATH_GPU_ENABLED inline typename detail::expint_result::type expint(T const z, U const u) { typedef typename policies::is_policy::type tag_type; @@ -1649,7 +1665,7 @@ inline typename detail::expint_result::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expint(T z) { return expint(z, policies::policy<>()); From ed381ab894dbd8dd0891325206507199ae3e8d01 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 15:40:04 -0400 Subject: [PATCH 02/26] Add SYCL testing of expint --- test/sycl_jamfile | 2 ++ test/test_expint.cpp | 11 +++++++++++ test/test_expint.hpp | 8 +++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index ff6d84fde..23adafdbb 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -71,6 +71,8 @@ run test_sign.cpp ; run test_round.cpp ; +run test_expint.cpp ; + run test_expm1_simple.cpp ; run test_log1p_simple.cpp ; diff --git a/test/test_expint.cpp b/test/test_expint.cpp index 3f44a8091..3eede5e38 100644 --- a/test/test_expint.cpp +++ b/test/test_expint.cpp @@ -3,7 +3,14 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#endif + #include "test_expint.hpp" // @@ -78,7 +85,11 @@ void expected_results() ".*", // platform "float|double|long double", // test type(s) ".*Ei.*", // test data group + #ifndef SYCL_LANGUAGE_VERSION ".*", 6, 3); // test function + #else + ".*", 10, 3); + #endif if(std::numeric_limits::digits > 100) { add_expected_result( diff --git a/test/test_expint.hpp b/test/test_expint.hpp index 491db2fcd..d6524810e 100644 --- a/test/test_expint.hpp +++ b/test/test_expint.hpp @@ -4,13 +4,19 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #include +#include #define BOOST_TEST_MAIN #include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include From 01985d6f54f385d36bedea6fd9c456449217bb12 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 16:06:36 -0400 Subject: [PATCH 03/26] Add markers to forward decls --- .../boost/math/special_functions/math_fwd.hpp | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 35f8dd1bb..fb3a2029f 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -50,6 +50,16 @@ namespace detail{ >::type; }; + template + struct expint_result + { + using type = typename boost::math::conditional< + policies::is_policy::value, + tools::promote_args_t, + typename tools::promote_args::type + >::type; + }; + } // namespace detail } // namespace math @@ -936,7 +946,7 @@ namespace boost template struct expint_result { - typedef typename std::conditional< + typedef typename boost::math::conditional< policies::is_policy::value, tools::promote_args_t, typename tools::promote_args::type @@ -946,13 +956,13 @@ namespace boost } // namespace detail template - tools::promote_args_t expint(unsigned n, T z, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t expint(unsigned n, T z, const Policy&); template - typename detail::expint_result::type expint(T const z, U const u); + BOOST_MATH_GPU_ENABLED typename detail::expint_result::type expint(T const z, U const u); template - tools::promote_args_t expint(T z); + BOOST_MATH_GPU_ENABLED tools::promote_args_t expint(T z); // Zeta: template @@ -1620,11 +1630,11 @@ template \ using boost::math::changesign;\ \ template \ - inline typename boost::math::tools::promote_args_t expint(T const& z, U const& u)\ + BOOST_MATH_GPU_ENABLED inline typename boost::math::tools::promote_args_t expint(T const& z, U const& u)\ { return boost::math::expint(z, u, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t expint(T z){ return boost::math::expint(z, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t expint(T z){ return boost::math::expint(z, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t zeta(T s){ return boost::math::zeta(s, Policy()); }\ From f829cc4f3cbd977707ec4f60b0cc5d4426b21ff6 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 16:06:46 -0400 Subject: [PATCH 04/26] Add CUDA testing of expint --- test/cuda_jamfile | 3 ++ test/test_expint_double.cu | 106 +++++++++++++++++++++++++++++++++++++ test/test_expint_float.cu | 106 +++++++++++++++++++++++++++++++++++++ 3 files changed, 215 insertions(+) create mode 100644 test/test_expint_double.cu create mode 100644 test/test_expint_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 34ac3b965..9c583821a 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -340,6 +340,9 @@ run test_erfc_float.cu ; run test_erfc_inv_double.cu ; run test_erfc_inv_float.cu ; +run test_expint_double.cu ; +run test_expint_float.cu ; + run test_expm1_double.cu ; run test_expm1_float.cu ; diff --git a/test/test_expint_double.cu b/test/test_expint_double.cu new file mode 100644 index 000000000..d82e90a93 --- /dev/null +++ b/test/test_expint_double.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::expint(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::expint(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_expint_float.cu b/test/test_expint_float.cu new file mode 100644 index 000000000..dd1fccd1d --- /dev/null +++ b/test/test_expint_float.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::expint(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::expint(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From ad66a6b41846fc6ceff8e59b03e7c3e32a410da4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 16:13:54 -0400 Subject: [PATCH 05/26] Fix static variable usage under NVRTC --- .../boost/math/special_functions/expint.hpp | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/include/boost/math/special_functions/expint.hpp b/include/boost/math/special_functions/expint.hpp index 79a46c52f..09e97bd4f 100644 --- a/include/boost/math/special_functions/expint.hpp +++ b/include/boost/math/special_functions/expint.hpp @@ -25,7 +25,6 @@ #include #include #include -#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -376,7 +375,7 @@ struct expint_fraction }; template -inline T expint_as_fraction(unsigned n, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T expint_as_fraction(unsigned n, T z, const Policy& pol) { BOOST_MATH_STD_USING BOOST_MATH_INSTRUMENT_VARIABLE(z) @@ -553,7 +552,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: // Maximum Deviation Found: 2.852e-18 // Expected Error Term: 2.852e-18 // Max Error found at double precision = Poly: 2.636335e-16 Cheb: 4.187027e-16 - static const T P[10] = { + BOOST_MATH_STATIC const T P[10] = { BOOST_MATH_BIG_CONSTANT(T, 53, 2.98677224343598593013), BOOST_MATH_BIG_CONSTANT(T, 53, 0.356343618769377415068), BOOST_MATH_BIG_CONSTANT(T, 53, 0.780836076283730801839), @@ -565,7 +564,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: BOOST_MATH_BIG_CONSTANT(T, 53, 0.798296365679269702435e-5), BOOST_MATH_BIG_CONSTANT(T, 53, 0.2777056254402008721e-6) }; - static const T Q[8] = { + BOOST_MATH_STATIC const T Q[8] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, -1.17090412365413911947), BOOST_MATH_BIG_CONSTANT(T, 53, 0.62215109846016746276), @@ -576,11 +575,11 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: BOOST_MATH_BIG_CONSTANT(T, 53, -0.138972589601781706598e-4) }; - static const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0); - static const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0); - static const T r1 = static_cast(c1 / c2); - static const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16); - static const T r = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392)); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T r1 = static_cast(c1 / c2); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T r = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392)); T t = (z / 3) - 1; result = tools::evaluate_polynomial(P, t) / tools::evaluate_polynomial(Q, t); @@ -600,8 +599,8 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: // Maximum Deviation Found: 6.546e-17 // Expected Error Term: 6.546e-17 // Max Error found at double precision = Poly: 6.890169e-17 Cheb: 6.772128e-17 - static const T Y = 1.158985137939453125F; - static const T P[8] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.158985137939453125F; + BOOST_MATH_STATIC const T P[8] = { BOOST_MATH_BIG_CONSTANT(T, 53, 0.00139324086199402804173), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0349921221823888744966), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0264095520754134848538), @@ -611,7 +610,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: BOOST_MATH_BIG_CONSTANT(T, 53, -0.554086272024881826253e-4), BOOST_MATH_BIG_CONSTANT(T, 53, -0.396487648924804510056e-5) }; - static const T Q[8] = { + BOOST_MATH_STATIC const T Q[8] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 0.744625566823272107711), BOOST_MATH_BIG_CONSTANT(T, 53, 0.329061095011767059236), @@ -633,8 +632,8 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: // Expected Error Term: -1.842e-17 // Max Error found at double precision = Poly: 4.375868e-17 Cheb: 5.860967e-17 - static const T Y = 1.0869731903076171875F; - static const T P[9] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.0869731903076171875F; + BOOST_MATH_STATIC const T P[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, -0.00893891094356945667451), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0484607730127134045806), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0652810444222236895772), @@ -645,7 +644,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: BOOST_MATH_BIG_CONSTANT(T, 53, -0.000209750022660200888349), BOOST_MATH_BIG_CONSTANT(T, 53, -0.138652200349182596186e-4) }; - static const T Q[9] = { + BOOST_MATH_STATIC const T Q[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 1.97017214039061194971), BOOST_MATH_BIG_CONSTANT(T, 53, 1.86232465043073157508), @@ -669,8 +668,8 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: // Max Error found at double precision = Poly: 1.441088e-16 Cheb: 1.864792e-16 - static const T Y = 1.03937530517578125F; - static const T P[9] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.03937530517578125F; + BOOST_MATH_STATIC const T P[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, -0.00356165148914447597995), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0229930320357982333406), BOOST_MATH_BIG_CONSTANT(T, 53, -0.0449814350482277917716), @@ -681,7 +680,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: BOOST_MATH_BIG_CONSTANT(T, 53, -0.000192178045857733706044), BOOST_MATH_BIG_CONSTANT(T, 53, -0.113161784705911400295e-9) }; - static const T Q[9] = { + BOOST_MATH_STATIC const T Q[9] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 2.84354408840148561131), BOOST_MATH_BIG_CONSTANT(T, 53, 3.6599610090072393012), @@ -700,9 +699,9 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: else { // Max Error found at double precision = 3.381886e-17 - static const T exp40 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17)); - static const T Y= 1.013065338134765625F; - static const T P[6] = { + BOOST_MATH_STATIC_LOCAL_VARIABLE const T exp40 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17)); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y= 1.013065338134765625F; + BOOST_MATH_STATIC const T P[6] = { BOOST_MATH_BIG_CONSTANT(T, 53, -0.0130653381347656243849), BOOST_MATH_BIG_CONSTANT(T, 53, 0.19029710559486576682), BOOST_MATH_BIG_CONSTANT(T, 53, 94.7365094537197236011), @@ -710,7 +709,7 @@ BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math: BOOST_MATH_BIG_CONSTANT(T, 53, 18932.0850014925993025), BOOST_MATH_BIG_CONSTANT(T, 53, -38703.1431362056714134) }; - static const T Q[7] = { + BOOST_MATH_STATIC const T Q[7] = { BOOST_MATH_BIG_CONSTANT(T, 53, 1.0), BOOST_MATH_BIG_CONSTANT(T, 53, 61.9733592849439884145), BOOST_MATH_BIG_CONSTANT(T, 53, -2354.56211323420194283), From 59faf9128e35e09df52690103243b13647fa3398 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 16:14:04 -0400 Subject: [PATCH 06/26] Add NVRTC testing --- test/nvrtc_jamfile | 6 +- test/test_expint_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++++++ test/test_expint_nvrtc_float.cpp | 190 ++++++++++++++++++++++++++++++ 3 files changed, 384 insertions(+), 2 deletions(-) create mode 100644 test/test_expint_nvrtc_double.cpp create mode 100644 test/test_expint_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index eb5da37c8..294559821 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -9,8 +9,8 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; -run test_heumann_lambda_nvrtc_double.cpp ; -run test_heumann_lambda_nvrtc_float.cpp ; +run test_expint_nvrtc_double.cpp ; +run test_expint_nvrtc_float.cpp ; # Quad run test_exp_sinh_quad_nvrtc_float.cpp ; @@ -335,6 +335,8 @@ run test_ellint_d_nvrtc_double.cpp ; run test_ellint_d_nvrtc_float.cpp ; run test_jacobi_zeta_nvrtc_double.cpp ; run test_jacobi_zeta_nvrtc_float.cpp ; +run test_heumann_lambda_nvrtc_double.cpp ; +run test_heumann_lambda_nvrtc_float.cpp ; run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/test_expint_nvrtc_double.cpp b/test/test_expint_nvrtc_double.cpp new file mode 100644 index 000000000..3ab45e6a1 --- /dev/null +++ b/test/test_expint_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::expint(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_expint_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::expint(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_expint_nvrtc_float.cpp b/test/test_expint_nvrtc_float.cpp new file mode 100644 index 000000000..bff58580e --- /dev/null +++ b/test/test_expint_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::expint(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_expint_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::expint(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 9c565e92c16fcf0adb7b81a487a7e96246171b68 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 10:12:45 -0400 Subject: [PATCH 07/26] Add configurable definition of complex --- include/boost/math/tools/complex.hpp | 26 ++++++++++++++++++++++++-- include/boost/math/tools/config.hpp | 1 + 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp index f93d2958d..ec5144011 100644 --- a/include/boost/math/tools/complex.hpp +++ b/include/boost/math/tools/complex.hpp @@ -14,11 +14,33 @@ #include #ifdef BOOST_MATH_ENABLE_CUDA + #include -#endif +#include + +namespace boost { +namespace math { + +template +using complex = cuda::std::complex; + +} // namespace math +} // namespace boost + +#else -#ifndef BOOST_MATH_HAS_NVRTC #include +#include + +namespace boost { +namespace math { + +template +using complex = std::complex; + +} // namespace math +} // namespace boost + #endif namespace boost { diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 99bc4ee27..b3a8c2cf8 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -678,6 +678,7 @@ namespace boost{ namespace math{ #include #include #include +#include # define BOOST_MATH_CUDA_ENABLED __host__ __device__ # define BOOST_MATH_HAS_GPU_SUPPORT From bf43d6ea25103830c0090fb4df88c3429ee45140 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 10:19:10 -0400 Subject: [PATCH 08/26] Add function aliases --- include/boost/math/tools/complex.hpp | 60 ++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp index ec5144011..88db8e8a7 100644 --- a/include/boost/math/tools/complex.hpp +++ b/include/boost/math/tools/complex.hpp @@ -24,6 +24,36 @@ namespace math { template using complex = cuda::std::complex; +using cuda::std::real; +using cuda::std::imag; +using cuda::std::abs; +using cuda::std::arg; +using cuda::std::norm; +using cuda::std::conj; +using cuda::std::proj; +using cuda::std::polar; + +using cuda::std::exp; +using cuda::std::log; +using cuda::std::log10; + +using cuda::std::pow; +using cuda::std::sqrt; + +using cuda::std::sin; +using cuda::std::cos; +using cuda::std::tan; +using cuda::std::asin; +using cuda::std::acos; +using cuda::std::atan; + +using cuda::std::sinh; +using cuda::std::cosh; +using cuda::std::tanh; +using cuda::std::asinh; +using cuda::std::acosh; +using cuda::std::atanh; + } // namespace math } // namespace boost @@ -38,6 +68,36 @@ namespace math { template using complex = std::complex; +using std::real; +using std::imag; +using std::abs; +using std::arg; +using std::norm; +using std::conj; +using std::proj; +using std::polar; + +using std::exp; +using std::log; +using std::log10; + +using std::pow; +using std::sqrt; + +using std::sin; +using std::cos; +using std::tan; +using std::asin; +using std::acos; +using std::atan; + +using std::sinh; +using std::cosh; +using std::tanh; +using std::asinh; +using std::acosh; +using std::atanh; + } // namespace math } // namespace boost From 571958701bd445f0807f3e5c3bfdb3a584dc8015 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 10:53:17 -0400 Subject: [PATCH 09/26] Add GPU support to gegenbauer polynomials --- include/boost/math/special_functions/gegenbauer.hpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/include/boost/math/special_functions/gegenbauer.hpp b/include/boost/math/special_functions/gegenbauer.hpp index b7033cd14..41fcdbd34 100644 --- a/include/boost/math/special_functions/gegenbauer.hpp +++ b/include/boost/math/special_functions/gegenbauer.hpp @@ -1,4 +1,5 @@ // (C) Copyright Nick Thompson 2019. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -6,16 +7,20 @@ #ifndef BOOST_MATH_SPECIAL_GEGENBAUER_HPP #define BOOST_MATH_SPECIAL_GEGENBAUER_HPP -#include +#include +#include +#include + +#ifndef BOOST_MATH_NO_EXCEPTIONS #include -#include +#endif namespace boost { namespace math { template Real gegenbauer(unsigned n, Real lambda, Real x) { - static_assert(!std::is_integral::value, "Gegenbauer polynomials required floating point arguments."); + static_assert(!boost::math::is_integral::value, "Gegenbauer polynomials required floating point arguments."); if (lambda <= -1/Real(2)) { #ifndef BOOST_MATH_NO_EXCEPTIONS throw std::domain_error("lambda > -1/2 is required."); @@ -41,7 +46,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x) Real yk = y1; Real k = 2; - Real k_max = n*(1+std::numeric_limits::epsilon()); + Real k_max = n*(1+boost::math::numeric_limits::epsilon()); Real gamma = 2*(lambda - 1); while(k < k_max) { From 8de822321a390aaec1c74582758da8e56ac189e1 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 10:53:27 -0400 Subject: [PATCH 10/26] Add SYCL testing of gegenbauer --- test/sycl_jamfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 23adafdbb..2f617be85 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -75,6 +75,8 @@ run test_expint.cpp ; run test_expm1_simple.cpp ; +run gegenbauer_test.cpp ; + run test_log1p_simple.cpp ; run test_digamma_simple.cpp ; From 608ba2fe7b9346d1d1ab9750aa5a1e5a07847611 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 11:16:40 -0400 Subject: [PATCH 11/26] Add NVCC testing of gegenbauer --- .../math/special_functions/gegenbauer.hpp | 8 +- test/cuda_jamfile | 3 + test/test_gegenbauer_double.cu | 125 ++++++++++++++++++ test/test_gegenbauer_float.cu | 124 +++++++++++++++++ 4 files changed, 256 insertions(+), 4 deletions(-) create mode 100644 test/test_gegenbauer_double.cu create mode 100644 test/test_gegenbauer_float.cu diff --git a/include/boost/math/special_functions/gegenbauer.hpp b/include/boost/math/special_functions/gegenbauer.hpp index 41fcdbd34..70324cf65 100644 --- a/include/boost/math/special_functions/gegenbauer.hpp +++ b/include/boost/math/special_functions/gegenbauer.hpp @@ -18,14 +18,14 @@ namespace boost { namespace math { template -Real gegenbauer(unsigned n, Real lambda, Real x) +BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x) { static_assert(!boost::math::is_integral::value, "Gegenbauer polynomials required floating point arguments."); if (lambda <= -1/Real(2)) { #ifndef BOOST_MATH_NO_EXCEPTIONS throw std::domain_error("lambda > -1/2 is required."); #else - return std::numeric_limits::quiet_NaN(); + return boost::math::numeric_limits::quiet_NaN(); #endif } // The only reason to do this is because of some instability that could be present for x < 0 that is not present for x > 0. @@ -60,7 +60,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x) template -Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k) +BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k) { if (k > n) { return Real(0); @@ -75,7 +75,7 @@ Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k) } template -Real gegenbauer_prime(unsigned n, Real lambda, Real x) { +BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x) { return gegenbauer_derivative(n, lambda, x, 1); } diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 9c583821a..6d6eea454 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -9,6 +9,9 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; +run test_gegenbauer_double.cu ; +run test_gegenbauer_float.cu ; + # Quad run test_exp_sinh_quad_float.cu ; run test_exp_sinh_quad_double.cu ; diff --git a/test/test_gegenbauer_double.cu b/test/test_gegenbauer_double.cu new file mode 100644 index 000000000..21278d7a8 --- /dev/null +++ b/test/test_gegenbauer_double.cu @@ -0,0 +1,125 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Gegenbauer prime uses all methods internally so it's the easy choice + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]); + // Most elements are under 50 but extremely small numbers very more greatly + if (eps > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_gegenbauer_float.cu b/test/test_gegenbauer_float.cu new file mode 100644 index 000000000..b7affaecd --- /dev/null +++ b/test/test_gegenbauer_float.cu @@ -0,0 +1,124 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Gegenbauer prime uses all methods internally so it's the easy choice + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]); + if (eps > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From dbd194ed88f119f47149704be852f897958ad693 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 11:22:45 -0400 Subject: [PATCH 12/26] Add NVRTC testing of gegenbauer --- test/cuda_jamfile | 6 +- test/nvrtc_jamfile | 7 +- test/test_gegenbauer_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++ test/test_gegenbauer_nvrtc_float.cpp | 190 ++++++++++++++++++++++++++ 4 files changed, 388 insertions(+), 5 deletions(-) create mode 100644 test/test_gegenbauer_nvrtc_double.cpp create mode 100644 test/test_gegenbauer_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 6d6eea454..6abcf00b8 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -9,9 +9,6 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; -run test_gegenbauer_double.cu ; -run test_gegenbauer_float.cu ; - # Quad run test_exp_sinh_quad_float.cu ; run test_exp_sinh_quad_double.cu ; @@ -349,6 +346,9 @@ run test_expint_float.cu ; run test_expm1_double.cu ; run test_expm1_float.cu ; +run test_gegenbauer_double.cu ; +run test_gegenbauer_float.cu ; + run test_lgamma_double.cu ; run test_lgamma_float.cu ; run test_tgamma_double.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 294559821..2cd61028e 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -9,8 +9,8 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; -run test_expint_nvrtc_double.cpp ; -run test_expint_nvrtc_float.cpp ; +run test_gegenbauer_nvrtc_double.cpp ; +run test_gegenbauer_nvrtc_float.cpp ; # Quad run test_exp_sinh_quad_nvrtc_float.cpp ; @@ -338,6 +338,9 @@ run test_jacobi_zeta_nvrtc_float.cpp ; run test_heumann_lambda_nvrtc_double.cpp ; run test_heumann_lambda_nvrtc_float.cpp ; +run test_expint_nvrtc_double.cpp ; +run test_expint_nvrtc_float.cpp ; + run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/test_gegenbauer_nvrtc_double.cpp b/test/test_gegenbauer_nvrtc_double.cpp new file mode 100644 index 000000000..0c8416cb6 --- /dev/null +++ b/test/test_gegenbauer_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gegenbauer(2, in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gegenbauer_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gegenbauer_nvrtc_float.cpp b/test/test_gegenbauer_nvrtc_float.cpp new file mode 100644 index 000000000..c0d348417 --- /dev/null +++ b/test/test_gegenbauer_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gegenbauer(2, in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gegenbauer_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 0f17f14d9789f39b635fe4c97b424d21fa369b30 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 11:47:49 -0400 Subject: [PATCH 13/26] Add GPU support for hankel --- .../boost/math/special_functions/bessel.hpp | 44 ------------- .../boost/math/special_functions/hankel.hpp | 62 +++++++++--------- .../boost/math/special_functions/math_fwd.hpp | 63 +++++++++++++++---- 3 files changed, 84 insertions(+), 85 deletions(-) diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp index 081473442..c32f251bc 100644 --- a/include/boost/math/special_functions/bessel.hpp +++ b/include/boost/math/special_functions/bessel.hpp @@ -47,50 +47,6 @@ namespace boost{ namespace math{ -// Since we cannot pull this in from math fwd we need a copy -#ifdef BOOST_MATH_HAS_NVRTC - -namespace detail{ - - typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. - typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. - typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. - - template - struct bessel_traits - { - using result_type = typename boost::math::conditional< - boost::math::is_integral::value, - typename tools::promote_args::type, - tools::promote_args_t - >::type; - - typedef typename policies::precision::type precision_type; - - using optimisation_tag = typename boost::math::conditional< - (precision_type::value <= 0 || precision_type::value > 64), - bessel_no_int_tag, - typename boost::math::conditional< - boost::math::is_integral::value, - bessel_int_tag, - bessel_maybe_int_tag - >::type - >::type; - - using optimisation_tag128 = typename boost::math::conditional< - (precision_type::value <= 0 || precision_type::value > 113), - bessel_no_int_tag, - typename boost::math::conditional< - boost::math::is_integral::value, - bessel_int_tag, - bessel_maybe_int_tag - >::type - >::type; - }; - } // detail - -#endif - namespace detail{ template diff --git a/include/boost/math/special_functions/hankel.hpp b/include/boost/math/special_functions/hankel.hpp index 51b8390d9..d9cff8f92 100644 --- a/include/boost/math/special_functions/hankel.hpp +++ b/include/boost/math/special_functions/hankel.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2012. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -7,26 +8,31 @@ #ifndef BOOST_MATH_HANKEL_HPP #define BOOST_MATH_HANKEL_HPP +#include +#include #include #include +#include +#include +#include namespace boost{ namespace math{ namespace detail{ template -std::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign) +boost::math::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING - static const char* function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"; if(x < 0) { bool isint_v = floor(v) == v; T j, y; bessel_jy(v, -x, &j, &y, need_j | need_y, pol); - std::complex cx(x), cv(v); - std::complex j_result, y_result; + boost::math::complex cx(x), cv(v); + boost::math::complex j_result, y_result; if(isint_v) { int s = (iround(v) & 1) ? -1 : 1; @@ -37,12 +43,12 @@ std::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol { j_result = pow(cx, v) * pow(-cx, -v) * j; T p1 = pow(-x, v); - std::complex p2 = pow(cx, v); + boost::math::complex p2 = pow(cx, v); y_result = p1 * y / p2 + (p2 / p1 - p1 / p2) * j / tan(constants::pi() * v); } // multiply y_result by i: - y_result = std::complex(-sign * y_result.imag(), sign * y_result.real()); + y_result = boost::math::complex(-sign * y_result.imag(), sign * y_result.real()); return j_result + y_result; } @@ -51,25 +57,25 @@ std::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol if(v == 0) { // J is 1, Y is -INF - return std::complex(1, sign * -policies::raise_overflow_error(function, nullptr, pol)); + return boost::math::complex(1, sign * -policies::raise_overflow_error(function, nullptr, pol)); } else { // At least one of J and Y is complex infinity: - return std::complex(policies::raise_overflow_error(function, nullptr, pol), sign * policies::raise_overflow_error(function, nullptr, pol)); + return boost::math::complex(policies::raise_overflow_error(function, nullptr, pol), sign * policies::raise_overflow_error(function, nullptr, pol)); } } T j, y; bessel_jy(v, x, &j, &y, need_j | need_y, pol); - return std::complex(j, sign * y); + return boost::math::complex(j, sign * y); } template -std::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign); +boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign); template -inline std::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign) +inline boost::math::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING // ADL of std names. int ival = detail::iconv(v, pol); @@ -81,57 +87,57 @@ inline std::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const P } template -inline std::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign) +inline boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING - if((std::abs(v) < 200) && (x > 0)) - return std::complex(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol)); + if((abs(v) < 200) && (x > 0)) + return boost::math::complex(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol)); return hankel_imp(static_cast(v), x, bessel_no_int_tag(), pol, sign); } template -inline std::complex sph_hankel_imp(T v, T x, const Policy& pol, int sign) +inline boost::math::complex sph_hankel_imp(T v, T x, const Policy& pol, int sign) { BOOST_MATH_STD_USING - return constants::root_half_pi() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(std::complex(x)); + return constants::root_half_pi() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(boost::math::complex(x)); } } // namespace detail template -inline std::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol) +inline boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; typedef typename detail::bessel_traits::optimisation_tag tag_type; typedef typename policies::evaluation::type value_type; - return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> cyl_hankel_1(T1 v, T2 x) +inline boost::math::complex >::result_type> cyl_hankel_1(T1 v, T2 x) { return cyl_hankel_1(v, x, policies::policy<>()); } template -inline std::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol) +inline boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; typedef typename detail::bessel_traits::optimisation_tag tag_type; typedef typename policies::evaluation::type value_type; - return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::hankel_imp(v, static_cast(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> cyl_hankel_2(T1 v, T2 x) +inline boost::math::complex >::result_type> cyl_hankel_2(T1 v, T2 x) { return cyl_hankel_2(v, x, policies::policy<>()); } template -inline std::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy&) +inline boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -143,17 +149,17 @@ inline std::complex::result_type> policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> sph_hankel_1(T1 v, T2 x) +inline boost::math::complex >::result_type> sph_hankel_1(T1 v, T2 x) { return sph_hankel_1(v, x, policies::policy<>()); } template -inline std::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy&) +inline boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -165,11 +171,11 @@ inline std::complex::result_type> policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); + return policies::checked_narrowing_cast, Policy>(detail::sph_hankel_imp(static_cast(v), static_cast(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)"); } template -inline std::complex >::result_type> sph_hankel_2(T1 v, T2 x) +inline boost::math::complex >::result_type> sph_hankel_2(T1 v, T2 x) { return sph_hankel_2(v, x, policies::policy<>()); } diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index fb3a2029f..e989f5b18 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -27,6 +27,7 @@ #include #include // for argument promotion. #include +#include #include #ifdef BOOST_MATH_HAS_NVRTC @@ -60,7 +61,43 @@ namespace detail{ >::type; }; - } // namespace detail + typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. + typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. + typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. + + template + struct bessel_traits + { + using result_type = typename boost::math::conditional< + boost::math::is_integral::value, + typename tools::promote_args::type, + tools::promote_args_t + >::type; + + typedef typename policies::precision::type precision_type; + + using optimisation_tag = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 64), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + + using optimisation_tag128 = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 113), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + }; + +} // namespace detail } // namespace math } // namespace boost @@ -818,28 +855,28 @@ namespace boost const Policy&); template - std::complex >::result_type> cyl_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> cyl_hankel_1(T1 v, T2 x); template - std::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol); template - std::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol); template - std::complex >::result_type> cyl_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> cyl_hankel_2(T1 v, T2 x); template - std::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol); template - std::complex >::result_type> sph_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> sph_hankel_1(T1 v, T2 x); template - std::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol); template - std::complex >::result_type> sph_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED boost::math::complex >::result_type> sph_hankel_2(T1 v, T2 x); template BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai(T x, const Policy&); @@ -1679,19 +1716,19 @@ template \ inline boost::math::tools::promote_args_t owens_t(RT1 a, RT2 z){ return boost::math::owens_t(a, z, Policy()); }\ \ template \ - inline std::complex::result_type> cyl_hankel_1(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x)\ { return boost::math::cyl_hankel_1(v, x, Policy()); }\ \ template \ - inline std::complex::result_type> cyl_hankel_2(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x)\ { return boost::math::cyl_hankel_2(v, x, Policy()); }\ \ template \ - inline std::complex::result_type> sph_hankel_1(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x)\ { return boost::math::sph_hankel_1(v, x, Policy()); }\ \ template \ - inline std::complex::result_type> sph_hankel_2(T1 v, T2 x)\ + inline BOOST_MATH_GPU_ENABLED boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x)\ { return boost::math::sph_hankel_2(v, x, Policy()); }\ \ template \ From f561a7722ee8353c4cb88f609a30b41056db6629 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 11:47:59 -0400 Subject: [PATCH 14/26] Add SYCL testing of hankel --- test/sycl_jamfile | 2 ++ test/test_hankel.cpp | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 2f617be85..991d34520 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -77,6 +77,8 @@ run test_expm1_simple.cpp ; run gegenbauer_test.cpp ; +run test_hankel.cpp ; + run test_log1p_simple.cpp ; run test_digamma_simple.cpp ; diff --git a/test/test_hankel.cpp b/test/test_hankel.cpp index f8bd173da..a93e90c4d 100644 --- a/test/test_hankel.cpp +++ b/test/test_hankel.cpp @@ -3,9 +3,13 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#include + #define BOOST_TEST_MAIN #include #include @@ -85,6 +89,7 @@ void test_hankel(T, const char* name) // // Instantiate a few instances to check our error handling code can cope with std::complex: // +#ifndef SYCL_LANGUAGE_VERSION typedef boost::math::policies::policy< boost::math::policies::overflow_error, boost::math::policies::denorm_error, @@ -120,7 +125,7 @@ typedef boost::math::policies::policy< boost::math::policies::indeterminate_result_error > pol3; template std::complex boost::math::cyl_hankel_1(double, double, const pol3&); - +#endif BOOST_AUTO_TEST_CASE( test_main ) { From 597d6a833fc75b2df729f1eff17a0396b001d14a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 12:22:16 -0400 Subject: [PATCH 15/26] Add NVCC testing of cyl_hankel_1 --- .../boost/math/special_functions/hankel.hpp | 26 ++-- test/cuda_jamfile | 3 + test/test_cyl_hankel_1_double.cu | 119 ++++++++++++++++++ test/test_cyl_hankel_1_float.cu | 119 ++++++++++++++++++ 4 files changed, 254 insertions(+), 13 deletions(-) create mode 100644 test/test_cyl_hankel_1_double.cu create mode 100644 test/test_cyl_hankel_1_float.cu diff --git a/include/boost/math/special_functions/hankel.hpp b/include/boost/math/special_functions/hankel.hpp index d9cff8f92..730c7afa0 100644 --- a/include/boost/math/special_functions/hankel.hpp +++ b/include/boost/math/special_functions/hankel.hpp @@ -21,7 +21,7 @@ namespace boost{ namespace math{ namespace detail{ template -boost::math::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED boost::math::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING constexpr auto function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)"; @@ -72,10 +72,10 @@ boost::math::complex hankel_imp(T v, T x, const bessel_no_int_tag&, const Pol } template -boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign); +BOOST_MATH_GPU_ENABLED boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign); template -inline boost::math::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED inline boost::math::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING // ADL of std names. int ival = detail::iconv(v, pol); @@ -87,7 +87,7 @@ inline boost::math::complex hankel_imp(T v, T x, const bessel_maybe_int_tag&, } template -inline boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED inline boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign) { BOOST_MATH_STD_USING if((abs(v) < 200) && (x > 0)) @@ -96,7 +96,7 @@ inline boost::math::complex hankel_imp(int v, T x, const bessel_int_tag&, con } template -inline boost::math::complex sph_hankel_imp(T v, T x, const Policy& pol, int sign) +BOOST_MATH_GPU_ENABLED inline boost::math::complex sph_hankel_imp(T v, T x, const Policy& pol, int sign) { BOOST_MATH_STD_USING return constants::root_half_pi() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(boost::math::complex(x)); @@ -105,7 +105,7 @@ inline boost::math::complex sph_hankel_imp(T v, T x, const Policy& pol, int s } // namespace detail template -inline boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -115,13 +115,13 @@ inline boost::math::complex::resu } template -inline boost::math::complex >::result_type> cyl_hankel_1(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> cyl_hankel_1(T1 v, T2 x) { return cyl_hankel_1(v, x, policies::policy<>()); } template -inline boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -131,13 +131,13 @@ inline boost::math::complex::resu } template -inline boost::math::complex >::result_type> cyl_hankel_2(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> cyl_hankel_2(T1 v, T2 x) { return cyl_hankel_2(v, x, policies::policy<>()); } template -inline boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy&) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> sph_hankel_1(T1 v, T2 x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -153,13 +153,13 @@ inline boost::math::complex::resu } template -inline boost::math::complex >::result_type> sph_hankel_1(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> sph_hankel_1(T1 v, T2 x) { return sph_hankel_1(v, x, policies::policy<>()); } template -inline boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy&) +BOOST_MATH_GPU_ENABLED inline boost::math::complex::result_type> sph_hankel_2(T1 v, T2 x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -175,7 +175,7 @@ inline boost::math::complex::resu } template -inline boost::math::complex >::result_type> sph_hankel_2(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline boost::math::complex >::result_type> sph_hankel_2(T1 v, T2 x) { return sph_hankel_2(v, x, policies::policy<>()); } diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 6abcf00b8..01b2334b9 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -9,6 +9,9 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; +run test_cyl_hankel_1_double.cu ; +run test_cyl_hankel_1_float.cu ; + # Quad run test_exp_sinh_quad_float.cu ; run test_exp_sinh_quad_double.cu ; diff --git a/test/test_cyl_hankel_1_double.cu b/test/test_cyl_hankel_1_double.cu new file mode 100644 index 000000000..b718521c6 --- /dev/null +++ b/test/test_cyl_hankel_1_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_hankel_1_float.cu b/test/test_cyl_hankel_1_float.cu new file mode 100644 index 000000000..794d6d763 --- /dev/null +++ b/test/test_cyl_hankel_1_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From cdb600c2f6c26d9a4b244e28f3651a5b3fc43761 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 12:32:37 -0400 Subject: [PATCH 16/26] Add comprehensive NVCC testing --- test/cuda_jamfile | 11 ++- test/test_cyl_hankel_2_double.cu | 119 +++++++++++++++++++++++++++++++ test/test_cyl_hankel_2_float.cu | 119 +++++++++++++++++++++++++++++++ test/test_sph_hankel_1_double.cu | 119 +++++++++++++++++++++++++++++++ test/test_sph_hankel_1_float.cu | 119 +++++++++++++++++++++++++++++++ test/test_sph_hankel_2_double.cu | 119 +++++++++++++++++++++++++++++++ test/test_sph_hankel_2_float.cu | 119 +++++++++++++++++++++++++++++++ 7 files changed, 722 insertions(+), 3 deletions(-) create mode 100644 test/test_cyl_hankel_2_double.cu create mode 100644 test/test_cyl_hankel_2_float.cu create mode 100644 test/test_sph_hankel_1_double.cu create mode 100644 test/test_sph_hankel_1_float.cu create mode 100644 test/test_sph_hankel_2_double.cu create mode 100644 test/test_sph_hankel_2_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 01b2334b9..32f87f44e 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -9,9 +9,6 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; -run test_cyl_hankel_1_double.cu ; -run test_cyl_hankel_1_float.cu ; - # Quad run test_exp_sinh_quad_float.cu ; run test_exp_sinh_quad_double.cu ; @@ -308,6 +305,14 @@ run test_cyl_neumann_double.cu ; run test_cyl_neumann_float.cu ; run test_sph_neumann_double.cu ; run test_sph_neumann_float.cu ; +run test_cyl_hankel_1_double.cu ; +run test_cyl_hankel_1_float.cu ; +run test_cyl_hankel_2_double.cu ; +run test_cyl_hankel_2_float.cu ; +run test_sph_hankel_1_double.cu ; +run test_sph_hankel_1_float.cu ; +run test_sph_hankel_2_double.cu ; +run test_sph_hankel_2_float.cu ; run test_cbrt_double.cu ; run test_cbrt_float.cu ; diff --git a/test/test_cyl_hankel_2_double.cu b/test/test_cyl_hankel_2_double.cu new file mode 100644 index 000000000..8218b5347 --- /dev/null +++ b/test/test_cyl_hankel_2_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_hankel_2_float.cu b/test/test_cyl_hankel_2_float.cu new file mode 100644 index 000000000..f00174b47 --- /dev/null +++ b/test/test_cyl_hankel_2_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_1_double.cu b/test/test_sph_hankel_1_double.cu new file mode 100644 index 000000000..3d8473e22 --- /dev/null +++ b/test/test_sph_hankel_1_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_1_float.cu b/test/test_sph_hankel_1_float.cu new file mode 100644 index 000000000..9cf17ffb2 --- /dev/null +++ b/test/test_sph_hankel_1_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_2_double.cu b/test/test_sph_hankel_2_double.cu new file mode 100644 index 000000000..6c7a032f6 --- /dev/null +++ b/test/test_sph_hankel_2_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_hankel_2_float.cu b/test/test_sph_hankel_2_float.cu new file mode 100644 index 000000000..bcf41e797 --- /dev/null +++ b/test/test_sph_hankel_2_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr> output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector> results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]); + double t = w.elapsed(); + // check the results + int failure_counter = 0; + for(int i = 0; i < numElements; ++i) + { + const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real()); + if (eps > 10) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] + << "\n Host: " << results[i] + << "\n Eps: " << eps << std::endl; + ++failure_counter; + if (failure_counter > 100) + { + break; + } + } + } + + if (failure_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 17c2207fa1303ff9cd7dba97f0c1e342ccb68b6b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 12:52:43 -0400 Subject: [PATCH 17/26] Add NVRTC testing of cyl and sph hankel --- test/nvrtc_jamfile | 14 +- test/test_cyl_hankel_1_nvrtc_double.cpp | 199 ++++++++++++++++++++++++ test/test_cyl_hankel_1_nvrtc_float.cpp | 199 ++++++++++++++++++++++++ test/test_cyl_hankel_2_nvrtc_double.cpp | 199 ++++++++++++++++++++++++ test/test_cyl_hankel_2_nvrtc_float.cpp | 199 ++++++++++++++++++++++++ test/test_sph_hankel_1_nvrtc_double.cpp | 199 ++++++++++++++++++++++++ test/test_sph_hankel_1_nvrtc_float.cpp | 199 ++++++++++++++++++++++++ test/test_sph_hankel_2_nvrtc_double.cpp | 199 ++++++++++++++++++++++++ test/test_sph_hankel_2_nvrtc_float.cpp | 199 ++++++++++++++++++++++++ 9 files changed, 1603 insertions(+), 3 deletions(-) create mode 100644 test/test_cyl_hankel_1_nvrtc_double.cpp create mode 100644 test/test_cyl_hankel_1_nvrtc_float.cpp create mode 100644 test/test_cyl_hankel_2_nvrtc_double.cpp create mode 100644 test/test_cyl_hankel_2_nvrtc_float.cpp create mode 100644 test/test_sph_hankel_1_nvrtc_double.cpp create mode 100644 test/test_sph_hankel_1_nvrtc_float.cpp create mode 100644 test/test_sph_hankel_2_nvrtc_double.cpp create mode 100644 test/test_sph_hankel_2_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 2cd61028e..734c87ad4 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -9,9 +9,6 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; -run test_gegenbauer_nvrtc_double.cpp ; -run test_gegenbauer_nvrtc_float.cpp ; - # Quad run test_exp_sinh_quad_nvrtc_float.cpp ; run test_exp_sinh_quad_nvrtc_double.cpp ; @@ -306,6 +303,14 @@ run test_cyl_neumann_nvrtc_double.cpp ; run test_cyl_neumann_nvrtc_float.cpp ; run test_sph_neumann_nvrtc_double.cpp ; run test_sph_neumann_nvrtc_float.cpp ; +run test_cyl_hankel_1_nvrtc_double.cpp ; +run test_cyl_hankel_1_nvrtc_float.cpp ; +run test_cyl_hankel_2_nvrtc_double.cpp ; +run test_cyl_hankel_2_nvrtc_float.cpp ; +run test_sph_hankel_1_nvrtc_double.cpp ; +run test_sph_hankel_1_nvrtc_float.cpp ; +run test_sph_hankel_2_nvrtc_double.cpp ; +run test_sph_hankel_2_nvrtc_float.cpp ; run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; @@ -356,6 +361,9 @@ run test_gamma_p_inv_nvrtc_float.cpp ; run test_tgamma_ratio_nvrtc_double.cpp ; run test_tgamma_ratio_nvrtc_float.cpp ; +run test_gegenbauer_nvrtc_double.cpp ; +run test_gegenbauer_nvrtc_float.cpp ; + run test_log1p_nvrtc_double.cpp ; run test_log1p_nvrtc_float.cpp ; diff --git a/test/test_cyl_hankel_1_nvrtc_double.cpp b/test/test_cyl_hankel_1_nvrtc_double.cpp new file mode 100644 index 000000000..01f68ecb5 --- /dev/null +++ b/test/test_cyl_hankel_1_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_hankel_1_nvrtc_float.cpp b/test/test_cyl_hankel_1_nvrtc_float.cpp new file mode 100644 index 000000000..68fb928c2 --- /dev/null +++ b/test/test_cyl_hankel_1_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_hankel_2_nvrtc_double.cpp b/test/test_cyl_hankel_2_nvrtc_double.cpp new file mode 100644 index 000000000..188d93bbe --- /dev/null +++ b/test/test_cyl_hankel_2_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_hankel_2_nvrtc_float.cpp b/test/test_cyl_hankel_2_nvrtc_float.cpp new file mode 100644 index 000000000..9349f3c0d --- /dev/null +++ b/test/test_cyl_hankel_2_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_1_nvrtc_double.cpp b/test/test_sph_hankel_1_nvrtc_double.cpp new file mode 100644 index 000000000..1e6863c55 --- /dev/null +++ b/test/test_sph_hankel_1_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_1_nvrtc_float.cpp b/test/test_sph_hankel_1_nvrtc_float.cpp new file mode 100644 index 000000000..28346d284 --- /dev/null +++ b/test/test_sph_hankel_1_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_1(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_2_nvrtc_double.cpp b/test/test_sph_hankel_2_nvrtc_double.cpp new file mode 100644 index 000000000..17b8c7ff0 --- /dev/null +++ b/test/test_sph_hankel_2_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_hankel_2_nvrtc_float.cpp b/test/test_sph_hankel_2_nvrtc_float.cpp new file mode 100644 index 000000000..038e215d9 --- /dev/null +++ b/test/test_sph_hankel_2_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_hankel_2(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2; + float_type *d_in1, *d_in2; + boost::math::complex *h_out, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new boost::math::complex[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + int fail_counter = 0; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]); + if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; + ++fail_counter; + if (fail_counter > 100) + { + break; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (fail_counter > 0) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 4f2fe1183e0874828151dfb1c5122bb683a4aebb Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 12:59:39 -0400 Subject: [PATCH 18/26] Update docs --- doc/sf/airy.qbk | 32 +++++++++++++-------------- doc/sf/expint.qbk | 16 +++++++------- doc/sf/gegenbauer.qbk | 6 ++--- doc/sf/hankel.qbk | 51 ++++++++++++++++++++++++++++++++++++------- 4 files changed, 70 insertions(+), 35 deletions(-) diff --git a/doc/sf/airy.qbk b/doc/sf/airy.qbk index 5ff4c7cb5..4756bee2d 100644 --- a/doc/sf/airy.qbk +++ b/doc/sf/airy.qbk @@ -18,10 +18,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` airy_ai(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x); template - ``__sf_result`` airy_ai(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x, const Policy&); }} // namespaces @@ -78,10 +78,10 @@ This function is implemented in terms of the Bessel functions using the relation namespace boost { namespace math { template - ``__sf_result`` airy_bi(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x); template - ``__sf_result`` airy_bi(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x, const Policy&); }} // namespaces @@ -132,10 +132,10 @@ This function is implemented in terms of the Bessel functions using the relation namespace boost { namespace math { template - ``__sf_result`` airy_ai_prime(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x); template - ``__sf_result`` airy_ai_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x, const Policy&); }} // namespaces @@ -186,10 +186,10 @@ This function is implemented in terms of the Bessel functions using the relation namespace boost { namespace math { template - ``__sf_result`` airy_bi_prime(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x); template - ``__sf_result`` airy_bi_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x, const Policy&); }} // namespaces @@ -242,23 +242,23 @@ by providing an output iterator. The signature of the single value functions are: template - T airy_ai_zero( + BOOST_MATH_GPU_ENABLED T airy_ai_zero( int m); // 1-based index of zero. template - T airy_bi_zero( + BOOST_MATH_GPU_ENABLED T airy_bi_zero( int m); // 1-based index of zero. and for multiple zeros: template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, // 1-based index of first zero. unsigned number_of_zeros, // How many zeros to generate. OutputIterator out_it); // Destination for zeros. template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, // 1-based index of zero. unsigned number_of_zeros, // How many zeros to generate OutputIterator out_it); // Destination for zeros. @@ -266,25 +266,25 @@ and for multiple zeros: There are also versions which allow control of the __policy_section for error handling and precision. template - T airy_ai_zero( + BOOST_MATH_GPU_ENABLED T airy_ai_zero( int m, // 1-based index of zero. const Policy&); // Policy to use. template - T airy_bi_zero( + BOOST_MATH_GPU_ENABLED T airy_bi_zero( int m, // 1-based index of zero. const Policy&); // Policy to use. template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, // 1-based index of first zero. unsigned number_of_zeros, // How many zeros to generate. OutputIterator out_it, // Destination for zeros. const Policy& pol); // Policy to use. template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, // 1-based index of zero. unsigned number_of_zeros, // How many zeros to generate. OutputIterator out_it, // Destination for zeros. diff --git a/doc/sf/expint.qbk b/doc/sf/expint.qbk index 89554730d..f0abf090e 100644 --- a/doc/sf/expint.qbk +++ b/doc/sf/expint.qbk @@ -11,10 +11,10 @@ namespace boost{ namespace math{ template - ``__sf_result`` expint(unsigned n, T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z); template - ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); }} // namespaces @@ -26,10 +26,10 @@ the return type is `double` if T is an integer type, and T otherwise. [h4 Description] template - ``__sf_result`` expint(unsigned n, T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z); template - ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&); Returns the [@http://mathworld.wolfram.com/En-Function.html exponential integral En] of z: @@ -100,10 +100,10 @@ is used. namespace boost{ namespace math{ template - ``__sf_result`` expint(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z); template - ``__sf_result`` expint(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&); }} // namespaces @@ -115,10 +115,10 @@ the return type is `double` if T is an integer type, and T otherwise. [h4 Description] template - ``__sf_result`` expint(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z); template - ``__sf_result`` expint(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&); Returns the [@http://mathworld.wolfram.com/ExponentialIntegral.html exponential integral] of z: diff --git a/doc/sf/gegenbauer.qbk b/doc/sf/gegenbauer.qbk index a6afc53d8..69671917c 100644 --- a/doc/sf/gegenbauer.qbk +++ b/doc/sf/gegenbauer.qbk @@ -16,13 +16,13 @@ namespace boost{ namespace math{ template - Real gegenbauer(unsigned n, Real lambda, Real x); + BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x); template - Real gegenbauer_prime(unsigned n, Real lambda, Real x); + BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x); template - Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k); + BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k); }} // namespaces diff --git a/doc/sf/hankel.qbk b/doc/sf/hankel.qbk index 4d8a5eda1..05d65201b 100644 --- a/doc/sf/hankel.qbk +++ b/doc/sf/hankel.qbk @@ -3,18 +3,36 @@ [h4 Synopsis] + #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) + template - std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x); template - std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&); template - std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x); template - std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&); + #else // When using cuda we use namespace cuda::std:: instead of std:: + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&); + + #endif + [h4 Description] @@ -77,18 +95,35 @@ routines for integer order are used. [h4 Synopsis] + #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) + template - std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x); template - std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&); template - std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x); + + template + BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&); + #else // When using cuda we use namespace cuda::std:: instead of std:: + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x); + template - std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&); + + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x); + template + BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&); + + #endif [h4 Description] From 79c8b49894597eed9a6cb8b1607950fdfccd7e7c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 13:12:37 -0400 Subject: [PATCH 19/26] Fix writing cuda::std::complex to stdout --- test/test_cyl_hankel_1_double.cu | 4 ++-- test/test_cyl_hankel_1_float.cu | 4 ++-- test/test_cyl_hankel_1_nvrtc_double.cpp | 4 ++-- test/test_cyl_hankel_1_nvrtc_float.cpp | 4 ++-- test/test_cyl_hankel_2_double.cu | 4 ++-- test/test_cyl_hankel_2_float.cu | 4 ++-- test/test_cyl_hankel_2_nvrtc_double.cpp | 4 ++-- test/test_cyl_hankel_2_nvrtc_float.cpp | 4 ++-- test/test_sph_hankel_1_double.cu | 4 ++-- test/test_sph_hankel_1_float.cu | 4 ++-- test/test_sph_hankel_1_nvrtc_double.cpp | 4 ++-- test/test_sph_hankel_1_nvrtc_float.cpp | 4 ++-- test/test_sph_hankel_2_double.cu | 4 ++-- test/test_sph_hankel_2_float.cu | 4 ++-- test/test_sph_hankel_2_nvrtc_double.cpp | 4 ++-- test/test_sph_hankel_2_nvrtc_float.cpp | 4 ++-- 16 files changed, 32 insertions(+), 32 deletions(-) diff --git a/test/test_cyl_hankel_1_double.cu b/test/test_cyl_hankel_1_double.cu index b718521c6..134946934 100644 --- a/test/test_cyl_hankel_1_double.cu +++ b/test/test_cyl_hankel_1_double.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_cyl_hankel_1_float.cu b/test/test_cyl_hankel_1_float.cu index 794d6d763..da78c375c 100644 --- a/test/test_cyl_hankel_1_float.cu +++ b/test/test_cyl_hankel_1_float.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_cyl_hankel_1_nvrtc_double.cpp b/test/test_cyl_hankel_1_nvrtc_double.cpp index 01f68ecb5..298436d06 100644 --- a/test/test_cyl_hankel_1_nvrtc_double.cpp +++ b/test/test_cyl_hankel_1_nvrtc_double.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) diff --git a/test/test_cyl_hankel_1_nvrtc_float.cpp b/test/test_cyl_hankel_1_nvrtc_float.cpp index 68fb928c2..d505c7bc4 100644 --- a/test/test_cyl_hankel_1_nvrtc_float.cpp +++ b/test/test_cyl_hankel_1_nvrtc_float.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) diff --git a/test/test_cyl_hankel_2_double.cu b/test/test_cyl_hankel_2_double.cu index 8218b5347..55b643173 100644 --- a/test/test_cyl_hankel_2_double.cu +++ b/test/test_cyl_hankel_2_double.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_cyl_hankel_2_float.cu b/test/test_cyl_hankel_2_float.cu index f00174b47..5766ebeb4 100644 --- a/test/test_cyl_hankel_2_float.cu +++ b/test/test_cyl_hankel_2_float.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_cyl_hankel_2_nvrtc_double.cpp b/test/test_cyl_hankel_2_nvrtc_double.cpp index 188d93bbe..f7589d201 100644 --- a/test/test_cyl_hankel_2_nvrtc_double.cpp +++ b/test/test_cyl_hankel_2_nvrtc_double.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) diff --git a/test/test_cyl_hankel_2_nvrtc_float.cpp b/test/test_cyl_hankel_2_nvrtc_float.cpp index 9349f3c0d..54216d39c 100644 --- a/test/test_cyl_hankel_2_nvrtc_float.cpp +++ b/test/test_cyl_hankel_2_nvrtc_float.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) diff --git a/test/test_sph_hankel_1_double.cu b/test/test_sph_hankel_1_double.cu index 3d8473e22..ea9ec2306 100644 --- a/test/test_sph_hankel_1_double.cu +++ b/test/test_sph_hankel_1_double.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_sph_hankel_1_float.cu b/test/test_sph_hankel_1_float.cu index 9cf17ffb2..4b01fe02a 100644 --- a/test/test_sph_hankel_1_float.cu +++ b/test/test_sph_hankel_1_float.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_sph_hankel_1_nvrtc_double.cpp b/test/test_sph_hankel_1_nvrtc_double.cpp index 1e6863c55..3ff1da8b2 100644 --- a/test/test_sph_hankel_1_nvrtc_double.cpp +++ b/test/test_sph_hankel_1_nvrtc_double.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) diff --git a/test/test_sph_hankel_1_nvrtc_float.cpp b/test/test_sph_hankel_1_nvrtc_float.cpp index 28346d284..0b0796653 100644 --- a/test/test_sph_hankel_1_nvrtc_float.cpp +++ b/test/test_sph_hankel_1_nvrtc_float.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) diff --git a/test/test_sph_hankel_2_double.cu b/test/test_sph_hankel_2_double.cu index 6c7a032f6..6631f73a0 100644 --- a/test/test_sph_hankel_2_double.cu +++ b/test/test_sph_hankel_2_double.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_sph_hankel_2_float.cu b/test/test_sph_hankel_2_float.cu index bcf41e797..1910aef04 100644 --- a/test/test_sph_hankel_2_float.cu +++ b/test/test_sph_hankel_2_float.cu @@ -96,8 +96,8 @@ int main(void) if (eps > 10) { std::cerr << "Result verification failed at element " << i << "!\n" - << "Device: " << output_vector[i] - << "\n Host: " << results[i] + << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag() + << "\n Host: " << results[i].real() << ", " << results[i].imag() << "\n Eps: " << eps << std::endl; ++failure_counter; if (failure_counter > 100) diff --git a/test/test_sph_hankel_2_nvrtc_double.cpp b/test/test_sph_hankel_2_nvrtc_double.cpp index 17b8c7ff0..fa57fcbb1 100644 --- a/test/test_sph_hankel_2_nvrtc_double.cpp +++ b/test/test_sph_hankel_2_nvrtc_double.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) diff --git a/test/test_sph_hankel_2_nvrtc_float.cpp b/test/test_sph_hankel_2_nvrtc_float.cpp index 038e215d9..be6fd0d09 100644 --- a/test/test_sph_hankel_2_nvrtc_float.cpp +++ b/test/test_sph_hankel_2_nvrtc_float.cpp @@ -160,8 +160,8 @@ int main() if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300) { std::cout << "error at line: " << i - << "\nParallel: " << h_out[i] - << "\n Serial: " << res + << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag() + << "\n Serial: " << res.real() << ", " << res.imag() << "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl; ++fail_counter; if (fail_counter > 100) From f10f18a27cca678e74913b1bd4d9c2f8b68171e1 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 13:59:19 -0400 Subject: [PATCH 20/26] Add GPU support to hermite --- include/boost/math/special_functions/hermite.hpp | 14 ++++++++------ include/boost/math/special_functions/math_fwd.hpp | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/boost/math/special_functions/hermite.hpp b/include/boost/math/special_functions/hermite.hpp index 81ccb2ac6..3d77fc03e 100644 --- a/include/boost/math/special_functions/hermite.hpp +++ b/include/boost/math/special_functions/hermite.hpp @@ -1,5 +1,6 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -11,8 +12,9 @@ #pragma once #endif -#include #include +#include +#include #include namespace boost{ @@ -20,7 +22,7 @@ namespace math{ // Recurrence relation for Hermite polynomials: template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1) { using promoted_type = tools::promote_args_t; @@ -31,7 +33,7 @@ namespace detail{ // Implement Hermite polynomials via recurrence: template -T hermite_imp(unsigned n, T x) +BOOST_MATH_GPU_ENABLED T hermite_imp(unsigned n, T x) { T p0 = 1; T p1 = 2 * x; @@ -43,7 +45,7 @@ T hermite_imp(unsigned n, T x) while(c < n) { - std::swap(p0, p1); + BOOST_MATH_GPU_SAFE_SWAP(p0, p1); p1 = static_cast(hermite_next(c, x, p0, p1)); ++c; } @@ -53,7 +55,7 @@ T hermite_imp(unsigned n, T x) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hermite(unsigned n, T x, const Policy&) { typedef typename tools::promote_args::type result_type; @@ -62,7 +64,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hermite(unsigned n, T x) { return boost::math::hermite(n, x, policies::policy<>()); diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index e989f5b18..91dde74cc 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -331,15 +331,15 @@ namespace boost laguerre(unsigned n, T1 m, T2 x); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hermite(unsigned n, T x); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hermite(unsigned n, T x, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); template @@ -1393,7 +1393,7 @@ namespace boost laguerre(unsigned n, T1 m, T2 x) { return ::boost::math::laguerre(n, m, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ hermite(unsigned n, T x){ return ::boost::math::hermite(n, x, Policy()); }\ \ using boost::math::hermite_next;\ From acb686132023fd3c2639994d4fd23cdffc8b2440 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 13:59:27 -0400 Subject: [PATCH 21/26] Add SYCL testing of hermite --- test/sycl_jamfile | 2 ++ test/test_hermite.cpp | 5 ++++- test/test_hermite.hpp | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 991d34520..582eaea40 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -91,3 +91,5 @@ run test_gamma.cpp ; run test_igamma.cpp ; run test_igamma_inv.cpp ; run test_igamma_inva.cpp ; + +run test_hermite.cpp ; diff --git a/test/test_hermite.cpp b/test/test_hermite.cpp index d1127feec..c7dd8b8c4 100644 --- a/test/test_hermite.cpp +++ b/test/test_hermite.cpp @@ -5,8 +5,11 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include -#include"test_hermite.hpp" +#endif + +#include "test_hermite.hpp" // // DESCRIPTION: diff --git a/test/test_hermite.hpp b/test/test_hermite.hpp index 0b00677ee..8f7c55ff1 100644 --- a/test/test_hermite.hpp +++ b/test/test_hermite.hpp @@ -11,11 +11,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include #include #include "functor.hpp" From 9f05a5288a5c3991514960847a10f26fdaf2b4f8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 14:11:31 -0400 Subject: [PATCH 22/26] Add CUDA testing of hermite --- test/cuda_jamfile | 3 + test/test_hermite_double.cu | 120 ++++++++++++++++++++++++++++++++++++ test/test_hermite_float.cu | 120 ++++++++++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+) create mode 100644 test/test_hermite_double.cu create mode 100644 test/test_hermite_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 32f87f44e..d72501559 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -357,6 +357,9 @@ run test_expm1_float.cu ; run test_gegenbauer_double.cu ; run test_gegenbauer_float.cu ; +run test_hermite_double.cu ; +run test_hermite_float.cu ; + run test_lgamma_double.cu ; run test_lgamma_float.cu ; run test_tgamma_double.cu ; diff --git a/test/test_hermite_double.cu b/test/test_hermite_double.cu new file mode 100644 index 000000000..a53766171 --- /dev/null +++ b/test/test_hermite_double.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::hermite(1U, input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_hermite_float.cu b/test/test_hermite_float.cu new file mode 100644 index 000000000..c48560bbe --- /dev/null +++ b/test/test_hermite_float.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::hermite(1U, input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 677e699cdc508a8ede4d197990db8a663fece48c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 14:16:04 -0400 Subject: [PATCH 23/26] Add NVRTC testing of hermite --- test/nvrtc_jamfile | 3 + test/test_hermite_nvrtc_double.cpp | 190 +++++++++++++++++++++++++++++ test/test_hermite_nvrtc_float.cpp | 190 +++++++++++++++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 test/test_hermite_nvrtc_double.cpp create mode 100644 test/test_hermite_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 734c87ad4..e049a24d2 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -364,6 +364,9 @@ run test_tgamma_ratio_nvrtc_float.cpp ; run test_gegenbauer_nvrtc_double.cpp ; run test_gegenbauer_nvrtc_float.cpp ; +run test_hermite_nvrtc_double.cpp ; +run test_hermite_nvrtc_float.cpp ; + run test_log1p_nvrtc_double.cpp ; run test_log1p_nvrtc_float.cpp ; diff --git a/test/test_hermite_nvrtc_double.cpp b/test/test_hermite_nvrtc_double.cpp new file mode 100644 index 000000000..569d975cb --- /dev/null +++ b/test/test_hermite_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_hermite_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::hermite(1U, h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_hermite_nvrtc_float.cpp b/test/test_hermite_nvrtc_float.cpp new file mode 100644 index 000000000..e2e907c51 --- /dev/null +++ b/test/test_hermite_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::hermite(1U, in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_hermite_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::hermite(1U, h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From cecd0e2cd85fcc9444f160783f0a2b321d3894ad Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Sep 2024 14:16:41 -0400 Subject: [PATCH 24/26] Add markers to hermite docs --- doc/sf/hermite.qbk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/sf/hermite.qbk b/doc/sf/hermite.qbk index c88aadc34..965aa8092 100644 --- a/doc/sf/hermite.qbk +++ b/doc/sf/hermite.qbk @@ -9,13 +9,13 @@ namespace boost{ namespace math{ template - ``__sf_result`` hermite(unsigned n, T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x); template - ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); template - ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); }} // namespaces @@ -26,10 +26,10 @@ note than when there is a single template argument the result is the same type as that argument or `double` if the template argument is an integer type. template - ``__sf_result`` hermite(unsigned n, T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x); template - ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&); Returns the value of the Hermite Polynomial of order /n/ at point /x/: @@ -43,7 +43,7 @@ Hermite Polynomials: [graph hermite] template - ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); + BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1); Implements the three term recurrence relation for the Hermite polynomials, this function can be used to create a sequence of From 11529ca0e30e6a3955acc7d2deef5f9e9bdbbfa4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 18 Sep 2024 09:59:26 -0400 Subject: [PATCH 25/26] Remove using statements for complex functions --- include/boost/math/tools/complex.hpp | 60 ---------------------------- 1 file changed, 60 deletions(-) diff --git a/include/boost/math/tools/complex.hpp b/include/boost/math/tools/complex.hpp index 88db8e8a7..ec5144011 100644 --- a/include/boost/math/tools/complex.hpp +++ b/include/boost/math/tools/complex.hpp @@ -24,36 +24,6 @@ namespace math { template using complex = cuda::std::complex; -using cuda::std::real; -using cuda::std::imag; -using cuda::std::abs; -using cuda::std::arg; -using cuda::std::norm; -using cuda::std::conj; -using cuda::std::proj; -using cuda::std::polar; - -using cuda::std::exp; -using cuda::std::log; -using cuda::std::log10; - -using cuda::std::pow; -using cuda::std::sqrt; - -using cuda::std::sin; -using cuda::std::cos; -using cuda::std::tan; -using cuda::std::asin; -using cuda::std::acos; -using cuda::std::atan; - -using cuda::std::sinh; -using cuda::std::cosh; -using cuda::std::tanh; -using cuda::std::asinh; -using cuda::std::acosh; -using cuda::std::atanh; - } // namespace math } // namespace boost @@ -68,36 +38,6 @@ namespace math { template using complex = std::complex; -using std::real; -using std::imag; -using std::abs; -using std::arg; -using std::norm; -using std::conj; -using std::proj; -using std::polar; - -using std::exp; -using std::log; -using std::log10; - -using std::pow; -using std::sqrt; - -using std::sin; -using std::cos; -using std::tan; -using std::asin; -using std::acos; -using std::atan; - -using std::sinh; -using std::cosh; -using std::tanh; -using std::asinh; -using std::acosh; -using std::atanh; - } // namespace math } // namespace boost From c0046200fc92955654608dececc31757a351aabb Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 18 Sep 2024 12:51:00 -0400 Subject: [PATCH 26/26] Fix overflow policy definition location on MS platforms --- test/test_hermite.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_hermite.cpp b/test/test_hermite.cpp index c7dd8b8c4..60dafdb8f 100644 --- a/test/test_hermite.cpp +++ b/test/test_hermite.cpp @@ -9,6 +9,10 @@ #include #endif +#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#endif + #include "test_hermite.hpp" //