diff --git a/libcudacxx/include/cuda/__floating_point/cast.h b/libcudacxx/include/cuda/__floating_point/cast.h new file mode 100644 index 00000000000..dc325b9e035 --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/cast.h @@ -0,0 +1,486 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_CAST_H +#define _CUDA___FLOATING_POINT_CAST_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +struct __fp_cast +{ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __cast_generic(const _Up& __src) + { + _CCCL_ASSERT(false, "Unsupported floating point cast"); + // todo: implement generic cast + // - fp -> fp + // - fp -> integral + // - integral -> fp + + _Tp __dst{}; + + // Copy sign + // if constexpr (_Tp::__is_signed && _Up::__is_signed) + // { + // __dst.__set_sign(__src.__get_sign()); + // } + // else if constexpr (!_Tp::__is_signed && _Up::__is_signed) + // { + // if (__src.__get_sign()) + // { + // return _Tp::__nan(); + // } + // } + + // using _Sp = + // _CUDA_VSTD::make_signed_t<_CUDA_VSTD::common_type_t>; + + // // Convert exponent + // constexpr _Sp __src_exp_bias = _Up::__exp_val_mask() / 2; + // constexpr _Sp __dst_exp_bias = _Tp::__exp_val_mask() / 2; + + // _Sp __dst_exp = static_cast<_Sp>(__src.__get_exp()) - __src_exp_bias + __dst_exp_bias; + + // if (__dst_exp >= static_cast<_Sp>(_Tp::__exp_val_mask())) + // { + // return _Tp::__inf(); + // } + + // __dst.__set_exp(static_cast(__dst_exp)); + + // // Convert mantissa (todo: implement rounding) + // constexpr ptrdiff_t __mant_diff = + // static_cast(_Tp::__mant_nbits) - static_cast(_Up::__mant_nbits); + + // _Sp __dst_mant{}; + + // if constexpr (__mant_diff < 0) + // { + // __dst_mant = static_cast<_Sp>(__src.__get_mant()) >> (-__mant_diff); + // } + // else + // { + // __dst_mant = static_cast<_Sp>(__src.__get_mant()) << __mant_diff; + // } + + // __dst.__set_mant(static_cast(__dst_mant)); + + return __dst; + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __fp16_config>) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + fp16 __fp_val{__fp_from_native, __val}; + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp32>) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.f32.f16 %0, %1;" : "=r"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp64>) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.f64.f16 %0, %1;" : "=l"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.bf16.f16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_signed_v<_Tp>) + { + if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_IS_DEVICE, + (int16_t __ret; asm("cvt.rni.s8.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); + return static_cast<_Tp>(__ret);)) + } + else if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rni.s16.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 4) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rni.s32.f16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 8) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rni.s64.f16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + } + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_unsigned_v<_Tp>) + { + if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_IS_DEVICE, + (uint16_t __ret; asm("cvt.rni.u8.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); + return static_cast<_Tp>(__ret);)) + } + else if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rni.u16.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 4) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rni.u32.f16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 8) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rni.u64.f16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + } + + return __ret = __cast_generic<_Tp>(__fp_val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __fp32_config>) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + fp32 __fp_val{__fp_from_native, __val}; + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rn.f16.f32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__fp_val.__storage_)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.f32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__fp_val.__storage_)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__fp_val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __fp64_config>) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + fp64 __fp_val{__fp_from_native, __val}; + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, + (asm("cvt.rn.f16.f64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__fp_val.__storage_)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.f64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__fp_val.__storage_)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__fp_val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __bf16_config>) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + bf16 __fp_val{__fp_from_native, __val}; + _Tp __ret{}; + +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.f16.bf16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp32>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.f32.bf16 %0, %1;" : "=r"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp64>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.f64.bf16 %0, %1;" : "=l"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_signed_v<_Tp>) + { + if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (int16_t __ret; asm("cvt.s8.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); + return static_cast<_Tp>(__ret);)) + } + else if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.s16.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 4) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.s32.bf16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 8) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.s64.bf16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + } + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_unsigned_v<_Tp>) + { + if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (uint16_t __ret; asm("cvt.u8.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); + return static_cast<_Tp>(__ret);)) + } + else if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.u16.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 4) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.u32.bf16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + else if constexpr (sizeof(_Tp) == 8) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.u64.bf16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;)) + } + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__fp_val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 1)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET( + NV_IS_DEVICE, + (asm("cvt.rn.f16.s8 %0, %1;" : "=h"(__ret.__storage_) : "h"(static_cast(__val))); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.s8 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 2)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 4)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 8)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 1)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET( + NV_IS_DEVICE, + (asm("cvt.rn.f16.u8 %0, %1;" : "=h"(__ret.__storage_) : "h"(static_cast(__val))); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.u8 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 2)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 4)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + + _CCCL_TEMPLATE(class _Tp, class _Up) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 8)) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept + { + _Tp __ret{}; + + if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>) + { + NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;)) + } +# if __cccl_ptx_isa >= 780 + else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ret = __cast_generic<_Tp>(__val); + } + +public: + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __cast(const _Up& __src) noexcept + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __cast_impl_device<_Tp>(__src);)) + } + + return __cast_generic<_Tp>(__src); + } +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_CAST_H diff --git a/libcudacxx/include/cuda/__floating_point/config.h b/libcudacxx/include/cuda/__floating_point/config.h new file mode 100644 index 00000000000..440526dd476 --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/config.h @@ -0,0 +1,526 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_CONFIG_H +#define _CUDA___FLOATING_POINT_CONFIG_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include +# include + +// Silence the warning about the use of long double in device code +_CCCL_NV_DIAG_SUPPRESS(20208) + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +// _FpConfig is a type that provides the configuration for the __fp type +// - static constexpr variables: +// - __exp_nbits: number of bits in the exponent +// - __mant_nbits: number of bits in the mantissa +// - __is_signed: whether the floating point type is signed +// - __has_inf: whether the floating point type has infinity +// - __has_nan: whether the floating point type has quiet NaN +// - __has_nans: whether the floating point type has signaling NaN +// - __has_denorm: whether the floating point type has denormalized values +// - __native_type: the native type that the floating point type is based on +// - __is_iec559: whether the floating point type is based on IEC 559 +// - type aliases: +// - __host native_type: the host native type that the floating point type is based on (if no native type, use +// __fp_no_native_type_tag) +// - __device native_type: the device native type that the floating point type is based on (if no native type, use +// __fp_no_native_type_tag) +// - static member functions: +// - __min(): returns the minimum value for the floating point type +// - __max(): returns the maximum value for the floating point type +// - __inf(): returns the infinity value for the floating point type (if __has_inf is true) +// - __nan(): returns the quiet NaN value for the floating point type (if __has_quiet_nan is true) +// - __nans(): returns the signaling NaN value for the floating point type (if __has_signaling_nan is true) +// - __is_inf(): returns whether the given value is infinity (if __has_inf is true) +// - __is_nan(): returns whether the given value is NaN (if __has_nan or __has_nans is true) + +struct __fp_no_native_type_tag +{}; + +struct __fp_invalid_config +{}; + +struct __fp_from_native_t +{}; + +_CCCL_INLINE_VAR constexpr __fp_from_native_t __fp_from_native{}; + +struct __fp_from_storage_t +{}; + +_CCCL_INLINE_VAR constexpr __fp_from_storage_t __fp_from_storage{}; + +struct __fp_iec559_config_base +{ + static constexpr bool __is_signed = true; + static constexpr bool __has_inf = true; + static constexpr bool __has_nan = true; + static constexpr bool __has_nans = true; + static constexpr bool __has_denorm = true; + static constexpr bool __is_iec559 = true; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min() + { + _Tp __ret{}; + __ret.__set_exp(typename _Tp::__storage_type{1}); + return __ret; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max() + { + _Tp __ret{}; + __ret.__set_exp(static_cast(~typename _Tp::__storage_type{1})); + return __ret; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __inf() + { + _Tp __ret{}; + __ret.__set_exp(_Tp::__exp_val_mask()); + return __ret; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan() + { + _Tp __ret = __inf<_Tp>(); + __ret.__set_mant(typename _Tp::__storage_type{1} << (_Tp::__mant_nbits - 1)); + return __ret; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nans() + { + _Tp __ret = __inf<_Tp>(); + __ret.__set_mant(typename _Tp::__storage_type{1} << (_Tp::__mant_nbits - 2)); + return __ret; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_inf(const _Tp& __val) + { + return __val.__get_exp() == _Tp::__exp_val_mask() && __val.__get_mant() == typename _Tp::__storage_type{0}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val) + { + return __val.__get_exp() == _Tp::__exp_val_mask() && __val.__get_mant() != typename _Tp::__storage_type{0}; + } +}; + +struct __fp4_e2m1_config +{ + static constexpr size_t __exp_nbits = 2; + static constexpr size_t __mant_nbits = 1; + static constexpr bool __is_signed = true; + static constexpr bool __has_inf = false; + static constexpr bool __has_nan = false; + static constexpr bool __has_nans = false; + static constexpr bool __has_denorm = true; + static constexpr bool __is_iec559 = false; + + using __host_native_type = __fp_no_native_type_tag; + using __device_native_type = __fp_no_native_type_tag; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x7}}; + } +}; + +struct __fp6_e2m3_config +{ + static constexpr size_t __exp_nbits = 2; + static constexpr size_t __mant_nbits = 3; + static constexpr bool __is_signed = true; + static constexpr bool __has_inf = false; + static constexpr bool __has_nan = false; + static constexpr bool __has_nans = false; + static constexpr bool __has_denorm = true; + static constexpr bool __is_iec559 = false; + + using __host_native_type = __fp_no_native_type_tag; + using __device_native_type = __fp_no_native_type_tag; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x1f}}; + } +}; + +struct __fp6_e3m2_config +{ + static constexpr size_t __exp_nbits = 3; + static constexpr size_t __mant_nbits = 2; + static constexpr bool __is_signed = true; + static constexpr bool __has_inf = false; + static constexpr bool __has_nan = false; + static constexpr bool __has_nans = false; + static constexpr bool __has_denorm = true; + static constexpr bool __is_iec559 = false; + + using __host_native_type = __fp_no_native_type_tag; + using __device_native_type = __fp_no_native_type_tag; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x1f}}; + } +}; + +struct __fp8_e4m3_config +{ + static constexpr size_t __exp_nbits = 4; + static constexpr size_t __mant_nbits = 3; + static constexpr bool __is_signed = true; + static constexpr bool __has_inf = false; + static constexpr bool __has_nan = true; + static constexpr bool __has_nans = false; + static constexpr bool __has_denorm = true; + static constexpr bool __is_iec559 = false; + + using __host_native_type = __fp_no_native_type_tag; + using __device_native_type = __fp_no_native_type_tag; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x7e}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan() + { + return _Tp{__fp_from_storage, static_cast(0x7f)}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val) + { + return (__val.__storage & static_cast(0x7f)) + == static_cast(0x7f); + } +}; + +struct __fp8_e5m2_config : __fp_iec559_config_base +{ + static constexpr size_t __exp_nbits = 5; + static constexpr size_t __mant_nbits = 2; + + using __host_native_type = __fp_no_native_type_tag; + using __device_native_type = __fp_no_native_type_tag; +}; + +struct __fp16_config : __fp_iec559_config_base +{ + static constexpr size_t __exp_nbits = 5; + static constexpr size_t __mant_nbits = 10; + static constexpr bool __is_iec559 = false; + +# if __STDCPP_FLOAT16_T__ == 1 + using __host_native_type = ::std::float16_t; +# elif _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC, >=, 24, 9) + using __host_native_type = _Float16; +# else + using __host_native_type = __fp_no_native_type_tag; +# endif +# if _CCCL_CUDA_COMPILER(CLANG, >=, 19) || _CCCL_CUDA_COMPILER(NVHPC, >=, 24, 9) + using __device_native_type = _Float16; +# else + using __device_native_type = __fp_no_native_type_tag; +# endif +}; + +struct __fp32_config : __fp_iec559_config_base +{ + static constexpr size_t __exp_nbits = 8; + static constexpr size_t __mant_nbits = 23; + + using __host_native_type = float; + using __device_native_type = float; +}; + +struct __fp64_config : __fp_iec559_config_base +{ + static constexpr size_t __exp_nbits = 11; + static constexpr size_t __mant_nbits = 52; + + using __host_native_type = double; + using __device_native_type = double; +}; + +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +struct __fp_long_double_config : __fp_iec559_config_base +{ + static constexpr size_t __exp_nbits = _CUDA_VSTD::bit_width(static_cast(LDBL_MAX_EXP)); + static constexpr size_t __mant_nbits = LDBL_MANT_DIG; + + using __host_native_type = long double; + using __device_native_type = __fp_no_native_type_tag; +}; +# endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +struct __fp128_config : __fp_iec559_config_base +{ + static constexpr size_t __exp_nbits = 15; + static constexpr size_t __mant_nbits = 112; + +# if __STDCPP_FLOAT128_T__ == 1 + using __host_native_type = ::std::float128_t; +# elif (defined(__SIZEOF_FLOAT128__) || defined(__FLOAT128__)) && _CCCL_OS(LINUX) \ + && (_CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)) && !defined(__CUDA_ARCH__) \ + && !_CCCL_COMPILER(NVRTC) + using __host_native_type = __float128; +# else + using __host_native_type = __fp_no_native_type_tag; +# endif + using __device_native_type = __fp_no_native_type_tag; +}; + +struct __bf16_config : __fp_iec559_config_base +{ + static constexpr size_t __exp_nbits = 8; + static constexpr size_t __mant_nbits = 7; + +# if __STDCPP_BFLOAT16_T__ == 1 + using __host_native_type = ::std::bfloat16_t; +# elif _CCCL_COMPILER(GCC, >=, 13) || (_CCCL_COMPILER(CLANG, >=, 15) && _CCCL_ARCH(X86_64)) \ + || (_CCCL_COMPILER(CLANG, >=, 11) && _CCCL_ARCH(ARM64)) + using __host_native_type = __bf16; +# else + using __host_native_type = __fp_no_native_type_tag; +# endif +# if _CCCL_CUDA_COMPILER(CLANG, >=, 17) + using __device_native_type = __bf16; +# else + using __device_native_type = __fp_no_native_type_tag; +# endif +}; + +struct __fp8_ue4m3_config +{ + static constexpr size_t __exp_nbits = 4; + static constexpr size_t __mant_nbits = 3; + static constexpr bool __is_signed = false; + static constexpr bool __has_inf = false; + static constexpr bool __has_nan = true; + static constexpr bool __has_nans = false; + static constexpr bool __has_denorm = true; + static constexpr bool __is_iec559 = false; + + using __host_native_type = __fp_no_native_type_tag; + using __device_native_type = __fp_no_native_type_tag; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x7e}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan() + { + return _Tp{__fp_from_storage, static_cast(0x7f)}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val) + { + return (__val.__storage & static_cast(0x7f)) + == static_cast(0x7f); + } +}; + +struct __fp8_ue8m0_config +{ + static constexpr size_t __exp_nbits = 8; + static constexpr size_t __mant_nbits = 0; + static constexpr bool __is_signed = false; + static constexpr bool __has_inf = false; + static constexpr bool __has_nan = true; + static constexpr bool __has_nans = false; + static constexpr bool __has_denorm = false; + static constexpr bool __is_iec559 = false; + + using __host_native_type = __fp_no_native_type_tag; + using __device_native_type = __fp_no_native_type_tag; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x00}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max() + { + return _Tp{__fp_from_storage, typename _Tp::__storage_type{0xff}}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan() + { + return _Tp{__fp_from_storage, static_cast(0xff)}; + } + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val) + { + return __val.__storage == static_cast(0xff); + } +}; + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr auto __fp_make_config_from() +{ + using _Up = _CUDA_VSTD::remove_cv_t<_Tp>; + + if constexpr (__is_cuda_extended_floating_point_v<_Up>) + { + return typename _Up::__config_type{}; + } + else if constexpr (_CUDA_VSTD::is_same_v<_Up, float>) + { + return __fp32_config{}; + } + else if constexpr (_CUDA_VSTD::is_same_v<_Up, double>) + { + return __fp64_config{}; + } +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + else if constexpr (_CUDA_VSTD::is_same_v<_Up, long double>) + { + return __fp_long_double_config{}; + } +# endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + else if constexpr (_CUDA_VSTD::is_same_v<_Up, typename __fp16_config::__host_native_type> + || _CUDA_VSTD::is_same_v<_Up, typename __fp16_config::__device_native_type>) + { + return __fp16_config{}; + } + else if constexpr (_CUDA_VSTD::is_same_v<_Up, typename __fp128_config::__host_native_type> + || _CUDA_VSTD::is_same_v<_Up, typename __fp128_config::__device_native_type>) + { + return __fp128_config{}; + } + else if constexpr (_CUDA_VSTD::is_same_v<_Up, typename __bf16_config::__host_native_type> + || _CUDA_VSTD::is_same_v<_Up, typename __bf16_config::__device_native_type>) + { + return __bf16_config{}; + } +# if __STDCPP_FLOAT16_T__ == 1 + else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float16_t>) + { + return __fp16_config{}; + } +# endif // __STDCPP_FLOAT16_T__ +# if __STDCPP_FLOAT32_T__ == 1 + else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float32_t>) + { + return __fp32_config{}; + } +# endif // __STDCPP_FLOAT32_T__ +# if __STDCPP_FLOAT64_T__ == 1 + else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float64_t>) + { + return __fp64_config{}; + } +# endif // __STDCPP_FLOAT64_T__ +# if __STDCPP_FLOAT128_T__ == 1 + else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float128_t>) + { + return __fp128_config{}; + } +# endif // __STDCPP_FLOAT128_T__ +# if __STDCPP_BFLOAT16_T__ == 1 + else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::bfloat16_t>) + { + return __bf16_config{}; + } +# endif // __STDCPP_BFLOAT16_T__ + else + { + return __fp_invalid_config{}; + } +} + +template +using __fp_make_config_from_t = decltype(::cuda::__fp_make_config_from<_Tp>()); + +_LIBCUDACXX_END_NAMESPACE_CUDA + +_CCCL_NV_DIAG_DEFAULT(20208) + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_CONFIG_H diff --git a/libcudacxx/include/cuda/__floating_point/conv_rank_order.h b/libcudacxx/include/cuda/__floating_point/conv_rank_order.h new file mode 100644 index 00000000000..66418dd9b6b --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/conv_rank_order.h @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_CONV_RANK_ORDER_H +#define _CUDA___FLOATING_POINT_CONV_RANK_ORDER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +# include +# include +# include + +enum class __fp_conv_rank_order +{ + __unordered, + __greater, + __equal, + __less, +}; + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp_conv_rank_order __fp_make_conv_rank_order() +{ + using _FromConfig = __fp_make_config_from_t<_From>; + using _ToConfig = __fp_make_config_from_t<_To>; + + if constexpr (!_CUDA_VSTD::is_same_v<_FromConfig, __fp_invalid_config> + && !_CUDA_VSTD::is_same_v<_ToConfig, __fp_invalid_config>) + { + if constexpr (_ToConfig::__is_signed == _FromConfig::__is_signed) + { + if constexpr (_ToConfig::__exp_nbits == _FromConfig::__exp_nbits + && _ToConfig::__mant_nbits == _FromConfig::__mant_nbits) + { +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + // If the representations are the same, long double always has the higher subrank + if constexpr (_CUDA_VSTD::is_same_v<_ToConfig, __fp_long_double_config> + && !_CUDA_VSTD::is_same_v<_FromConfig, __fp_long_double_config>) + { + return __fp_conv_rank_order::__greater; + } + else if constexpr (!_CUDA_VSTD::is_same_v<_ToConfig, __fp_long_double_config> + && _CUDA_VSTD::is_same_v<_FromConfig, __fp_long_double_config>) + { + return __fp_conv_rank_order::__less; + } +# endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + return __fp_conv_rank_order::__equal; + } + else if constexpr (_ToConfig::__exp_nbits >= _FromConfig::__exp_nbits + && _ToConfig::__mant_nbits >= _FromConfig::__mant_nbits) + { + return __fp_conv_rank_order::__greater; + } + else if constexpr (_ToConfig::__exp_nbits <= _FromConfig::__exp_nbits + && _ToConfig::__mant_nbits <= _FromConfig::__mant_nbits) + { + return __fp_conv_rank_order::__less; + } + } + } + + return __fp_conv_rank_order::__unordered; +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_CONV_RANK_ORDER_H diff --git a/libcudacxx/include/cuda/__floating_point/fp.h b/libcudacxx/include/cuda/__floating_point/fp.h new file mode 100644 index 00000000000..e872365290e --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/fp.h @@ -0,0 +1,530 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_FP_H +#define _CUDA___FLOATING_POINT_FP_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +// Silence the warning about the use of long double in device code +_CCCL_NV_DIAG_SUPPRESS(20208) + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +// Signed FP types +using fp4_e2m1 = __fp<__fp4_e2m1_config>; +using fp6_e2m3 = __fp<__fp6_e2m3_config>; +using fp6_e3m2 = __fp<__fp6_e3m2_config>; +using fp8_e4m3 = __fp<__fp8_e4m3_config>; +using fp8_e5m2 = __fp<__fp8_e5m2_config>; +using fp16 = __fp<__fp16_config>; +using bf16 = __fp<__bf16_config>; +using fp32 = __fp<__fp32_config>; +using fp64 = __fp<__fp64_config>; +// # if !defined(_LIBCUDACXX_HAS_NO_INT128) +// using fp128 = __fp<__fp128_config>; +// # endif // !_LIBCUDACXX_HAS_NO_INT128 + +// Unsigned FP types +using fp8_ue4m3 = __fp<__fp8_ue4m3_config>; +using fp8_ue8m0 = __fp<__fp8_ue8m0_config>; + +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(__fp_is_floating_point_v<_Tp>) +__fp(_Tp) -> __fp<__fp_make_config_from_t<_Tp>>; + +template +__fp(__fp_from_native_t, _Tp) -> __fp<__fp_make_config_from_t<_Tp>>; + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __fp_cast_is_implicit() +{ + constexpr auto __rank_order = ::cuda::__fp_make_conv_rank_order<_To, _From>(); + + return __rank_order == __fp_conv_rank_order::__equal || __rank_order == __fp_conv_rank_order::__greater; +} + +template +class __fp +{ +public: + template + friend class __fp; + friend _Config; + friend struct __fp_cast; + friend struct __fp_ops; + + using __config_type = _Config; + + static constexpr size_t __exp_nbits = __config_type::__exp_nbits; + static constexpr size_t __mant_nbits = __config_type::__mant_nbits; + static constexpr bool __is_signed = __config_type::__is_signed; + static constexpr size_t __nbits = __exp_nbits + __mant_nbits + __is_signed; + static constexpr bool __has_inf = __config_type::__has_inf; + static constexpr bool __has_nan = __config_type::__has_nan; + static constexpr bool __has_nans = __config_type::__has_nans; + static constexpr bool __has_denorm = __config_type::__has_denorm; + + using __storage_type = __fp_storage_t<__nbits>; + using __host_native_type = typename __config_type::__host_native_type; + using __device_native_type = typename __config_type::__device_native_type; + + static constexpr bool __has_host_native_type = !_CUDA_VSTD::is_same_v<__host_native_type, __fp_no_native_type_tag>; + static constexpr bool __has_device_native_type = + !_CUDA_VSTD::is_same_v<__device_native_type, __fp_no_native_type_tag>; + + static_assert(!__has_host_native_type || sizeof(__storage_type) == sizeof(__host_native_type)); + static_assert(!__has_device_native_type || sizeof(__storage_type) == sizeof(__device_native_type)); + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __min() + { + return __config_type::template __min<__fp>(); + } + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __max() + { + return __config_type::template __max<__fp>(); + } + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __lowest() + { + __fp __ret{}; + if constexpr (__is_signed) + { + __ret = __max(); + __ret.__set_sign(true); + } + return __ret; + } + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __epsilon() + { + // TODO: implement epsilon + return __fp{}; + } + _CCCL_TEMPLATE(bool _HasInf = __has_inf) + _CCCL_REQUIRES(_HasInf) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __inf() + { + return __config_type::template __inf<__fp>(); + } + _CCCL_TEMPLATE(bool _HasNan = __has_nan) + _CCCL_REQUIRES(_HasNan) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __nan() + { + return __config_type::template __nan<__fp>(); + } + _CCCL_TEMPLATE(bool _HasNans = __has_nans) + _CCCL_REQUIRES(_HasNans) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __nans() + { + return __config_type::template __nans<__fp>(); + } + _CCCL_TEMPLATE(bool _HasDenorm = __has_denorm) + _CCCL_REQUIRES(_HasDenorm) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __denorm_min() + { + return __fp{__fp_from_storage, __storage_type{1}}; + } + + __fp() = default; + + __fp(const __fp&) = default; + +# if defined(_CCCL_NO_CONDITIONAL_EXPLICIT) + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND __fp_cast_is_implicit<__fp, _Tp>()) + _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp(const _Tp& __v) noexcept + : __fp{__construct_from(__v)} + {} + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND(!__fp_cast_is_implicit<__fp, _Tp>())) + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __fp(const _Tp& __v) noexcept + : __fp{__construct_from(__v)} + {} +# else // ^^^ _CCCL_NO_CONDITIONAL_EXPLICIT ^^^ / vvv !_CCCL_NO_CONDITIONAL_EXPLICIT vvv + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp>) + _LIBCUDACXX_HIDE_FROM_ABI explicit(!__fp_cast_is_implicit<__fp, _Tp>()) constexpr __fp(const _Tp& __v) noexcept + : __fp{__construct_from(__v)} + {} +# endif // ^^^ !_CCCL_NO_CONDITIONAL_EXPLICIT ^^^ + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Tp>) + _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp(const _Tp& __v) noexcept + : __fp{__construct_from(__v)} + {} + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Tp>, __config_type>) + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __fp(__fp_from_native_t, const _Tp& __v) noexcept + : __fp{__fp_from_storage, _CUDA_VSTD::bit_cast<__storage_type>(__v)} + {} + + template + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __fp(__fp_from_storage_t, const _Tp& __v) noexcept + : __storage_{__v} + { + static_assert(_CUDA_VSTD::is_same_v<_Tp, __storage_type>); + } + +# if defined(_CCCL_NO_CONDITIONAL_EXPLICIT) + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND __fp_cast_is_implicit<_Tp, __fp>()) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _Tp() const noexcept + { + return __cast_to<_Tp>(); + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND(!__fp_cast_is_implicit<_Tp, __fp>())) + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator _Tp() const noexcept + { + return __cast_to<_Tp>(); + } +# else // ^^^ _CCCL_NO_CONDITIONAL_EXPLICIT ^^^ / vvv !_CCCL_NO_CONDITIONAL_EXPLICIT vvv + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp>) + _LIBCUDACXX_HIDE_FROM_ABI explicit(!__fp_cast_is_implicit<_Tp, __fp>()) constexpr operator _Tp() const noexcept + { + return __cast_to<_Tp>(); + } +# endif // ^^^ !_CCCL_NO_CONDITIONAL_EXPLICIT ^^^ + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Tp>) + _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _Tp() const noexcept + { + return __cast_to<_Tp>(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __has_native_type() noexcept + { + NV_IF_ELSE_TARGET(NV_IS_HOST, (return __has_host_native_type;), (return __has_device_native_type;)) + } + + // private: + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __construct_from(const _Tp& __v) noexcept + { +# if defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert(!_CUDA_VSTD::is_same_v<_Tp, long double>, "long double is not supported"); +# endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // todo: improve the implementation + NV_IF_ELSE_TARGET(NV_IS_HOST, (return __construct_from_host<_Tp>(__v);), (return __construct_from_device<_Tp>(__v);)) + } +# if !_CCCL_COMPILER(NVRTC) + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_HOST static constexpr __fp __construct_from_host(const _Tp& __v) noexcept + { + if constexpr (__has_host_native_type) + { + using _TpConfig = __fp_make_config_from_t<_Tp>; + + if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>) + { + return __fp{__fp_from_native, __v}; + } + else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>) + { + using _TpFp = __fp<_TpConfig>; + if constexpr (_TpFp::__has_host_native_type) + { + return __fp{__fp_from_native, + static_cast<__host_native_type>(_CUDA_VSTD::bit_cast(__v))}; + } + } + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>) + { + return __fp{__fp_from_native, static_cast<__host_native_type>(__v)}; + } + } + return _FpCast::template __cast<__fp>(__v); + } +# endif // !_CCCL_COMPILER(NVRTC) + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static constexpr __fp __construct_from_device(const _Tp& __v) noexcept + { + if constexpr (__has_device_native_type) + { + using _TpConfig = __fp_make_config_from_t<_Tp>; + + if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>) + { + return __fp{__fp_from_native, __v}; + } + else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>) + { + using _TpFp = __fp<_TpConfig>; + if constexpr (_TpFp::__has_device_native_type) + { + return __fp{ + __fp_from_native, + static_cast<__device_native_type>(_CUDA_VSTD::bit_cast(__v))}; + } + } + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>) + { + return __fp{__fp_from_native, static_cast<__device_native_type>(__v)}; + } + } + return _FpCast::template __cast<__fp>(__v); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __cast_to() const noexcept + { + using _Up = _CUDA_VSTD::remove_cv_t<_Tp>; + +# if defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert(!_CUDA_VSTD::is_same_v<_Up, long double>, "long double is not supported"); +# endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // todo: improve the implementation + NV_IF_ELSE_TARGET(NV_IS_HOST, (return __cast_to_host<_Up>();), (return __cast_to_device<_Up>();)) + } + +# if !_CCCL_COMPILER(NVRTC) + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_HOST constexpr _Tp __cast_to_host() const noexcept + { + if constexpr (__has_host_native_type) + { + using _TpConfig = __fp_make_config_from_t<_Tp>; + + if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>) + { + return _CUDA_VSTD::bit_cast<_Tp>(__storage_); + } + else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>) + { + using _TpFp = __fp<_TpConfig>; + if constexpr (_TpFp::__has_host_native_type) + { + return _CUDA_VSTD::bit_cast<_Tp>(static_cast(__host_native())); + } + } + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>) + { + return static_cast<_Tp>(__host_native()); + } + } + return _FpCast::template __cast<_Tp>(*this); + } +# endif // !_CCCL_COMPILER(NVRTC) + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE constexpr _Tp __cast_to_device() const noexcept + { + if constexpr (__has_device_native_type) + { + using _TpConfig = __fp_make_config_from_t<_Tp>; + + if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>) + { + return _CUDA_VSTD::bit_cast<_Tp>(__storage_); + } + else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>) + { + using _TpFp = __fp<_TpConfig>; + if constexpr (_TpFp::__has_device_native_type) + { + return _CUDA_VSTD::bit_cast<_Tp>(static_cast(__device_native())); + } + } + else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>) + { + return static_cast<_Tp>(__device_native()); + } + } + return _FpCast::template __cast<_Tp>(*this); + } + +# if !_CCCL_COMPILER(NVRTC) + _CCCL_TEMPLATE(bool _HasNativeType = __has_host_native_type) + _CCCL_REQUIRES(_HasNativeType) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_HOST constexpr __host_native_type __host_native() const noexcept + { + return _CUDA_VSTD::bit_cast<__host_native_type>(__storage_); + } +# endif // !_CCCL_COMPILER(NVRTC) + + _CCCL_TEMPLATE(bool _HasNativeType = __has_device_native_type) + _CCCL_REQUIRES(_HasNativeType) + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE constexpr __device_native_type __device_native() const noexcept + { + return _CUDA_VSTD::bit_cast<__device_native_type>(__storage_); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __mask() noexcept + { + return __sign_mask() | __exp_mask() | __mant_mask(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __sign_shift() noexcept + { +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + return (sizeof(__storage_type) * CHAR_BIT) - static_cast(__is_signed); +# else + // return CHAR_BIT - static_cast(__is_signed); +# endif + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __sign_mask() noexcept + { + return (__is_signed) ? __storage_type(1) << __sign_shift() : 0; + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __exp_val_mask() noexcept + { + return ((__storage_type(1) << __exp_nbits) - 1); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __exp_shift() noexcept + { + return __sign_shift() - __exp_nbits; + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __exp_mask() noexcept + { +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + return static_cast<__storage_type>(__exp_val_mask() << __exp_shift()); +# else + // return __exp_val_mask() << (sizeof(__storage_type) * CHAR_BIT - (__exp_nbits + __is_signed)); +# endif + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __mant_val_mask() noexcept + { + return static_cast<__storage_type>((__storage_type(1) << __mant_nbits) - 1); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __mant_shift() noexcept + { + return __exp_shift() - __mant_nbits; + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __mant_mask() noexcept + { +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + return static_cast<__storage_type>(__mant_val_mask() << __mant_shift()); +# else + // return __mant_val_mask() << (sizeof(__storage_type) * CHAR_BIT - (__mant_nbits + __exp_nbits + __is_signed)); +# endif + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __get_sign() const noexcept + { + return static_cast(__storage_ & __sign_mask()); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __storage_type __get_exp() const noexcept + { +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + return static_cast<__storage_type>((__storage_ & __exp_mask()) >> __exp_shift()); +# else + // return (__storage_ & __exp_mask()) >> (sizeof(__storage_type) * CHAR_BIT - (__exp_nbits + __is_signed)); +# endif + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __storage_type __get_mant() const noexcept + { +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + return static_cast<__storage_type>(static_cast<__storage_type>(__storage_ & __mant_mask()) >> __mant_shift()); +# else + // return (__storage_ & __mant_mask()) >> (sizeof(__storage_type) * CHAR_BIT - (__mant_nbits + __exp_nbits + + // __is_signed)); +# endif + } + + _CCCL_TEMPLATE(bool _IsSigned = __is_signed) + _CCCL_REQUIRES(_IsSigned) + _LIBCUDACXX_HIDE_FROM_ABI constexpr void __set_sign(bool __sign) noexcept + { + __storage_ &= ~__sign_mask(); + __storage_ |= static_cast<__storage_type>(__sign) << __sign_shift(); + } + + _LIBCUDACXX_HIDE_FROM_ABI constexpr void __set_exp(__storage_type __exp) noexcept + { + __storage_ &= ~__exp_mask(); +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + __storage_ |= (__exp & __exp_val_mask()) << __exp_shift(); +# else + // __storage_ |= (__exp & ((__storage_type(1) << __exp_nbits) - 1)) << (sizeof(__storage_type) * CHAR_BIT - + // (__exp_nbits + __is_signed)); +# endif + } + + _LIBCUDACXX_HIDE_FROM_ABI constexpr void __set_mant(__storage_type __mant) noexcept + { + __storage_ &= ~__mant_mask(); +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + __storage_ |= (__mant & __mant_val_mask()) << __mant_shift(); +# else + // __storage_ |= (__mant & ((__storage_type(1) << __mant_nbits) - 1)) << (sizeof(__storage_type) * CHAR_BIT - + // (__mant_nbits + __exp_nbits + __is_signed)); +# endif + } + + _CCCL_TEMPLATE(bool _HasInf = __has_inf) + _CCCL_REQUIRES(_HasInf) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_inf() const noexcept + { + return __config_type::template __is_inf<__fp>(*this); + } + + _CCCL_TEMPLATE(bool _HasNan = __has_nan) + _CCCL_REQUIRES(_HasNan) + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_nan() const noexcept + { + return __config_type::template __is_nan<__fp>(*this); + } + + __storage_type __storage_; +}; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +_CCCL_NV_DIAG_DEFAULT(20208) + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_FP_H diff --git a/libcudacxx/include/cuda/__floating_point/literals.h b/libcudacxx/include/cuda/__floating_point/literals.h new file mode 100644 index 00000000000..19482995148 --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/literals.h @@ -0,0 +1,125 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_LITERALS_H +#define _CUDA___FLOATING_POINT_LITERALS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include + +// Silence the warning about the use of long double in device code +_CCCL_NV_DIAG_SUPPRESS(20208) + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +namespace fp_literals +{ + +_LIBCUDACXX_HIDE_FROM_ABI constexpr fp4_e2m1 operator""_fp4_e2m1(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp4_e2m1{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp4_e2m1{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +_LIBCUDACXX_HIDE_FROM_ABI constexpr fp6_e2m3 operator""_fp6_e2m3(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp6_e2m3{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp6_e2m3{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +_LIBCUDACXX_HIDE_FROM_ABI constexpr fp6_e3m2 operator""_fp6_e3m2(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp6_e3m2{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp6_e3m2{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +_LIBCUDACXX_HIDE_FROM_ABI constexpr fp16 operator""_fp16(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp16{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp16{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +_LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_bf16(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return bf16{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return bf16{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +_LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_fp32(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp32{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp32{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +_LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_fp64(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp64{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp64{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +// # if !defined(_LIBCUDACXX_HAS_NO_INT128) +// _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_fp128(long double __val) noexcept +// { +// NV_IF_ELSE_TARGET(NV_IS_HOST, (return fp128{__val};), (return fp128{static_cast(__val)};)) +// } +// # endif // !_LIBCUDACXX_HAS_NO_INT128 + +_LIBCUDACXX_HIDE_FROM_ABI constexpr fp8_ue4m3 operator""_fp8_ue4m3(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp8_ue4m3{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp8_ue4m3{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} +_LIBCUDACXX_HIDE_FROM_ABI constexpr fp8_ue8m0 operator""_fp8_ue8m0(long double __val) noexcept +{ +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + return fp8_ue8m0{__val}; +# else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv + return fp8_ue8m0{static_cast(__val)}; +# endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ +} + +} // namespace fp_literals + +_LIBCUDACXX_END_NAMESPACE_CUDA + +_CCCL_NV_DIAG_DEFAULT(20208) + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_LITERALS_H diff --git a/libcudacxx/include/cuda/__floating_point/operators.h b/libcudacxx/include/cuda/__floating_point/operators.h new file mode 100644 index 00000000000..c423bb0787e --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/operators.h @@ -0,0 +1,1250 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_OPERATORS_H +#define _CUDA___FLOATING_POINT_OPERATORS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include + +# if _CCCL_HAS_INCLUDE() +# include +# endif + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto __fp_make_common_type() +{ + constexpr auto __rank_order = __fp_make_conv_rank_order<_Lhs, _Rhs>(); + + if constexpr (__rank_order == __fp_conv_rank_order::__equal) + { + // Extended floating point types have higher subrank, prefer cuda extended types over std extended types + // Fixme: potentially will not work correctly for long double + // auto val = 1.0f64 + 1.0l; // val will be of type long double, is this right? + if constexpr (__is_standard_floating_point_v<_Lhs> || __is_std_extended_floating_point_v<_Lhs>) + { + return _Rhs{}; + } + else + { + return _Lhs{}; + } + } + else if constexpr (__rank_order == __fp_conv_rank_order::__greater) + { + return _Lhs{}; + } + else if constexpr (__rank_order == __fp_conv_rank_order::__less) + { + return _Rhs{}; + } + else + { + static_assert(__always_false(), "Cannot make a common fp type from the given types"); + _CCCL_UNREACHABLE(); + } +} + +template +using __fp_common_type_t = decltype(__fp_make_common_type<_Lhs, _Rhs>()); + +// Implementations of the arithmetic operations. Usually an operation is in several parts: +// 1. __op() - the entry point for the operation which tries to implement the operation using the host & device native +// types if available, otherwise calls the __op_impl() function. +// +// 2. __op_impl() - the non native implementation of the operation for the given type. This function dispatches to the +// target specific implementation and falls back to the constexpr implementation. +// +// 3. __op_impl_TARGET() - the target specific implementation. If available, implements the operation via asm. +// +// 4. __op_impl_constexpr() - the constexpr implementation of the operation. Slow. +// +// The arguments may be heterogenous. In that case the implementation is chosen in the following order: +// 1. try to use the host & device native types +// 2. try to use host & device native instructions for mixed arithmetic +// 3. cast the arguments to the common type and use the implementation for homogeneous arguments +struct __fp_ops +{ + /********************************************************************************************************************/ + // Negation + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static _Tp __neg_impl_constexpr(const _Tp& __src) noexcept + { + auto __ret{__src}; + __ret.__set_sign(!__ret.__get_sign()); + return __ret; + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __neg_impl_device(const _Tp& __src) noexcept + { + [[maybe_unused]] _Tp __ret; + +# if __cccl_ptx_isa >= 600 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (asm("neg.f16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__src.__storage_)); return __ret;)) + } +# endif // __cccl_ptx_isa >= 600 +# if __cccl_ptx_isa >= 700 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_80, + (asm("neg.bf16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__src.__storage_)); return __ret;)) + } +# endif // ^^^ __cccl_ptx_isa < 700 ^^^ + + return __neg_impl_constexpr(__src); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __neg_impl(const _Tp& __src) noexcept + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __neg_impl_device(__src);)) + } + + return __neg_impl_constexpr(__src); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __neg(const _Tp& __src) noexcept + { + NV_IF_ELSE_TARGET( + NV_IS_HOST, + ( + if constexpr (_Tp::__has_host_native_type) { return __fp{__fp_from_native, -__src.__host_native()}; } else { + return __neg_impl(__src); + }), + ( + if constexpr (_Tp::__has_device_native_type) { return __fp{__fp_from_native, -__src.__device_native()}; } else { + return __neg_impl(__src); + })) + } + + /********************************************************************************************************************/ + // Addition + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs> + __add_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>) + || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) ) + { + return _Lhs{static_cast(__lhs) + static_cast(__rhs)}; + } + else + { + _CCCL_ASSERT(false, "Addition is not supported for the given type"); + return {}; + } + + // if (__lhs.__is_nan() && __rhs.__is_nan()) + // { + // return _Tp::__nan(); + // } + + // if (__lhs.__is_inf() && __rhs.__is_inf()) + // { + // return (__lhs.__get_sign() == __rhs.__get_sign()) ? __lhs : -_Tp::__nan(); + // } + + // auto __lhs_sign = __lhs.__get_sign(); + // auto __lhs_exp = __lhs.__get_exp(); + // auto __lhs_mant = __lhs.__get_mant(); + // auto __rhs_sign = __rhs.__get_sign(); + // auto __rhs_exp = __rhs.__get_exp(); + // auto __rhs_mant = __rhs.__get_mant(); + + // if (__lhs_exp > __rhs_exp) + // { + // __rhs_mant >>= (__lhs_exp - __rhs_exp); + // __rhs_exp = __lhs_exp; + // } + // else if (__rhs_exp > __lhs_exp) + // { + // __lhs_mant >>= (__rhs_exp - __lhs_exp); + // __lhs_exp = __rhs_exp; + // } + + // using _Sp = _CUDA_VSTD::make_signed_t; + + // bool __res_sign{}; + // _Sp __res_exp = __lhs_exp; + // _Sp __res_mant{}; + + // if (__lhs_sign == __rhs_sign) + // { + // __res_mant = __lhs_mant + __rhs_mant; + // __res_sign = __lhs_sign; + // } + // else if (__lhs_mant >= __rhs_mant) + // { + // __res_mant = __lhs_mant - __rhs_mant; + // __res_sign = __lhs_sign; + // } + // else + // { + // __res_mant = __rhs_mant - __lhs_mant; + // __res_sign = __rhs_sign; + // } + + // while (__res_mant > static_cast<_Sp>(_Tp::__mant_val_mask())) + // { + // __res_mant >>= 1; + // __res_exp++; + // } + + // while (__res_mant < (_Sp{1} << (_Tp::__mant_nbits - 1)) && __res_mant != 0) + // { + // __res_mant <<= 1; + // __res_exp--; + // } + + // todo: implement denormalized numbers + + // _Tp __ret{}; + + // if (__res_exp > static_cast<_Sp>(_Tp::__exp_val_mask()) / 2) + // { + // __ret = _Tp::__inf(); + // __ret.__set_sign(__res_sign); + // } + // else if (__res_exp < -static_cast<_Sp>(_Tp::__exp_val_mask()) / 2 + 1) + // { + // __ret = _Tp{}; + // } + // else if (__res_mant == 0) + // { + // __ret.__set_sign(__res_sign); + // __ret.__set_exp(__res_exp); + // __ret.__set_mant(__res_mant); + // } + + // return __ret; + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs> + __add_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + [[maybe_unused]] __fp_common_type_t<_Lhs, _Rhs> __ret; + + if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>) + { +# if __cccl_ptx_isa >= 860 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_100, + (asm("add.f32.f16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 860 +# if __cccl_ptx_isa >= 860 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_100, + (asm("add.f32.bf16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 860 + return __add(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs}); + } + else + { +# if __cccl_ptx_isa >= 420 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp16)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_53, + (asm("add.f16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 420 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, bf16)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_90, + (asm("add.bf16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + return __add_impl_constexpr(__lhs, __rhs); + } + _CCCL_UNREACHABLE(); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __add_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __add_impl_device(__lhs, __rhs);)) + } + + return __add_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __add(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + NV_IF_ELSE_TARGET( + NV_IS_HOST, + ( + if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) { + return __fp{__fp_from_native, __lhs.__host_native() + __rhs.__host_native()}; + } else { return __add_impl(__lhs, __rhs); }), + ( + if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) { + return __fp{__fp_from_native, __lhs.__device_native() + __rhs.__device_native()}; + } else { return __add_impl(__lhs, __rhs); })) + } + + /********************************************************************************************************************/ + // Subtraction + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs> + __sub_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>) + || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) ) + { + return _Lhs{static_cast(__lhs) - static_cast(__rhs)}; + } + else + { + _CCCL_ASSERT(false, "Subtraction is not supported for the given type"); + return {}; + } + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs> + __sub_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + [[maybe_unused]] __fp_common_type_t<_Lhs, _Rhs> __ret; + + if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>) + { +# if __cccl_ptx_isa >= 860 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_100, + (asm("sub.f32.f16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 860 +# if __cccl_ptx_isa >= 860 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_100, + (asm("sub.f32.bf16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 860 + return __sub(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs}); + } + else + { +# if __cccl_ptx_isa >= 420 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp16)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_53, + (asm("sub.f16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 420 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, bf16)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_90, + (asm("sub.bf16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + return __sub_impl_constexpr(__lhs, __rhs); + } + _CCCL_UNREACHABLE(); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __sub_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __sub_impl_device(__lhs, __rhs);)) + } + + return __sub_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __sub(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + NV_IF_ELSE_TARGET( + NV_IS_HOST, + ( + if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) { + return __fp{__fp_from_native, __lhs.__host_native() - __rhs.__host_native()}; + } else { return __sub_impl(__lhs, __rhs); }), + ( + if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) { + return __fp{__fp_from_native, __lhs.__device_native() - __rhs.__device_native()}; + } else { return __sub_impl(__lhs, __rhs); })) + } + + /********************************************************************************************************************/ + // Multiplication + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs> + __mul_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>) + || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) ) + { + return _Lhs{static_cast(__lhs) * static_cast(__rhs)}; + } + else + { + _CCCL_ASSERT(false, "Multiplication is not supported for the given type"); + return {}; + } + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs> + __mul_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + [[maybe_unused]] __fp_common_type_t<_Lhs, _Rhs> __ret; + + if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>) + { + return __mul(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs}); + } + else + { +# if __cccl_ptx_isa >= 420 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp16)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_53, + (asm("mul.f16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 420 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, bf16)) + { + NV_IF_TARGET( + NV_PROVIDES_SM_90, + (asm("mul.bf16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return __ret;)) + } +# endif // __cccl_ptx_isa >= 780 + return __mul_impl_constexpr(__lhs, __rhs); + } + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __mul_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __mul_impl_device(__lhs, __rhs);)) + } + + return __mul_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __mul(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + NV_IF_ELSE_TARGET( + NV_IS_HOST, + ( + if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) { + return __fp{__fp_from_native, __lhs.__host_native() * __rhs.__host_native()}; + } else { return __mul_impl(__lhs, __rhs); }), + ( + if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) { + return __fp{__fp_from_native, __lhs.__device_native() * __rhs.__device_native()}; + } else { return __mul_impl(__lhs, __rhs); })) + } + + /********************************************************************************************************************/ + // Division + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs> + __div_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>) + || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) ) + { + return _Lhs{static_cast(__lhs) / static_cast(__rhs)}; + } + else + { + _CCCL_ASSERT(false, "Division is not supported for the given type"); + return {}; + } + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs> + __div_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>) + { + return __div(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs}); + } + else + { + return __div_impl_constexpr(__lhs, __rhs); + } + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __div_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __div_impl_device(__lhs, __rhs);)) + } + + return __div_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs> + __div(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + NV_IF_ELSE_TARGET( + NV_IS_HOST, + ( + if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) { + return __fp{__fp_from_native, __lhs.__host_native() / __rhs.__host_native()}; + } else { return __div_impl(__lhs, __rhs); }), + ( + if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) { + return __fp{__fp_from_native, __lhs.__device_native() / __rhs.__device_native()}; + } else { return __div_impl(__lhs, __rhs); })) + } + + /********************************************************************************************************************/ + // Equality + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool + __eq_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept + { + if (__lhs.__is_nan() || __rhs.__is_nan()) + { + return false; + } + + return (__lhs.__storage_ & __lhs.__mask()) == (__rhs.__storage_ & __rhs.__mask()); + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool + __eq_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept + { +# if __cccl_ptx_isa >= 650 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (_CUDA_VSTD::uint16_t __ret; + asm("set.eq.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 650 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint16_t __ret; + asm("set.eq.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 780 + + return __eq_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __eq_impl(const _Tp& __lhs, const _Tp& __rhs) noexcept + { + using _CommonFp = __fp_common_type_t<_Tp, _Tp>; + + const auto __clhs = static_cast<_CommonFp>(__lhs); + const auto __crhs = static_cast<_CommonFp>(__rhs); + + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __eq_impl_device(__clhs, __crhs);)) + } + + return __eq_impl_constexpr(__clhs, __crhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __eq(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + NV_IF_ELSE_TARGET( + NV_IS_HOST, + ( + if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) { + return __lhs.__host_native() == __rhs.__host_native(); + } else { return __eq_impl(__lhs, __rhs); }), + ( + if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) { + return __lhs.__device_native() == __rhs.__device_native(); + } else { return __eq_impl(__lhs, __rhs); })) + } + + /********************************************************************************************************************/ + // Inequality + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool + __neq_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept + { + return !__eq_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool + __neq_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept + { +# if __cccl_ptx_isa >= 650 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (_CUDA_VSTD::uint16_t __ret; + asm("set.ne.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 650 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint16_t __ret; + asm("set.ne.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 780 + + return __eq_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __neq(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type()) + { + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return __lhs.__host_native() != __rhs.__host_native();), + (return __lhs.__device_native() != __rhs.__device_native();)) + } + else + { + using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>; + + const auto __l = static_cast<_CommonFp>(__lhs); + const auto __r = static_cast<_CommonFp>(__rhs); + + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __neq_impl_device(__l, __r);)) + } + + return __neq_impl_constexpr(__l, __r); + } + } + + /********************************************************************************************************************/ + // Less than + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool + __lt_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept + { + if (__lhs.__is_nan() || __rhs.__is_nan()) + { + return false; + } + + return (__lhs.__storage_ & __lhs.__mask()) < (__rhs.__storage_ & __rhs.__mask()); + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool + __lt_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept + { +# if __cccl_ptx_isa >= 650 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (_CUDA_VSTD::uint16_t __ret; + asm("set.lt.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 650 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint16_t __ret; + asm("set.lt.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 780 + + return __lt_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __lt(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type()) + { + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return __lhs.__host_native() < __rhs.__host_native();), + (return __lhs.__device_native() < __rhs.__device_native();)) + } + else + { + using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>; + + const auto __l = static_cast<_CommonFp>(__lhs); + const auto __r = static_cast<_CommonFp>(__rhs); + + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __lt_impl_device(__l, __r);)) + } + + return __lt_impl_constexpr(__l, __r); + } + } + + /********************************************************************************************************************/ + // Less than or equal + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool + __le_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept + { + if (__lhs.__is_nan() || __rhs.__is_nan()) + { + return false; + } + + return (__lhs.__storage_ & __lhs.__mask()) <= (__rhs.__storage_ & __rhs.__mask()); + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool + __le_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept + { +# if __cccl_ptx_isa >= 650 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (_CUDA_VSTD::uint16_t __ret; + asm("set.le.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 650 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint16_t __ret; + asm("set.le.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 780 + + return __le_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __le(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type()) + { + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return __lhs.__host_native() <= __rhs.__host_native();), + (return __lhs.__device_native() <= __rhs.__device_native();)) + } + else + { + using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>; + + const auto __l = static_cast<_CommonFp>(__lhs); + const auto __r = static_cast<_CommonFp>(__rhs); + + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __le_impl_device(__l, __r);)) + } + + return __le_impl_constexpr(__l, __r); + } + } + + /********************************************************************************************************************/ + // Greater than + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool + __gt_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept + { + if (__lhs.__is_nan() || __rhs.__is_nan()) + { + return false; + } + + return (__lhs.__storage_ & __lhs.__mask()) > (__rhs.__storage_ & __rhs.__mask()); + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool + __gt_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept + { +# if __cccl_ptx_isa >= 650 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (_CUDA_VSTD::uint16_t __ret; + asm("set.gt.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 650 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint16_t __ret; + asm("set.gt.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 780 + + return __gt_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __gt(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type()) + { + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return __lhs.__host_native() > __rhs.__host_native();), + (return __lhs.__device_native() > __rhs.__device_native();)) + } + else + { + using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>; + + const auto __l = static_cast<_CommonFp>(__lhs); + const auto __r = static_cast<_CommonFp>(__rhs); + + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __gt_impl_device(__l, __r);)) + } + + return __gt_impl_constexpr(__l, __r); + } + } + + /********************************************************************************************************************/ + // Greater than or equal + /********************************************************************************************************************/ + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool + __ge_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept + { + if (__lhs.__is_nan() || __rhs.__is_nan()) + { + return false; + } + + return (__lhs.__storage_ & __lhs.__mask()) >= (__rhs.__storage_ & __rhs.__mask()); + } + + template + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool + __ge_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept + { +# if __cccl_ptx_isa >= 650 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_53, + (_CUDA_VSTD::uint16_t __ret; + asm("set.ge.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 650 +# if __cccl_ptx_isa >= 780 + if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16)) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint16_t __ret; + asm("set.ge.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_)); + return static_cast(__ret);)) + } +# endif // __cccl_ptx_isa >= 780 + + return __ge_impl_constexpr(__lhs, __rhs); + } + + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __ge(const _Lhs& __lhs, const _Rhs& __rhs) noexcept + { + if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type()) + { + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return __lhs.__host_native() >= __rhs.__host_native();), + (return __lhs.__device_native() >= __rhs.__device_native();)) + } + else + { + using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>; + + const auto __l = static_cast<_CommonFp>(__lhs); + const auto __r = static_cast<_CommonFp>(__rhs); + + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __ge_impl_device(__l, __r);)) + } + + return __ge_impl_constexpr(__l, __r); + } + } +}; + +/**********************************************************************************************************************/ +// Unary operators +/**********************************************************************************************************************/ +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_Config> operator+(const __fp<_Config>& __src) noexcept +{ + return __src; +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_Config> operator-(const __fp<_Config>& __src) noexcept +{ + static_assert(_Config::__is_signed, "Unary minus is not allowed for unsigned floating point types"); + return __fp_ops::__neg(__src); +} + +/**********************************************************************************************************************/ +// Binary operators +/**********************************************************************************************************************/ +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +operator+(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__add(__lhs, __rhs)); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +operator-(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__sub(__lhs, __rhs)); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +operator*(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__mul(__lhs, __rhs)); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +operator/(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__div(__lhs, __rhs)); +} +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>& +operator+=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(), + "Implicit narrow conversion from higher to lower rank is not allowed"); + return __lhs = __lhs + __rhs; +} +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>& +operator-=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(), + "Implicit narrow conversion from higher to lower rank is not allowed"); + return __lhs = __lhs - __rhs; +} +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>& +operator*=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(), + "Implicit narrow conversion from higher to lower rank is not allowed"); + return __lhs = __lhs * __rhs; +} +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>& +operator/=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(), + "Implicit narrow conversion from higher to lower rank is not allowed"); + return __lhs = __lhs / __rhs; +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool +operator==(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return __fp_ops::__eq(__lhs, __rhs); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool +operator!=(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return __fp_ops::__neq(__lhs, __rhs); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool +operator<(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return __fp_ops::__lt(__lhs, __rhs); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool +operator<=(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return __fp_ops::__le(__lhs, __rhs); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool +operator>(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return __fp_ops::__gt(__lhs, __rhs); +} +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool +operator>=(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept +{ + return __fp_ops::__ge(__lhs, __rhs); +} + +# define _LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(_TYPE, _EXSPACE) \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator+(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__add(__lhs, __fp{__rhs})); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator+(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__add(__fp{__lhs}, __rhs)); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator-(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__sub(__lhs, __fp{__rhs})); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator-(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__sub(__fp{__lhs}, __rhs)); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator*(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__mul(__lhs, __fp{__rhs})); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator*(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__mul(__fp{__lhs}, __rhs)); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator/(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__div(__lhs, __fp{__rhs})); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator/(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__div(__fp{__lhs}, __rhs)); \ + } \ + template \ + _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator+=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept \ + { \ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(), \ + "Implicit narrow conversion from higher to lower rank is not allowed"); \ + return __lhs = __lhs + __rhs; \ + } \ + template \ + _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator-=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept \ + { \ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(), \ + "Implicit narrow conversion from higher to lower rank is not allowed"); \ + return __lhs = __lhs - __rhs; \ + } \ + template \ + _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator*=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept \ + { \ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(), \ + "Implicit narrow conversion from higher to lower rank is not allowed"); \ + return __lhs = __lhs * __rhs; \ + } \ + template \ + _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator/=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept \ + { \ + static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(), \ + "Implicit narrow conversion from higher to lower rank is not allowed"); \ + return __lhs = __lhs / __rhs; \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator==( \ + __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return __fp_ops::__eq(__lhs, __fp{__rhs}); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator==( \ + _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return __fp_ops::__eq(__fp{__lhs}, __rhs); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator!=( \ + __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return __fp_ops::__neq(__lhs, __fp{__rhs}); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator!=( \ + _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return __fp_ops::__neq(__fp{__lhs}, __rhs); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return __fp_ops::__lt(__lhs, __fp{__rhs}); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return __fp_ops::__lt(__fp{__lhs}, __rhs); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<=( \ + __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return __fp_ops::__le(__lhs, __fp{__rhs}); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<=( \ + _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return __fp_ops::__le(__fp{__lhs}, __rhs); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return __fp_ops::__gt(__lhs, __fp{__rhs}); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return __fp_ops::__gt(__fp{__lhs}, __rhs); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>=( \ + __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \ + { \ + return __fp_ops::__ge(__lhs, __fp{__rhs}); \ + } \ + template \ + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>=( \ + _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \ + { \ + return __fp_ops::__ge(__fp{__lhs}, __rhs); \ + } + +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(float, _CCCL_HOST_DEVICE) +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(double, _CCCL_HOST_DEVICE) +# if !_CCCL_COMPILER(NVRTC) && (!_CCCL_HAS_CUDA_COMPILER || _CCCL_CUDA_COMPILER(NVHPC)) +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(long double, _CCCL_HOST) +# endif // !_CCCL_COMPILER(NVRTC) && (!_CCCL_HAS_CUDA_COMPILER || _CCCL_CUDA_COMPILER(NVHPC)) + +# if __STDCPP_FLOAT16_T__ == 1 +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float16_t, _CCCL_HOST_DEVICE) +# endif // __STDCPP_FLOAT16_T__ == 1 +# if __STDCPP_FLOAT32_T__ == 1 +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float32_t, _CCCL_HOST_DEVICE) +# endif // __STDCPP_FLOAT32_T__ == 1 +# if __STDCPP_FLOAT64_T__ == 1 +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float64_t, _CCCL_HOST_DEVICE) +# endif // __STDCPP_FLOAT64_T__ == 1 +# if __STDCPP_FLOAT128_T__ == 1 +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float128_t, _CCCL_HOST_DEVICE) +# endif // __STDCPP_FLOAT128_T__ == 1 +# if __STDCPP_BFLOAT16_T__ == 1 +_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::bfloat16_t, _CCCL_HOST_DEVICE) +# endif // __STDCPP_BFLOAT16_T__ == 1 + +# undef _LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_OPERATORS_H diff --git a/libcudacxx/include/cuda/__floating_point/storage.h b/libcudacxx/include/cuda/__floating_point/storage.h new file mode 100644 index 00000000000..3804a99b38f --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/storage.h @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_STORAGE_H +#define _CUDA___FLOATING_POINT_STORAGE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr auto __fp_make_storage_type() +{ + if constexpr (_NBits <= CHAR_BIT) + { + return _CUDA_VSTD::uint8_t{}; + } + else if constexpr (_NBits <= 2 * CHAR_BIT) + { + return _CUDA_VSTD::uint16_t{}; + } + else if constexpr (_NBits <= 4 * CHAR_BIT) + { + return _CUDA_VSTD::uint32_t{}; + } + else if constexpr (_NBits <= 8 * CHAR_BIT) + { + return _CUDA_VSTD::uint64_t{}; + } +# if !defined(_LIBCUDACXX_HAS_NO_INT128) + else if constexpr (_NBits <= 16 * CHAR_BIT) + { + return ::__uint128_t{}; + } +# endif // !_LIBCUDACXX_HAS_NO_INT128 + else + { + static_assert(__always_false<_NBits>(), "Unsupported number of bits for floating point type"); + } +} + +template +using __fp_storage_t = decltype(__fp_make_storage_type<_NBits>()); + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_STORAGE_H diff --git a/libcudacxx/include/cuda/__floating_point/type_traits.h b/libcudacxx/include/cuda/__floating_point/type_traits.h new file mode 100644 index 00000000000..839a821a7ea --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point/type_traits.h @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FLOATING_POINT_TYPE_TRAITS_H +#define _CUDA___FLOATING_POINT_TYPE_TRAITS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include + +# if _CCCL_HAS_INCLUDE() +# include +# endif + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __always_false() +{ + return false; +} + +template +_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v = false; + +template <> +_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v = true; + +template <> +_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v = true; + +template <> +_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v = true; + +template +_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_v = + __is_standard_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>; + +template +_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v = false; + +# if __STDCPP_FLOAT16_T__ == 1 +template <> +_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float16_t> = true; +# endif // __STDCPP_FLOAT16_T__ == 1 + +# if __STDCPP_BFLOAT16_T__ == 1 +template <> +_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::bfloat16_t> = true; +# endif // __STDCPP_BFLOAT16_T__ == 1 + +# if __STDCPP_FLOAT32_T__ == 1 +template <> +_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float32_t> = true; +# endif // __STDCPP_FLOAT32_T__ == 1 + +# if __STDCPP_FLOAT64_T__ == 1 +template <> +_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float64_t> = true; +# endif // __STDCPP_FLOAT64_T__ == 1 + +# if __STDCPP_FLOAT128_T__ == 1 +template <> +_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float128_t> = true; +# endif // __STDCPP_FLOAT128_T__ == 1 + +template +_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_v = + __is_std_extended_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>; + +template +_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_impl_v = false; + +template +_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_impl_v<__fp<_Config>> = true; + +template +_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_v = + __is_cuda_extended_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>; + +template +_CCCL_INLINE_VAR constexpr bool __fp_is_floating_point_v = + __is_standard_floating_point_v<_Tp> || __is_std_extended_floating_point_v<_Tp> + || __is_cuda_extended_floating_point_v<_Tp>; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA___FLOATING_POINT_TYPE_TRAITS_H diff --git a/libcudacxx/include/cuda/__floating_point_ b/libcudacxx/include/cuda/__floating_point_ new file mode 100644 index 00000000000..0008560f24c --- /dev/null +++ b/libcudacxx/include/cuda/__floating_point_ @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_FLOATING_POINT_ +#define _CUDA_FLOATING_POINT_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include + +#endif // _CUDA_FLOATING_POINT_ diff --git a/libcudacxx/include/cuda/__fwd/fp.h b/libcudacxx/include/cuda/__fwd/fp.h new file mode 100644 index 00000000000..dd5137f0552 --- /dev/null +++ b/libcudacxx/include/cuda/__fwd/fp.h @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FWD_FP_H +#define _CUDA___FWD_FP_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +class __fp; + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___FWD_FP_H diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h index 3642d17de09..fe6c42f77a3 100644 --- a/libcudacxx/include/cuda/std/__bit/countl.h +++ b/libcudacxx/include/cuda/std/__bit/countl.h @@ -23,10 +23,10 @@ #include #include +#include #include #include #include -#include _LIBCUDACXX_BEGIN_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h index e7a2b609abe..9284b53478c 100644 --- a/libcudacxx/include/cuda/std/__bit/countr.h +++ b/libcudacxx/include/cuda/std/__bit/countr.h @@ -22,10 +22,10 @@ #endif // no system header #include +#include #include #include #include -#include _LIBCUDACXX_BEGIN_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index f0186ad9f5f..7da53f440b2 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -22,10 +22,10 @@ #endif // no system header #include +#include #include #include #include -#include _LIBCUDACXX_BEGIN_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__bit/popcount.h b/libcudacxx/include/cuda/std/__bit/popcount.h index 18c8d97dd30..d0d8425165f 100644 --- a/libcudacxx/include/cuda/std/__bit/popcount.h +++ b/libcudacxx/include/cuda/std/__bit/popcount.h @@ -22,10 +22,10 @@ #endif // no system header #include +#include #include #include #include -#include _LIBCUDACXX_BEGIN_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__bit/rotate.h b/libcudacxx/include/cuda/std/__bit/rotate.h index bf2c2e5f61a..3ba6391d462 100644 --- a/libcudacxx/include/cuda/std/__bit/rotate.h +++ b/libcudacxx/include/cuda/std/__bit/rotate.h @@ -21,9 +21,9 @@ # pragma system_header #endif // no system header +#include #include #include -#include _LIBCUDACXX_BEGIN_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h index 5791c41a18f..e8251b8c78e 100644 --- a/libcudacxx/include/cuda/std/__cccl/dialect.h +++ b/libcudacxx/include/cuda/std/__cccl/dialect.h @@ -129,6 +129,10 @@ # define _CCCL_NO_VARIABLE_TEMPLATES #endif // _CCCL_STD_VER <= 2011 || __cpp_variable_templates < 201304L +#if !(__cpp_conditional_explicit >= 201806L) +# define _CCCL_NO_CONDITIONAL_EXPLICIT +#endif // !(__cpp_conditional_explicit >= 201806L) + /////////////////////////////////////////////////////////////////////////////// // Conditionally use certain language features depending on availability /////////////////////////////////////////////////////////////////////////////// diff --git a/libcudacxx/include/cuda/std/__limits/extended_floating_point.h b/libcudacxx/include/cuda/std/__limits/extended_floating_point.h new file mode 100644 index 00000000000..de694db68c7 --- /dev/null +++ b/libcudacxx/include/cuda/std/__limits/extended_floating_point.h @@ -0,0 +1,125 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___LIMITS_EXTENDED_FLOATING_POINT_H +#define _LIBCUDACXX___LIMITS_EXTENDED_FLOATING_POINT_H + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +class __numeric_limits_impl<::cuda::__fp<_FpConfig>, __numeric_limits_type::__floating_point> +{ +public: + using type = ::cuda::__fp<_FpConfig>; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = type::__is_signed; + static constexpr int digits = type::__mant_nbits; + static constexpr int digits10 = 0; // todo + static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return type::__min(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return type::__max(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return type::__lowest(); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type::__epsilon(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type{0.5}; + } + + static constexpr int min_exponent = -(1 << (type::__exp_nbits - 1)) + 3; + static constexpr int min_exponent10 = 0; // todo + static constexpr int max_exponent = 1 << (type::__exp_nbits - 1); + static constexpr int max_exponent10 = 0; // todo + + static constexpr bool has_infinity = type::__has_inf; + static constexpr bool has_quiet_NaN = type::__has_nan; + static constexpr bool has_signaling_NaN = type::__has_nans; + static constexpr float_denorm_style has_denorm = (type::__has_denorm) ? denorm_present : denorm_absent; + static constexpr bool has_denorm_loss = false; + + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + if constexpr (has_infinity) + { + return type::__inf(); + } + else + { + return type{}; + } + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + if constexpr (has_quiet_NaN) + { + return type::__nan(); + } + else + { + return type{}; + } + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + if constexpr (has_signaling_NaN) + { + return type::__nans(); + } + else + { + return type{}; + } + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type::__denorm_min(); + } + + static constexpr bool is_iec559 = _FpConfig::__is_iec559; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___LIMITS_EXTENDED_FLOATING_POINT_H diff --git a/libcudacxx/include/cuda/std/__limits/limits.h b/libcudacxx/include/cuda/std/__limits/limits.h new file mode 100644 index 00000000000..587d1b8302e --- /dev/null +++ b/libcudacxx/include/cuda/std/__limits/limits.h @@ -0,0 +1,761 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___LIMITS_LIMITS_H +#define _LIBCUDACXX___LIMITS_LIMITS_H + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +enum float_round_style +{ + round_indeterminate = -1, + round_toward_zero = 0, + round_to_nearest = 1, + round_toward_infinity = 2, + round_toward_neg_infinity = 3 +}; + +enum float_denorm_style +{ + denorm_indeterminate = -1, + denorm_absent = 0, + denorm_present = 1 +}; + +enum class __numeric_limits_type +{ + __integral, + __bool, + __floating_point, + __other, +}; + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr __numeric_limits_type __make_numeric_limits_type() +{ +#if !defined(_CCCL_NO_IF_CONSTEXPR) + _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_same, _Tp, bool)) + { + return __numeric_limits_type::__bool; + } + else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_integral, _Tp)) + { + return __numeric_limits_type::__integral; + } + else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp)) + { + return __numeric_limits_type::__floating_point; + } + else + { + return __numeric_limits_type::__other; + } +#else // ^^^ !_CCCL_NO_IF_CONSTEXPR ^^^ // vvv _CCCL_NO_IF_CONSTEXPR vvv + return _CCCL_TRAIT(is_same, _Tp, bool) + ? __numeric_limits_type::__bool + : (_CCCL_TRAIT(is_integral, _Tp) + ? __numeric_limits_type::__integral + : (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp) + ? __numeric_limits_type::__floating_point + : __numeric_limits_type::__other)); +#endif // _CCCL_NO_IF_CONSTEXPR +} + +template ()> +class __numeric_limits_impl +{ +public: + using type = _Tp; + + static constexpr bool is_specialized = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return type(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return type(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return type(); + } + + static constexpr int digits = 0; + static constexpr int digits10 = 0; + static constexpr int max_digits10 = 0; + static constexpr bool is_signed = false; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = 0; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type(); + } + + static constexpr int min_exponent = 0; + static constexpr int min_exponent10 = 0; + static constexpr int max_exponent = 0; + static constexpr int max_exponent10 = 0; + + static constexpr bool has_infinity = false; + static constexpr bool has_quiet_NaN = false; + static constexpr bool has_signaling_NaN = false; + static constexpr float_denorm_style has_denorm = denorm_absent; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return type(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return type(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return type(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type(); + } + + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = false; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_toward_zero; +}; + +// MSVC warns about overflowing left shift +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_MSVC(4309) +template +struct __int_min +{ + static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits); +}; +_CCCL_DIAG_POP + +template +struct __int_min<_Tp, __digits, false> +{ + static constexpr _Tp value = _Tp(0); +}; + +template +class __numeric_limits_impl<_Tp, __numeric_limits_type::__integral> +{ +public: + using type = _Tp; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = type(-1) < type(0); + static constexpr int digits = static_cast(sizeof(type) * __CHAR_BIT__ - is_signed); + static constexpr int digits10 = digits * 3 / 10; + static constexpr int max_digits10 = 0; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return __int_min::value; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return is_signed ? type(type(~0) ^ min()) : type(~0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return min(); + } + + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr int radix = 2; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type(0); + } + + static constexpr int min_exponent = 0; + static constexpr int min_exponent10 = 0; + static constexpr int max_exponent = 0; + static constexpr int max_exponent10 = 0; + + static constexpr bool has_infinity = false; + static constexpr bool has_quiet_NaN = false; + static constexpr bool has_signaling_NaN = false; + static constexpr float_denorm_style has_denorm = denorm_absent; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type(0); + } + + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = !is_signed; + +#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__) + static constexpr bool traps = true; +#else + static constexpr bool traps = false; +#endif + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_toward_zero; +}; + +template <> +class __numeric_limits_impl +{ +public: + using type = bool; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = false; + static constexpr int digits = 1; + static constexpr int digits10 = 0; + static constexpr int max_digits10 = 0; + + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return false; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return true; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return min(); + } + + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr int radix = 2; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type(0); + } + + static constexpr int min_exponent = 0; + static constexpr int min_exponent10 = 0; + static constexpr int max_exponent = 0; + static constexpr int max_exponent10 = 0; + + static constexpr bool has_infinity = false; + static constexpr bool has_quiet_NaN = false; + static constexpr bool has_signaling_NaN = false; + static constexpr float_denorm_style has_denorm = denorm_absent; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return type(0); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type(0); + } + + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_toward_zero; +}; + +template <> +class __numeric_limits_impl +{ +public: + using type = float; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = __FLT_MANT_DIG__; + static constexpr int digits10 = __FLT_DIG__; + static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return __FLT_MIN__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return __FLT_MAX__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return -max(); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return __FLT_EPSILON__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return 0.5F; + } + + static constexpr int min_exponent = __FLT_MIN_EXP__; + static constexpr int min_exponent10 = __FLT_MIN_10_EXP__; + static constexpr int max_exponent = __FLT_MAX_EXP__; + static constexpr int max_exponent10 = __FLT_MAX_10_EXP__; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + +#if defined(_CCCL_BUILTIN_HUGE_VALF) + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return _CCCL_BUILTIN_HUGE_VALF(); + } +#else // ^^^ _CCCL_BUILTIN_HUGE_VALF ^^^ // vvv !_CCCL_BUILTIN_HUGE_VALF vvv + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept + { + return _CUDA_VSTD::bit_cast(0x7f800000); + } +#endif // !_CCCL_BUILTIN_HUGE_VALF +#if defined(_CCCL_BUILTIN_NANF) + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return _CCCL_BUILTIN_NANF(""); + } +#else // ^^^ _CCCL_BUILTIN_NANF ^^^ // vvv !_CCCL_BUILTIN_NANF vvv + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept + { + return _CUDA_VSTD::bit_cast(0x7fc00000); + } +#endif // !_CCCL_BUILTIN_NANF +#if defined(_CCCL_BUILTIN_NANSF) + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return _CCCL_BUILTIN_NANSF(""); + } +#else // ^^^ _CCCL_BUILTIN_NANSF ^^^ // vvv !_CCCL_BUILTIN_NANSF vvv + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept + { + return _CUDA_VSTD::bit_cast(0x7fa00000); + } +#endif // !_CCCL_BUILTIN_NANSF + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return __FLT_DENORM_MIN__; + } + + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; + +template <> +class __numeric_limits_impl +{ +public: + using type = double; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = __DBL_MANT_DIG__; + static constexpr int digits10 = __DBL_DIG__; + static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return __DBL_MIN__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return __DBL_MAX__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return -max(); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return __DBL_EPSILON__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return 0.5; + } + + static constexpr int min_exponent = __DBL_MIN_EXP__; + static constexpr int min_exponent10 = __DBL_MIN_10_EXP__; + static constexpr int max_exponent = __DBL_MAX_EXP__; + static constexpr int max_exponent10 = __DBL_MAX_10_EXP__; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + +#if defined(_CCCL_BUILTIN_HUGE_VAL) + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return _CCCL_BUILTIN_HUGE_VAL(); + } +#else // ^^^ _CCCL_BUILTIN_HUGE_VAL ^^^ // vvv !_CCCL_BUILTIN_HUGE_VAL vvv + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept + { + return _CUDA_VSTD::bit_cast(0x7ff0000000000000); + } +#endif // !_CCCL_BUILTIN_HUGE_VAL +#if defined(_CCCL_BUILTIN_NAN) + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return _CCCL_BUILTIN_NAN(""); + } +#else // ^^^ _CCCL_BUILTIN_NAN ^^^ // vvv !_CCCL_BUILTIN_NAN vvv + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept + { + return std::bit_cast(0x7ff8000000000000); + } +#endif // !_CCCL_BUILTIN_NAN +#if defined(_CCCL_BUILTIN_NANS) + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return _CCCL_BUILTIN_NANS(""); + } +#else // ^^^ _CCCL_BUILTIN_NANS ^^^ // vvv !_CCCL_BUILTIN_NANS vvv + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept + { + return _CUDA_VSTD::bit_cast(0x7ff4000000000000); + } +#endif // !_CCCL_BUILTIN_NANS + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return __DBL_DENORM_MIN__; + } + + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; + +template <> +class __numeric_limits_impl +{ +#ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE + +public: + using type = long double; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = __LDBL_MANT_DIG__; + static constexpr int digits10 = __LDBL_DIG__; + static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return __LDBL_MIN__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return __LDBL_MAX__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return -max(); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return __LDBL_EPSILON__; + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return 0.5L; + } + + static constexpr int min_exponent = __LDBL_MIN_EXP__; + static constexpr int min_exponent10 = __LDBL_MIN_10_EXP__; + static constexpr int max_exponent = __LDBL_MAX_EXP__; + static constexpr int max_exponent10 = __LDBL_MAX_10_EXP__; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return _CCCL_BUILTIN_HUGE_VALL(); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return _CCCL_BUILTIN_NANL(""); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return _CCCL_BUILTIN_NANSL(""); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return __LDBL_DENORM_MIN__; + } + + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +}; + +#if defined(_LIBCUDACXX_HAS_NVFP16) +template <> +class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point> +{ +public: + using type = __half; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = 11; + static constexpr int digits10 = 3; + static constexpr int max_digits10 = 5; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return type(__half_raw{0x0400u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return type(__half_raw{0x7bffu}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return type(__half_raw{0xfbffu}); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type(__half_raw{0x1400u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type(__half_raw{0x3800u}); + } + + static constexpr int min_exponent = -13; + static constexpr int min_exponent10 = -4; + static constexpr int max_exponent = 16; + static constexpr int max_exponent10 = 4; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return type(__half_raw{0x7c00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return type(__half_raw{0x7e00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return type(__half_raw{0x7d00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type(__half_raw{0x0001u}); + } + + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +template <> +class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point> +{ +public: + using type = __nv_bfloat16; + + static constexpr bool is_specialized = true; + + static constexpr bool is_signed = true; + static constexpr int digits = 8; + static constexpr int digits10 = 2; + static constexpr int max_digits10 = 4; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + { + return type(__nv_bfloat16_raw{0x0080u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + { + return type(__nv_bfloat16_raw{0x7f7fu}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + { + return type(__nv_bfloat16_raw{0xff7fu}); + } + + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = __FLT_RADIX__; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + { + return type(__nv_bfloat16_raw{0x3c00u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + { + return type(__nv_bfloat16_raw{0x3f00u}); + } + + static constexpr int min_exponent = -125; + static constexpr int min_exponent10 = -37; + static constexpr int max_exponent = 128; + static constexpr int max_exponent10 = 38; + + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_present; + static constexpr bool has_denorm_loss = false; + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + { + return type(__nv_bfloat16_raw{0x7f80u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + { + return type(__nv_bfloat16_raw{0x7fc0u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + { + return type(__nv_bfloat16_raw{0x7fa0u}); + } + _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + { + return type(__nv_bfloat16_raw{0x0001u}); + } + + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; +#endif // _LIBCUDACXX_HAS_NVBF16 + +template +class numeric_limits : public __numeric_limits_impl<_Tp> +{}; + +template +class numeric_limits : public numeric_limits<_Tp> +{}; + +template +class numeric_limits : public numeric_limits<_Tp> +{}; + +template +class numeric_limits : public numeric_limits<_Tp> +{}; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___LIMITS_LIMITS_H diff --git a/libcudacxx/include/cuda/std/__stdfloat_ b/libcudacxx/include/cuda/std/__stdfloat_ new file mode 100644 index 00000000000..93b4c64ee8d --- /dev/null +++ b/libcudacxx/include/cuda/std/__stdfloat_ @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD_STDFLOAT +#define _CUDA_STD_STDFLOAT + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# if _CCCL_HAS_INCLUDE() +# include +# endif // _CCCL_HAS_INCLUDE() +# include + +_CCCL_PUSH_MACROS + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +# if LIBCUDACXX_FLOAT16_T == 1 +using float16_t = ::cuda::fp16; +# endif // LIBCUDACXX_FLOAT16_T == 1 +# if LIBCUDACXX_FLOAT32_T == 1 +using float32_t = ::cuda::fp32; +# endif // LIBCUDACXX_FLOAT32_T == 1 +# if LIBCUDACXX_FLOAT64_T == 1 +using float64_t = ::cuda::fp64; +# endif // LIBCUDACXX_FLOAT64_T == 1 +// # if LIBCUDACXX_FLOAT128_T == 1 +// using float128_t = ::cuda::fp128; +// # endif // LIBCUDACXX_FLOAT128_T == 1 +# if LIBCUDACXX_BFLOAT16_T == 1 +using bfloat16_t = ::cuda::bf16; +# endif // LIBCUDACXX_BFLOAT16_T == 1 + +_LIBCUDACXX_END_NAMESPACE_STD + +_CCCL_POP_MACROS + +#endif // _CCCL_STD_VER >= 2017 + +#endif // _CUDA_STD_STDFLOAT diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits index ad529f2082b..4825c346c08 100644 --- a/libcudacxx/include/cuda/std/limits +++ b/libcudacxx/include/cuda/std/limits @@ -21,745 +21,10 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include - _CCCL_PUSH_MACROS -_LIBCUDACXX_BEGIN_NAMESPACE_STD - -enum float_round_style -{ - round_indeterminate = -1, - round_toward_zero = 0, - round_to_nearest = 1, - round_toward_infinity = 2, - round_toward_neg_infinity = 3 -}; - -enum float_denorm_style -{ - denorm_indeterminate = -1, - denorm_absent = 0, - denorm_present = 1 -}; - -enum class __numeric_limits_type -{ - __integral, - __bool, - __floating_point, - __other, -}; - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr __numeric_limits_type __make_numeric_limits_type() -{ -#if !defined(_CCCL_NO_IF_CONSTEXPR) - _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_same, _Tp, bool)) - { - return __numeric_limits_type::__bool; - } - else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_integral, _Tp)) - { - return __numeric_limits_type::__integral; - } - else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp)) - { - return __numeric_limits_type::__floating_point; - } - else - { - return __numeric_limits_type::__other; - } -#else // ^^^ !_CCCL_NO_IF_CONSTEXPR ^^^ // vvv _CCCL_NO_IF_CONSTEXPR vvv - return _CCCL_TRAIT(is_same, _Tp, bool) - ? __numeric_limits_type::__bool - : (_CCCL_TRAIT(is_integral, _Tp) - ? __numeric_limits_type::__integral - : (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp) - ? __numeric_limits_type::__floating_point - : __numeric_limits_type::__other)); -#endif // _CCCL_NO_IF_CONSTEXPR -} - -template ()> -class __numeric_limits_impl -{ -public: - using type = _Tp; - - static constexpr bool is_specialized = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return type(); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return type(); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return type(); - } - - static constexpr int digits = 0; - static constexpr int digits10 = 0; - static constexpr int max_digits10 = 0; - static constexpr bool is_signed = false; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr int radix = 0; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return type(); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return type(); - } - - static constexpr int min_exponent = 0; - static constexpr int min_exponent10 = 0; - static constexpr int max_exponent = 0; - static constexpr int max_exponent10 = 0; - - static constexpr bool has_infinity = false; - static constexpr bool has_quiet_NaN = false; - static constexpr bool has_signaling_NaN = false; - static constexpr float_denorm_style has_denorm = denorm_absent; - static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return type(); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return type(); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return type(); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return type(); - } - - static constexpr bool is_iec559 = false; - static constexpr bool is_bounded = false; - static constexpr bool is_modulo = false; - - static constexpr bool traps = false; - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_toward_zero; -}; - -// MSVC warns about overflowing left shift -_CCCL_DIAG_PUSH -_CCCL_DIAG_SUPPRESS_MSVC(4309) -template -struct __int_min -{ - static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits); -}; -_CCCL_DIAG_POP - -template -struct __int_min<_Tp, __digits, false> -{ - static constexpr _Tp value = _Tp(0); -}; - -template -class __numeric_limits_impl<_Tp, __numeric_limits_type::__integral> -{ -public: - using type = _Tp; - - static constexpr bool is_specialized = true; - - static constexpr bool is_signed = type(-1) < type(0); - static constexpr int digits = static_cast(sizeof(type) * __CHAR_BIT__ - is_signed); - static constexpr int digits10 = digits * 3 / 10; - static constexpr int max_digits10 = 0; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return __int_min::value; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return is_signed ? type(type(~0) ^ min()) : type(~0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return min(); - } - - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr int radix = 2; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return type(0); - } - - static constexpr int min_exponent = 0; - static constexpr int min_exponent10 = 0; - static constexpr int max_exponent = 0; - static constexpr int max_exponent10 = 0; - - static constexpr bool has_infinity = false; - static constexpr bool has_quiet_NaN = false; - static constexpr bool has_signaling_NaN = false; - static constexpr float_denorm_style has_denorm = denorm_absent; - static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return type(0); - } - - static constexpr bool is_iec559 = false; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = !is_signed; - -#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__) - static constexpr bool traps = true; -#else - static constexpr bool traps = false; -#endif - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_toward_zero; -}; - -template <> -class __numeric_limits_impl -{ -public: - using type = bool; - - static constexpr bool is_specialized = true; - - static constexpr bool is_signed = false; - static constexpr int digits = 1; - static constexpr int digits10 = 0; - static constexpr int max_digits10 = 0; - - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return false; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return true; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return min(); - } - - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr int radix = 2; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return type(0); - } - - static constexpr int min_exponent = 0; - static constexpr int min_exponent10 = 0; - static constexpr int max_exponent = 0; - static constexpr int max_exponent10 = 0; - - static constexpr bool has_infinity = false; - static constexpr bool has_quiet_NaN = false; - static constexpr bool has_signaling_NaN = false; - static constexpr float_denorm_style has_denorm = denorm_absent; - static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return type(0); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return type(0); - } - - static constexpr bool is_iec559 = false; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - - static constexpr bool traps = false; - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_toward_zero; -}; - -template <> -class __numeric_limits_impl -{ -public: - using type = float; - - static constexpr bool is_specialized = true; - - static constexpr bool is_signed = true; - static constexpr int digits = __FLT_MANT_DIG__; - static constexpr int digits10 = __FLT_DIG__; - static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return __FLT_MIN__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return __FLT_MAX__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return -max(); - } - - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return __FLT_EPSILON__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return 0.5F; - } - - static constexpr int min_exponent = __FLT_MIN_EXP__; - static constexpr int min_exponent10 = __FLT_MIN_10_EXP__; - static constexpr int max_exponent = __FLT_MAX_EXP__; - static constexpr int max_exponent10 = __FLT_MAX_10_EXP__; - - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr float_denorm_style has_denorm = denorm_present; - static constexpr bool has_denorm_loss = false; - -#if defined(_CCCL_BUILTIN_HUGE_VALF) - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return _CCCL_BUILTIN_HUGE_VALF(); - } -#else // ^^^ _CCCL_BUILTIN_HUGE_VALF ^^^ // vvv !_CCCL_BUILTIN_HUGE_VALF vvv - _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept - { - return _CUDA_VSTD::bit_cast(0x7f800000); - } -#endif // !_CCCL_BUILTIN_HUGE_VALF -#if defined(_CCCL_BUILTIN_NANF) - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return _CCCL_BUILTIN_NANF(""); - } -#else // ^^^ _CCCL_BUILTIN_NANF ^^^ // vvv !_CCCL_BUILTIN_NANF vvv - _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept - { - return _CUDA_VSTD::bit_cast(0x7fc00000); - } -#endif // !_CCCL_BUILTIN_NANF -#if defined(_CCCL_BUILTIN_NANSF) - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return _CCCL_BUILTIN_NANSF(""); - } -#else // ^^^ _CCCL_BUILTIN_NANSF ^^^ // vvv !_CCCL_BUILTIN_NANSF vvv - _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept - { - return _CUDA_VSTD::bit_cast(0x7fa00000); - } -#endif // !_CCCL_BUILTIN_NANSF - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return __FLT_DENORM_MIN__; - } - - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - - static constexpr bool traps = false; - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_to_nearest; -}; - -template <> -class __numeric_limits_impl -{ -public: - using type = double; - - static constexpr bool is_specialized = true; - - static constexpr bool is_signed = true; - static constexpr int digits = __DBL_MANT_DIG__; - static constexpr int digits10 = __DBL_DIG__; - static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return __DBL_MIN__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return __DBL_MAX__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return -max(); - } - - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return __DBL_EPSILON__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return 0.5; - } - - static constexpr int min_exponent = __DBL_MIN_EXP__; - static constexpr int min_exponent10 = __DBL_MIN_10_EXP__; - static constexpr int max_exponent = __DBL_MAX_EXP__; - static constexpr int max_exponent10 = __DBL_MAX_10_EXP__; - - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr float_denorm_style has_denorm = denorm_present; - static constexpr bool has_denorm_loss = false; - -#if defined(_CCCL_BUILTIN_HUGE_VAL) - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return _CCCL_BUILTIN_HUGE_VAL(); - } -#else // ^^^ _CCCL_BUILTIN_HUGE_VAL ^^^ // vvv !_CCCL_BUILTIN_HUGE_VAL vvv - _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept - { - return _CUDA_VSTD::bit_cast(0x7ff0000000000000); - } -#endif // !_CCCL_BUILTIN_HUGE_VAL -#if defined(_CCCL_BUILTIN_NAN) - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return _CCCL_BUILTIN_NAN(""); - } -#else // ^^^ _CCCL_BUILTIN_NAN ^^^ // vvv !_CCCL_BUILTIN_NAN vvv - _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept - { - return std::bit_cast(0x7ff8000000000000); - } -#endif // !_CCCL_BUILTIN_NAN -#if defined(_CCCL_BUILTIN_NANS) - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return _CCCL_BUILTIN_NANS(""); - } -#else // ^^^ _CCCL_BUILTIN_NANS ^^^ // vvv !_CCCL_BUILTIN_NANS vvv - _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept - { - return _CUDA_VSTD::bit_cast(0x7ff4000000000000); - } -#endif // !_CCCL_BUILTIN_NANS - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return __DBL_DENORM_MIN__; - } - - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - - static constexpr bool traps = false; - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_to_nearest; -}; - -template <> -class __numeric_limits_impl -{ -#ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE - -public: - using type = long double; - - static constexpr bool is_specialized = true; - - static constexpr bool is_signed = true; - static constexpr int digits = __LDBL_MANT_DIG__; - static constexpr int digits10 = __LDBL_DIG__; - static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return __LDBL_MIN__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return __LDBL_MAX__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return -max(); - } - - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return __LDBL_EPSILON__; - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return 0.5L; - } - - static constexpr int min_exponent = __LDBL_MIN_EXP__; - static constexpr int min_exponent10 = __LDBL_MIN_10_EXP__; - static constexpr int max_exponent = __LDBL_MAX_EXP__; - static constexpr int max_exponent10 = __LDBL_MAX_10_EXP__; - - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr float_denorm_style has_denorm = denorm_present; - static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return _CCCL_BUILTIN_HUGE_VALL(); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return _CCCL_BUILTIN_NANL(""); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return _CCCL_BUILTIN_NANSL(""); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return __LDBL_DENORM_MIN__; - } - - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - - static constexpr bool traps = false; - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_to_nearest; -#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE -}; - -#if defined(_LIBCUDACXX_HAS_NVFP16) -template <> -class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point> -{ -public: - using type = __half; - - static constexpr bool is_specialized = true; - - static constexpr bool is_signed = true; - static constexpr int digits = 11; - static constexpr int digits10 = 3; - static constexpr int max_digits10 = 5; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return type(__half_raw{0x0400u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return type(__half_raw{0x7bffu}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return type(__half_raw{0xfbffu}); - } - - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return type(__half_raw{0x1400u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return type(__half_raw{0x3800u}); - } - - static constexpr int min_exponent = -13; - static constexpr int min_exponent10 = -4; - static constexpr int max_exponent = 16; - static constexpr int max_exponent10 = 4; - - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr float_denorm_style has_denorm = denorm_present; - static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return type(__half_raw{0x7c00u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return type(__half_raw{0x7e00u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return type(__half_raw{0x7d00u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return type(__half_raw{0x0001u}); - } - - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - - static constexpr bool traps = false; - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_to_nearest; -}; -#endif // _LIBCUDACXX_HAS_NVFP16 - -#if defined(_LIBCUDACXX_HAS_NVBF16) -template <> -class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point> -{ -public: - using type = __nv_bfloat16; - - static constexpr bool is_specialized = true; - - static constexpr bool is_signed = true; - static constexpr int digits = 8; - static constexpr int digits10 = 2; - static constexpr int max_digits10 = 4; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept - { - return type(__nv_bfloat16_raw{0x0080u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept - { - return type(__nv_bfloat16_raw{0x7f7fu}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept - { - return type(__nv_bfloat16_raw{0xff7fu}); - } - - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept - { - return type(__nv_bfloat16_raw{0x3c00u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept - { - return type(__nv_bfloat16_raw{0x3f00u}); - } - - static constexpr int min_exponent = -125; - static constexpr int min_exponent10 = -37; - static constexpr int max_exponent = 128; - static constexpr int max_exponent10 = 38; - - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr float_denorm_style has_denorm = denorm_present; - static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept - { - return type(__nv_bfloat16_raw{0x7f80u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept - { - return type(__nv_bfloat16_raw{0x7fc0u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept - { - return type(__nv_bfloat16_raw{0x7fa0u}); - } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept - { - return type(__nv_bfloat16_raw{0x0001u}); - } - - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - - static constexpr bool traps = false; - static constexpr bool tinyness_before = false; - static constexpr float_round_style round_style = round_to_nearest; -}; -#endif // _LIBCUDACXX_HAS_NVBF16 - -template -class numeric_limits : public __numeric_limits_impl<_Tp> -{}; - -template -class numeric_limits : public numeric_limits<_Tp> -{}; - -template -class numeric_limits : public numeric_limits<_Tp> -{}; - -template -class numeric_limits : public numeric_limits<_Tp> -{}; - -_LIBCUDACXX_END_NAMESPACE_STD +#include +#include _CCCL_POP_MACROS diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/cast.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/cast.pass.cpp new file mode 100644 index 00000000000..42549b53628 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/floating_point/cast.pass.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11, c++14 + +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr bool test_type() +{ + assert(static_cast(T{1.0}) == cuda::fp16{1.0}); + assert(static_cast(T{1.0}) == cuda::fp32{1.0}); + assert(static_cast(T{1.0}) == cuda::fp64{1.0}); + // assert(static_cast(T{1.0}) == cuda::bf16{1.0}); + + assert(static_cast(T{1.0}) == 1.0f); + assert(static_cast(T{1.0}) == 1.0); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + assert(static_cast(T{1.0}) == 1.0L); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + assert(static_cast(T{1.0}) == 1); + assert(static_cast(T{1.0}) == 1); + assert(static_cast(T{1.0}) == 1); + assert(static_cast(T{1.0}) == 1); + assert(static_cast(T{1.0}) == 1); + assert(static_cast(T{1.0}) == 1); + assert(static_cast(T{1.0}) == 1); + assert(static_cast(T{1.0}) == 1); + + assert(static_cast(T{1.0}) == true); + assert(static_cast(T{1.0}) == 1); + + return true; +} + +__host__ __device__ constexpr bool test() +{ + test_type(); + test_type(); + test_type(); + // test_type(); + + return true; +} + +int main(int, char**) +{ + test(); + // static_assert(test()); + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/construct.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/construct.pass.cpp new file mode 100644 index 00000000000..f1f47a58436 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/floating_point/construct.pass.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11, c++14 + +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr bool test_type() +{ + assert(T{cuda::fp16{1.0}} == T{1.0}); + assert(T{cuda::fp32{1.0}} == T{1.0}); + assert(T{cuda::fp64{1.0}} == T{1.0}); + // assert(T{cuda::bf16{1.0}} == T{1.0}); + assert(T{float{1.0}} == T{1.0}); + assert(T{double{1.0}} == T{1.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + assert(T{long double{1.0}} == T{1.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + assert(T{int8_t{1}} == T{1.0}); + assert(T{int16_t{1}} == T{1.0}); + assert(T{int32_t{1}} == T{1.0}); + assert(T{int64_t{1}} == T{1.0}); + assert(T{uint8_t{1}} == T{1.0}); + assert(T{uint16_t{1}} == T{1.0}); + assert(T{uint32_t{1}} == T{1.0}); + assert(T{uint64_t{1}} == T{1.0}); + + assert(T{bool{1}} == T{1.0}); + assert(T{char{1}} == T{1.0}); + + return true; +} + +__host__ __device__ constexpr bool test() +{ + test_type(); + test_type(); + test_type(); + // test_type(); + + return true; +} + +int main(int, char**) +{ + test(); + // static_assert(test()); + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/add.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/add.pass.cpp new file mode 100644 index 00000000000..a912634ca48 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/add.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11, c++14 + +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr void test_add(Lhs lhs, Rhs rhs, Exp expected) +{ + static_assert(noexcept(lhs + rhs)); + ASSERT_SAME_TYPE(decltype(lhs + rhs), Exp); + assert(lhs + rhs == expected); +} + +__host__ __device__ constexpr bool test() +{ + test_add(cuda::fp16{1.0}, cuda::fp16{1.0}, cuda::fp16{2.0}); + test_add(cuda::fp16{1.0}, cuda::fp32{1.0}, cuda::fp32{2.0}); + test_add(cuda::fp16{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0}); + test_add(cuda::fp16{1.0}, float{1.0}, float{2.0}); + test_add(cuda::fp16{1.0}, double{1.0}, double{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_add(cuda::fp16{1.0}, long double{1.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_add(cuda::fp32{1.0}, cuda::fp16{1.0}, cuda::fp32{2.0}); + test_add(cuda::fp32{1.0}, cuda::fp32{1.0}, cuda::fp32{2.0}); + test_add(cuda::fp32{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0}); + // test_add(cuda::fp32{1.0}, cuda::bf16{1.0}, cuda::fp32{2.0}); + test_add(cuda::fp32{1.0}, float{1.0}, cuda::fp32{2.0}); + test_add(cuda::fp32{1.0}, double{1.0}, double{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_add(cuda::fp32{1.0}, long double{1.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_add(cuda::fp64{1.0}, cuda::fp16{1.0}, cuda::fp64{2.0}); + test_add(cuda::fp64{1.0}, cuda::fp32{1.0}, cuda::fp64{2.0}); + test_add(cuda::fp64{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0}); + // test_add(cuda::fp64{1.0}, cuda::bf16{1.0}, cuda::fp64{2.0}); + test_add(cuda::fp64{1.0}, float{1.0}, cuda::fp64{2.0}); + test_add(cuda::fp64{1.0}, double{1.0}, cuda::fp64{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_add(cuda::fp64{1.0}, long double{1.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // test_add(cuda::bf16{1.0}, cuda::fp32{1.0}, cuda::fp32{2.0}); + // test_add(cuda::bf16{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0}); + // test_add(cuda::bf16{1.0}, cuda::bf16{1.0}, cuda::bf16{2.0}); + // test_add(cuda::bf16{1.0}, float{1.0}, float{2.0}); + // test_add(cuda::bf16{1.0}, double{1.0}, double{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + // test_add(cuda::bf16{1.0}, long double{1.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + return true; +} + +int main(int, char**) +{ + test(); + // static_assert(test()); + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/common_type.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/common_type.pass.cpp new file mode 100644 index 00000000000..7d89fba6342 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/common_type.pass.cpp @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11, c++14 + +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ void test_common_type() +{ + ASSERT_SAME_TYPE(cuda::__fp_common_type_t, Exp); + ASSERT_SAME_TYPE(cuda::__fp_common_type_t, Exp); +} + +int main(int, char**) +{ + // cuda::fp16 + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_common_type(); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // cuda::fp32 + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_common_type(); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // cuda::fp64 + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_common_type(); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // cuda::bf16 + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); + test_common_type(); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_common_type(); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/div.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/div.pass.cpp new file mode 100644 index 00000000000..a783e8e1e6b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/div.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11, c++14 + +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr void test_div(Lhs lhs, Rhs rhs, Exp expected) +{ + static_assert(noexcept(lhs / rhs)); + ASSERT_SAME_TYPE(decltype(lhs / rhs), Exp); + assert(lhs / rhs == expected); +} + +__host__ __device__ constexpr bool test() +{ + // test_div(cuda::fp16{2.0}, cuda::fp16{2.0}, cuda::fp16{1.0}); + test_div(cuda::fp16{2.0}, cuda::fp32{2.0}, cuda::fp32{1.0}); + test_div(cuda::fp16{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0}); + test_div(cuda::fp16{2.0}, float{2.0}, float{1.0}); + test_div(cuda::fp16{2.0}, double{2.0}, double{1.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_div(cuda::fp16{2.0}, long double{2.0}, long double{1.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_div(cuda::fp32{2.0}, cuda::fp16{2.0}, cuda::fp32{1.0}); + test_div(cuda::fp32{2.0}, cuda::fp32{2.0}, cuda::fp32{1.0}); + test_div(cuda::fp32{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0}); + // test_div(cuda::fp32{2.0}, cuda::bf16{2.0}, cuda::fp32{1.0}); + test_div(cuda::fp32{2.0}, float{2.0}, cuda::fp32{1.0}); + test_div(cuda::fp32{2.0}, double{2.0}, double{1.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_div(cuda::fp32{2.0}, long double{2.0}, long double{1.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_div(cuda::fp64{2.0}, cuda::fp16{2.0}, cuda::fp64{1.0}); + test_div(cuda::fp64{2.0}, cuda::fp32{2.0}, cuda::fp64{1.0}); + test_div(cuda::fp64{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0}); + // test_div(cuda::fp64{2.0}, cuda::bf16{2.0}, cuda::fp64{1.0}); + test_div(cuda::fp64{2.0}, float{2.0}, cuda::fp64{1.0}); + test_div(cuda::fp64{2.0}, double{2.0}, cuda::fp64{1.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_div(cuda::fp64{2.0}, long double{2.0}, long double{1.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // test_div(cuda::bf16{2.0}, cuda::fp32{2.0}, cuda::fp32{1.0}); + // test_div(cuda::bf16{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0}); + // test_div(cuda::bf16{2.0}, cuda::bf16{2.0}, cuda::bf16{1.0}); + // test_div(cuda::bf16{2.0}, float{2.0}, float{1.0}); + // test_div(cuda::bf16{2.0}, double{2.0}, double{1.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + // test_div(cuda::bf16{2.0}, long double{2.0}, long double{1.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + return true; +} + +int main(int, char**) +{ + test(); + // static_assert(test()); + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/mul.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/mul.pass.cpp new file mode 100644 index 00000000000..7f3915115e7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/mul.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11, c++14 + +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr void test_mul(Lhs lhs, Rhs rhs, Exp expected) +{ + static_assert(noexcept(lhs * rhs)); + ASSERT_SAME_TYPE(decltype(lhs * rhs), Exp); + assert(lhs * rhs == expected); +} + +__host__ __device__ constexpr bool test() +{ + test_mul(cuda::fp16{1.0}, cuda::fp16{2.0}, cuda::fp16{2.0}); + test_mul(cuda::fp16{1.0}, cuda::fp32{2.0}, cuda::fp32{2.0}); + test_mul(cuda::fp16{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0}); + test_mul(cuda::fp16{1.0}, float{2.0}, float{2.0}); + test_mul(cuda::fp16{1.0}, double{2.0}, double{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_mul(cuda::fp16{1.0}, long double{2.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_mul(cuda::fp32{1.0}, cuda::fp16{2.0}, cuda::fp32{2.0}); + test_mul(cuda::fp32{1.0}, cuda::fp32{2.0}, cuda::fp32{2.0}); + test_mul(cuda::fp32{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0}); + // test_mul(cuda::fp32{1.0}, cuda::bf16{2.0}, cuda::fp32{2.0}); + test_mul(cuda::fp32{1.0}, float{2.0}, cuda::fp32{2.0}); + test_mul(cuda::fp32{1.0}, double{2.0}, double{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_mul(cuda::fp32{1.0}, long double{2.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_mul(cuda::fp64{1.0}, cuda::fp16{2.0}, cuda::fp64{2.0}); + test_mul(cuda::fp64{1.0}, cuda::fp32{2.0}, cuda::fp64{2.0}); + test_mul(cuda::fp64{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0}); + // test_mul(cuda::fp64{1.0}, cuda::bf16{2.0}, cuda::fp64{2.0}); + test_mul(cuda::fp64{1.0}, float{2.0}, cuda::fp64{2.0}); + test_mul(cuda::fp64{1.0}, double{2.0}, cuda::fp64{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_mul(cuda::fp64{1.0}, long double{2.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // test_mul(cuda::bf16{1.0}, cuda::fp32{2.0}, cuda::fp32{2.0}); + // test_mul(cuda::bf16{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0}); + // test_mul(cuda::bf16{1.0}, cuda::bf16{2.0}, cuda::bf16{2.0}); + // test_mul(cuda::bf16{1.0}, float{2.0}, float{2.0}); + // test_mul(cuda::bf16{1.0}, double{2.0}, double{2.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + // test_mul(cuda::bf16{1.0}, long double{2.0}, long double{2.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + return true; +} + +int main(int, char**) +{ + test(); + // static_assert(test()); + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/sub.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/sub.pass.cpp new file mode 100644 index 00000000000..2af00d7eb2f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/sub.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11, c++14 + +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr void test_sub(Lhs lhs, Rhs rhs, Exp expected) +{ + static_assert(noexcept(lhs - rhs)); + ASSERT_SAME_TYPE(decltype(lhs - rhs), Exp); + assert(lhs - rhs == expected); +} + +__host__ __device__ constexpr bool test() +{ + test_sub(cuda::fp16{1.0}, cuda::fp16{1.0}, cuda::fp16{0.0}); + test_sub(cuda::fp16{1.0}, cuda::fp32{1.0}, cuda::fp32{0.0}); + test_sub(cuda::fp16{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0}); + test_sub(cuda::fp16{1.0}, float{1.0}, float{0.0}); + test_sub(cuda::fp16{1.0}, double{1.0}, double{0.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_sub(cuda::fp16{1.0}, long double{1.0}, long double{0.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_sub(cuda::fp32{1.0}, cuda::fp16{1.0}, cuda::fp32{0.0}); + test_sub(cuda::fp32{1.0}, cuda::fp32{1.0}, cuda::fp32{0.0}); + test_sub(cuda::fp32{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0}); + // test_sub(cuda::fp32{1.0}, cuda::bf16{1.0}, cuda::fp32{0.0}); + test_sub(cuda::fp32{1.0}, float{1.0}, cuda::fp32{0.0}); + test_sub(cuda::fp32{1.0}, double{1.0}, double{0.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_sub(cuda::fp32{1.0}, long double{1.0}, long double{0.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + test_sub(cuda::fp64{1.0}, cuda::fp16{1.0}, cuda::fp64{0.0}); + test_sub(cuda::fp64{1.0}, cuda::fp32{1.0}, cuda::fp64{0.0}); + test_sub(cuda::fp64{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0}); + // test_sub(cuda::fp64{1.0}, cuda::bf16{1.0}, cuda::fp64{0.0}); + test_sub(cuda::fp64{1.0}, float{1.0}, cuda::fp64{0.0}); + test_sub(cuda::fp64{1.0}, double{1.0}, cuda::fp64{0.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_sub(cuda::fp64{1.0}, long double{1.0}, long double{0.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + // test_sub(cuda::bf16{1.0}, cuda::fp32{1.0}, cuda::fp32{0.0}); + // test_sub(cuda::bf16{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0}); + // test_sub(cuda::bf16{1.0}, cuda::bf16{1.0}, cuda::bf16{0.0}); + // test_sub(cuda::bf16{1.0}, float{1.0}, float{0.0}); + // test_sub(cuda::bf16{1.0}, double{1.0}, double{0.0}); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + // test_sub(cuda::bf16{1.0}, long double{1.0}, long double{0.0}); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + + return true; +} + +int main(int, char**) +{ + test(); + // static_assert(test()); + + return 0; +}