diff --git a/libcudacxx/include/cuda/__floating_point/cast.h b/libcudacxx/include/cuda/__floating_point/cast.h
new file mode 100644
index 00000000000..dc325b9e035
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/cast.h
@@ -0,0 +1,486 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_CAST_H
+#define _CUDA___FLOATING_POINT_CAST_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__floating_point/fp.h>
+#  include <cuda/__floating_point/type_traits.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/is_integral.h>
+#  include <cuda/std/__type_traits/is_same.h>
+#  include <cuda/std/__type_traits/is_signed.h>
+#  include <cuda/std/__type_traits/is_unsigned.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+struct __fp_cast
+{
+  template <class _Tp, class _Up>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __cast_generic(const _Up& __src)
+  {
+    _CCCL_ASSERT(false, "Unsupported floating point cast");
+    // todo: implement generic cast
+    //  - fp -> fp
+    //  - fp -> integral
+    //  - integral -> fp
+
+    _Tp __dst{};
+
+    // Copy sign
+    // if constexpr (_Tp::__is_signed && _Up::__is_signed)
+    // {
+    //   __dst.__set_sign(__src.__get_sign());
+    // }
+    // else if constexpr (!_Tp::__is_signed && _Up::__is_signed)
+    // {
+    //   if (__src.__get_sign())
+    //   {
+    //     return _Tp::__nan();
+    //   }
+    // }
+
+    // using _Sp =
+    //   _CUDA_VSTD::make_signed_t<_CUDA_VSTD::common_type_t<typename _Tp::__storage_type, typename
+    //   _Up::__storage_type>>;
+
+    // // Convert exponent
+    // constexpr _Sp __src_exp_bias = _Up::__exp_val_mask() / 2;
+    // constexpr _Sp __dst_exp_bias = _Tp::__exp_val_mask() / 2;
+
+    // _Sp __dst_exp = static_cast<_Sp>(__src.__get_exp()) - __src_exp_bias + __dst_exp_bias;
+
+    // if (__dst_exp >= static_cast<_Sp>(_Tp::__exp_val_mask()))
+    // {
+    //   return _Tp::__inf();
+    // }
+
+    // __dst.__set_exp(static_cast<typename _Tp::__storage_type>(__dst_exp));
+
+    // // Convert mantissa (todo: implement rounding)
+    // constexpr ptrdiff_t __mant_diff =
+    //   static_cast<ptrdiff_t>(_Tp::__mant_nbits) - static_cast<ptrdiff_t>(_Up::__mant_nbits);
+
+    // _Sp __dst_mant{};
+
+    // if constexpr (__mant_diff < 0)
+    // {
+    //   __dst_mant = static_cast<_Sp>(__src.__get_mant()) >> (-__mant_diff);
+    // }
+    // else
+    // {
+    //   __dst_mant = static_cast<_Sp>(__src.__get_mant()) << __mant_diff;
+    // }
+
+    // __dst.__set_mant(static_cast<typename _Tp::__storage_type>(__dst_mant));
+
+    return __dst;
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __fp16_config>)
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    fp16 __fp_val{__fp_from_native, __val};
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp32>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE,
+                   (asm("cvt.f32.f16 %0, %1;" : "=r"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;))
+    }
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp64>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE,
+                   (asm("cvt.f64.f16 %0, %1;" : "=l"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.bf16.f16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_signed_v<_Tp>)
+    {
+      if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (int16_t __ret; asm("cvt.rni.s8.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_));
+                      return static_cast<_Tp>(__ret);))
+      }
+      else if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (asm("cvt.rni.s16.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 4)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (asm("cvt.rni.s32.f16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 8)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (asm("cvt.rni.s64.f16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+    }
+    else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_unsigned_v<_Tp>)
+    {
+      if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (uint16_t __ret; asm("cvt.rni.u8.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_));
+                      return static_cast<_Tp>(__ret);))
+      }
+      else if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (asm("cvt.rni.u16.f16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 4)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (asm("cvt.rni.u32.f16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 8)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE,
+                     (asm("cvt.rni.u64.f16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+    }
+
+    return __ret = __cast_generic<_Tp>(__fp_val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __fp32_config>)
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    fp32 __fp_val{__fp_from_native, __val};
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE,
+                   (asm("cvt.rn.f16.f32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__fp_val.__storage_)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.f32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__fp_val.__storage_)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__fp_val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __fp64_config>)
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    fp64 __fp_val{__fp_from_native, __val};
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE,
+                   (asm("cvt.rn.f16.f64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__fp_val.__storage_)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.f64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__fp_val.__storage_)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__fp_val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Up>, __bf16_config>)
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    bf16 __fp_val{__fp_from_native, __val};
+    _Tp __ret{};
+
+#  if __cccl_ptx_isa >= 780
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.f16.bf16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;))
+    }
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp32>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.f32.bf16 %0, %1;" : "=r"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;))
+    }
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp64>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.f64.bf16 %0, %1;" : "=l"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;))
+    }
+    else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_signed_v<_Tp>)
+    {
+      if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (int16_t __ret; asm("cvt.s8.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_));
+                      return static_cast<_Tp>(__ret);))
+      }
+      else if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (asm("cvt.s16.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 4)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (asm("cvt.s32.bf16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 8)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (asm("cvt.s64.bf16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+    }
+    else if constexpr (_CUDA_VSTD::is_integral_v<_Tp> && _CUDA_VSTD::is_unsigned_v<_Tp>)
+    {
+      if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (uint16_t __ret; asm("cvt.u8.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_));
+                      return static_cast<_Tp>(__ret);))
+      }
+      else if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (asm("cvt.u16.bf16 %0, %1;" : "=h"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 4)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (asm("cvt.u32.bf16 %0, %1;" : "=r"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+      else if constexpr (sizeof(_Tp) == 8)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (asm("cvt.u64.bf16 %0, %1;" : "=l"(__ret) : "h"(__fp_val.__storage_)); return __ret;))
+      }
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__fp_val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 1))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(
+        NV_IS_DEVICE,
+        (asm("cvt.rn.f16.s8 %0, %1;" : "=h"(__ret.__storage_) : "h"(static_cast<int16_t>(__val))); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.s8 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 2))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 4))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_signed_v<_Up> _CCCL_AND(sizeof(_Up) == 8))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 1))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(
+        NV_IS_DEVICE,
+        (asm("cvt.rn.f16.u8 %0, %1;" : "=h"(__ret.__storage_) : "h"(static_cast<int16_t>(__val))); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.u8 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 2))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 4))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__ret.__storage_) : "r"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+  _CCCL_TEMPLATE(class _Tp, class _Up)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Up> _CCCL_AND _CUDA_VSTD::is_unsigned_v<_Up> _CCCL_AND(sizeof(_Up) == 8))
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __cast_impl_device(const _Up& __val) noexcept
+  {
+    _Tp __ret{};
+
+    if constexpr (_CUDA_VSTD::is_same_v<_Tp, fp16>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;))
+    }
+#  if __cccl_ptx_isa >= 780
+    else if constexpr (_CUDA_VSTD::is_same_v<_Tp, bf16>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__ret.__storage_) : "l"(__val)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ret = __cast_generic<_Tp>(__val);
+  }
+
+public:
+  template <class _Tp, class _Up>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __cast(const _Up& __src) noexcept
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return __cast_impl_device<_Tp>(__src);))
+    }
+
+    return __cast_generic<_Tp>(__src);
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_CAST_H
diff --git a/libcudacxx/include/cuda/__floating_point/config.h b/libcudacxx/include/cuda/__floating_point/config.h
new file mode 100644
index 00000000000..440526dd476
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/config.h
@@ -0,0 +1,526 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_CONFIG_H
+#define _CUDA___FLOATING_POINT_CONFIG_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__floating_point/type_traits.h>
+#  include <cuda/__fwd/fp.h>
+#  include <cuda/std/__bit/integral.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/is_same.h>
+#  include <cuda/std/cfloat>
+
+// Silence the warning about the use of long double in device code
+_CCCL_NV_DIAG_SUPPRESS(20208)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+// _FpConfig is a type that provides the configuration for the __fp type
+// - static constexpr variables:
+//   - __exp_nbits: number of bits in the exponent
+//   - __mant_nbits: number of bits in the mantissa
+//   - __is_signed: whether the floating point type is signed
+//   - __has_inf: whether the floating point type has infinity
+//   - __has_nan: whether the floating point type has quiet NaN
+//   - __has_nans: whether the floating point type has signaling NaN
+//   - __has_denorm: whether the floating point type has denormalized values
+//   - __native_type: the native type that the floating point type is based on
+//   - __is_iec559: whether the floating point type is based on IEC 559
+// - type aliases:
+//   - __host native_type: the host native type that the floating point type is based on (if no native type, use
+//   __fp_no_native_type_tag)
+//   - __device native_type: the device native type that the floating point type is based on (if no native type, use
+//   __fp_no_native_type_tag)
+// - static member functions:
+//   - __min(): returns the minimum value for the floating point type
+//   - __max(): returns the maximum value for the floating point type
+//   - __inf(): returns the infinity value for the floating point type (if __has_inf is true)
+//   - __nan(): returns the quiet NaN value for the floating point type (if __has_quiet_nan is true)
+//   - __nans(): returns the signaling NaN value for the floating point type (if __has_signaling_nan is true)
+//   - __is_inf(): returns whether the given value is infinity (if __has_inf is true)
+//   - __is_nan(): returns whether the given value is NaN (if __has_nan or __has_nans is true)
+
+struct __fp_no_native_type_tag
+{};
+
+struct __fp_invalid_config
+{};
+
+struct __fp_from_native_t
+{};
+
+_CCCL_INLINE_VAR constexpr __fp_from_native_t __fp_from_native{};
+
+struct __fp_from_storage_t
+{};
+
+_CCCL_INLINE_VAR constexpr __fp_from_storage_t __fp_from_storage{};
+
+struct __fp_iec559_config_base
+{
+  static constexpr bool __is_signed  = true;
+  static constexpr bool __has_inf    = true;
+  static constexpr bool __has_nan    = true;
+  static constexpr bool __has_nans   = true;
+  static constexpr bool __has_denorm = true;
+  static constexpr bool __is_iec559  = true;
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min()
+  {
+    _Tp __ret{};
+    __ret.__set_exp(typename _Tp::__storage_type{1});
+    return __ret;
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max()
+  {
+    _Tp __ret{};
+    __ret.__set_exp(static_cast<typename _Tp::__storage_type>(~typename _Tp::__storage_type{1}));
+    return __ret;
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __inf()
+  {
+    _Tp __ret{};
+    __ret.__set_exp(_Tp::__exp_val_mask());
+    return __ret;
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan()
+  {
+    _Tp __ret = __inf<_Tp>();
+    __ret.__set_mant(typename _Tp::__storage_type{1} << (_Tp::__mant_nbits - 1));
+    return __ret;
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nans()
+  {
+    _Tp __ret = __inf<_Tp>();
+    __ret.__set_mant(typename _Tp::__storage_type{1} << (_Tp::__mant_nbits - 2));
+    return __ret;
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_inf(const _Tp& __val)
+  {
+    return __val.__get_exp() == _Tp::__exp_val_mask() && __val.__get_mant() == typename _Tp::__storage_type{0};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val)
+  {
+    return __val.__get_exp() == _Tp::__exp_val_mask() && __val.__get_mant() != typename _Tp::__storage_type{0};
+  }
+};
+
+struct __fp4_e2m1_config
+{
+  static constexpr size_t __exp_nbits  = 2;
+  static constexpr size_t __mant_nbits = 1;
+  static constexpr bool __is_signed    = true;
+  static constexpr bool __has_inf      = false;
+  static constexpr bool __has_nan      = false;
+  static constexpr bool __has_nans     = false;
+  static constexpr bool __has_denorm   = true;
+  static constexpr bool __is_iec559    = false;
+
+  using __host_native_type   = __fp_no_native_type_tag;
+  using __device_native_type = __fp_no_native_type_tag;
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x7}};
+  }
+};
+
+struct __fp6_e2m3_config
+{
+  static constexpr size_t __exp_nbits  = 2;
+  static constexpr size_t __mant_nbits = 3;
+  static constexpr bool __is_signed    = true;
+  static constexpr bool __has_inf      = false;
+  static constexpr bool __has_nan      = false;
+  static constexpr bool __has_nans     = false;
+  static constexpr bool __has_denorm   = true;
+  static constexpr bool __is_iec559    = false;
+
+  using __host_native_type   = __fp_no_native_type_tag;
+  using __device_native_type = __fp_no_native_type_tag;
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x1f}};
+  }
+};
+
+struct __fp6_e3m2_config
+{
+  static constexpr size_t __exp_nbits  = 3;
+  static constexpr size_t __mant_nbits = 2;
+  static constexpr bool __is_signed    = true;
+  static constexpr bool __has_inf      = false;
+  static constexpr bool __has_nan      = false;
+  static constexpr bool __has_nans     = false;
+  static constexpr bool __has_denorm   = true;
+  static constexpr bool __is_iec559    = false;
+
+  using __host_native_type   = __fp_no_native_type_tag;
+  using __device_native_type = __fp_no_native_type_tag;
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x1f}};
+  }
+};
+
+struct __fp8_e4m3_config
+{
+  static constexpr size_t __exp_nbits  = 4;
+  static constexpr size_t __mant_nbits = 3;
+  static constexpr bool __is_signed    = true;
+  static constexpr bool __has_inf      = false;
+  static constexpr bool __has_nan      = true;
+  static constexpr bool __has_nans     = false;
+  static constexpr bool __has_denorm   = true;
+  static constexpr bool __is_iec559    = false;
+
+  using __host_native_type   = __fp_no_native_type_tag;
+  using __device_native_type = __fp_no_native_type_tag;
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x7e}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan()
+  {
+    return _Tp{__fp_from_storage, static_cast<typename _Tp::__storage_type>(0x7f)};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val)
+  {
+    return (__val.__storage & static_cast<typename _Tp::__storage_type>(0x7f))
+        == static_cast<typename _Tp::__storage_type>(0x7f);
+  }
+};
+
+struct __fp8_e5m2_config : __fp_iec559_config_base
+{
+  static constexpr size_t __exp_nbits  = 5;
+  static constexpr size_t __mant_nbits = 2;
+
+  using __host_native_type   = __fp_no_native_type_tag;
+  using __device_native_type = __fp_no_native_type_tag;
+};
+
+struct __fp16_config : __fp_iec559_config_base
+{
+  static constexpr size_t __exp_nbits  = 5;
+  static constexpr size_t __mant_nbits = 10;
+  static constexpr bool __is_iec559    = false;
+
+#  if __STDCPP_FLOAT16_T__ == 1
+  using __host_native_type = ::std::float16_t;
+#  elif _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC, >=, 24, 9)
+  using __host_native_type = _Float16;
+#  else
+  using __host_native_type = __fp_no_native_type_tag;
+#  endif
+#  if _CCCL_CUDA_COMPILER(CLANG, >=, 19) || _CCCL_CUDA_COMPILER(NVHPC, >=, 24, 9)
+  using __device_native_type = _Float16;
+#  else
+  using __device_native_type = __fp_no_native_type_tag;
+#  endif
+};
+
+struct __fp32_config : __fp_iec559_config_base
+{
+  static constexpr size_t __exp_nbits  = 8;
+  static constexpr size_t __mant_nbits = 23;
+
+  using __host_native_type   = float;
+  using __device_native_type = float;
+};
+
+struct __fp64_config : __fp_iec559_config_base
+{
+  static constexpr size_t __exp_nbits  = 11;
+  static constexpr size_t __mant_nbits = 52;
+
+  using __host_native_type   = double;
+  using __device_native_type = double;
+};
+
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+struct __fp_long_double_config : __fp_iec559_config_base
+{
+  static constexpr size_t __exp_nbits  = _CUDA_VSTD::bit_width(static_cast<size_t>(LDBL_MAX_EXP));
+  static constexpr size_t __mant_nbits = LDBL_MANT_DIG;
+
+  using __host_native_type   = long double;
+  using __device_native_type = __fp_no_native_type_tag;
+};
+#  endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+struct __fp128_config : __fp_iec559_config_base
+{
+  static constexpr size_t __exp_nbits  = 15;
+  static constexpr size_t __mant_nbits = 112;
+
+#  if __STDCPP_FLOAT128_T__ == 1
+  using __host_native_type = ::std::float128_t;
+#  elif (defined(__SIZEOF_FLOAT128__) || defined(__FLOAT128__)) && _CCCL_OS(LINUX)                        \
+    && (_CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)) && !defined(__CUDA_ARCH__) \
+    && !_CCCL_COMPILER(NVRTC)
+  using __host_native_type = __float128;
+#  else
+  using __host_native_type = __fp_no_native_type_tag;
+#  endif
+  using __device_native_type = __fp_no_native_type_tag;
+};
+
+struct __bf16_config : __fp_iec559_config_base
+{
+  static constexpr size_t __exp_nbits  = 8;
+  static constexpr size_t __mant_nbits = 7;
+
+#  if __STDCPP_BFLOAT16_T__ == 1
+  using __host_native_type = ::std::bfloat16_t;
+#  elif _CCCL_COMPILER(GCC, >=, 13) || (_CCCL_COMPILER(CLANG, >=, 15) && _CCCL_ARCH(X86_64)) \
+    || (_CCCL_COMPILER(CLANG, >=, 11) && _CCCL_ARCH(ARM64))
+  using __host_native_type = __bf16;
+#  else
+  using __host_native_type = __fp_no_native_type_tag;
+#  endif
+#  if _CCCL_CUDA_COMPILER(CLANG, >=, 17)
+  using __device_native_type = __bf16;
+#  else
+  using __device_native_type = __fp_no_native_type_tag;
+#  endif
+};
+
+struct __fp8_ue4m3_config
+{
+  static constexpr size_t __exp_nbits  = 4;
+  static constexpr size_t __mant_nbits = 3;
+  static constexpr bool __is_signed    = false;
+  static constexpr bool __has_inf      = false;
+  static constexpr bool __has_nan      = true;
+  static constexpr bool __has_nans     = false;
+  static constexpr bool __has_denorm   = true;
+  static constexpr bool __is_iec559    = false;
+
+  using __host_native_type   = __fp_no_native_type_tag;
+  using __device_native_type = __fp_no_native_type_tag;
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x08}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x7e}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan()
+  {
+    return _Tp{__fp_from_storage, static_cast<typename _Tp::__storage_type>(0x7f)};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val)
+  {
+    return (__val.__storage & static_cast<typename _Tp::__storage_type>(0x7f))
+        == static_cast<typename _Tp::__storage_type>(0x7f);
+  }
+};
+
+struct __fp8_ue8m0_config
+{
+  static constexpr size_t __exp_nbits  = 8;
+  static constexpr size_t __mant_nbits = 0;
+  static constexpr bool __is_signed    = false;
+  static constexpr bool __has_inf      = false;
+  static constexpr bool __has_nan      = true;
+  static constexpr bool __has_nans     = false;
+  static constexpr bool __has_denorm   = false;
+  static constexpr bool __is_iec559    = false;
+
+  using __host_native_type   = __fp_no_native_type_tag;
+  using __device_native_type = __fp_no_native_type_tag;
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __min()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0x00}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __max()
+  {
+    return _Tp{__fp_from_storage, typename _Tp::__storage_type{0xff}};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __nan()
+  {
+    return _Tp{__fp_from_storage, static_cast<typename _Tp::__storage_type>(0xff)};
+  }
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__is_cuda_extended_floating_point_v<_Tp>)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __is_nan(const _Tp& __val)
+  {
+    return __val.__storage == static_cast<typename _Tp::__storage_type>(0xff);
+  }
+};
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto __fp_make_config_from()
+{
+  using _Up = _CUDA_VSTD::remove_cv_t<_Tp>;
+
+  if constexpr (__is_cuda_extended_floating_point_v<_Up>)
+  {
+    return typename _Up::__config_type{};
+  }
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, float>)
+  {
+    return __fp32_config{};
+  }
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, double>)
+  {
+    return __fp64_config{};
+  }
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, long double>)
+  {
+    return __fp_long_double_config{};
+  }
+#  endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, typename __fp16_config::__host_native_type>
+                     || _CUDA_VSTD::is_same_v<_Up, typename __fp16_config::__device_native_type>)
+  {
+    return __fp16_config{};
+  }
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, typename __fp128_config::__host_native_type>
+                     || _CUDA_VSTD::is_same_v<_Up, typename __fp128_config::__device_native_type>)
+  {
+    return __fp128_config{};
+  }
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, typename __bf16_config::__host_native_type>
+                     || _CUDA_VSTD::is_same_v<_Up, typename __bf16_config::__device_native_type>)
+  {
+    return __bf16_config{};
+  }
+#  if __STDCPP_FLOAT16_T__ == 1
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float16_t>)
+  {
+    return __fp16_config{};
+  }
+#  endif // __STDCPP_FLOAT16_T__
+#  if __STDCPP_FLOAT32_T__ == 1
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float32_t>)
+  {
+    return __fp32_config{};
+  }
+#  endif // __STDCPP_FLOAT32_T__
+#  if __STDCPP_FLOAT64_T__ == 1
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float64_t>)
+  {
+    return __fp64_config{};
+  }
+#  endif // __STDCPP_FLOAT64_T__
+#  if __STDCPP_FLOAT128_T__ == 1
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::float128_t>)
+  {
+    return __fp128_config{};
+  }
+#  endif // __STDCPP_FLOAT128_T__
+#  if __STDCPP_BFLOAT16_T__ == 1
+  else if constexpr (_CUDA_VSTD::is_same_v<_Up, ::std::bfloat16_t>)
+  {
+    return __bf16_config{};
+  }
+#  endif // __STDCPP_BFLOAT16_T__
+  else
+  {
+    return __fp_invalid_config{};
+  }
+}
+
+template <class _Tp>
+using __fp_make_config_from_t = decltype(::cuda::__fp_make_config_from<_Tp>());
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+_CCCL_NV_DIAG_DEFAULT(20208)
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_CONFIG_H
diff --git a/libcudacxx/include/cuda/__floating_point/conv_rank_order.h b/libcudacxx/include/cuda/__floating_point/conv_rank_order.h
new file mode 100644
index 00000000000..66418dd9b6b
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/conv_rank_order.h
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_CONV_RANK_ORDER_H
+#define _CUDA___FLOATING_POINT_CONV_RANK_ORDER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+#  include <cuda/__floating_point/config.h>
+#  include <cuda/__floating_point/type_traits.h>
+#  include <cuda/std/__type_traits/is_same.h>
+
+enum class __fp_conv_rank_order
+{
+  __unordered,
+  __greater,
+  __equal,
+  __less,
+};
+
+template <class _To, class _From>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp_conv_rank_order __fp_make_conv_rank_order()
+{
+  using _FromConfig = __fp_make_config_from_t<_From>;
+  using _ToConfig   = __fp_make_config_from_t<_To>;
+
+  if constexpr (!_CUDA_VSTD::is_same_v<_FromConfig, __fp_invalid_config>
+                && !_CUDA_VSTD::is_same_v<_ToConfig, __fp_invalid_config>)
+  {
+    if constexpr (_ToConfig::__is_signed == _FromConfig::__is_signed)
+    {
+      if constexpr (_ToConfig::__exp_nbits == _FromConfig::__exp_nbits
+                    && _ToConfig::__mant_nbits == _FromConfig::__mant_nbits)
+      {
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+        // If the representations are the same, long double always has the higher subrank
+        if constexpr (_CUDA_VSTD::is_same_v<_ToConfig, __fp_long_double_config>
+                      && !_CUDA_VSTD::is_same_v<_FromConfig, __fp_long_double_config>)
+        {
+          return __fp_conv_rank_order::__greater;
+        }
+        else if constexpr (!_CUDA_VSTD::is_same_v<_ToConfig, __fp_long_double_config>
+                           && _CUDA_VSTD::is_same_v<_FromConfig, __fp_long_double_config>)
+        {
+          return __fp_conv_rank_order::__less;
+        }
+#  endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+        return __fp_conv_rank_order::__equal;
+      }
+      else if constexpr (_ToConfig::__exp_nbits >= _FromConfig::__exp_nbits
+                         && _ToConfig::__mant_nbits >= _FromConfig::__mant_nbits)
+      {
+        return __fp_conv_rank_order::__greater;
+      }
+      else if constexpr (_ToConfig::__exp_nbits <= _FromConfig::__exp_nbits
+                         && _ToConfig::__mant_nbits <= _FromConfig::__mant_nbits)
+      {
+        return __fp_conv_rank_order::__less;
+      }
+    }
+  }
+
+  return __fp_conv_rank_order::__unordered;
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_CONV_RANK_ORDER_H
diff --git a/libcudacxx/include/cuda/__floating_point/fp.h b/libcudacxx/include/cuda/__floating_point/fp.h
new file mode 100644
index 00000000000..e872365290e
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/fp.h
@@ -0,0 +1,530 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_FP_H
+#define _CUDA___FLOATING_POINT_FP_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__floating_point/config.h>
+#  include <cuda/__floating_point/conv_rank_order.h>
+#  include <cuda/__floating_point/storage.h>
+#  include <cuda/__floating_point/type_traits.h>
+#  include <cuda/__fwd/fp.h>
+#  include <cuda/std/__bit/bit_cast.h>
+#  include <cuda/std/__bit/integral.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__cstddef/types.h>
+#  include <cuda/std/__type_traits/enable_if.h>
+#  include <cuda/std/__type_traits/is_constant_evaluated.h>
+#  include <cuda/std/__type_traits/is_integral.h>
+#  include <cuda/std/__type_traits/is_same.h>
+#  include <cuda/std/__type_traits/make_signed.h>
+#  include <cuda/std/__type_traits/remove_cv.h>
+#  include <cuda/std/climits>
+
+#  include <nv/target>
+
+// Silence the warning about the use of long double in device code
+_CCCL_NV_DIAG_SUPPRESS(20208)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+// Signed FP types
+using fp4_e2m1 = __fp<__fp4_e2m1_config>;
+using fp6_e2m3 = __fp<__fp6_e2m3_config>;
+using fp6_e3m2 = __fp<__fp6_e3m2_config>;
+using fp8_e4m3 = __fp<__fp8_e4m3_config>;
+using fp8_e5m2 = __fp<__fp8_e5m2_config>;
+using fp16     = __fp<__fp16_config>;
+using bf16     = __fp<__bf16_config>;
+using fp32     = __fp<__fp32_config>;
+using fp64     = __fp<__fp64_config>;
+// #  if !defined(_LIBCUDACXX_HAS_NO_INT128)
+// using fp128 = __fp<__fp128_config>;
+// #  endif // !_LIBCUDACXX_HAS_NO_INT128
+
+// Unsigned FP types
+using fp8_ue4m3 = __fp<__fp8_ue4m3_config>;
+using fp8_ue8m0 = __fp<__fp8_ue8m0_config>;
+
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(__fp_is_floating_point_v<_Tp>)
+__fp(_Tp) -> __fp<__fp_make_config_from_t<_Tp>>;
+
+template <class _Tp>
+__fp(__fp_from_native_t, _Tp) -> __fp<__fp_make_config_from_t<_Tp>>;
+
+template <class _To, class _From>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __fp_cast_is_implicit()
+{
+  constexpr auto __rank_order = ::cuda::__fp_make_conv_rank_order<_To, _From>();
+
+  return __rank_order == __fp_conv_rank_order::__equal || __rank_order == __fp_conv_rank_order::__greater;
+}
+
+template <class _Config>
+class __fp
+{
+public:
+  template <class>
+  friend class __fp;
+  friend _Config;
+  friend struct __fp_cast;
+  friend struct __fp_ops;
+
+  using __config_type = _Config;
+
+  static constexpr size_t __exp_nbits  = __config_type::__exp_nbits;
+  static constexpr size_t __mant_nbits = __config_type::__mant_nbits;
+  static constexpr bool __is_signed    = __config_type::__is_signed;
+  static constexpr size_t __nbits      = __exp_nbits + __mant_nbits + __is_signed;
+  static constexpr bool __has_inf      = __config_type::__has_inf;
+  static constexpr bool __has_nan      = __config_type::__has_nan;
+  static constexpr bool __has_nans     = __config_type::__has_nans;
+  static constexpr bool __has_denorm   = __config_type::__has_denorm;
+
+  using __storage_type       = __fp_storage_t<__nbits>;
+  using __host_native_type   = typename __config_type::__host_native_type;
+  using __device_native_type = typename __config_type::__device_native_type;
+
+  static constexpr bool __has_host_native_type = !_CUDA_VSTD::is_same_v<__host_native_type, __fp_no_native_type_tag>;
+  static constexpr bool __has_device_native_type =
+    !_CUDA_VSTD::is_same_v<__device_native_type, __fp_no_native_type_tag>;
+
+  static_assert(!__has_host_native_type || sizeof(__storage_type) == sizeof(__host_native_type));
+  static_assert(!__has_device_native_type || sizeof(__storage_type) == sizeof(__device_native_type));
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __min()
+  {
+    return __config_type::template __min<__fp>();
+  }
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __max()
+  {
+    return __config_type::template __max<__fp>();
+  }
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __lowest()
+  {
+    __fp __ret{};
+    if constexpr (__is_signed)
+    {
+      __ret = __max();
+      __ret.__set_sign(true);
+    }
+    return __ret;
+  }
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __epsilon()
+  {
+    // TODO: implement epsilon
+    return __fp{};
+  }
+  _CCCL_TEMPLATE(bool _HasInf = __has_inf)
+  _CCCL_REQUIRES(_HasInf)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __inf()
+  {
+    return __config_type::template __inf<__fp>();
+  }
+  _CCCL_TEMPLATE(bool _HasNan = __has_nan)
+  _CCCL_REQUIRES(_HasNan)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __nan()
+  {
+    return __config_type::template __nan<__fp>();
+  }
+  _CCCL_TEMPLATE(bool _HasNans = __has_nans)
+  _CCCL_REQUIRES(_HasNans)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __nans()
+  {
+    return __config_type::template __nans<__fp>();
+  }
+  _CCCL_TEMPLATE(bool _HasDenorm = __has_denorm)
+  _CCCL_REQUIRES(_HasDenorm)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __denorm_min()
+  {
+    return __fp{__fp_from_storage, __storage_type{1}};
+  }
+
+  __fp() = default;
+
+  __fp(const __fp&) = default;
+
+#  if defined(_CCCL_NO_CONDITIONAL_EXPLICIT)
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND __fp_cast_is_implicit<__fp, _Tp>())
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp(const _Tp& __v) noexcept
+      : __fp{__construct_from(__v)}
+  {}
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND(!__fp_cast_is_implicit<__fp, _Tp>()))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __fp(const _Tp& __v) noexcept
+      : __fp{__construct_from(__v)}
+  {}
+#  else // ^^^ _CCCL_NO_CONDITIONAL_EXPLICIT ^^^ / vvv !_CCCL_NO_CONDITIONAL_EXPLICIT vvv
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit(!__fp_cast_is_implicit<__fp, _Tp>()) constexpr __fp(const _Tp& __v) noexcept
+      : __fp{__construct_from(__v)}
+  {}
+#  endif // ^^^ !_CCCL_NO_CONDITIONAL_EXPLICIT ^^^
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp(const _Tp& __v) noexcept
+      : __fp{__construct_from(__v)}
+  {}
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_same_v<__fp_make_config_from_t<_Tp>, __config_type>)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __fp(__fp_from_native_t, const _Tp& __v) noexcept
+      : __fp{__fp_from_storage, _CUDA_VSTD::bit_cast<__storage_type>(__v)}
+  {}
+
+  template <class _Tp>
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __fp(__fp_from_storage_t, const _Tp& __v) noexcept
+      : __storage_{__v}
+  {
+    static_assert(_CUDA_VSTD::is_same_v<_Tp, __storage_type>);
+  }
+
+#  if defined(_CCCL_NO_CONDITIONAL_EXPLICIT)
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND __fp_cast_is_implicit<_Tp, __fp>())
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _Tp() const noexcept
+  {
+    return __cast_to<_Tp>();
+  }
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND(!__fp_cast_is_implicit<_Tp, __fp>()))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator _Tp() const noexcept
+  {
+    return __cast_to<_Tp>();
+  }
+#  else // ^^^ _CCCL_NO_CONDITIONAL_EXPLICIT ^^^ / vvv !_CCCL_NO_CONDITIONAL_EXPLICIT vvv
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(__fp_is_floating_point_v<_Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit(!__fp_cast_is_implicit<_Tp, __fp>()) constexpr operator _Tp() const noexcept
+  {
+    return __cast_to<_Tp>();
+  }
+#  endif // ^^^ !_CCCL_NO_CONDITIONAL_EXPLICIT ^^^
+
+  _CCCL_TEMPLATE(class _Tp)
+  _CCCL_REQUIRES(_CUDA_VSTD::is_integral_v<_Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _Tp() const noexcept
+  {
+    return __cast_to<_Tp>();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __has_native_type() noexcept
+  {
+    NV_IF_ELSE_TARGET(NV_IS_HOST, (return __has_host_native_type;), (return __has_device_native_type;))
+  }
+
+  // private:
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp __construct_from(const _Tp& __v) noexcept
+  {
+#  if defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+    static_assert(!_CUDA_VSTD::is_same_v<_Tp, long double>, "long double is not supported");
+#  endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+    // todo: improve the implementation
+    NV_IF_ELSE_TARGET(NV_IS_HOST, (return __construct_from_host<_Tp>(__v);), (return __construct_from_device<_Tp>(__v);))
+  }
+#  if !_CCCL_COMPILER(NVRTC)
+  template <class _Tp, class _FpCast = class __fp_cast>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_HOST static constexpr __fp __construct_from_host(const _Tp& __v) noexcept
+  {
+    if constexpr (__has_host_native_type)
+    {
+      using _TpConfig = __fp_make_config_from_t<_Tp>;
+
+      if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>)
+      {
+        return __fp{__fp_from_native, __v};
+      }
+      else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>)
+      {
+        using _TpFp = __fp<_TpConfig>;
+        if constexpr (_TpFp::__has_host_native_type)
+        {
+          return __fp{__fp_from_native,
+                      static_cast<__host_native_type>(_CUDA_VSTD::bit_cast<typename _TpFp::__host_native_type>(__v))};
+        }
+      }
+      else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>)
+      {
+        return __fp{__fp_from_native, static_cast<__host_native_type>(__v)};
+      }
+    }
+    return _FpCast::template __cast<__fp>(__v);
+  }
+#  endif // !_CCCL_COMPILER(NVRTC)
+
+  template <class _Tp, class _FpCast = class __fp_cast>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static constexpr __fp __construct_from_device(const _Tp& __v) noexcept
+  {
+    if constexpr (__has_device_native_type)
+    {
+      using _TpConfig = __fp_make_config_from_t<_Tp>;
+
+      if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>)
+      {
+        return __fp{__fp_from_native, __v};
+      }
+      else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>)
+      {
+        using _TpFp = __fp<_TpConfig>;
+        if constexpr (_TpFp::__has_device_native_type)
+        {
+          return __fp{
+            __fp_from_native,
+            static_cast<__device_native_type>(_CUDA_VSTD::bit_cast<typename _TpFp::__device_native_type>(__v))};
+        }
+      }
+      else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>)
+      {
+        return __fp{__fp_from_native, static_cast<__device_native_type>(__v)};
+      }
+    }
+    return _FpCast::template __cast<__fp>(__v);
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __cast_to() const noexcept
+  {
+    using _Up = _CUDA_VSTD::remove_cv_t<_Tp>;
+
+#  if defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+    static_assert(!_CUDA_VSTD::is_same_v<_Up, long double>, "long double is not supported");
+#  endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+    // todo: improve the implementation
+    NV_IF_ELSE_TARGET(NV_IS_HOST, (return __cast_to_host<_Up>();), (return __cast_to_device<_Up>();))
+  }
+
+#  if !_CCCL_COMPILER(NVRTC)
+  template <class _Tp, class _FpCast = class __fp_cast>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_HOST constexpr _Tp __cast_to_host() const noexcept
+  {
+    if constexpr (__has_host_native_type)
+    {
+      using _TpConfig = __fp_make_config_from_t<_Tp>;
+
+      if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>)
+      {
+        return _CUDA_VSTD::bit_cast<_Tp>(__storage_);
+      }
+      else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>)
+      {
+        using _TpFp = __fp<_TpConfig>;
+        if constexpr (_TpFp::__has_host_native_type)
+        {
+          return _CUDA_VSTD::bit_cast<_Tp>(static_cast<typename _TpFp::__host_native_type>(__host_native()));
+        }
+      }
+      else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>)
+      {
+        return static_cast<_Tp>(__host_native());
+      }
+    }
+    return _FpCast::template __cast<_Tp>(*this);
+  }
+#  endif // !_CCCL_COMPILER(NVRTC)
+
+  template <class _Tp, class _FpCast = class __fp_cast>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE constexpr _Tp __cast_to_device() const noexcept
+  {
+    if constexpr (__has_device_native_type)
+    {
+      using _TpConfig = __fp_make_config_from_t<_Tp>;
+
+      if constexpr (_CUDA_VSTD::is_same_v<_TpConfig, __config_type>)
+      {
+        return _CUDA_VSTD::bit_cast<_Tp>(__storage_);
+      }
+      else if constexpr (!_CUDA_VSTD::is_same_v<_TpConfig, __fp_invalid_config>)
+      {
+        using _TpFp = __fp<_TpConfig>;
+        if constexpr (_TpFp::__has_device_native_type)
+        {
+          return _CUDA_VSTD::bit_cast<_Tp>(static_cast<typename _TpFp::__device_native_type>(__device_native()));
+        }
+      }
+      else if constexpr (_CUDA_VSTD::is_integral_v<_Tp>)
+      {
+        return static_cast<_Tp>(__device_native());
+      }
+    }
+    return _FpCast::template __cast<_Tp>(*this);
+  }
+
+#  if !_CCCL_COMPILER(NVRTC)
+  _CCCL_TEMPLATE(bool _HasNativeType = __has_host_native_type)
+  _CCCL_REQUIRES(_HasNativeType)
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_HOST constexpr __host_native_type __host_native() const noexcept
+  {
+    return _CUDA_VSTD::bit_cast<__host_native_type>(__storage_);
+  }
+#  endif // !_CCCL_COMPILER(NVRTC)
+
+  _CCCL_TEMPLATE(bool _HasNativeType = __has_device_native_type)
+  _CCCL_REQUIRES(_HasNativeType)
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE constexpr __device_native_type __device_native() const noexcept
+  {
+    return _CUDA_VSTD::bit_cast<__device_native_type>(__storage_);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __mask() noexcept
+  {
+    return __sign_mask() | __exp_mask() | __mant_mask();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __sign_shift() noexcept
+  {
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+    return (sizeof(__storage_type) * CHAR_BIT) - static_cast<size_t>(__is_signed);
+#  else
+    // return CHAR_BIT - static_cast<size_t>(__is_signed);
+#  endif
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __sign_mask() noexcept
+  {
+    return (__is_signed) ? __storage_type(1) << __sign_shift() : 0;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __exp_val_mask() noexcept
+  {
+    return ((__storage_type(1) << __exp_nbits) - 1);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __exp_shift() noexcept
+  {
+    return __sign_shift() - __exp_nbits;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __exp_mask() noexcept
+  {
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+    return static_cast<__storage_type>(__exp_val_mask() << __exp_shift());
+#  else
+    // return __exp_val_mask() << (sizeof(__storage_type) * CHAR_BIT - (__exp_nbits + __is_signed));
+#  endif
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __mant_val_mask() noexcept
+  {
+    return static_cast<__storage_type>((__storage_type(1) << __mant_nbits) - 1);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __mant_shift() noexcept
+  {
+    return __exp_shift() - __mant_nbits;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __mant_mask() noexcept
+  {
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+    return static_cast<__storage_type>(__mant_val_mask() << __mant_shift());
+#  else
+    // return __mant_val_mask() << (sizeof(__storage_type) * CHAR_BIT - (__mant_nbits + __exp_nbits + __is_signed));
+#  endif
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __get_sign() const noexcept
+  {
+    return static_cast<bool>(__storage_ & __sign_mask());
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __storage_type __get_exp() const noexcept
+  {
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+    return static_cast<__storage_type>((__storage_ & __exp_mask()) >> __exp_shift());
+#  else
+    // return (__storage_ & __exp_mask()) >> (sizeof(__storage_type) * CHAR_BIT - (__exp_nbits + __is_signed));
+#  endif
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __storage_type __get_mant() const noexcept
+  {
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+    return static_cast<__storage_type>(static_cast<__storage_type>(__storage_ & __mant_mask()) >> __mant_shift());
+#  else
+    // return (__storage_ & __mant_mask()) >> (sizeof(__storage_type) * CHAR_BIT - (__mant_nbits + __exp_nbits +
+    // __is_signed));
+#  endif
+  }
+
+  _CCCL_TEMPLATE(bool _IsSigned = __is_signed)
+  _CCCL_REQUIRES(_IsSigned)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __set_sign(bool __sign) noexcept
+  {
+    __storage_ &= ~__sign_mask();
+    __storage_ |= static_cast<__storage_type>(__sign) << __sign_shift();
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __set_exp(__storage_type __exp) noexcept
+  {
+    __storage_ &= ~__exp_mask();
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+    __storage_ |= (__exp & __exp_val_mask()) << __exp_shift();
+#  else
+    // __storage_ |= (__exp & ((__storage_type(1) << __exp_nbits) - 1)) << (sizeof(__storage_type) * CHAR_BIT -
+    // (__exp_nbits + __is_signed));
+#  endif
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __set_mant(__storage_type __mant) noexcept
+  {
+    __storage_ &= ~__mant_mask();
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+    __storage_ |= (__mant & __mant_val_mask()) << __mant_shift();
+#  else
+    // __storage_ |= (__mant & ((__storage_type(1) << __mant_nbits) - 1)) << (sizeof(__storage_type) * CHAR_BIT -
+    // (__mant_nbits + __exp_nbits + __is_signed));
+#  endif
+  }
+
+  _CCCL_TEMPLATE(bool _HasInf = __has_inf)
+  _CCCL_REQUIRES(_HasInf)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_inf() const noexcept
+  {
+    return __config_type::template __is_inf<__fp>(*this);
+  }
+
+  _CCCL_TEMPLATE(bool _HasNan = __has_nan)
+  _CCCL_REQUIRES(_HasNan)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_nan() const noexcept
+  {
+    return __config_type::template __is_nan<__fp>(*this);
+  }
+
+  __storage_type __storage_;
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+_CCCL_NV_DIAG_DEFAULT(20208)
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_FP_H
diff --git a/libcudacxx/include/cuda/__floating_point/literals.h b/libcudacxx/include/cuda/__floating_point/literals.h
new file mode 100644
index 00000000000..19482995148
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/literals.h
@@ -0,0 +1,125 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_LITERALS_H
+#define _CUDA___FLOATING_POINT_LITERALS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__floating_point/cast.h>
+#  include <cuda/__floating_point/fp.h>
+
+// Silence the warning about the use of long double in device code
+_CCCL_NV_DIAG_SUPPRESS(20208)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+namespace fp_literals
+{
+
+_LIBCUDACXX_HIDE_FROM_ABI constexpr fp4_e2m1 operator""_fp4_e2m1(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp4_e2m1{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp4_e2m1{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+_LIBCUDACXX_HIDE_FROM_ABI constexpr fp6_e2m3 operator""_fp6_e2m3(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp6_e2m3{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp6_e2m3{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+_LIBCUDACXX_HIDE_FROM_ABI constexpr fp6_e3m2 operator""_fp6_e3m2(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp6_e3m2{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp6_e3m2{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+_LIBCUDACXX_HIDE_FROM_ABI constexpr fp16 operator""_fp16(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp16{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp16{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_bf16(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return bf16{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return bf16{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_fp32(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp32{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp32{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_fp64(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp64{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp64{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+// #  if !defined(_LIBCUDACXX_HAS_NO_INT128)
+// _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator""_fp128(long double __val) noexcept
+// {
+//   NV_IF_ELSE_TARGET(NV_IS_HOST, (return fp128{__val};), (return fp128{static_cast<double>(__val)};))
+// }
+// #  endif // !_LIBCUDACXX_HAS_NO_INT128
+
+_LIBCUDACXX_HIDE_FROM_ABI constexpr fp8_ue4m3 operator""_fp8_ue4m3(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp8_ue4m3{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp8_ue4m3{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+_LIBCUDACXX_HIDE_FROM_ABI constexpr fp8_ue8m0 operator""_fp8_ue8m0(long double __val) noexcept
+{
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  return fp8_ue8m0{__val};
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^ / vvv _LIBCUDACXX_HAS_NO_LONG_DOUBLE vvv
+  return fp8_ue8m0{static_cast<double>(__val)};
+#  endif // ^^^ _LIBCUDACXX_HAS_NO_LONG_DOUBLE ^^^
+}
+
+} // namespace fp_literals
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+_CCCL_NV_DIAG_DEFAULT(20208)
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_LITERALS_H
diff --git a/libcudacxx/include/cuda/__floating_point/operators.h b/libcudacxx/include/cuda/__floating_point/operators.h
new file mode 100644
index 00000000000..c423bb0787e
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/operators.h
@@ -0,0 +1,1250 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_OPERATORS_H
+#define _CUDA___FLOATING_POINT_OPERATORS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__floating_point/fp.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+
+#  if _CCCL_HAS_INCLUDE(<stdfloat>)
+#    include <stdfloat>
+#  endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <class _Lhs, typename _Rhs>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto __fp_make_common_type()
+{
+  constexpr auto __rank_order = __fp_make_conv_rank_order<_Lhs, _Rhs>();
+
+  if constexpr (__rank_order == __fp_conv_rank_order::__equal)
+  {
+    // Extended floating point types have higher subrank, prefer cuda extended types over std extended types
+    // Fixme: potentially will not work correctly for long double
+    // auto val = 1.0f64 + 1.0l; // val will be of type long double, is this right?
+    if constexpr (__is_standard_floating_point_v<_Lhs> || __is_std_extended_floating_point_v<_Lhs>)
+    {
+      return _Rhs{};
+    }
+    else
+    {
+      return _Lhs{};
+    }
+  }
+  else if constexpr (__rank_order == __fp_conv_rank_order::__greater)
+  {
+    return _Lhs{};
+  }
+  else if constexpr (__rank_order == __fp_conv_rank_order::__less)
+  {
+    return _Rhs{};
+  }
+  else
+  {
+    static_assert(__always_false<sizeof(_Lhs)>(), "Cannot make a common fp type from the given types");
+    _CCCL_UNREACHABLE();
+  }
+}
+
+template <class _Lhs, typename _Rhs>
+using __fp_common_type_t = decltype(__fp_make_common_type<_Lhs, _Rhs>());
+
+// Implementations of the arithmetic operations. Usually an operation is in several parts:
+//   1. __op() - the entry point for the operation which tries to implement the operation using the host & device native
+//      types if available, otherwise calls the __op_impl() function.
+//
+//   2. __op_impl() - the non native implementation of the operation for the given type. This function dispatches to the
+//      target specific implementation and falls back to the constexpr implementation.
+//
+//   3. __op_impl_TARGET() - the target specific implementation. If available, implements the operation via asm.
+//
+//   4. __op_impl_constexpr() - the constexpr implementation of the operation. Slow.
+//
+// The arguments may be heterogenous. In that case the implementation is chosen in the following order:
+//   1. try to use the host & device native types
+//   2. try to use host & device native instructions for mixed arithmetic
+//   3. cast the arguments to the common type and use the implementation for homogeneous arguments
+struct __fp_ops
+{
+  /********************************************************************************************************************/
+  // Negation
+  /********************************************************************************************************************/
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static _Tp __neg_impl_constexpr(const _Tp& __src) noexcept
+  {
+    auto __ret{__src};
+    __ret.__set_sign(!__ret.__get_sign());
+    return __ret;
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static _Tp __neg_impl_device(const _Tp& __src) noexcept
+  {
+    [[maybe_unused]] _Tp __ret;
+
+#  if __cccl_ptx_isa >= 600
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (asm("neg.f16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__src.__storage_)); return __ret;))
+    }
+#  endif // __cccl_ptx_isa >= 600
+#  if __cccl_ptx_isa >= 700
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_80,
+                   (asm("neg.bf16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__src.__storage_)); return __ret;))
+    }
+#  endif // ^^^ __cccl_ptx_isa < 700 ^^^
+
+    return __neg_impl_constexpr(__src);
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __neg_impl(const _Tp& __src) noexcept
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return __neg_impl_device(__src);))
+    }
+
+    return __neg_impl_constexpr(__src);
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __neg(const _Tp& __src) noexcept
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        if constexpr (_Tp::__has_host_native_type) { return __fp{__fp_from_native, -__src.__host_native()}; } else {
+          return __neg_impl(__src);
+        }),
+      (
+        if constexpr (_Tp::__has_device_native_type) { return __fp{__fp_from_native, -__src.__device_native()}; } else {
+          return __neg_impl(__src);
+        }))
+  }
+
+  /********************************************************************************************************************/
+  // Addition
+  /********************************************************************************************************************/
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs>
+  __add_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>)
+                  || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) )
+    {
+      return _Lhs{static_cast<float>(__lhs) + static_cast<float>(__rhs)};
+    }
+    else
+    {
+      _CCCL_ASSERT(false, "Addition is not supported for the given type");
+      return {};
+    }
+
+    // if (__lhs.__is_nan() && __rhs.__is_nan())
+    // {
+    //   return _Tp::__nan();
+    // }
+
+    // if (__lhs.__is_inf() && __rhs.__is_inf())
+    // {
+    //   return (__lhs.__get_sign() == __rhs.__get_sign()) ? __lhs : -_Tp::__nan();
+    // }
+
+    // auto __lhs_sign = __lhs.__get_sign();
+    // auto __lhs_exp  = __lhs.__get_exp();
+    // auto __lhs_mant = __lhs.__get_mant();
+    // auto __rhs_sign = __rhs.__get_sign();
+    // auto __rhs_exp  = __rhs.__get_exp();
+    // auto __rhs_mant = __rhs.__get_mant();
+
+    // if (__lhs_exp > __rhs_exp)
+    // {
+    //   __rhs_mant >>= (__lhs_exp - __rhs_exp);
+    //   __rhs_exp = __lhs_exp;
+    // }
+    // else if (__rhs_exp > __lhs_exp)
+    // {
+    //   __lhs_mant >>= (__rhs_exp - __lhs_exp);
+    //   __lhs_exp = __rhs_exp;
+    // }
+
+    // using _Sp = _CUDA_VSTD::make_signed_t<typename _Tp::__storage_type>;
+
+    // bool __res_sign{};
+    // _Sp __res_exp = __lhs_exp;
+    // _Sp __res_mant{};
+
+    // if (__lhs_sign == __rhs_sign)
+    // {
+    //   __res_mant = __lhs_mant + __rhs_mant;
+    //   __res_sign = __lhs_sign;
+    // }
+    // else if (__lhs_mant >= __rhs_mant)
+    // {
+    //   __res_mant = __lhs_mant - __rhs_mant;
+    //   __res_sign = __lhs_sign;
+    // }
+    // else
+    // {
+    //   __res_mant = __rhs_mant - __lhs_mant;
+    //   __res_sign = __rhs_sign;
+    // }
+
+    // while (__res_mant > static_cast<_Sp>(_Tp::__mant_val_mask()))
+    // {
+    //   __res_mant >>= 1;
+    //   __res_exp++;
+    // }
+
+    // while (__res_mant < (_Sp{1} << (_Tp::__mant_nbits - 1)) && __res_mant != 0)
+    // {
+    //   __res_mant <<= 1;
+    //   __res_exp--;
+    // }
+
+    // todo: implement denormalized numbers
+
+    // _Tp __ret{};
+
+    // if (__res_exp > static_cast<_Sp>(_Tp::__exp_val_mask()) / 2)
+    // {
+    //   __ret = _Tp::__inf();
+    //   __ret.__set_sign(__res_sign);
+    // }
+    // else if (__res_exp < -static_cast<_Sp>(_Tp::__exp_val_mask()) / 2 + 1)
+    // {
+    //   __ret = _Tp{};
+    // }
+    // else if (__res_mant == 0)
+    // {
+    //   __ret.__set_sign(__res_sign);
+    //   __ret.__set_exp(__res_exp);
+    //   __ret.__set_mant(__res_mant);
+    // }
+
+    // return __ret;
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs>
+  __add_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    [[maybe_unused]] __fp_common_type_t<_Lhs, _Rhs> __ret;
+
+    if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>)
+    {
+#  if __cccl_ptx_isa >= 860
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_100,
+          (asm("add.f32.f16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 860
+#  if __cccl_ptx_isa >= 860
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_100,
+          (asm("add.f32.bf16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 860
+      return __add(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs});
+    }
+    else
+    {
+#  if __cccl_ptx_isa >= 420
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp16))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_53,
+          (asm("add.f16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 420
+#  if __cccl_ptx_isa >= 780
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, bf16))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_90,
+          (asm("add.bf16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 780
+      return __add_impl_constexpr(__lhs, __rhs);
+    }
+    _CCCL_UNREACHABLE();
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __add_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return __add_impl_device(__lhs, __rhs);))
+    }
+
+    return __add_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __add(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) {
+          return __fp{__fp_from_native, __lhs.__host_native() + __rhs.__host_native()};
+        } else { return __add_impl(__lhs, __rhs); }),
+      (
+        if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) {
+          return __fp{__fp_from_native, __lhs.__device_native() + __rhs.__device_native()};
+        } else { return __add_impl(__lhs, __rhs); }))
+  }
+
+  /********************************************************************************************************************/
+  // Subtraction
+  /********************************************************************************************************************/
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs>
+  __sub_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>)
+                  || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) )
+    {
+      return _Lhs{static_cast<float>(__lhs) - static_cast<float>(__rhs)};
+    }
+    else
+    {
+      _CCCL_ASSERT(false, "Subtraction is not supported for the given type");
+      return {};
+    }
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs>
+  __sub_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    [[maybe_unused]] __fp_common_type_t<_Lhs, _Rhs> __ret;
+
+    if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>)
+    {
+#  if __cccl_ptx_isa >= 860
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_100,
+          (asm("sub.f32.f16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 860
+#  if __cccl_ptx_isa >= 860
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp32))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_100,
+          (asm("sub.f32.bf16 %0, %1, %2;" : "=r"(__ret.__storage_) : "h"(__lhs.__storage_), "r"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 860
+      return __sub(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs});
+    }
+    else
+    {
+#  if __cccl_ptx_isa >= 420
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp16))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_53,
+          (asm("sub.f16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 420
+#  if __cccl_ptx_isa >= 780
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, bf16))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_90,
+          (asm("sub.bf16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 780
+      return __sub_impl_constexpr(__lhs, __rhs);
+    }
+    _CCCL_UNREACHABLE();
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __sub_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return __sub_impl_device(__lhs, __rhs);))
+    }
+
+    return __sub_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __sub(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) {
+          return __fp{__fp_from_native, __lhs.__host_native() - __rhs.__host_native()};
+        } else { return __sub_impl(__lhs, __rhs); }),
+      (
+        if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) {
+          return __fp{__fp_from_native, __lhs.__device_native() - __rhs.__device_native()};
+        } else { return __sub_impl(__lhs, __rhs); }))
+  }
+
+  /********************************************************************************************************************/
+  // Multiplication
+  /********************************************************************************************************************/
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs>
+  __mul_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>)
+                  || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) )
+    {
+      return _Lhs{static_cast<float>(__lhs) * static_cast<float>(__rhs)};
+    }
+    else
+    {
+      _CCCL_ASSERT(false, "Multiplication is not supported for the given type");
+      return {};
+    }
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs>
+  __mul_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    [[maybe_unused]] __fp_common_type_t<_Lhs, _Rhs> __ret;
+
+    if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>)
+    {
+      return __mul(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs});
+    }
+    else
+    {
+#  if __cccl_ptx_isa >= 420
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, fp16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, fp16))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_53,
+          (asm("mul.f16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 420
+#  if __cccl_ptx_isa >= 780
+      if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Lhs, bf16) && _CUDA_VSTD::_CCCL_TRAIT(is_same, _Rhs, bf16))
+      {
+        NV_IF_TARGET(
+          NV_PROVIDES_SM_90,
+          (asm("mul.bf16 %0, %1, %2;" : "=h"(__ret.__storage_) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+           return __ret;))
+      }
+#  endif // __cccl_ptx_isa >= 780
+      return __mul_impl_constexpr(__lhs, __rhs);
+    }
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __mul_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return __mul_impl_device(__lhs, __rhs);))
+    }
+
+    return __mul_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __mul(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) {
+          return __fp{__fp_from_native, __lhs.__host_native() * __rhs.__host_native()};
+        } else { return __mul_impl(__lhs, __rhs); }),
+      (
+        if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) {
+          return __fp{__fp_from_native, __lhs.__device_native() * __rhs.__device_native()};
+        } else { return __mul_impl(__lhs, __rhs); }))
+  }
+
+  /********************************************************************************************************************/
+  // Division
+  /********************************************************************************************************************/
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static __fp_common_type_t<_Lhs, _Rhs>
+  __div_impl_constexpr(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>)
+                  || (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) )
+    {
+      return _Lhs{static_cast<float>(__lhs) / static_cast<float>(__rhs)};
+    }
+    else
+    {
+      _CCCL_ASSERT(false, "Division is not supported for the given type");
+      return {};
+    }
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static __fp_common_type_t<_Lhs, _Rhs>
+  __div_impl_device(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr (!_CUDA_VSTD::is_same_v<_Lhs, _Rhs>)
+    {
+      return __div(__fp_common_type_t<_Lhs, _Rhs>{__lhs}, __fp_common_type_t<_Lhs, _Rhs>{__rhs});
+    }
+    else
+    {
+      return __div_impl_constexpr(__lhs, __rhs);
+    }
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __div_impl(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return __div_impl_device(__lhs, __rhs);))
+    }
+
+    return __div_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr __fp_common_type_t<_Lhs, _Rhs>
+  __div(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) {
+          return __fp{__fp_from_native, __lhs.__host_native() / __rhs.__host_native()};
+        } else { return __div_impl(__lhs, __rhs); }),
+      (
+        if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) {
+          return __fp{__fp_from_native, __lhs.__device_native() / __rhs.__device_native()};
+        } else { return __div_impl(__lhs, __rhs); }))
+  }
+
+  /********************************************************************************************************************/
+  // Equality
+  /********************************************************************************************************************/
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool
+  __eq_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+    if (__lhs.__is_nan() || __rhs.__is_nan())
+    {
+      return false;
+    }
+
+    return (__lhs.__storage_ & __lhs.__mask()) == (__rhs.__storage_ & __rhs.__mask());
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool
+  __eq_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+#  if __cccl_ptx_isa >= 650
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.eq.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 650
+#  if __cccl_ptx_isa >= 780
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.eq.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __eq_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __eq_impl(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+    using _CommonFp = __fp_common_type_t<_Tp, _Tp>;
+
+    const auto __clhs = static_cast<_CommonFp>(__lhs);
+    const auto __crhs = static_cast<_CommonFp>(__rhs);
+
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return __eq_impl_device(__clhs, __crhs);))
+    }
+
+    return __eq_impl_constexpr(__clhs, __crhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __eq(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        if constexpr (_Lhs::__has_host_native_type && _Rhs::__has_host_native_type) {
+          return __lhs.__host_native() == __rhs.__host_native();
+        } else { return __eq_impl(__lhs, __rhs); }),
+      (
+        if constexpr (_Lhs::__has_device_native_type && _Rhs::__has_device_native_type) {
+          return __lhs.__device_native() == __rhs.__device_native();
+        } else { return __eq_impl(__lhs, __rhs); }))
+  }
+
+  /********************************************************************************************************************/
+  // Inequality
+  /********************************************************************************************************************/
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool
+  __neq_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+    return !__eq_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool
+  __neq_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+#  if __cccl_ptx_isa >= 650
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.ne.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 650
+#  if __cccl_ptx_isa >= 780
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.ne.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __eq_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __neq(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type())
+    {
+      NV_IF_ELSE_TARGET(NV_IS_HOST,
+                        (return __lhs.__host_native() != __rhs.__host_native();),
+                        (return __lhs.__device_native() != __rhs.__device_native();))
+    }
+    else
+    {
+      using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>;
+
+      const auto __l = static_cast<_CommonFp>(__lhs);
+      const auto __r = static_cast<_CommonFp>(__rhs);
+
+      if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+      {
+        NV_IF_TARGET(NV_IS_DEVICE, (return __neq_impl_device(__l, __r);))
+      }
+
+      return __neq_impl_constexpr(__l, __r);
+    }
+  }
+
+  /********************************************************************************************************************/
+  // Less than
+  /********************************************************************************************************************/
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool
+  __lt_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+    if (__lhs.__is_nan() || __rhs.__is_nan())
+    {
+      return false;
+    }
+
+    return (__lhs.__storage_ & __lhs.__mask()) < (__rhs.__storage_ & __rhs.__mask());
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool
+  __lt_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+#  if __cccl_ptx_isa >= 650
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.lt.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 650
+#  if __cccl_ptx_isa >= 780
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.lt.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __lt_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __lt(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type())
+    {
+      NV_IF_ELSE_TARGET(NV_IS_HOST,
+                        (return __lhs.__host_native() < __rhs.__host_native();),
+                        (return __lhs.__device_native() < __rhs.__device_native();))
+    }
+    else
+    {
+      using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>;
+
+      const auto __l = static_cast<_CommonFp>(__lhs);
+      const auto __r = static_cast<_CommonFp>(__rhs);
+
+      if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+      {
+        NV_IF_TARGET(NV_IS_DEVICE, (return __lt_impl_device(__l, __r);))
+      }
+
+      return __lt_impl_constexpr(__l, __r);
+    }
+  }
+
+  /********************************************************************************************************************/
+  // Less than or equal
+  /********************************************************************************************************************/
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool
+  __le_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+    if (__lhs.__is_nan() || __rhs.__is_nan())
+    {
+      return false;
+    }
+
+    return (__lhs.__storage_ & __lhs.__mask()) <= (__rhs.__storage_ & __rhs.__mask());
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool
+  __le_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+#  if __cccl_ptx_isa >= 650
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.le.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 650
+#  if __cccl_ptx_isa >= 780
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.le.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __le_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __le(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type())
+    {
+      NV_IF_ELSE_TARGET(NV_IS_HOST,
+                        (return __lhs.__host_native() <= __rhs.__host_native();),
+                        (return __lhs.__device_native() <= __rhs.__device_native();))
+    }
+    else
+    {
+      using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>;
+
+      const auto __l = static_cast<_CommonFp>(__lhs);
+      const auto __r = static_cast<_CommonFp>(__rhs);
+
+      if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+      {
+        NV_IF_TARGET(NV_IS_DEVICE, (return __le_impl_device(__l, __r);))
+      }
+
+      return __le_impl_constexpr(__l, __r);
+    }
+  }
+
+  /********************************************************************************************************************/
+  // Greater than
+  /********************************************************************************************************************/
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool
+  __gt_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+    if (__lhs.__is_nan() || __rhs.__is_nan())
+    {
+      return false;
+    }
+
+    return (__lhs.__storage_ & __lhs.__mask()) > (__rhs.__storage_ & __rhs.__mask());
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool
+  __gt_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+#  if __cccl_ptx_isa >= 650
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.gt.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 650
+#  if __cccl_ptx_isa >= 780
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.gt.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __gt_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __gt(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type())
+    {
+      NV_IF_ELSE_TARGET(NV_IS_HOST,
+                        (return __lhs.__host_native() > __rhs.__host_native();),
+                        (return __lhs.__device_native() > __rhs.__device_native();))
+    }
+    else
+    {
+      using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>;
+
+      const auto __l = static_cast<_CommonFp>(__lhs);
+      const auto __r = static_cast<_CommonFp>(__rhs);
+
+      if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+      {
+        NV_IF_TARGET(NV_IS_DEVICE, (return __gt_impl_device(__l, __r);))
+      }
+
+      return __gt_impl_constexpr(__l, __r);
+    }
+  }
+
+  /********************************************************************************************************************/
+  // Greater than or equal
+  /********************************************************************************************************************/
+  template <class _Tp>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr static bool
+  __ge_impl_constexpr(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+    if (__lhs.__is_nan() || __rhs.__is_nan())
+    {
+      return false;
+    }
+
+    return (__lhs.__storage_ & __lhs.__mask()) >= (__rhs.__storage_ & __rhs.__mask());
+  }
+
+  template <class _Tp>
+  _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE static bool
+  __ge_impl_device(const _Tp& __lhs, const _Tp& __rhs) noexcept
+  {
+#  if __cccl_ptx_isa >= 650
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.ge.u16.f16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 650
+#  if __cccl_ptx_isa >= 780
+    if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, bf16))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (_CUDA_VSTD::uint16_t __ret;
+                    asm("set.ge.u16.bf16 %0, %1, %2;" : "=h"(__ret) : "h"(__lhs.__storage_), "h"(__rhs.__storage_));
+                    return static_cast<bool>(__ret);))
+    }
+#  endif // __cccl_ptx_isa >= 780
+
+    return __ge_impl_constexpr(__lhs, __rhs);
+  }
+
+  template <class _Lhs, class _Rhs>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __ge(const _Lhs& __lhs, const _Rhs& __rhs) noexcept
+  {
+    if constexpr (_Lhs::__has_native_type() && _Rhs::__has_native_type())
+    {
+      NV_IF_ELSE_TARGET(NV_IS_HOST,
+                        (return __lhs.__host_native() >= __rhs.__host_native();),
+                        (return __lhs.__device_native() >= __rhs.__device_native();))
+    }
+    else
+    {
+      using _CommonFp = __fp_common_type_t<_Lhs, _Rhs>;
+
+      const auto __l = static_cast<_CommonFp>(__lhs);
+      const auto __r = static_cast<_CommonFp>(__rhs);
+
+      if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+      {
+        NV_IF_TARGET(NV_IS_DEVICE, (return __ge_impl_device(__l, __r);))
+      }
+
+      return __ge_impl_constexpr(__l, __r);
+    }
+  }
+};
+
+/**********************************************************************************************************************/
+// Unary operators
+/**********************************************************************************************************************/
+template <class _Config>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_Config> operator+(const __fp<_Config>& __src) noexcept
+{
+  return __src;
+}
+template <class _Config>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_Config> operator-(const __fp<_Config>& __src) noexcept
+{
+  static_assert(_Config::__is_signed, "Unary minus is not allowed for unsigned floating point types");
+  return __fp_ops::__neg(__src);
+}
+
+/**********************************************************************************************************************/
+// Binary operators
+/**********************************************************************************************************************/
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+operator+(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__add(__lhs, __rhs));
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+operator-(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__sub(__lhs, __rhs));
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+operator*(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__mul(__lhs, __rhs));
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+operator/(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return static_cast<__fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(__fp_ops::__div(__lhs, __rhs));
+}
+template <class _LhsConfig, class _RhsConfig>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>&
+operator+=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(),
+                "Implicit narrow conversion from higher to lower rank is not allowed");
+  return __lhs = __lhs + __rhs;
+}
+template <class _LhsConfig, class _RhsConfig>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>&
+operator-=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(),
+                "Implicit narrow conversion from higher to lower rank is not allowed");
+  return __lhs = __lhs - __rhs;
+}
+template <class _LhsConfig, class _RhsConfig>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>&
+operator*=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(),
+                "Implicit narrow conversion from higher to lower rank is not allowed");
+  return __lhs = __lhs * __rhs;
+}
+template <class _LhsConfig, class _RhsConfig>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __fp<_LhsConfig>&
+operator/=(__fp<_LhsConfig>& __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, __fp<_RhsConfig>>>(),
+                "Implicit narrow conversion from higher to lower rank is not allowed");
+  return __lhs = __lhs / __rhs;
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+operator==(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return __fp_ops::__eq(__lhs, __rhs);
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+operator!=(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return __fp_ops::__neq(__lhs, __rhs);
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+operator<(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return __fp_ops::__lt(__lhs, __rhs);
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+operator<=(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return __fp_ops::__le(__lhs, __rhs);
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+operator>(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return __fp_ops::__gt(__lhs, __rhs);
+}
+template <class _LhsConfig, class _RhsConfig>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+operator>=(__fp<_LhsConfig> __lhs, __fp<_RhsConfig> __rhs) noexcept
+{
+  return __fp_ops::__ge(__lhs, __rhs);
+}
+
+#  define _LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(_TYPE, _EXSPACE)                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator+(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__add(__lhs, __fp{__rhs}));             \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator+(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__add(__fp{__lhs}, __rhs));             \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator-(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__sub(__lhs, __fp{__rhs}));             \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator-(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__sub(__fp{__lhs}, __rhs));             \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator*(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__mul(__lhs, __fp{__rhs}));             \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator*(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__mul(__fp{__lhs}, __rhs));             \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator/(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(__fp_ops::__div(__lhs, __fp{__rhs}));             \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr auto operator/(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \
+    {                                                                                                                   \
+      return static_cast<__fp_common_type_t<_TYPE, __fp<_RhsConfig>>>(__fp_ops::__div(__fp{__lhs}, __rhs));             \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator+=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept  \
+    {                                                                                                                   \
+      static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(),             \
+                    "Implicit narrow conversion from higher to lower rank is not allowed");                             \
+      return __lhs = __lhs + __rhs;                                                                                     \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator-=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept  \
+    {                                                                                                                   \
+      static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(),             \
+                    "Implicit narrow conversion from higher to lower rank is not allowed");                             \
+      return __lhs = __lhs - __rhs;                                                                                     \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator*=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept  \
+    {                                                                                                                   \
+      static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(),             \
+                    "Implicit narrow conversion from higher to lower rank is not allowed");                             \
+      return __lhs = __lhs * __rhs;                                                                                     \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_HIDE_FROM_ABI _EXSPACE constexpr __fp<_LhsConfig>& operator/=(__fp<_LhsConfig>& __lhs, _TYPE __rhs) noexcept  \
+    {                                                                                                                   \
+      static_assert(__fp_cast_is_implicit<__fp<_LhsConfig>, __fp_common_type_t<__fp<_LhsConfig>, _TYPE>>(),             \
+                    "Implicit narrow conversion from higher to lower rank is not allowed");                             \
+      return __lhs = __lhs / __rhs;                                                                                     \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator==(                                             \
+      __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__eq(__lhs, __fp{__rhs});                                                                        \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator==(                                             \
+      _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__eq(__fp{__lhs}, __rhs);                                                                        \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator!=(                                             \
+      __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__neq(__lhs, __fp{__rhs});                                                                       \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator!=(                                             \
+      _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__neq(__fp{__lhs}, __rhs);                                                                       \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \
+    {                                                                                                                   \
+      return __fp_ops::__lt(__lhs, __fp{__rhs});                                                                        \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \
+    {                                                                                                                   \
+      return __fp_ops::__lt(__fp{__lhs}, __rhs);                                                                        \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<=(                                             \
+      __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__le(__lhs, __fp{__rhs});                                                                        \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator<=(                                             \
+      _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__le(__fp{__lhs}, __rhs);                                                                        \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>(__fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept \
+    {                                                                                                                   \
+      return __fp_ops::__gt(__lhs, __fp{__rhs});                                                                        \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>(_TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept \
+    {                                                                                                                   \
+      return __fp_ops::__gt(__fp{__lhs}, __rhs);                                                                        \
+    }                                                                                                                   \
+    template <class _LhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>=(                                             \
+      __fp<_LhsConfig> __lhs, _TYPE __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__ge(__lhs, __fp{__rhs});                                                                        \
+    }                                                                                                                   \
+    template <class _RhsConfig>                                                                                         \
+    _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _EXSPACE constexpr bool operator>=(                                             \
+      _TYPE __lhs, __fp<_RhsConfig> __rhs) noexcept                                                                     \
+    {                                                                                                                   \
+      return __fp_ops::__ge(__fp{__lhs}, __rhs);                                                                        \
+    }
+
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(float, _CCCL_HOST_DEVICE)
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(double, _CCCL_HOST_DEVICE)
+#  if !_CCCL_COMPILER(NVRTC) && (!_CCCL_HAS_CUDA_COMPILER || _CCCL_CUDA_COMPILER(NVHPC))
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(long double, _CCCL_HOST)
+#  endif // !_CCCL_COMPILER(NVRTC) && (!_CCCL_HAS_CUDA_COMPILER || _CCCL_CUDA_COMPILER(NVHPC))
+
+#  if __STDCPP_FLOAT16_T__ == 1
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float16_t, _CCCL_HOST_DEVICE)
+#  endif // __STDCPP_FLOAT16_T__ == 1
+#  if __STDCPP_FLOAT32_T__ == 1
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float32_t, _CCCL_HOST_DEVICE)
+#  endif // __STDCPP_FLOAT32_T__ == 1
+#  if __STDCPP_FLOAT64_T__ == 1
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float64_t, _CCCL_HOST_DEVICE)
+#  endif // __STDCPP_FLOAT64_T__ == 1
+#  if __STDCPP_FLOAT128_T__ == 1
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::float128_t, _CCCL_HOST_DEVICE)
+#  endif // __STDCPP_FLOAT128_T__ == 1
+#  if __STDCPP_BFLOAT16_T__ == 1
+_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(::std::bfloat16_t, _CCCL_HOST_DEVICE)
+#  endif // __STDCPP_BFLOAT16_T__ == 1
+
+#  undef _LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_OPERATORS_H
diff --git a/libcudacxx/include/cuda/__floating_point/storage.h b/libcudacxx/include/cuda/__floating_point/storage.h
new file mode 100644
index 00000000000..3804a99b38f
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/storage.h
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_STORAGE_H
+#define _CUDA___FLOATING_POINT_STORAGE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/climits>
+#  include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <size_t _NBits>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto __fp_make_storage_type()
+{
+  if constexpr (_NBits <= CHAR_BIT)
+  {
+    return _CUDA_VSTD::uint8_t{};
+  }
+  else if constexpr (_NBits <= 2 * CHAR_BIT)
+  {
+    return _CUDA_VSTD::uint16_t{};
+  }
+  else if constexpr (_NBits <= 4 * CHAR_BIT)
+  {
+    return _CUDA_VSTD::uint32_t{};
+  }
+  else if constexpr (_NBits <= 8 * CHAR_BIT)
+  {
+    return _CUDA_VSTD::uint64_t{};
+  }
+#  if !defined(_LIBCUDACXX_HAS_NO_INT128)
+  else if constexpr (_NBits <= 16 * CHAR_BIT)
+  {
+    return ::__uint128_t{};
+  }
+#  endif // !_LIBCUDACXX_HAS_NO_INT128
+  else
+  {
+    static_assert(__always_false<_NBits>(), "Unsupported number of bits for floating point type");
+  }
+}
+
+template <size_t _NBits>
+using __fp_storage_t = decltype(__fp_make_storage_type<_NBits>());
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_STORAGE_H
diff --git a/libcudacxx/include/cuda/__floating_point/type_traits.h b/libcudacxx/include/cuda/__floating_point/type_traits.h
new file mode 100644
index 00000000000..839a821a7ea
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point/type_traits.h
@@ -0,0 +1,108 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FLOATING_POINT_TYPE_TRAITS_H
+#define _CUDA___FLOATING_POINT_TYPE_TRAITS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__fwd/fp.h>
+#  include <cuda/std/__type_traits/remove_cv.h>
+
+#  if _CCCL_HAS_INCLUDE(<stdfloat>)
+#    include <stdfloat>
+#  endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <size_t _NBits>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __always_false()
+{
+  return false;
+}
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v = false;
+
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v<float> = true;
+
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v<double> = true;
+
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v<long double> = true;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_v =
+  __is_standard_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v = false;
+
+#  if __STDCPP_FLOAT16_T__ == 1
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float16_t> = true;
+#  endif // __STDCPP_FLOAT16_T__ == 1
+
+#  if __STDCPP_BFLOAT16_T__ == 1
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::bfloat16_t> = true;
+#  endif // __STDCPP_BFLOAT16_T__ == 1
+
+#  if __STDCPP_FLOAT32_T__ == 1
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float32_t> = true;
+#  endif // __STDCPP_FLOAT32_T__ == 1
+
+#  if __STDCPP_FLOAT64_T__ == 1
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float64_t> = true;
+#  endif // __STDCPP_FLOAT64_T__ == 1
+
+#  if __STDCPP_FLOAT128_T__ == 1
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float128_t> = true;
+#  endif // __STDCPP_FLOAT128_T__ == 1
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_v =
+  __is_std_extended_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_impl_v = false;
+
+template <class _Config>
+_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_impl_v<__fp<_Config>> = true;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_v =
+  __is_cuda_extended_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __fp_is_floating_point_v =
+  __is_standard_floating_point_v<_Tp> || __is_std_extended_floating_point_v<_Tp>
+  || __is_cuda_extended_floating_point_v<_Tp>;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA___FLOATING_POINT_TYPE_TRAITS_H
diff --git a/libcudacxx/include/cuda/__floating_point_ b/libcudacxx/include/cuda/__floating_point_
new file mode 100644
index 00000000000..0008560f24c
--- /dev/null
+++ b/libcudacxx/include/cuda/__floating_point_
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_FLOATING_POINT_
+#define _CUDA_FLOATING_POINT_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__floating_point/cast.h>
+#include <cuda/__floating_point/config.h>
+#include <cuda/__floating_point/conv_rank_order.h>
+#include <cuda/__floating_point/fp.h>
+#include <cuda/__floating_point/literals.h>
+#include <cuda/__floating_point/operators.h>
+#include <cuda/__floating_point/storage.h>
+#include <cuda/__floating_point/type_traits.h>
+
+#endif // _CUDA_FLOATING_POINT_
diff --git a/libcudacxx/include/cuda/__fwd/fp.h b/libcudacxx/include/cuda/__fwd/fp.h
new file mode 100644
index 00000000000..dd5137f0552
--- /dev/null
+++ b/libcudacxx/include/cuda/__fwd/fp.h
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FWD_FP_H
+#define _CUDA___FWD_FP_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <class _Config>
+class __fp;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___FWD_FP_H
diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h
index 3642d17de09..fe6c42f77a3 100644
--- a/libcudacxx/include/cuda/std/__bit/countl.h
+++ b/libcudacxx/include/cuda/std/__bit/countl.h
@@ -23,10 +23,10 @@
 
 #include <cuda/std/__bit/clz.h>
 #include <cuda/std/__bit/rotate.h>
+#include <cuda/std/__limits/limits.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
 #include <cuda/std/cstdint>
-#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h
index e7a2b609abe..9284b53478c 100644
--- a/libcudacxx/include/cuda/std/__bit/countr.h
+++ b/libcudacxx/include/cuda/std/__bit/countr.h
@@ -22,10 +22,10 @@
 #endif // no system header
 
 #include <cuda/std/__bit/ctz.h>
+#include <cuda/std/__limits/limits.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
 #include <cuda/std/cstdint>
-#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index f0186ad9f5f..7da53f440b2 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -22,10 +22,10 @@
 #endif // no system header
 
 #include <cuda/std/__bit/countl.h>
+#include <cuda/std/__limits/limits.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
 #include <cuda/std/cstdint>
-#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__bit/popcount.h b/libcudacxx/include/cuda/std/__bit/popcount.h
index 18c8d97dd30..d0d8425165f 100644
--- a/libcudacxx/include/cuda/std/__bit/popcount.h
+++ b/libcudacxx/include/cuda/std/__bit/popcount.h
@@ -22,10 +22,10 @@
 #endif // no system header
 
 #include <cuda/std/__bit/popc.h>
+#include <cuda/std/__limits/limits.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
 #include <cuda/std/cstdint>
-#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__bit/rotate.h b/libcudacxx/include/cuda/std/__bit/rotate.h
index bf2c2e5f61a..3ba6391d462 100644
--- a/libcudacxx/include/cuda/std/__bit/rotate.h
+++ b/libcudacxx/include/cuda/std/__bit/rotate.h
@@ -21,9 +21,9 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__limits/limits.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
-#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h
index 5791c41a18f..e8251b8c78e 100644
--- a/libcudacxx/include/cuda/std/__cccl/dialect.h
+++ b/libcudacxx/include/cuda/std/__cccl/dialect.h
@@ -129,6 +129,10 @@
 #  define _CCCL_NO_VARIABLE_TEMPLATES
 #endif // _CCCL_STD_VER <= 2011 || __cpp_variable_templates < 201304L
 
+#if !(__cpp_conditional_explicit >= 201806L)
+#  define _CCCL_NO_CONDITIONAL_EXPLICIT
+#endif // !(__cpp_conditional_explicit >= 201806L)
+
 ///////////////////////////////////////////////////////////////////////////////
 // Conditionally use certain language features depending on availability
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/libcudacxx/include/cuda/std/__limits/extended_floating_point.h b/libcudacxx/include/cuda/std/__limits/extended_floating_point.h
new file mode 100644
index 00000000000..de694db68c7
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__limits/extended_floating_point.h
@@ -0,0 +1,125 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___LIMITS_EXTENDED_FLOATING_POINT_H
+#define _LIBCUDACXX___LIMITS_EXTENDED_FLOATING_POINT_H
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__floating_point/fp.h>
+#include <cuda/std/__limits/limits.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _FpConfig>
+class __numeric_limits_impl<::cuda::__fp<_FpConfig>, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = ::cuda::__fp<_FpConfig>;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = type::__is_signed;
+  static constexpr int digits       = type::__mant_nbits;
+  static constexpr int digits10     = 0; // todo
+  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type::__min();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type::__max();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type::__lowest();
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type::__epsilon();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type{0.5};
+  }
+
+  static constexpr int min_exponent   = -(1 << (type::__exp_nbits - 1)) + 3;
+  static constexpr int min_exponent10 = 0; // todo
+  static constexpr int max_exponent   = 1 << (type::__exp_nbits - 1);
+  static constexpr int max_exponent10 = 0; // todo
+
+  static constexpr bool has_infinity             = type::__has_inf;
+  static constexpr bool has_quiet_NaN            = type::__has_nan;
+  static constexpr bool has_signaling_NaN        = type::__has_nans;
+  static constexpr float_denorm_style has_denorm = (type::__has_denorm) ? denorm_present : denorm_absent;
+  static constexpr bool has_denorm_loss          = false;
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    if constexpr (has_infinity)
+    {
+      return type::__inf();
+    }
+    else
+    {
+      return type{};
+    }
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    if constexpr (has_quiet_NaN)
+    {
+      return type::__nan();
+    }
+    else
+    {
+      return type{};
+    }
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    if constexpr (has_signaling_NaN)
+    {
+      return type::__nans();
+    }
+    else
+    {
+      return type{};
+    }
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type::__denorm_min();
+  }
+
+  static constexpr bool is_iec559  = _FpConfig::__is_iec559;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___LIMITS_EXTENDED_FLOATING_POINT_H
diff --git a/libcudacxx/include/cuda/std/__limits/limits.h b/libcudacxx/include/cuda/std/__limits/limits.h
new file mode 100644
index 00000000000..587d1b8302e
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__limits/limits.h
@@ -0,0 +1,761 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___LIMITS_LIMITS_H
+#define _LIBCUDACXX___LIMITS_LIMITS_H
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/bit_cast.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/climits>
+#include <cuda/std/version>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+enum float_round_style
+{
+  round_indeterminate       = -1,
+  round_toward_zero         = 0,
+  round_to_nearest          = 1,
+  round_toward_infinity     = 2,
+  round_toward_neg_infinity = 3
+};
+
+enum float_denorm_style
+{
+  denorm_indeterminate = -1,
+  denorm_absent        = 0,
+  denorm_present       = 1
+};
+
+enum class __numeric_limits_type
+{
+  __integral,
+  __bool,
+  __floating_point,
+  __other,
+};
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __numeric_limits_type __make_numeric_limits_type()
+{
+#if !defined(_CCCL_NO_IF_CONSTEXPR)
+  _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_same, _Tp, bool))
+  {
+    return __numeric_limits_type::__bool;
+  }
+  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_integral, _Tp))
+  {
+    return __numeric_limits_type::__integral;
+  }
+  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp))
+  {
+    return __numeric_limits_type::__floating_point;
+  }
+  else
+  {
+    return __numeric_limits_type::__other;
+  }
+#else // ^^^ !_CCCL_NO_IF_CONSTEXPR ^^^ // vvv _CCCL_NO_IF_CONSTEXPR vvv
+  return _CCCL_TRAIT(is_same, _Tp, bool)
+         ? __numeric_limits_type::__bool
+         : (_CCCL_TRAIT(is_integral, _Tp)
+              ? __numeric_limits_type::__integral
+              : (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp)
+                   ? __numeric_limits_type::__floating_point
+                   : __numeric_limits_type::__other));
+#endif // _CCCL_NO_IF_CONSTEXPR
+}
+
+template <class _Tp, __numeric_limits_type = __make_numeric_limits_type<_Tp>()>
+class __numeric_limits_impl
+{
+public:
+  using type = _Tp;
+
+  static constexpr bool is_specialized = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type();
+  }
+
+  static constexpr int digits       = 0;
+  static constexpr int digits10     = 0;
+  static constexpr int max_digits10 = 0;
+  static constexpr bool is_signed   = false;
+  static constexpr bool is_integer  = false;
+  static constexpr bool is_exact    = false;
+  static constexpr int radix        = 0;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type();
+  }
+
+  static constexpr int min_exponent   = 0;
+  static constexpr int min_exponent10 = 0;
+  static constexpr int max_exponent   = 0;
+  static constexpr int max_exponent10 = 0;
+
+  static constexpr bool has_infinity             = false;
+  static constexpr bool has_quiet_NaN            = false;
+  static constexpr bool has_signaling_NaN        = false;
+  static constexpr float_denorm_style has_denorm = denorm_absent;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type();
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = false;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_toward_zero;
+};
+
+// MSVC warns about overflowing left shift
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4309)
+template <class _Tp, int __digits, bool _IsSigned>
+struct __int_min
+{
+  static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits);
+};
+_CCCL_DIAG_POP
+
+template <class _Tp, int __digits>
+struct __int_min<_Tp, __digits, false>
+{
+  static constexpr _Tp value = _Tp(0);
+};
+
+template <class _Tp>
+class __numeric_limits_impl<_Tp, __numeric_limits_type::__integral>
+{
+public:
+  using type = _Tp;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = type(-1) < type(0);
+  static constexpr int digits       = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
+  static constexpr int digits10     = digits * 3 / 10;
+  static constexpr int max_digits10 = 0;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __int_min<type, digits, is_signed>::value;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return is_signed ? type(type(~0) ^ min()) : type(~0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return min();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_exact   = true;
+  static constexpr int radix       = 2;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr int min_exponent   = 0;
+  static constexpr int min_exponent10 = 0;
+  static constexpr int max_exponent   = 0;
+  static constexpr int max_exponent10 = 0;
+
+  static constexpr bool has_infinity             = false;
+  static constexpr bool has_quiet_NaN            = false;
+  static constexpr bool has_signaling_NaN        = false;
+  static constexpr float_denorm_style has_denorm = denorm_absent;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = !is_signed;
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__)
+  static constexpr bool traps = true;
+#else
+  static constexpr bool traps = false;
+#endif
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_toward_zero;
+};
+
+template <>
+class __numeric_limits_impl<bool, __numeric_limits_type::__bool>
+{
+public:
+  using type = bool;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = false;
+  static constexpr int digits       = 1;
+  static constexpr int digits10     = 0;
+  static constexpr int max_digits10 = 0;
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return false;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return true;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return min();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_exact   = true;
+  static constexpr int radix       = 2;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr int min_exponent   = 0;
+  static constexpr int min_exponent10 = 0;
+  static constexpr int max_exponent   = 0;
+  static constexpr int max_exponent10 = 0;
+
+  static constexpr bool has_infinity             = false;
+  static constexpr bool has_quiet_NaN            = false;
+  static constexpr bool has_signaling_NaN        = false;
+  static constexpr float_denorm_style has_denorm = denorm_absent;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_toward_zero;
+};
+
+template <>
+class __numeric_limits_impl<float, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = float;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = __FLT_MANT_DIG__;
+  static constexpr int digits10     = __FLT_DIG__;
+  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __FLT_MIN__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return __FLT_MAX__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return -max();
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return __FLT_EPSILON__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return 0.5F;
+  }
+
+  static constexpr int min_exponent   = __FLT_MIN_EXP__;
+  static constexpr int min_exponent10 = __FLT_MIN_10_EXP__;
+  static constexpr int max_exponent   = __FLT_MAX_EXP__;
+  static constexpr int max_exponent10 = __FLT_MAX_10_EXP__;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+
+#if defined(_CCCL_BUILTIN_HUGE_VALF)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return _CCCL_BUILTIN_HUGE_VALF();
+  }
+#else // ^^^ _CCCL_BUILTIN_HUGE_VALF ^^^ // vvv !_CCCL_BUILTIN_HUGE_VALF vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7f800000);
+  }
+#endif // !_CCCL_BUILTIN_HUGE_VALF
+#if defined(_CCCL_BUILTIN_NANF)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANF("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NANF ^^^ // vvv !_CCCL_BUILTIN_NANF vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7fc00000);
+  }
+#endif // !_CCCL_BUILTIN_NANF
+#if defined(_CCCL_BUILTIN_NANSF)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANSF("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NANSF ^^^ // vvv !_CCCL_BUILTIN_NANSF vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7fa00000);
+  }
+#endif // !_CCCL_BUILTIN_NANSF
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return __FLT_DENORM_MIN__;
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+
+template <>
+class __numeric_limits_impl<double, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = double;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = __DBL_MANT_DIG__;
+  static constexpr int digits10     = __DBL_DIG__;
+  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __DBL_MIN__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return __DBL_MAX__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return -max();
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return __DBL_EPSILON__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return 0.5;
+  }
+
+  static constexpr int min_exponent   = __DBL_MIN_EXP__;
+  static constexpr int min_exponent10 = __DBL_MIN_10_EXP__;
+  static constexpr int max_exponent   = __DBL_MAX_EXP__;
+  static constexpr int max_exponent10 = __DBL_MAX_10_EXP__;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+
+#if defined(_CCCL_BUILTIN_HUGE_VAL)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return _CCCL_BUILTIN_HUGE_VAL();
+  }
+#else // ^^^ _CCCL_BUILTIN_HUGE_VAL ^^^ // vvv !_CCCL_BUILTIN_HUGE_VAL vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7ff0000000000000);
+  }
+#endif // !_CCCL_BUILTIN_HUGE_VAL
+#if defined(_CCCL_BUILTIN_NAN)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NAN("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NAN ^^^ // vvv !_CCCL_BUILTIN_NAN vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept
+  {
+    return std::bit_cast<type>(0x7ff8000000000000);
+  }
+#endif // !_CCCL_BUILTIN_NAN
+#if defined(_CCCL_BUILTIN_NANS)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANS("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NANS ^^^ // vvv !_CCCL_BUILTIN_NANS vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7ff4000000000000);
+  }
+#endif // !_CCCL_BUILTIN_NANS
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return __DBL_DENORM_MIN__;
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+
+template <>
+class __numeric_limits_impl<long double, __numeric_limits_type::__floating_point>
+{
+#ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+public:
+  using type = long double;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = __LDBL_MANT_DIG__;
+  static constexpr int digits10     = __LDBL_DIG__;
+  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __LDBL_MIN__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return __LDBL_MAX__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return -max();
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return __LDBL_EPSILON__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return 0.5L;
+  }
+
+  static constexpr int min_exponent   = __LDBL_MIN_EXP__;
+  static constexpr int min_exponent10 = __LDBL_MIN_10_EXP__;
+  static constexpr int max_exponent   = __LDBL_MAX_EXP__;
+  static constexpr int max_exponent10 = __LDBL_MAX_10_EXP__;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return _CCCL_BUILTIN_HUGE_VALL();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANL("");
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANSL("");
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return __LDBL_DENORM_MIN__;
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+};
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+template <>
+class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = __half;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 11;
+  static constexpr int digits10     = 3;
+  static constexpr int max_digits10 = 5;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type(__half_raw{0x0400u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type(__half_raw{0x7bffu});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type(__half_raw{0xfbffu});
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(__half_raw{0x1400u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(__half_raw{0x3800u});
+  }
+
+  static constexpr int min_exponent   = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent   = 16;
+  static constexpr int max_exponent10 = 4;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(__half_raw{0x7c00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(__half_raw{0x7e00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(__half_raw{0x7d00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(__half_raw{0x0001u});
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+template <>
+class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = __nv_bfloat16;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 8;
+  static constexpr int digits10     = 2;
+  static constexpr int max_digits10 = 4;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x0080u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7f7fu});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type(__nv_bfloat16_raw{0xff7fu});
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x3c00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x3f00u});
+  }
+
+  static constexpr int min_exponent   = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent   = 128;
+  static constexpr int max_exponent10 = 38;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7f80u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7fc0u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7fa0u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x0001u});
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Tp>
+class numeric_limits : public __numeric_limits_impl<_Tp>
+{};
+
+template <class _Tp>
+class numeric_limits<const _Tp> : public numeric_limits<_Tp>
+{};
+
+template <class _Tp>
+class numeric_limits<volatile _Tp> : public numeric_limits<_Tp>
+{};
+
+template <class _Tp>
+class numeric_limits<const volatile _Tp> : public numeric_limits<_Tp>
+{};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___LIMITS_LIMITS_H
diff --git a/libcudacxx/include/cuda/std/__stdfloat_ b/libcudacxx/include/cuda/std/__stdfloat_
new file mode 100644
index 00000000000..93b4c64ee8d
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__stdfloat_
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_STDFLOAT
+#define _CUDA_STD_STDFLOAT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  if _CCCL_HAS_INCLUDE(<stdfloat>)
+#    include <stdfloat>
+#  endif // _CCCL_HAS_INCLUDE(<stdfloat>)
+#  include <cuda/__floating_point_>
+
+_CCCL_PUSH_MACROS
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#  if LIBCUDACXX_FLOAT16_T == 1
+using float16_t = ::cuda::fp16;
+#  endif // LIBCUDACXX_FLOAT16_T == 1
+#  if LIBCUDACXX_FLOAT32_T == 1
+using float32_t = ::cuda::fp32;
+#  endif // LIBCUDACXX_FLOAT32_T == 1
+#  if LIBCUDACXX_FLOAT64_T == 1
+using float64_t = ::cuda::fp64;
+#  endif // LIBCUDACXX_FLOAT64_T == 1
+// #  if LIBCUDACXX_FLOAT128_T == 1
+// using float128_t = ::cuda::fp128;
+// #  endif // LIBCUDACXX_FLOAT128_T == 1
+#  if LIBCUDACXX_BFLOAT16_T == 1
+using bfloat16_t = ::cuda::bf16;
+#  endif // LIBCUDACXX_BFLOAT16_T == 1
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+_CCCL_POP_MACROS
+
+#endif // _CCCL_STD_VER >= 2017
+
+#endif // _CUDA_STD_STDFLOAT
diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index ad529f2082b..4825c346c08 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -21,745 +21,10 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__bit/bit_cast.h>
-#include <cuda/std/__type_traits/integral_constant.h>
-#include <cuda/std/__type_traits/is_extended_floating_point.h>
-#include <cuda/std/__type_traits/is_floating_point.h>
-#include <cuda/std/__type_traits/is_integral.h>
-#include <cuda/std/climits>
-#include <cuda/std/version>
-
 _CCCL_PUSH_MACROS
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-enum float_round_style
-{
-  round_indeterminate       = -1,
-  round_toward_zero         = 0,
-  round_to_nearest          = 1,
-  round_toward_infinity     = 2,
-  round_toward_neg_infinity = 3
-};
-
-enum float_denorm_style
-{
-  denorm_indeterminate = -1,
-  denorm_absent        = 0,
-  denorm_present       = 1
-};
-
-enum class __numeric_limits_type
-{
-  __integral,
-  __bool,
-  __floating_point,
-  __other,
-};
-
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr __numeric_limits_type __make_numeric_limits_type()
-{
-#if !defined(_CCCL_NO_IF_CONSTEXPR)
-  _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_same, _Tp, bool))
-  {
-    return __numeric_limits_type::__bool;
-  }
-  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_integral, _Tp))
-  {
-    return __numeric_limits_type::__integral;
-  }
-  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp))
-  {
-    return __numeric_limits_type::__floating_point;
-  }
-  else
-  {
-    return __numeric_limits_type::__other;
-  }
-#else // ^^^ !_CCCL_NO_IF_CONSTEXPR ^^^ // vvv _CCCL_NO_IF_CONSTEXPR vvv
-  return _CCCL_TRAIT(is_same, _Tp, bool)
-         ? __numeric_limits_type::__bool
-         : (_CCCL_TRAIT(is_integral, _Tp)
-              ? __numeric_limits_type::__integral
-              : (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp)
-                   ? __numeric_limits_type::__floating_point
-                   : __numeric_limits_type::__other));
-#endif // _CCCL_NO_IF_CONSTEXPR
-}
-
-template <class _Tp, __numeric_limits_type = __make_numeric_limits_type<_Tp>()>
-class __numeric_limits_impl
-{
-public:
-  using type = _Tp;
-
-  static constexpr bool is_specialized = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return type();
-  }
-
-  static constexpr int digits       = 0;
-  static constexpr int digits10     = 0;
-  static constexpr int max_digits10 = 0;
-  static constexpr bool is_signed   = false;
-  static constexpr bool is_integer  = false;
-  static constexpr bool is_exact    = false;
-  static constexpr int radix        = 0;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type();
-  }
-
-  static constexpr int min_exponent   = 0;
-  static constexpr int min_exponent10 = 0;
-  static constexpr int max_exponent   = 0;
-  static constexpr int max_exponent10 = 0;
-
-  static constexpr bool has_infinity             = false;
-  static constexpr bool has_quiet_NaN            = false;
-  static constexpr bool has_signaling_NaN        = false;
-  static constexpr float_denorm_style has_denorm = denorm_absent;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type();
-  }
-
-  static constexpr bool is_iec559  = false;
-  static constexpr bool is_bounded = false;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_toward_zero;
-};
-
-// MSVC warns about overflowing left shift
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_MSVC(4309)
-template <class _Tp, int __digits, bool _IsSigned>
-struct __int_min
-{
-  static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits);
-};
-_CCCL_DIAG_POP
-
-template <class _Tp, int __digits>
-struct __int_min<_Tp, __digits, false>
-{
-  static constexpr _Tp value = _Tp(0);
-};
-
-template <class _Tp>
-class __numeric_limits_impl<_Tp, __numeric_limits_type::__integral>
-{
-public:
-  using type = _Tp;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = type(-1) < type(0);
-  static constexpr int digits       = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
-  static constexpr int digits10     = digits * 3 / 10;
-  static constexpr int max_digits10 = 0;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __int_min<type, digits, is_signed>::value;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return is_signed ? type(type(~0) ^ min()) : type(~0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return min();
-  }
-
-  static constexpr bool is_integer = true;
-  static constexpr bool is_exact   = true;
-  static constexpr int radix       = 2;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr int min_exponent   = 0;
-  static constexpr int min_exponent10 = 0;
-  static constexpr int max_exponent   = 0;
-  static constexpr int max_exponent10 = 0;
-
-  static constexpr bool has_infinity             = false;
-  static constexpr bool has_quiet_NaN            = false;
-  static constexpr bool has_signaling_NaN        = false;
-  static constexpr float_denorm_style has_denorm = denorm_absent;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr bool is_iec559  = false;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = !is_signed;
-
-#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__)
-  static constexpr bool traps = true;
-#else
-  static constexpr bool traps = false;
-#endif
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_toward_zero;
-};
-
-template <>
-class __numeric_limits_impl<bool, __numeric_limits_type::__bool>
-{
-public:
-  using type = bool;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = false;
-  static constexpr int digits       = 1;
-  static constexpr int digits10     = 0;
-  static constexpr int max_digits10 = 0;
-
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return false;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return true;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return min();
-  }
-
-  static constexpr bool is_integer = true;
-  static constexpr bool is_exact   = true;
-  static constexpr int radix       = 2;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr int min_exponent   = 0;
-  static constexpr int min_exponent10 = 0;
-  static constexpr int max_exponent   = 0;
-  static constexpr int max_exponent10 = 0;
-
-  static constexpr bool has_infinity             = false;
-  static constexpr bool has_quiet_NaN            = false;
-  static constexpr bool has_signaling_NaN        = false;
-  static constexpr float_denorm_style has_denorm = denorm_absent;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr bool is_iec559  = false;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_toward_zero;
-};
-
-template <>
-class __numeric_limits_impl<float, __numeric_limits_type::__floating_point>
-{
-public:
-  using type = float;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = __FLT_MANT_DIG__;
-  static constexpr int digits10     = __FLT_DIG__;
-  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __FLT_MIN__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __FLT_MAX__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return -max();
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __FLT_EPSILON__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return 0.5F;
-  }
-
-  static constexpr int min_exponent   = __FLT_MIN_EXP__;
-  static constexpr int min_exponent10 = __FLT_MIN_10_EXP__;
-  static constexpr int max_exponent   = __FLT_MAX_EXP__;
-  static constexpr int max_exponent10 = __FLT_MAX_10_EXP__;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-
-#if defined(_CCCL_BUILTIN_HUGE_VALF)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return _CCCL_BUILTIN_HUGE_VALF();
-  }
-#else // ^^^ _CCCL_BUILTIN_HUGE_VALF ^^^ // vvv !_CCCL_BUILTIN_HUGE_VALF vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept
-  {
-    return _CUDA_VSTD::bit_cast<type>(0x7f800000);
-  }
-#endif // !_CCCL_BUILTIN_HUGE_VALF
-#if defined(_CCCL_BUILTIN_NANF)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return _CCCL_BUILTIN_NANF("");
-  }
-#else // ^^^ _CCCL_BUILTIN_NANF ^^^ // vvv !_CCCL_BUILTIN_NANF vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept
-  {
-    return _CUDA_VSTD::bit_cast<type>(0x7fc00000);
-  }
-#endif // !_CCCL_BUILTIN_NANF
-#if defined(_CCCL_BUILTIN_NANSF)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return _CCCL_BUILTIN_NANSF("");
-  }
-#else // ^^^ _CCCL_BUILTIN_NANSF ^^^ // vvv !_CCCL_BUILTIN_NANSF vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept
-  {
-    return _CUDA_VSTD::bit_cast<type>(0x7fa00000);
-  }
-#endif // !_CCCL_BUILTIN_NANSF
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __FLT_DENORM_MIN__;
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-};
-
-template <>
-class __numeric_limits_impl<double, __numeric_limits_type::__floating_point>
-{
-public:
-  using type = double;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = __DBL_MANT_DIG__;
-  static constexpr int digits10     = __DBL_DIG__;
-  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __DBL_MIN__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __DBL_MAX__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return -max();
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __DBL_EPSILON__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return 0.5;
-  }
-
-  static constexpr int min_exponent   = __DBL_MIN_EXP__;
-  static constexpr int min_exponent10 = __DBL_MIN_10_EXP__;
-  static constexpr int max_exponent   = __DBL_MAX_EXP__;
-  static constexpr int max_exponent10 = __DBL_MAX_10_EXP__;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-
-#if defined(_CCCL_BUILTIN_HUGE_VAL)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return _CCCL_BUILTIN_HUGE_VAL();
-  }
-#else // ^^^ _CCCL_BUILTIN_HUGE_VAL ^^^ // vvv !_CCCL_BUILTIN_HUGE_VAL vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept
-  {
-    return _CUDA_VSTD::bit_cast<type>(0x7ff0000000000000);
-  }
-#endif // !_CCCL_BUILTIN_HUGE_VAL
-#if defined(_CCCL_BUILTIN_NAN)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return _CCCL_BUILTIN_NAN("");
-  }
-#else // ^^^ _CCCL_BUILTIN_NAN ^^^ // vvv !_CCCL_BUILTIN_NAN vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept
-  {
-    return std::bit_cast<type>(0x7ff8000000000000);
-  }
-#endif // !_CCCL_BUILTIN_NAN
-#if defined(_CCCL_BUILTIN_NANS)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return _CCCL_BUILTIN_NANS("");
-  }
-#else // ^^^ _CCCL_BUILTIN_NANS ^^^ // vvv !_CCCL_BUILTIN_NANS vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept
-  {
-    return _CUDA_VSTD::bit_cast<type>(0x7ff4000000000000);
-  }
-#endif // !_CCCL_BUILTIN_NANS
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __DBL_DENORM_MIN__;
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-};
-
-template <>
-class __numeric_limits_impl<long double, __numeric_limits_type::__floating_point>
-{
-#ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-
-public:
-  using type = long double;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = __LDBL_MANT_DIG__;
-  static constexpr int digits10     = __LDBL_DIG__;
-  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __LDBL_MIN__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __LDBL_MAX__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return -max();
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __LDBL_EPSILON__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return 0.5L;
-  }
-
-  static constexpr int min_exponent   = __LDBL_MIN_EXP__;
-  static constexpr int min_exponent10 = __LDBL_MIN_10_EXP__;
-  static constexpr int max_exponent   = __LDBL_MAX_EXP__;
-  static constexpr int max_exponent10 = __LDBL_MAX_10_EXP__;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return _CCCL_BUILTIN_HUGE_VALL();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return _CCCL_BUILTIN_NANL("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return _CCCL_BUILTIN_NANSL("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __LDBL_DENORM_MIN__;
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
-};
-
-#if defined(_LIBCUDACXX_HAS_NVFP16)
-template <>
-class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point>
-{
-public:
-  using type = __half;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = 11;
-  static constexpr int digits10     = 3;
-  static constexpr int max_digits10 = 5;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return type(__half_raw{0x0400u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return type(__half_raw{0x7bffu});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return type(__half_raw{0xfbffu});
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type(__half_raw{0x1400u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type(__half_raw{0x3800u});
-  }
-
-  static constexpr int min_exponent   = -13;
-  static constexpr int min_exponent10 = -4;
-  static constexpr int max_exponent   = 16;
-  static constexpr int max_exponent10 = 4;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type(__half_raw{0x7c00u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type(__half_raw{0x7e00u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type(__half_raw{0x7d00u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type(__half_raw{0x0001u});
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-};
-#endif // _LIBCUDACXX_HAS_NVFP16
-
-#if defined(_LIBCUDACXX_HAS_NVBF16)
-template <>
-class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point>
-{
-public:
-  using type = __nv_bfloat16;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = 8;
-  static constexpr int digits10     = 2;
-  static constexpr int max_digits10 = 4;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x0080u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x7f7fu});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return type(__nv_bfloat16_raw{0xff7fu});
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x3c00u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x3f00u});
-  }
-
-  static constexpr int min_exponent   = -125;
-  static constexpr int min_exponent10 = -37;
-  static constexpr int max_exponent   = 128;
-  static constexpr int max_exponent10 = 38;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x7f80u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x7fc0u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x7fa0u});
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type(__nv_bfloat16_raw{0x0001u});
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-};
-#endif // _LIBCUDACXX_HAS_NVBF16
-
-template <class _Tp>
-class numeric_limits : public __numeric_limits_impl<_Tp>
-{};
-
-template <class _Tp>
-class numeric_limits<const _Tp> : public numeric_limits<_Tp>
-{};
-
-template <class _Tp>
-class numeric_limits<volatile _Tp> : public numeric_limits<_Tp>
-{};
-
-template <class _Tp>
-class numeric_limits<const volatile _Tp> : public numeric_limits<_Tp>
-{};
-
-_LIBCUDACXX_END_NAMESPACE_STD
+#include <cuda/std/__limits/extended_floating_point.h>
+#include <cuda/std/__limits/limits.h>
 
 _CCCL_POP_MACROS
 
diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/cast.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/cast.pass.cpp
new file mode 100644
index 00000000000..42549b53628
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/floating_point/cast.pass.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+
+#include <cuda/__floating_point_>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ constexpr bool test_type()
+{
+  assert(static_cast<cuda::fp16>(T{1.0}) == cuda::fp16{1.0});
+  assert(static_cast<cuda::fp32>(T{1.0}) == cuda::fp32{1.0});
+  assert(static_cast<cuda::fp64>(T{1.0}) == cuda::fp64{1.0});
+  // assert(static_cast<cuda::bf16>(T{1.0}) == cuda::bf16{1.0});
+
+  assert(static_cast<float>(T{1.0}) == 1.0f);
+  assert(static_cast<double>(T{1.0}) == 1.0);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  assert(static_cast<long double>(T{1.0}) == 1.0L);
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  assert(static_cast<int8_t>(T{1.0}) == 1);
+  assert(static_cast<int16_t>(T{1.0}) == 1);
+  assert(static_cast<int32_t>(T{1.0}) == 1);
+  assert(static_cast<int64_t>(T{1.0}) == 1);
+  assert(static_cast<uint8_t>(T{1.0}) == 1);
+  assert(static_cast<uint16_t>(T{1.0}) == 1);
+  assert(static_cast<uint32_t>(T{1.0}) == 1);
+  assert(static_cast<uint64_t>(T{1.0}) == 1);
+
+  assert(static_cast<bool>(T{1.0}) == true);
+  assert(static_cast<char>(T{1.0}) == 1);
+
+  return true;
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_type<cuda::fp16>();
+  test_type<cuda::fp32>();
+  test_type<cuda::fp64>();
+  // test_type<cuda::bf16>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  // static_assert(test());
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/construct.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/construct.pass.cpp
new file mode 100644
index 00000000000..f1f47a58436
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/floating_point/construct.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+
+#include <cuda/__floating_point_>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ constexpr bool test_type()
+{
+  assert(T{cuda::fp16{1.0}} == T{1.0});
+  assert(T{cuda::fp32{1.0}} == T{1.0});
+  assert(T{cuda::fp64{1.0}} == T{1.0});
+  // assert(T{cuda::bf16{1.0}} == T{1.0});
+  assert(T{float{1.0}} == T{1.0});
+  assert(T{double{1.0}} == T{1.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  assert(T{long double{1.0}} == T{1.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  assert(T{int8_t{1}} == T{1.0});
+  assert(T{int16_t{1}} == T{1.0});
+  assert(T{int32_t{1}} == T{1.0});
+  assert(T{int64_t{1}} == T{1.0});
+  assert(T{uint8_t{1}} == T{1.0});
+  assert(T{uint16_t{1}} == T{1.0});
+  assert(T{uint32_t{1}} == T{1.0});
+  assert(T{uint64_t{1}} == T{1.0});
+
+  assert(T{bool{1}} == T{1.0});
+  assert(T{char{1}} == T{1.0});
+
+  return true;
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_type<cuda::fp16>();
+  test_type<cuda::fp32>();
+  test_type<cuda::fp64>();
+  // test_type<cuda::bf16>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  // static_assert(test());
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/add.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/add.pass.cpp
new file mode 100644
index 00000000000..a912634ca48
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/add.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+
+#include <cuda/__floating_point_>
+#include <cuda/std/cassert>
+
+#include "test_macros.h"
+
+template <class Lhs, class Rhs, class Exp>
+__host__ __device__ constexpr void test_add(Lhs lhs, Rhs rhs, Exp expected)
+{
+  static_assert(noexcept(lhs + rhs));
+  ASSERT_SAME_TYPE(decltype(lhs + rhs), Exp);
+  assert(lhs + rhs == expected);
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_add(cuda::fp16{1.0}, cuda::fp16{1.0}, cuda::fp16{2.0});
+  test_add(cuda::fp16{1.0}, cuda::fp32{1.0}, cuda::fp32{2.0});
+  test_add(cuda::fp16{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0});
+  test_add(cuda::fp16{1.0}, float{1.0}, float{2.0});
+  test_add(cuda::fp16{1.0}, double{1.0}, double{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_add(cuda::fp16{1.0}, long double{1.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_add(cuda::fp32{1.0}, cuda::fp16{1.0}, cuda::fp32{2.0});
+  test_add(cuda::fp32{1.0}, cuda::fp32{1.0}, cuda::fp32{2.0});
+  test_add(cuda::fp32{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0});
+  // test_add(cuda::fp32{1.0}, cuda::bf16{1.0}, cuda::fp32{2.0});
+  test_add(cuda::fp32{1.0}, float{1.0}, cuda::fp32{2.0});
+  test_add(cuda::fp32{1.0}, double{1.0}, double{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_add(cuda::fp32{1.0}, long double{1.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_add(cuda::fp64{1.0}, cuda::fp16{1.0}, cuda::fp64{2.0});
+  test_add(cuda::fp64{1.0}, cuda::fp32{1.0}, cuda::fp64{2.0});
+  test_add(cuda::fp64{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0});
+  // test_add(cuda::fp64{1.0}, cuda::bf16{1.0}, cuda::fp64{2.0});
+  test_add(cuda::fp64{1.0}, float{1.0}, cuda::fp64{2.0});
+  test_add(cuda::fp64{1.0}, double{1.0}, cuda::fp64{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_add(cuda::fp64{1.0}, long double{1.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  // test_add(cuda::bf16{1.0}, cuda::fp32{1.0}, cuda::fp32{2.0});
+  // test_add(cuda::bf16{1.0}, cuda::fp64{1.0}, cuda::fp64{2.0});
+  // test_add(cuda::bf16{1.0}, cuda::bf16{1.0}, cuda::bf16{2.0});
+  // test_add(cuda::bf16{1.0}, float{1.0}, float{2.0});
+  // test_add(cuda::bf16{1.0}, double{1.0}, double{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  // test_add(cuda::bf16{1.0}, long double{1.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  // static_assert(test());
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/common_type.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/common_type.pass.cpp
new file mode 100644
index 00000000000..7d89fba6342
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/common_type.pass.cpp
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+
+#include <cuda/__floating_point_>
+#include <cuda/std/cassert>
+
+#include "test_macros.h"
+
+template <class T, class U, class Exp>
+__host__ __device__ void test_common_type()
+{
+  ASSERT_SAME_TYPE(cuda::__fp_common_type_t<T, U>, Exp);
+  ASSERT_SAME_TYPE(cuda::__fp_common_type_t<U, T>, Exp);
+}
+
+int main(int, char**)
+{
+  // cuda::fp16
+  test_common_type<cuda::fp16, cuda::fp16, cuda::fp16>();
+  test_common_type<cuda::fp16, cuda::fp32, cuda::fp32>();
+  test_common_type<cuda::fp16, cuda::fp64, cuda::fp64>();
+  test_common_type<cuda::fp16, float, float>();
+  test_common_type<cuda::fp16, double, double>();
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_common_type<cuda::fp16, long double, long double>();
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  // cuda::fp32
+  test_common_type<cuda::fp32, cuda::fp16, cuda::fp32>();
+  test_common_type<cuda::fp32, cuda::fp32, cuda::fp32>();
+  test_common_type<cuda::fp32, cuda::fp64, cuda::fp64>();
+  test_common_type<cuda::fp32, cuda::bf16, cuda::fp32>();
+  test_common_type<cuda::fp32, float, cuda::fp32>();
+  test_common_type<cuda::fp32, double, double>();
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_common_type<cuda::fp32, long double, long double>();
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  // cuda::fp64
+  test_common_type<cuda::fp64, cuda::fp16, cuda::fp64>();
+  test_common_type<cuda::fp64, cuda::fp32, cuda::fp64>();
+  test_common_type<cuda::fp64, cuda::fp64, cuda::fp64>();
+  test_common_type<cuda::fp64, cuda::bf16, cuda::fp64>();
+  test_common_type<cuda::fp64, float, cuda::fp64>();
+  test_common_type<cuda::fp64, double, cuda::fp64>();
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_common_type<cuda::fp64, long double, long double>();
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  // cuda::bf16
+  test_common_type<cuda::bf16, cuda::bf16, cuda::bf16>();
+  test_common_type<cuda::bf16, cuda::fp32, cuda::fp32>();
+  test_common_type<cuda::bf16, cuda::fp64, cuda::fp64>();
+  test_common_type<cuda::bf16, float, float>();
+  test_common_type<cuda::bf16, double, double>();
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_common_type<cuda::bf16, long double, long double>();
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/div.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/div.pass.cpp
new file mode 100644
index 00000000000..a783e8e1e6b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/div.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+
+#include <cuda/__floating_point_>
+#include <cuda/std/cassert>
+
+#include "test_macros.h"
+
+template <class Lhs, class Rhs, class Exp>
+__host__ __device__ constexpr void test_div(Lhs lhs, Rhs rhs, Exp expected)
+{
+  static_assert(noexcept(lhs / rhs));
+  ASSERT_SAME_TYPE(decltype(lhs / rhs), Exp);
+  assert(lhs / rhs == expected);
+}
+
+__host__ __device__ constexpr bool test()
+{
+  // test_div(cuda::fp16{2.0}, cuda::fp16{2.0}, cuda::fp16{1.0});
+  test_div(cuda::fp16{2.0}, cuda::fp32{2.0}, cuda::fp32{1.0});
+  test_div(cuda::fp16{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0});
+  test_div(cuda::fp16{2.0}, float{2.0}, float{1.0});
+  test_div(cuda::fp16{2.0}, double{2.0}, double{1.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_div(cuda::fp16{2.0}, long double{2.0}, long double{1.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_div(cuda::fp32{2.0}, cuda::fp16{2.0}, cuda::fp32{1.0});
+  test_div(cuda::fp32{2.0}, cuda::fp32{2.0}, cuda::fp32{1.0});
+  test_div(cuda::fp32{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0});
+  // test_div(cuda::fp32{2.0}, cuda::bf16{2.0}, cuda::fp32{1.0});
+  test_div(cuda::fp32{2.0}, float{2.0}, cuda::fp32{1.0});
+  test_div(cuda::fp32{2.0}, double{2.0}, double{1.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_div(cuda::fp32{2.0}, long double{2.0}, long double{1.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_div(cuda::fp64{2.0}, cuda::fp16{2.0}, cuda::fp64{1.0});
+  test_div(cuda::fp64{2.0}, cuda::fp32{2.0}, cuda::fp64{1.0});
+  test_div(cuda::fp64{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0});
+  // test_div(cuda::fp64{2.0}, cuda::bf16{2.0}, cuda::fp64{1.0});
+  test_div(cuda::fp64{2.0}, float{2.0}, cuda::fp64{1.0});
+  test_div(cuda::fp64{2.0}, double{2.0}, cuda::fp64{1.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_div(cuda::fp64{2.0}, long double{2.0}, long double{1.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  // test_div(cuda::bf16{2.0}, cuda::fp32{2.0}, cuda::fp32{1.0});
+  // test_div(cuda::bf16{2.0}, cuda::fp64{2.0}, cuda::fp64{1.0});
+  // test_div(cuda::bf16{2.0}, cuda::bf16{2.0}, cuda::bf16{1.0});
+  // test_div(cuda::bf16{2.0}, float{2.0}, float{1.0});
+  // test_div(cuda::bf16{2.0}, double{2.0}, double{1.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  // test_div(cuda::bf16{2.0}, long double{2.0}, long double{1.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  // static_assert(test());
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/mul.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/mul.pass.cpp
new file mode 100644
index 00000000000..7f3915115e7
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/mul.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+
+#include <cuda/__floating_point_>
+#include <cuda/std/cassert>
+
+#include "test_macros.h"
+
+template <class Lhs, class Rhs, class Exp>
+__host__ __device__ constexpr void test_mul(Lhs lhs, Rhs rhs, Exp expected)
+{
+  static_assert(noexcept(lhs * rhs));
+  ASSERT_SAME_TYPE(decltype(lhs * rhs), Exp);
+  assert(lhs * rhs == expected);
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_mul(cuda::fp16{1.0}, cuda::fp16{2.0}, cuda::fp16{2.0});
+  test_mul(cuda::fp16{1.0}, cuda::fp32{2.0}, cuda::fp32{2.0});
+  test_mul(cuda::fp16{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0});
+  test_mul(cuda::fp16{1.0}, float{2.0}, float{2.0});
+  test_mul(cuda::fp16{1.0}, double{2.0}, double{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_mul(cuda::fp16{1.0}, long double{2.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_mul(cuda::fp32{1.0}, cuda::fp16{2.0}, cuda::fp32{2.0});
+  test_mul(cuda::fp32{1.0}, cuda::fp32{2.0}, cuda::fp32{2.0});
+  test_mul(cuda::fp32{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0});
+  // test_mul(cuda::fp32{1.0}, cuda::bf16{2.0}, cuda::fp32{2.0});
+  test_mul(cuda::fp32{1.0}, float{2.0}, cuda::fp32{2.0});
+  test_mul(cuda::fp32{1.0}, double{2.0}, double{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_mul(cuda::fp32{1.0}, long double{2.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_mul(cuda::fp64{1.0}, cuda::fp16{2.0}, cuda::fp64{2.0});
+  test_mul(cuda::fp64{1.0}, cuda::fp32{2.0}, cuda::fp64{2.0});
+  test_mul(cuda::fp64{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0});
+  // test_mul(cuda::fp64{1.0}, cuda::bf16{2.0}, cuda::fp64{2.0});
+  test_mul(cuda::fp64{1.0}, float{2.0}, cuda::fp64{2.0});
+  test_mul(cuda::fp64{1.0}, double{2.0}, cuda::fp64{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_mul(cuda::fp64{1.0}, long double{2.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  // test_mul(cuda::bf16{1.0}, cuda::fp32{2.0}, cuda::fp32{2.0});
+  // test_mul(cuda::bf16{1.0}, cuda::fp64{2.0}, cuda::fp64{2.0});
+  // test_mul(cuda::bf16{1.0}, cuda::bf16{2.0}, cuda::bf16{2.0});
+  // test_mul(cuda::bf16{1.0}, float{2.0}, float{2.0});
+  // test_mul(cuda::bf16{1.0}, double{2.0}, double{2.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  // test_mul(cuda::bf16{1.0}, long double{2.0}, long double{2.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  // static_assert(test());
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/floating_point/ops/sub.pass.cpp b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/sub.pass.cpp
new file mode 100644
index 00000000000..2af00d7eb2f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/floating_point/ops/sub.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+
+#include <cuda/__floating_point_>
+#include <cuda/std/cassert>
+
+#include "test_macros.h"
+
+template <class Lhs, class Rhs, class Exp>
+__host__ __device__ constexpr void test_sub(Lhs lhs, Rhs rhs, Exp expected)
+{
+  static_assert(noexcept(lhs - rhs));
+  ASSERT_SAME_TYPE(decltype(lhs - rhs), Exp);
+  assert(lhs - rhs == expected);
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_sub(cuda::fp16{1.0}, cuda::fp16{1.0}, cuda::fp16{0.0});
+  test_sub(cuda::fp16{1.0}, cuda::fp32{1.0}, cuda::fp32{0.0});
+  test_sub(cuda::fp16{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0});
+  test_sub(cuda::fp16{1.0}, float{1.0}, float{0.0});
+  test_sub(cuda::fp16{1.0}, double{1.0}, double{0.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_sub(cuda::fp16{1.0}, long double{1.0}, long double{0.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_sub(cuda::fp32{1.0}, cuda::fp16{1.0}, cuda::fp32{0.0});
+  test_sub(cuda::fp32{1.0}, cuda::fp32{1.0}, cuda::fp32{0.0});
+  test_sub(cuda::fp32{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0});
+  // test_sub(cuda::fp32{1.0}, cuda::bf16{1.0}, cuda::fp32{0.0});
+  test_sub(cuda::fp32{1.0}, float{1.0}, cuda::fp32{0.0});
+  test_sub(cuda::fp32{1.0}, double{1.0}, double{0.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_sub(cuda::fp32{1.0}, long double{1.0}, long double{0.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  test_sub(cuda::fp64{1.0}, cuda::fp16{1.0}, cuda::fp64{0.0});
+  test_sub(cuda::fp64{1.0}, cuda::fp32{1.0}, cuda::fp64{0.0});
+  test_sub(cuda::fp64{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0});
+  // test_sub(cuda::fp64{1.0}, cuda::bf16{1.0}, cuda::fp64{0.0});
+  test_sub(cuda::fp64{1.0}, float{1.0}, cuda::fp64{0.0});
+  test_sub(cuda::fp64{1.0}, double{1.0}, cuda::fp64{0.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_sub(cuda::fp64{1.0}, long double{1.0}, long double{0.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  // test_sub(cuda::bf16{1.0}, cuda::fp32{1.0}, cuda::fp32{0.0});
+  // test_sub(cuda::bf16{1.0}, cuda::fp64{1.0}, cuda::fp64{0.0});
+  // test_sub(cuda::bf16{1.0}, cuda::bf16{1.0}, cuda::bf16{0.0});
+  // test_sub(cuda::bf16{1.0}, float{1.0}, float{0.0});
+  // test_sub(cuda::bf16{1.0}, double{1.0}, double{0.0});
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  // test_sub(cuda::bf16{1.0}, long double{1.0}, long double{0.0});
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  // static_assert(test());
+
+  return 0;
+}