diff --git a/shared/libebm/compute/cpu_ebm/cpu_64.cpp b/shared/libebm/compute/cpu_ebm/cpu_64.cpp index 6fc564f4e..1934cdd1d 100644 --- a/shared/libebm/compute/cpu_ebm/cpu_64.cpp +++ b/shared/libebm/compute/cpu_ebm/cpu_64.cpp @@ -74,6 +74,8 @@ struct Cpu_64_Int final { inline Cpu_64_Int operator+(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data + other.m_data); } + inline Cpu_64_Int operator-(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data - other.m_data); } + inline Cpu_64_Int operator*(const T& other) const noexcept { return Cpu_64_Int(m_data * other); } inline Cpu_64_Int operator>>(int shift) const noexcept { return Cpu_64_Int(m_data >> shift); } @@ -82,13 +84,28 @@ struct Cpu_64_Int final { inline Cpu_64_Int operator&(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data & other.m_data); } + inline Cpu_64_Int operator|(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data | other.m_data); } + + friend inline Cpu_64_Int IfThenElse(const bool cmp, const Cpu_64_Int& trueVal, const Cpu_64_Int& falseVal) noexcept { + return cmp ? trueVal : falseVal; + } + private: TPack m_data; }; static_assert(std::is_standard_layout::value && std::is_trivially_copyable::value, "This allows offsetof, memcpy, memset, inter-language, GPU and cross-machine use where needed"); +template +inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept; + struct Cpu_64_Float final { + template + friend Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept; + using T = double; using TPack = double; using TInt = Cpu_64_Int; @@ -106,6 +123,8 @@ struct Cpu_64_Float final { inline Cpu_64_Float(const double val) noexcept : m_data(static_cast(val)) {} inline Cpu_64_Float(const float val) noexcept : m_data(static_cast(val)) {} inline Cpu_64_Float(const int val) noexcept : m_data(static_cast(val)) {} + inline Cpu_64_Float(const int64_t val) noexcept : m_data(static_cast(val)) {} + explicit Cpu_64_Float(const Cpu_64_Int& val) : m_data(static_cast(val.m_data)) {} inline Cpu_64_Float operator+() const noexcept { return *this; } @@ -179,6 +198,10 @@ struct Cpu_64_Float final { return Cpu_64_Float(val) / other; } + friend inline bool operator<=(const Cpu_64_Float& left, const Cpu_64_Float& right) noexcept { + return left.m_data <= right.m_data; + } + inline static Cpu_64_Float Load(const T* const a) noexcept { return Cpu_64_Float(*a); } inline void Store(T* const a) const noexcept { *a = m_data; } @@ -207,6 +230,11 @@ struct Cpu_64_Float final { return cmp1.m_data < cmp2.m_data ? trueVal : falseVal; } + friend inline Cpu_64_Float IfThenElse( + const bool cmp, const Cpu_64_Float& trueVal, const Cpu_64_Float& falseVal) noexcept { + return cmp ? trueVal : falseVal; + } + friend inline Cpu_64_Float IfEqual(const Cpu_64_Float& cmp1, const Cpu_64_Float& cmp2, const Cpu_64_Float& trueVal, @@ -226,10 +254,24 @@ struct Cpu_64_Float final { return cmp1.m_data == cmp2.m_data ? trueVal : falseVal; } - friend inline Cpu_64_Float Abs(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::abs(val.m_data)); } + static inline bool ReinterpretInt(const bool val) noexcept { return val; } + + static inline Cpu_64_Int ReinterpretInt(const Cpu_64_Float& val) noexcept { + typename Cpu_64_Int::T mem; + memcpy(&mem, &val.m_data, sizeof(T)); + return Cpu_64_Int(mem); + } + + static inline Cpu_64_Float ReinterpretFloat(const Cpu_64_Int& val) noexcept { + T mem; + memcpy(&mem, &val.m_data, sizeof(T)); + return Cpu_64_Float(mem); + } friend inline Cpu_64_Float Round(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::round(val.m_data)); } + friend inline Cpu_64_Float Abs(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::abs(val.m_data)); } + friend inline Cpu_64_Float FastApproxReciprocal(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(T{1.0} / val.m_data); } @@ -250,8 +292,6 @@ struct Cpu_64_Float final { friend inline Cpu_64_Float Sqrt(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::sqrt(val.m_data)); } - friend inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::exp(val.m_data)); } - friend inline Cpu_64_Float Log(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::log(val.m_data)); } template(val); } template::value && std::is_trivially_copyable::value, "This allows offsetof, memcpy, memset, inter-language, GPU and cross-machine use where needed"); +template +inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept { + return Exp64(val); +} + INTERNAL_IMPORT_EXPORT_BODY ErrorEbm ApplyUpdate_Cpu_64( const ObjectiveWrapper* const pObjectiveWrapper, ApplyUpdateBridge* const pData) { const Objective* const pObjective = static_cast(pObjectiveWrapper->m_pObjective); diff --git a/shared/libebm/compute/math.hpp b/shared/libebm/compute/math.hpp index 4a15cfc15..d073bd96b 100644 --- a/shared/libebm/compute/math.hpp +++ b/shared/libebm/compute/math.hpp @@ -22,10 +22,14 @@ template static INLINE_ALWAYS typename TFloat::TInt Exponent32( return ((TFloat::ReinterpretInt(val) << 1) >> 24) - typename TFloat::TInt(0x7F); } -template static INLINE_ALWAYS TFloat Power2(const TFloat val) { +template static INLINE_ALWAYS TFloat Power32(const TFloat val) { return TFloat::ReinterpretFloat(TFloat::ReinterpretInt(val + TFloat{8388608 + 127}) << 23); } +template static INLINE_ALWAYS TFloat Power64(const TFloat val) { + return TFloat::ReinterpretFloat(TFloat::ReinterpretInt(val + TFloat{4503599627370496 + 1023}) << 52); +} + template static INLINE_ALWAYS TFloat Polynomial(const TFloat x, const TFloat c0, @@ -60,6 +64,32 @@ static INLINE_ALWAYS TFloat Polynomial(const TFloat x, FusedMultiplyAdd(FusedMultiplyAdd(c3, x, c2), x2, FusedMultiplyAdd(c1, x, c0) + c8 * x8)); } +template +static INLINE_ALWAYS TFloat Polynomial(const TFloat x, + const TFloat c2, + const TFloat c3, + const TFloat c4, + const TFloat c5, + const TFloat c6, + const TFloat c7, + const TFloat c8, + const TFloat c9, + const TFloat c10, + const TFloat c11, + const TFloat c12, + const TFloat c13) { + TFloat x2 = x * x; + TFloat x4 = x2 * x2; + TFloat x8 = x4 * x4; + return FusedMultiplyAdd(FusedMultiplyAdd(FusedMultiplyAdd(c13, x, c12), + x4, + FusedMultiplyAdd(FusedMultiplyAdd(c11, x, c10), x2, FusedMultiplyAdd(c9, x, c8))), + x8, + FusedMultiplyAdd(FusedMultiplyAdd(FusedMultiplyAdd(c7, x, c6), x2, FusedMultiplyAdd(c5, x, c4)), + x4, + FusedMultiplyAdd(FusedMultiplyAdd(c3, x, c2), x2, x))); +} + template +static INLINE_ALWAYS TFloat Exp64(const TFloat val) { + // algorithm comes from: + // https://github.com/vectorclass/version2/blob/f4617df57e17efcd754f5bbe0ec87883e0ed9ce6/vectormath_exp.h#L327 + + // k_expUnderflow is set to a value that prevents us from returning a denormal number. + static constexpr float k_expUnderflow = -708.25f; // this is exactly representable in IEEE 754 + static constexpr float k_expOverflow = 708.25f; // this is exactly representable in IEEE 754 + + // TODO: make this negation more efficient + TFloat x = bNegateInput ? -val : val; + const TFloat rounded = Round(x * TFloat{1.44269504088896340736}); + x = FusedNegateMultiplyAdd(rounded, TFloat{0.693145751953125}, x); + x = FusedNegateMultiplyAdd(rounded, TFloat{1.42860682030941723212E-6}, x); + + TFloat ret = Polynomial(x, + TFloat{1} / TFloat{2}, + TFloat{1} / TFloat{6}, + TFloat{1} / TFloat{24}, + TFloat{1} / TFloat{120}, + TFloat{1} / TFloat{720}, + TFloat{1} / TFloat{5040}, + TFloat{1} / TFloat{40320}, + TFloat{1} / TFloat{362880}, + TFloat{1} / TFloat{3628800}, + TFloat{1} / TFloat{39916800}, + TFloat{1} / TFloat{479001600}, + TFloat{1} / TFloat{int64_t{6227020800}}); + + const TFloat rounded2 = Power64(rounded); + + ret = (ret + TFloat{1}) * rounded2; + + if(bOverflowPossible) { + if(bNegateInput) { + ret = IfLess(val, + static_cast(-k_expOverflow), + std::numeric_limits::infinity(), + ret); + } else { + ret = IfLess(static_cast(k_expOverflow), + val, + std::numeric_limits::infinity(), + ret); + } + } + if(bUnderflowPossible) { + if(bNegateInput) { + ret = IfLess(static_cast(-k_expUnderflow), val, 0.0f, ret); + } else { + ret = IfLess(val, static_cast(k_expUnderflow), 0.0f, ret); + } + } + if(bNaNPossible) { + ret = IfNaN(val, val, ret); + } + +#ifndef NDEBUG + TFloat::Execute( + [](int, typename TFloat::T orig, typename TFloat::T ret) { + EBM_ASSERT(IsApproxEqual(std::exp(orig), ret, typename TFloat::T{1e-12})); + }, + bNegateInput ? -val : val, + ret); +#endif // NDEBUG + + return ret; +} + } // namespace DEFINED_ZONE_NAME #endif // REGISTRATION_HPP