From 93a52af989f2fa1e3e255d58f597cf86ef2da33d Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Mon, 25 Jul 2022 20:26:18 +0000 Subject: [PATCH 1/4] On the way to adding avx512 specialisation templates --- src/3rd_party/avx512_mathfun.h | 540 +++++++++++++++++++++++++++++++++ src/common/types.h | 78 +++++ src/functional/operators.h | 9 + 3 files changed, 627 insertions(+) create mode 100644 src/3rd_party/avx512_mathfun.h diff --git a/src/3rd_party/avx512_mathfun.h b/src/3rd_party/avx512_mathfun.h new file mode 100644 index 000000000..f4d01d836 --- /dev/null +++ b/src/3rd_party/avx512_mathfun.h @@ -0,0 +1,540 @@ +/* + AVX512 implementation of sin, cos, sincos, exp and log + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + Copyright (C) 2017 Adrien Cassagne + MIT license + + Adapted from https://github.com/aff3ct/MIPP/blob/master/src/math/avx512_mathfun.h and https://github.com/aff3ct/MIPP/blob/master/src/math/avx512_mathfun.hxx +*/ +#pragma once +#ifdef __AVX512F__ + +#include + +typedef __m512 v16sf; // vector of 16 float (avx512) +typedef __m512i v16si; // vector of 16 int (avx512) + +/* yes I know, the top of this file is quite ugly */ +#ifdef _MSC_VER /* visual c++ */ +# define ALIGN32_BEG __declspec(align(32)) +# define ALIGN32_END +#else /* gcc or icc */ +# define ALIGN32_BEG +# define ALIGN32_END __attribute__((aligned(32))) +#endif + +/* declare some AVX512 constants -- why can't I figure a better way to do that? */ +#define _PS512_CONST(Name, Val) \ + static const constexpr ALIGN32_BEG float _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } +#define _PI32_CONST512(Name, Val) \ + static const constexpr ALIGN32_BEG int _pi32_512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } +#define _PS512_CONST_TYPE(Name, Type, Val) \ + static const constexpr ALIGN32_BEG Type _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } + +_PS512_CONST(1 , 1.0f); +_PS512_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS512_CONST_TYPE(min_norm_pos, int, 0x00800000); +//_PS512_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS512_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS512_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS512_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST512(0, 0); +_PI32_CONST512(1, 1); +_PI32_CONST512(0xffffffff, (int)0xFFFFFFFF); +_PI32_CONST512(inv1, ~1); +_PI32_CONST512(2, 2); +_PI32_CONST512(4, 4); +_PI32_CONST512(0x7f, 0x7f); + +_PS512_CONST(cephes_SQRTHF, 0.707106781186547524f); +_PS512_CONST(cephes_log_p0, 7.0376836292E-2f); +_PS512_CONST(cephes_log_p1, - 1.1514610310E-1f); +_PS512_CONST(cephes_log_p2, 1.1676998740E-1f); +_PS512_CONST(cephes_log_p3, - 1.2420140846E-1f); +_PS512_CONST(cephes_log_p4, + 1.4249322787E-1f); +_PS512_CONST(cephes_log_p5, - 1.6668057665E-1f); +_PS512_CONST(cephes_log_p6, + 2.0000714765E-1f); +_PS512_CONST(cephes_log_p7, - 2.4999993993E-1f); +_PS512_CONST(cephes_log_p8, + 3.3333331174E-1f); +_PS512_CONST(cephes_log_q1, -2.12194440e-4f); +_PS512_CONST(cephes_log_q2, 0.693359375f); + +static inline v16si _wrap_mm512_slli_epi32(v16si x, int y) { return _mm512_slli_epi32(x,y); } +static inline v16si _wrap_mm512_srli_epi32(v16si x, int y) { return _mm512_srli_epi32(x,y); } +static inline v16si _wrap_mm512_sub_epi32 (v16si x, v16si y) { return _mm512_sub_epi32 (x,y); } +static inline v16si _wrap_mm512_add_epi32 (v16si x, v16si y) { return _mm512_add_epi32 (x,y); } + + +/* natural logarithm computed for 16 simultaneous float + return NaN for x <= 0 +*/ +static inline v16sf log512_ps(v16sf x) { + v16si imm0; + v16sf one = *(v16sf*)_ps512_1; + + //v16sf invalid_mask = _mm512_cmple_ps(x, _mm512_setzero_ps()); + __mmask16 invalid_mask2 = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_LE_OS); + v16sf invalid_mask = _mm512_mask_blend_ps(invalid_mask2, *(v16sf*)_pi32_512_0, *(v16sf*)_pi32_512_0xffffffff); + + x = _mm512_max_ps(x, *(v16sf*)_ps512_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = _wrap_mm512_srli_epi32(_mm512_castps_si512(x), 23); + + /* keep only the fractional part */ +// x = _mm512_and_ps(x, *(v16sf*)_ps512_inv_mant_mask); + x = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_inv_mant_mask))); +// x = _mm512_or_ps(x, *(v16sf*)_ps512_0p5); + x = _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_0p5))); + + // this is again another AVX2 instruction + imm0 = _wrap_mm512_sub_epi32(imm0, *(v16si*)_pi32_512_0x7f); + v16sf e = _mm512_cvtepi32_ps(imm0); + + e = _mm512_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + //v16sf mask = _mm512_cmplt_ps(x, *(v16sf*)_ps512_cephes_SQRTHF); + __mmask16 mask2 = _mm512_cmp_ps_mask(x, *(v16sf*)_ps512_cephes_SQRTHF, _CMP_LT_OS); + v16sf mask = _mm512_mask_blend_ps(mask2, *(v16sf*)_pi32_512_0, *(v16sf*)_pi32_512_0xffffffff); + +// v16sf tmp = _mm512_and_ps(x, mask); + v16sf tmp = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(mask))); + x = _mm512_sub_ps(x, one); +// e = _mm512_sub_ps(e, _mm512_and_ps(one, mask)); + e = _mm512_sub_ps(e, _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(one), _mm512_castps_si512(mask)))); + x = _mm512_add_ps(x, tmp); + + v16sf z = _mm512_mul_ps(x,x); + + v16sf y = *(v16sf*)_ps512_cephes_log_p0; + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p1); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p2); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p3); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p4); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p5); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p6); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p7); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p8); + y = _mm512_mul_ps(y, x); + + y = _mm512_mul_ps(y, z); + + tmp = _mm512_mul_ps(e, *(v16sf*)_ps512_cephes_log_q1); + y = _mm512_add_ps(y, tmp); + + + tmp = _mm512_mul_ps(z, *(v16sf*)_ps512_0p5); + y = _mm512_sub_ps(y, tmp); + + tmp = _mm512_mul_ps(e, *(v16sf*)_ps512_cephes_log_q2); + x = _mm512_add_ps(x, y); + x = _mm512_add_ps(x, tmp); +// x = _mm512_or_ps(x, invalid_mask); // negative arg will be NAN + x = _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(x), _mm512_castps_si512(invalid_mask))); + return x; +} + +_PS512_CONST(exp_hi, 88.3762626647949f); +_PS512_CONST(exp_lo, -88.3762626647949f); + +_PS512_CONST(cephes_LOG2EF, 1.44269504088896341f); +_PS512_CONST(cephes_exp_C1, 0.693359375f); +_PS512_CONST(cephes_exp_C2, -2.12194440e-4f); + +_PS512_CONST(cephes_exp_p0, 1.9875691500E-4f); +_PS512_CONST(cephes_exp_p1, 1.3981999507E-3f); +_PS512_CONST(cephes_exp_p2, 8.3334519073E-3f); +_PS512_CONST(cephes_exp_p3, 4.1665795894E-2f); +_PS512_CONST(cephes_exp_p4, 1.6666665459E-1f); +_PS512_CONST(cephes_exp_p5, 5.0000001201E-1f); + +static inline v16sf exp512_ps(v16sf x) { + v16sf tmp = _mm512_setzero_ps(), fx; + v16si imm0; + v16sf one = *(v16sf*)_ps512_1; + + x = _mm512_min_ps(x, *(v16sf*)_ps512_exp_hi); + x = _mm512_max_ps(x, *(v16sf*)_ps512_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm512_mul_ps(x, *(v16sf*)_ps512_cephes_LOG2EF); + fx = _mm512_add_ps(fx, *(v16sf*)_ps512_0p5); + + /* how to perform a floorf with SSE: just below */ + //imm0 = _mm512_cvttps_epi32(fx); + //tmp = _mm512_cvtepi32_ps(imm0); + + tmp = _mm512_floor_ps(fx); + + /* if greater, substract 1 */ + //v16sf mask = _mm512_cmpgt_ps(tmp, fx); +// v16sf mask = _mm512_cmp_ps(tmp, fx, _CMP_GT_OS); + __mmask16 mask2 = _mm512_cmp_ps_mask(tmp, fx, _CMP_GT_OS); + v16sf mask = _mm512_mask_blend_ps(mask2, *(v16sf*)_pi32_512_0, *(v16sf*)_pi32_512_0xffffffff); +// mask = _mm512_and_ps(mask, one); + mask = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(mask), _mm512_castps_si512(one))); + fx = _mm512_sub_ps(tmp, mask); + + tmp = _mm512_mul_ps(fx, *(v16sf*)_ps512_cephes_exp_C1); + v16sf z = _mm512_mul_ps(fx, *(v16sf*)_ps512_cephes_exp_C2); + x = _mm512_sub_ps(x, tmp); + x = _mm512_sub_ps(x, z); + + z = _mm512_mul_ps(x,x); + + v16sf y = *(v16sf*)_ps512_cephes_exp_p0; + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p1); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p2); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p3); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p4); + y = _mm512_mul_ps(y, x); + y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p5); + y = _mm512_mul_ps(y, z); + y = _mm512_add_ps(y, x); + y = _mm512_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm512_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = _wrap_mm512_add_epi32(imm0, *(v16si*)_pi32_512_0x7f); + imm0 = _wrap_mm512_slli_epi32(imm0, 23); + v16sf pow2n = _mm512_castsi512_ps(imm0); + y = _mm512_mul_ps(y, pow2n); + return y; +} + +_PS512_CONST(minus_cephes_DP1, -0.78515625f); +_PS512_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); +_PS512_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); +_PS512_CONST(sincof_p0, -1.9515295891E-4f); +_PS512_CONST(sincof_p1, 8.3321608736E-3f); +_PS512_CONST(sincof_p2, -1.6666654611E-1f); +_PS512_CONST(coscof_p0, 2.443315711809948E-005f); +_PS512_CONST(coscof_p1, -1.388731625493765E-003f); +_PS512_CONST(coscof_p2, 4.166664568298827E-002f); +_PS512_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI + + +/* evaluation of 16 sines at onces using AVX intrisics + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. +*/ +static inline v16sf sin512_ps(v16sf x) { // any x + v16sf xmm1, xmm2 = _mm512_setzero_ps(), xmm3, sign_bit, y; + v16si imm0, imm2; + + sign_bit = x; + /* take the absolute value */ +// x = _mm512_and_ps(x, *(v16sf*)_ps512_inv_sign_mask); + x = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_inv_sign_mask))); + /* extract the sign bit (upper one) */ +// sign_bit = _mm512_and_ps(sign_bit, *(v16sf*)_ps512_sign_mask); + sign_bit = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(sign_bit), _mm512_castps_si512(*(v16sf*)_ps512_sign_mask))); + + /* scale by 4/Pi */ + y = _mm512_mul_ps(x, *(v16sf*)_ps512_cephes_FOPI); + + /* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives + */ + + /* store the integer part of y in mm0 */ + imm2 = _mm512_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = _wrap_mm512_add_epi32(imm2, *(v16si*)_pi32_512_1); + imm2 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_inv1); + y = _mm512_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_4); + imm0 = _wrap_mm512_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 static inline Register set1_ps(float to); +template <> inline __m128 set1_ps<__m128>(float to) { + return _mm_set1_ps(to); +} +#ifdef __AVX__ +typedef __m256 float32x8; +template <> inline __m256 set1_ps<__m256>(float to) { + return _mm256_set1_ps(to); +} +#endif +#ifdef __AVX512F__ +typedef __m256 float32x16; +template <> inline __m512 set1_ps<__m512>(float to) { + return _mm512_set1_ps(to); +} +#endif + +template +struct float32v { +private: + Register f_; + +public: + float32vector() {} + float32vector(const Register& f) : f_(f) {} + float32vector(const float& f) : f_(set1_ps(f)) {} + + operator const Register&() const { return f_; } + operator Register&() { return f_; } + + float operator[] (size_t i) const { + return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __mXXX is an array of floats + } + + friend std::ostream& operator<<(std::ostream& out, Register f) { + size_t length = sizeof(Register)/sizeof(float); + float* a = (float*)&f; + out << "[" << a[0]; + for(size_t i = 1; i < length; i++) + out << " " << a[i]; + out << "]"; + return out; + } + +}; + +*/ struct float32x4 { private: __m128 f_; @@ -231,6 +280,35 @@ struct float32x8 { #endif #endif +#ifdef __AVX512F__ +struct float32x16 { +private: + __m512 f_; + +public: + float32x16() {} + float32x16(const __m512& f) : f_(f) {} + float32x16(const float& f) : f_(_mm512_set1_ps(f)) {} // __m256 _mm_set1_ps(float) copies value into all slots + + operator const __m512&() const { return f_; } + operator __m512&() { return f_; } + + float operator[] (size_t i) const { + return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats + } + + friend std::ostream& operator<<(std::ostream& out, float32x16 f16) { + float* a = (float*)&f16; + out << "[" << a[0]; + for(int i = 1; i < 16; i++) + out << " " << a[i]; + out << "]"; + return out; + } +}; +#endif + + #if COMPILE_FP16 // @TODO: check what intrinsics are actually available. diff --git a/src/functional/operators.h b/src/functional/operators.h index a14f153f1..8389e8d33 100644 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -463,6 +463,15 @@ struct Ops { } }; +} // end namespace functional +} // end namespace marian +#endif + +#ifdef __AVX512F__ +#include "3rd_party/avx512_mathfun.h" + +namespace marian { +namespace functional { } // end namespace functional } // end namespace marian #endif From e1bd54402d4cd28a9aebd7da5542e45907e0e24d Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 26 Jul 2022 16:10:39 +0000 Subject: [PATCH 2/4] First version of avx512intrinsics. Compiles but crashes --- src/common/types.h | 30 +++--- src/functional/operators.h | 133 +++++++++++++++++++++++++++ src/functional/tensor.h | 14 +++ src/tensors/cpu/element.h | 11 +++ src/tensors/cpu/tensor_operators.cpp | 25 ++++- 5 files changed, 199 insertions(+), 14 deletions(-) diff --git a/src/common/types.h b/src/common/types.h index ada061c73..1bcc96de1 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -171,21 +171,20 @@ struct intgemm8avx512vnni { int8_t x; }; #ifndef __CUDACC__ // vectorized types not available from .cu files // @TODO: check what intrinsics are actually available. -/* Templated implementation to get to to work later -typedef __m128 float32x4; +/* Currently doesn't work due to https://stackoverflow.com/questions/41676311/implication-of-gcc-warning-ignoring-attributes-on-template-argument-wignored +// I am not entirely sure wrapping it in a struct wouldn't at some point accidentally cause a misalignment +//Templated implementation to get to to work later template static inline Register set1_ps(float to); template <> inline __m128 set1_ps<__m128>(float to) { return _mm_set1_ps(to); } #ifdef __AVX__ -typedef __m256 float32x8; template <> inline __m256 set1_ps<__m256>(float to) { return _mm256_set1_ps(to); } #endif #ifdef __AVX512F__ -typedef __m256 float32x16; -template <> inline __m512 set1_ps<__m512>(float to) { +template <> inline __m512 set1_ps<__m5__attribute__ ((aligned (16)))12>(float to) { return _mm512_set1_ps(to); } #endif @@ -193,12 +192,12 @@ template <> inline __m512 set1_ps<__m512>(float to) { template struct float32v { private: - Register f_; + Register __attribute__ ((aligned (32))) f_; public: - float32vector() {} - float32vector(const Register& f) : f_(f) {} - float32vector(const float& f) : f_(set1_ps(f)) {} + float32v() {} + float32v(const Register& f) : f_(f) {} + float32v(const float& f) : f_(set1_ps(f)) {} operator const Register&() const { return f_; } operator Register&() { return f_; } @@ -208,7 +207,7 @@ struct float32v { } friend std::ostream& operator<<(std::ostream& out, Register f) { - size_t length = sizeof(Register)/sizeof(float); + const size_t constexpr length = sizeof(Register)/sizeof(float); float* a = (float*)&f; out << "[" << a[0]; for(size_t i = 1; i < length; i++) @@ -219,7 +218,15 @@ struct float32v { }; +using float32x4 = float32v<__m128>; +#ifdef __AVX__ +using float32x8 = float32v<__m256>; +#endif +#ifdef __AVX512F__ +using float32x16 = float32v<__m512>; +#endif */ + struct float32x4 { private: __m128 f_; @@ -278,7 +285,6 @@ struct float32x8 { struct float32x8 { }; #endif -#endif #ifdef __AVX512F__ struct float32x16 { @@ -307,7 +313,7 @@ struct float32x16 { } }; #endif - +#endif // #ifndef __CUDACC__ #if COMPILE_FP16 diff --git a/src/functional/operators.h b/src/functional/operators.h index 8389e8d33..7ec77baee 100644 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -472,6 +472,139 @@ struct Ops { namespace marian { namespace functional { + + +//******************************************************************************************* +// Specialization for float32x16 (=__m512, CPU AVX512 intrisics) +template <> +struct Ops { + typedef float Single; + + static inline float32x16 loop16(const std::function& f, const float32x16& x) { + float32x16 out; + for(int i = 0; i < 16; i++) + ((float*)&out)[i] = f(((const float*)&x)[i]); + return out; + } + + static inline float32x16 loop16(const std::function& f, const float32x16& x, const float32x16& y) { + float32x16 out; + for(int i = 0; i < 16; i++) + ((float*)&out)[i] = f(((const float*)&x)[i], ((const float*)&y)[i]); + return out; + } + + static inline float32x16 loop16(const std::function& f, const float32x16& x, const float32x16& y, const float32x16& z) { + float32x16 out; + for(int i = 0; i < 16; i++) + ((float*)&out)[i] = f(((const float*)&x)[i], ((const float*)&y)[i], ((const float*)&z)[i]); + return out; + } + + static inline float32x16 tanh(const float32x16& x) { // ( e^x - e^-x )/( e^x + e^-x ) + float32x16 e2x = exp(mul(2.f, x)); + return div(sub(e2x, 1.f), add(e2x, 1.f)); + } + + static inline float32x16 sin(const float32x16& x) { return sin512_ps(x); } + static inline float32x16 cos(const float32x16& x) { return cos512_ps(x); } + static inline float32x16 tan(const float32x16& x) { return div(sin(x), cos(x)); } // @TODO: use sincos256_ps + static inline float32x16 log(const float32x16& x) { return log512_ps(x); } + static inline float32x16 exp(const float32x16& x) { return exp512_ps(x); } + + // @TODO: get rid of loop16 with proper intrisics + static inline float32x16 abs(const float32x16& x) { return loop16(Ops::abs, x); } + static inline float32x16 sqr(const float32x16& x) { return _mm512_mul_ps(x, x); } + static inline float32x16 sqrt(const float32x16& x) { return _mm512_sqrt_ps(x); } + static inline float32x16 neg(const float32x16& x) { return sub(0.f, x); } + + // @TODO: get rid of loop16 with proper intrisics + static inline float32x16 sgn(const float32x16& x) { return loop16(Ops::sgn, x); } + + static inline float32x16 round(const float32x16& x) { return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEAREST_INT); } // Thank you, Intel: https://stackoverflow.com/questions/50854991/instrinsic-mm512-round-ps-is-missing-for-avx512 + static inline float32x16 floor(const float32x16& x) { return _mm512_floor_ps(x); } + static inline float32x16 ceil(const float32x16& x) { return _mm512_ceil_ps(x); } + + static inline float32x16 add(const float32x16& x, const float32x16& y) { return _mm512_add_ps(x, y); } + static inline float32x16 sub(const float32x16& x, const float32x16& y) { return _mm512_sub_ps(x, y); } + static inline float32x16 mul(const float32x16& x, const float32x16& y) { return _mm512_mul_ps(x, y); } + static inline float32x16 div(const float32x16& x, const float32x16& y) { return _mm512_div_ps(x, y); } + + static inline float32x16 max(const float32x16& x, const float32x16& y) { return _mm512_max_ps(x, y); } + static inline float32x16 min(const float32x16& x, const float32x16& y) { return _mm512_min_ps(x, y); } + static inline float32x16 pow(const float32x16& x, const float32x16& y) { return exp(mul(y, log(x))); } + + // @TODO: get rid of loop16 with proper intrisics + static inline float32x16 negate(float32x16& x) { return loop16(Ops::negate, x); } + + static inline float32x16 eq(const float32x16& x, const float32x16& y) { return loop16(Ops::eq, x, y); } + static inline float32x16 neq(const float32x16& x, const float32x16& y) { return loop16(Ops::neq, x, y); } + static inline float32x16 gt(const float32x16& x, const float32x16& y) { return loop16(Ops::gt, x, y); } + static inline float32x16 lt(const float32x16& x, const float32x16& y) { return loop16(Ops::lt, x, y); } + static inline float32x16 geq(const float32x16& x, const float32x16& y) { return loop16(Ops::geq, x, y); } + static inline float32x16 leq(const float32x16& x, const float32x16& y) { return loop16(Ops::leq, x, y); } + static inline float32x16 and_(const float32x16& x, const float32x16& y) { return loop16(Ops::and_, x, y); } // 'and' is used by gcc + static inline float32x16 or_(const float32x16& x, const float32x16& y) { return loop16(Ops::or_, x, y); } // 'or' is used by gcc + + + // Neural Networks specific functions + // @TODO: this is unsafe + static inline float32x16 sigmoid(const float32x16& x) { + float32x16 e = exp(x); + return div(e, add(1.f, e)); + } + + static inline float32x16 logaddexp(const float32x16& x, const float32x16& y) { return loop16(Ops::logaddexp, x, y); } + + static inline float32x16 clip(const float32x16& x, const float32x16& y) { return loop16(Ops::clip, x, y); } + static inline float32x16 bump(const float32x16& x, const float32x16& y) { return loop16(Ops::bump, x, y); } + + static inline float32x16 relu(const float32x16& x) { return max(0.f, x); } + + static inline float32x16 reluBack(const float32x16& x) { return loop16(Ops::reluBack, x); } + static inline float32x16 prelu(const float32x16& x, const float32x16& y) { return loop16(Ops::prelu, x, y); } + static inline float32x16 preluBack(const float32x16& x, const float32x16& y) { return loop16(Ops::preluBack, x, y); } + + static inline float32x16 if_then_else(const float32x16& x, const float32x16& y, const float32x16& z) { return loop16(Ops::if_then_else, x, y, z); } + + static inline Single sumReduce(const float32x16& x) { + // It's a sequence instruction so performance might be suboptimal, but this probably gives the compiler the best chance at optimising it: + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_add_ps&ig_expand=5926,5926,5703,5730,5660,5660&avx512techs=AVX512F + Single sum = _mm512_reduce_add_ps(x); + return sum; + /* + Single sum = 0; + for(int i = 0; i < 16; ++i) + sum = Ops::add(sum, x[i]); + return sum;*/ + } + + static inline Single maxReduce(const float32x16& x) { + // It's a sequence instruction so performance might be suboptimal, but this probably gives the compiler the best chance at optimising it: + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_max_ps&ig_expand=5926,5926,5703&avx512techs=AVX512F + Single max = _mm512_reduce_max_ps(x); + return max; + /* + Single maxs = x[0]; + for(int i = 1; i < 16; ++i) + maxs = Ops::max(maxs, x[i]); + return maxs;*/ + } + + static inline Single minReduce(const float32x16& x) { + // It's a sequence instruction so performance might be suboptimal, but this probably gives the compiler the best chance at optimising it: + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_min_ps&ig_expand=5926,5926,5703,5730&avx512techs=AVX512F + Single min = _mm512_reduce_min_ps(x); + return min; + /* + Single mins = x[0]; + for(int i = 1; i < 16; ++i) + mins = Ops::min(mins, x[i]); + return mins; + */ + } +}; + } // end namespace functional } // end namespace marian #endif diff --git a/src/functional/tensor.h b/src/functional/tensor.h index f5549c608..1b73d5825 100644 --- a/src/functional/tensor.h +++ b/src/functional/tensor.h @@ -44,7 +44,21 @@ inline marian::Shape adapt(const marian::Shape& shape) { return x8Shape; } #endif + +#ifdef __AVX512F__ +// as above, but for a stride of 16, since we are processing 16 floats at once +template <> +inline marian::Shape adapt(const marian::Shape& shape) { + ABORT_IF(shape[-1] % 16 != 0, + "Last dim ({}) is not a multiple of 16 while converting to Tensor", + shape[-1]); + + marian::Shape x16Shape = shape; + x16Shape.set(-1, shape[-1] / 16); + return x16Shape; +} #endif +#endif // __CUDACC__ #if COMPILE_FP16 // as above, but for a stride of 2, since we are processing 2 half floats at once. Works on GPU. diff --git a/src/tensors/cpu/element.h b/src/tensors/cpu/element.h index a0d111fd3..afdace652 100644 --- a/src/tensors/cpu/element.h +++ b/src/tensors/cpu/element.h @@ -83,10 +83,13 @@ template void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors) { #ifndef __CUDACC__ std::vector ts({out, tensors...}); + bool div16 = true; bool div8 = true; bool div4 = true; for(auto t : ts) { + if(t->shape()[-1] % 16 != 0) + div16 = false; if(t->shape()[-1] % 8 != 0) div8 = false; if(t->shape()[-1] % 4 != 0) { @@ -95,6 +98,14 @@ void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors } } + if(div16) { + // std::cerr << "16: " << functor.to_string() << std::endl; +#ifdef __AVX512F__ + element(functor, out, tensors...); + return; +#endif + } + if(div8) { // std::cerr << "8: " << functor.to_string() << std::endl; #ifdef __AVX__ diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1e1adc38b..c4ba6bc52 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -452,7 +452,12 @@ void Softmax(Tensor out, Tensor in) { void Softmax(Tensor out, Tensor in) { matchOrAbort(out->type()); matchOrAbort(in->type()); - +#ifdef __AVX512F__ + if(out->shape()[-1] % 16 == 0) { + Softmax(out, in); + return; + } +#endif #ifdef __AVX__ if(out->shape()[-1] % 8 == 0) { Softmax(out, in); @@ -509,6 +514,12 @@ void LogSoftmax(Tensor out, Tensor in) { matchOrAbort(out->type()); matchOrAbort(in->type()); +#ifdef __AVX512F__ + if(out->shape()[-1] % 16 == 0) { + LogSoftmax(out, in); + return; + } +#endif #ifdef __AVX__ if(out->shape()[-1] % 8 == 0) { LogSoftmax(out, in); @@ -1521,6 +1532,11 @@ void LSTMCellForwardTyped(Tensor out_, const std::vector& inputs) { void LSTMCellForward(Tensor out, std::vector inputs) { int cols = out->shape()[-1]; +#ifdef __AVX512F__ + if(cols % 16 == 0) + LSTMCellForwardTyped(out, inputs); + else +#endif #ifdef __AVX__ if(cols % 8 == 0) LSTMCellForwardTyped(out, inputs); @@ -1565,10 +1581,15 @@ void LSTMOutputForwardTyped(Tensor out_, const std::vector& inputs) { void LSTMOutputForward(Tensor out, std::vector inputs) { int cols = out->shape()[-1]; +#ifdef __AVX512F__ + if(cols % 16 == 0) + LSTMOutputForwardTyped(out, inputs); + else +#endif #ifdef __AVX__ if(cols % 8 == 0) LSTMOutputForwardTyped(out, inputs); - else + else #endif if(cols % 4 == 0) LSTMOutputForwardTyped(out, inputs); From d326cfc349d890882a1779701a6ac25c1c16a425 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Wed, 27 Jul 2022 14:52:14 +0000 Subject: [PATCH 3/4] Fixes and optimisations --- src/3rd_party/avx512_mathfun.h | 14 ++++++------- src/common/types.h | 6 +++++- src/functional/operators.h | 37 +++++++++++++++++----------------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/3rd_party/avx512_mathfun.h b/src/3rd_party/avx512_mathfun.h index f4d01d836..2355a8165 100644 --- a/src/3rd_party/avx512_mathfun.h +++ b/src/3rd_party/avx512_mathfun.h @@ -17,20 +17,20 @@ typedef __m512i v16si; // vector of 16 int (avx512) /* yes I know, the top of this file is quite ugly */ #ifdef _MSC_VER /* visual c++ */ -# define ALIGN32_BEG __declspec(align(32)) -# define ALIGN32_END +# define ALIGN64_BEG __declspec(align(64)) +# define ALIGN64_END #else /* gcc or icc */ -# define ALIGN32_BEG -# define ALIGN32_END __attribute__((aligned(32))) +# define ALIGN64_BEG +# define ALIGN64_END __attribute__((aligned(64))) #endif /* declare some AVX512 constants -- why can't I figure a better way to do that? */ #define _PS512_CONST(Name, Val) \ - static const constexpr ALIGN32_BEG float _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } + static const constexpr ALIGN64_BEG float _ps512_##Name[16] ALIGN64_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } #define _PI32_CONST512(Name, Val) \ - static const constexpr ALIGN32_BEG int _pi32_512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } + static const constexpr ALIGN64_BEG int _pi32_512_##Name[16] ALIGN64_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } #define _PS512_CONST_TYPE(Name, Type, Val) \ - static const constexpr ALIGN32_BEG Type _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } + static const constexpr ALIGN64_BEG Type _ps512_##Name[16] ALIGN64_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val } _PS512_CONST(1 , 1.0f); _PS512_CONST(0p5, 0.5f); diff --git a/src/common/types.h b/src/common/types.h index 1bcc96de1..131f3f1e7 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -197,10 +197,11 @@ struct float32v { public: float32v() {} float32v(const Register& f) : f_(f) {} - float32v(const float& f) : f_(set1_ps(f)) {} + float32v(const float& f) : f_(set1_ps(f)) {} // copies value into all slots operator const Register&() const { return f_; } operator Register&() { return f_; } + Register* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference. float operator[] (size_t i) const { return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __mXXX is an array of floats @@ -238,6 +239,7 @@ struct float32x4 { operator const __m128&() const { return f_; } operator __m128&() { return f_; } + __m128* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference. float operator[] (size_t i) const { return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats @@ -266,6 +268,7 @@ struct float32x8 { operator const __m256&() const { return f_; } operator __m256&() { return f_; } + __m256* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference. float operator[] (size_t i) const { return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats @@ -298,6 +301,7 @@ struct float32x16 { operator const __m512&() const { return f_; } operator __m512&() { return f_; } + __m512* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference. float operator[] (size_t i) const { return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats diff --git a/src/functional/operators.h b/src/functional/operators.h index 7ec77baee..41f9ae5f0 100644 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -253,7 +253,12 @@ struct Ops { static inline float32x4 sin(const float32x4& x) { return sin_ps(x); } static inline float32x4 cos(const float32x4& x) { return cos_ps(x); } - static inline float32x4 tan(const float32x4& x) { return div(sin(x), cos(x)); } + static inline float32x4 tan(const float32x4& x) { + float32x4 my_sin; + float32x4 my_cos; + sincos_ps(x, my_sin.get(), my_cos.get()); + return div(my_sin, my_cos); + } static inline float32x4 log(const float32x4& x) { return log_ps(x); } static inline float32x4 exp(const float32x4& x) { return exp_ps(x); } @@ -382,7 +387,12 @@ struct Ops { static inline float32x8 sin(const float32x8& x) { return sin256_ps(x); } static inline float32x8 cos(const float32x8& x) { return cos256_ps(x); } - static inline float32x8 tan(const float32x8& x) { return div(sin(x), cos(x)); } // @TODO: use sincos256_ps + static inline float32x8 tan(const float32x8& x) { + float32x8 my_sin; + float32x8 my_cos; + sincos256_ps(x, my_sin.get(), my_cos.get()); + return div(my_sin, my_cos); + } static inline float32x8 log(const float32x8& x) { return log256_ps(x); } static inline float32x8 exp(const float32x8& x) { return exp256_ps(x); } @@ -508,7 +518,12 @@ struct Ops { static inline float32x16 sin(const float32x16& x) { return sin512_ps(x); } static inline float32x16 cos(const float32x16& x) { return cos512_ps(x); } - static inline float32x16 tan(const float32x16& x) { return div(sin(x), cos(x)); } // @TODO: use sincos256_ps + static inline float32x16 tan(const float32x16& x) { + float32x16 my_sin; + float32x16 my_cos; + sincos512_ps(x, my_sin.get(), my_cos.get()); + return div(my_sin, my_cos); + } static inline float32x16 log(const float32x16& x) { return log512_ps(x); } static inline float32x16 exp(const float32x16& x) { return exp512_ps(x); } @@ -572,11 +587,6 @@ struct Ops { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_add_ps&ig_expand=5926,5926,5703,5730,5660,5660&avx512techs=AVX512F Single sum = _mm512_reduce_add_ps(x); return sum; - /* - Single sum = 0; - for(int i = 0; i < 16; ++i) - sum = Ops::add(sum, x[i]); - return sum;*/ } static inline Single maxReduce(const float32x16& x) { @@ -584,11 +594,6 @@ struct Ops { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_max_ps&ig_expand=5926,5926,5703&avx512techs=AVX512F Single max = _mm512_reduce_max_ps(x); return max; - /* - Single maxs = x[0]; - for(int i = 1; i < 16; ++i) - maxs = Ops::max(maxs, x[i]); - return maxs;*/ } static inline Single minReduce(const float32x16& x) { @@ -596,12 +601,6 @@ struct Ops { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_min_ps&ig_expand=5926,5926,5703,5730&avx512techs=AVX512F Single min = _mm512_reduce_min_ps(x); return min; - /* - Single mins = x[0]; - for(int i = 1; i < 16; ++i) - mins = Ops::min(mins, x[i]); - return mins; - */ } }; From 34189282e67972d7bf8ca0c88cb008a80a680b5b Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Thu, 28 Jul 2022 22:58:31 +0000 Subject: [PATCH 4/4] Changelog stuff --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01aea0251..ce678114c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] - +- Added avx512 types and functional path. +- Small optimisation on AVX and SSE instruction path. ### Added ### Fixed