From 93a52af989f2fa1e3e255d58f597cf86ef2da33d Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Mon, 25 Jul 2022 20:26:18 +0000
Subject: [PATCH 1/4] On the way to adding avx512 specialisation templates

---
 src/3rd_party/avx512_mathfun.h | 540 +++++++++++++++++++++++++++++++++
 src/common/types.h             |  78 +++++
 src/functional/operators.h     |   9 +
 3 files changed, 627 insertions(+)
 create mode 100644 src/3rd_party/avx512_mathfun.h

diff --git a/src/3rd_party/avx512_mathfun.h b/src/3rd_party/avx512_mathfun.h
new file mode 100644
index 000000000..f4d01d836
--- /dev/null
+++ b/src/3rd_party/avx512_mathfun.h
@@ -0,0 +1,540 @@
+/* 
+   AVX512 implementation of sin, cos, sincos, exp and log
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+   Copyright (C) 2017 Adrien Cassagne
+   MIT license
+
+   Adapted from https://github.com/aff3ct/MIPP/blob/master/src/math/avx512_mathfun.h and https://github.com/aff3ct/MIPP/blob/master/src/math/avx512_mathfun.hxx
+*/
+#pragma once
+#ifdef __AVX512F__
+
+#include <immintrin.h>
+
+typedef __m512 v16sf; // vector of 16 float (avx512)
+typedef __m512i v16si; // vector of 16 int   (avx512)
+
+/* yes I know, the top of this file is quite ugly */
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN32_BEG __declspec(align(32))
+# define ALIGN32_END 
+#else /* gcc or icc */
+# define ALIGN32_BEG
+# define ALIGN32_END __attribute__((aligned(32)))
+#endif
+
+/* declare some AVX512 constants -- why can't I figure a better way to do that? */
+#define _PS512_CONST(Name, Val)                                            \
+  static const constexpr ALIGN32_BEG float _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PI32_CONST512(Name, Val)                                            \
+  static const constexpr ALIGN32_BEG int _pi32_512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PS512_CONST_TYPE(Name, Type, Val)                                 \
+  static const constexpr ALIGN32_BEG Type _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
+
+_PS512_CONST(1  , 1.0f);
+_PS512_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS512_CONST_TYPE(min_norm_pos, int, 0x00800000);
+//_PS512_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS512_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS512_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS512_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST512(0, 0);
+_PI32_CONST512(1, 1);
+_PI32_CONST512(0xffffffff, (int)0xFFFFFFFF);
+_PI32_CONST512(inv1, ~1);
+_PI32_CONST512(2, 2);
+_PI32_CONST512(4, 4);
+_PI32_CONST512(0x7f, 0x7f);
+
+_PS512_CONST(cephes_SQRTHF, 0.707106781186547524f);
+_PS512_CONST(cephes_log_p0, 7.0376836292E-2f);
+_PS512_CONST(cephes_log_p1, - 1.1514610310E-1f);
+_PS512_CONST(cephes_log_p2, 1.1676998740E-1f);
+_PS512_CONST(cephes_log_p3, - 1.2420140846E-1f);
+_PS512_CONST(cephes_log_p4, + 1.4249322787E-1f);
+_PS512_CONST(cephes_log_p5, - 1.6668057665E-1f);
+_PS512_CONST(cephes_log_p6, + 2.0000714765E-1f);
+_PS512_CONST(cephes_log_p7, - 2.4999993993E-1f);
+_PS512_CONST(cephes_log_p8, + 3.3333331174E-1f);
+_PS512_CONST(cephes_log_q1, -2.12194440e-4f);
+_PS512_CONST(cephes_log_q2, 0.693359375f);
+
+static inline v16si _wrap_mm512_slli_epi32(v16si x, int  y) { return _mm512_slli_epi32(x,y); }
+static inline v16si _wrap_mm512_srli_epi32(v16si x, int  y) { return _mm512_srli_epi32(x,y); }
+static inline v16si _wrap_mm512_sub_epi32 (v16si x, v16si y) { return _mm512_sub_epi32 (x,y); }
+static inline v16si _wrap_mm512_add_epi32 (v16si x, v16si y) { return _mm512_add_epi32 (x,y); }
+
+
+/* natural logarithm computed for 16 simultaneous float
+   return NaN for x <= 0
+*/
+static inline v16sf log512_ps(v16sf x) {
+  v16si imm0;
+  v16sf one = *(v16sf*)_ps512_1;
+
+  //v16sf invalid_mask = _mm512_cmple_ps(x, _mm512_setzero_ps());
+  __mmask16 invalid_mask2 = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_LE_OS);
+  v16sf invalid_mask = _mm512_mask_blend_ps(invalid_mask2, *(v16sf*)_pi32_512_0, *(v16sf*)_pi32_512_0xffffffff);
+
+  x = _mm512_max_ps(x, *(v16sf*)_ps512_min_norm_pos);  /* cut off denormalized stuff */
+
+  // can be done with AVX2
+  imm0 = _wrap_mm512_srli_epi32(_mm512_castps_si512(x), 23);
+
+  /* keep only the fractional part */
+//  x = _mm512_and_ps(x, *(v16sf*)_ps512_inv_mant_mask);
+  x = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_inv_mant_mask)));
+//  x = _mm512_or_ps(x, *(v16sf*)_ps512_0p5);
+  x = _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_0p5)));
+
+  // this is again another AVX2 instruction
+  imm0 = _wrap_mm512_sub_epi32(imm0, *(v16si*)_pi32_512_0x7f);
+  v16sf e = _mm512_cvtepi32_ps(imm0);
+
+  e = _mm512_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  //v16sf mask = _mm512_cmplt_ps(x, *(v16sf*)_ps512_cephes_SQRTHF);
+  __mmask16 mask2 = _mm512_cmp_ps_mask(x, *(v16sf*)_ps512_cephes_SQRTHF, _CMP_LT_OS);
+  v16sf mask = _mm512_mask_blend_ps(mask2, *(v16sf*)_pi32_512_0, *(v16sf*)_pi32_512_0xffffffff);
+
+//  v16sf tmp = _mm512_and_ps(x, mask);
+  v16sf tmp = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(mask)));
+  x = _mm512_sub_ps(x, one);
+//  e = _mm512_sub_ps(e, _mm512_and_ps(one, mask));
+  e = _mm512_sub_ps(e, _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(one), _mm512_castps_si512(mask))));
+  x = _mm512_add_ps(x, tmp);
+
+  v16sf z = _mm512_mul_ps(x,x);
+
+  v16sf y = *(v16sf*)_ps512_cephes_log_p0;
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p1);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p2);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p3);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p4);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p5);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p6);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p7);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_log_p8);
+  y = _mm512_mul_ps(y, x);
+
+  y = _mm512_mul_ps(y, z);
+  
+  tmp = _mm512_mul_ps(e, *(v16sf*)_ps512_cephes_log_q1);
+  y = _mm512_add_ps(y, tmp);
+
+
+  tmp = _mm512_mul_ps(z, *(v16sf*)_ps512_0p5);
+  y = _mm512_sub_ps(y, tmp);
+
+  tmp = _mm512_mul_ps(e, *(v16sf*)_ps512_cephes_log_q2);
+  x = _mm512_add_ps(x, y);
+  x = _mm512_add_ps(x, tmp);
+//  x = _mm512_or_ps(x, invalid_mask); // negative arg will be NAN
+  x = _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(x), _mm512_castps_si512(invalid_mask)));
+  return x;
+}
+
+_PS512_CONST(exp_hi,	88.3762626647949f);
+_PS512_CONST(exp_lo,	-88.3762626647949f);
+
+_PS512_CONST(cephes_LOG2EF, 1.44269504088896341f);
+_PS512_CONST(cephes_exp_C1, 0.693359375f);
+_PS512_CONST(cephes_exp_C2, -2.12194440e-4f);
+
+_PS512_CONST(cephes_exp_p0, 1.9875691500E-4f);
+_PS512_CONST(cephes_exp_p1, 1.3981999507E-3f);
+_PS512_CONST(cephes_exp_p2, 8.3334519073E-3f);
+_PS512_CONST(cephes_exp_p3, 4.1665795894E-2f);
+_PS512_CONST(cephes_exp_p4, 1.6666665459E-1f);
+_PS512_CONST(cephes_exp_p5, 5.0000001201E-1f);
+
+static inline v16sf exp512_ps(v16sf x) {
+  v16sf tmp = _mm512_setzero_ps(), fx;
+  v16si imm0;
+  v16sf one = *(v16sf*)_ps512_1;
+
+  x = _mm512_min_ps(x, *(v16sf*)_ps512_exp_hi);
+  x = _mm512_max_ps(x, *(v16sf*)_ps512_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm512_mul_ps(x, *(v16sf*)_ps512_cephes_LOG2EF);
+  fx = _mm512_add_ps(fx, *(v16sf*)_ps512_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+  //imm0 = _mm512_cvttps_epi32(fx);
+  //tmp  = _mm512_cvtepi32_ps(imm0);
+  
+  tmp = _mm512_floor_ps(fx);
+
+  /* if greater, substract 1 */
+  //v16sf mask = _mm512_cmpgt_ps(tmp, fx);
+//  v16sf mask = _mm512_cmp_ps(tmp, fx, _CMP_GT_OS);
+  __mmask16 mask2 = _mm512_cmp_ps_mask(tmp, fx, _CMP_GT_OS);
+  v16sf mask = _mm512_mask_blend_ps(mask2, *(v16sf*)_pi32_512_0, *(v16sf*)_pi32_512_0xffffffff);
+//  mask = _mm512_and_ps(mask, one);
+  mask = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(mask), _mm512_castps_si512(one)));
+  fx = _mm512_sub_ps(tmp, mask);
+
+  tmp = _mm512_mul_ps(fx, *(v16sf*)_ps512_cephes_exp_C1);
+  v16sf z = _mm512_mul_ps(fx, *(v16sf*)_ps512_cephes_exp_C2);
+  x = _mm512_sub_ps(x, tmp);
+  x = _mm512_sub_ps(x, z);
+
+  z = _mm512_mul_ps(x,x);
+  
+  v16sf y = *(v16sf*)_ps512_cephes_exp_p0;
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p1);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p2);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p3);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p4);
+  y = _mm512_mul_ps(y, x);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_cephes_exp_p5);
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_add_ps(y, x);
+  y = _mm512_add_ps(y, one);
+
+  /* build 2^n */
+  imm0 = _mm512_cvttps_epi32(fx);
+  // another two AVX2 instructions
+  imm0 = _wrap_mm512_add_epi32(imm0, *(v16si*)_pi32_512_0x7f);
+  imm0 = _wrap_mm512_slli_epi32(imm0, 23);
+  v16sf pow2n = _mm512_castsi512_ps(imm0);
+  y = _mm512_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS512_CONST(minus_cephes_DP1, -0.78515625f);
+_PS512_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
+_PS512_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
+_PS512_CONST(sincof_p0, -1.9515295891E-4f);
+_PS512_CONST(sincof_p1,  8.3321608736E-3f);
+_PS512_CONST(sincof_p2, -1.6666654611E-1f);
+_PS512_CONST(coscof_p0,  2.443315711809948E-005f);
+_PS512_CONST(coscof_p1, -1.388731625493765E-003f);
+_PS512_CONST(coscof_p2,  4.166664568298827E-002f);
+_PS512_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+
+/* evaluation of 16 sines at onces using AVX intrisics
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+*/
+static inline v16sf sin512_ps(v16sf x) { // any x
+  v16sf xmm1, xmm2 = _mm512_setzero_ps(), xmm3, sign_bit, y;
+  v16si imm0, imm2;
+
+  sign_bit = x;
+  /* take the absolute value */
+//  x = _mm512_and_ps(x, *(v16sf*)_ps512_inv_sign_mask);
+  x = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_inv_sign_mask)));
+  /* extract the sign bit (upper one) */
+//  sign_bit = _mm512_and_ps(sign_bit, *(v16sf*)_ps512_sign_mask);
+  sign_bit = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(sign_bit), _mm512_castps_si512(*(v16sf*)_ps512_sign_mask)));
+  
+  /* scale by 4/Pi */
+  y = _mm512_mul_ps(x, *(v16sf*)_ps512_cephes_FOPI);
+
+  /*
+    Here we start a series of integer operations, which are in the
+    realm of AVX2.
+    If we don't have AVX, let's perform them using SSE2 directives
+  */
+
+  /* store the integer part of y in mm0 */
+  imm2 = _mm512_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two AVX2 instruction
+  imm2 = _wrap_mm512_add_epi32(imm2, *(v16si*)_pi32_512_1);
+  imm2 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_inv1);
+  y = _mm512_cvtepi32_ps(imm2);
+
+  /* get the swap sign flag */
+  imm0 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_4);
+  imm0 = _wrap_mm512_slli_epi32(imm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+     Both branches will be computed.
+  */
+  imm2 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_2);
+
+//  imm2 = _mm512_cmpeq_epi32(imm2,*(v16si*)_pi32_512_0);
+  __mmask16 imm22 = _mm512_cmpeq_epi32_mask(imm2,*(v16si*)_pi32_512_0);
+  imm2= _mm512_mask_blend_epi32(imm22, *(v16si*)_pi32_512_0, *(v16si*)_pi32_512_0xffffffff);
+ 
+  v16sf swap_sign_bit = _mm512_castsi512_ps(imm0);
+  v16sf poly_mask = _mm512_castsi512_ps(imm2);
+//  sign_bit = _mm512_xor_ps(sign_bit, swap_sign_bit);
+  sign_bit = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(sign_bit), _mm512_castps_si512(swap_sign_bit)));
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v16sf*)_ps512_minus_cephes_DP1;
+  xmm2 = *(v16sf*)_ps512_minus_cephes_DP2;
+  xmm3 = *(v16sf*)_ps512_minus_cephes_DP3;
+  xmm1 = _mm512_mul_ps(y, xmm1);
+  xmm2 = _mm512_mul_ps(y, xmm2);
+  xmm3 = _mm512_mul_ps(y, xmm3);
+  x = _mm512_add_ps(x, xmm1);
+  x = _mm512_add_ps(x, xmm2);
+  x = _mm512_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v16sf*)_ps512_coscof_p0;
+  v16sf z = _mm512_mul_ps(x,x);
+
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_coscof_p1);
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_coscof_p2);
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_mul_ps(y, z);
+  v16sf tmp = _mm512_mul_ps(z, *(v16sf*)_ps512_0p5);
+  y = _mm512_sub_ps(y, tmp);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v16sf y2 = *(v16sf*)_ps512_sincof_p0;
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_add_ps(y2, *(v16sf*)_ps512_sincof_p1);
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_add_ps(y2, *(v16sf*)_ps512_sincof_p2);
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_mul_ps(y2, x);
+  y2 = _mm512_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+//  y2 = _mm512_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(xmm3), _mm512_castps_si512(y2)));
+//  y = _mm512_andnot_ps(xmm3, y);
+  y = _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(xmm3), _mm512_castps_si512(y)));
+  y = _mm512_add_ps(y,y2);
+  /* update the sign */
+//  y = _mm512_xor_ps(y, sign_bit);
+  y = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(y), _mm512_castps_si512(sign_bit)));
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+static inline v16sf cos512_ps(v16sf x) { // any x
+  v16sf xmm1, xmm2 = _mm512_setzero_ps(), xmm3, y;
+  v16si imm0, imm2;
+
+  /* take the absolute value */
+//  x = _mm512_and_ps(x, *(v16sf*)_ps512_inv_sign_mask);
+  x = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_inv_sign_mask)));
+
+  /* scale by 4/Pi */
+  y = _mm512_mul_ps(x, *(v16sf*)_ps512_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  imm2 = _mm512_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _wrap_mm512_add_epi32(imm2, *(v16si*)_pi32_512_1);
+  imm2 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_inv1);
+  y = _mm512_cvtepi32_ps(imm2);
+  imm2 = _wrap_mm512_sub_epi32(imm2, *(v16si*)_pi32_512_2);
+
+  /* get the swap sign flag */
+  imm0 = _mm512_andnot_si512(imm2, *(v16si*)_pi32_512_4);
+  imm0 = _wrap_mm512_slli_epi32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_2);
+//  imm2 = _mm512_cmpeq_epi32(imm2, *(v16si*)_pi32_512_0);
+  __mmask16 imm22 = _mm512_cmpeq_epi32_mask(imm2, *(v16si*)_pi32_512_0);
+  imm2 = _mm512_mask_blend_epi32(imm22, *(v16si*)_pi32_512_0, *(v16si*)_pi32_512_0xffffffff);
+
+  v16sf sign_bit = _mm512_castsi512_ps(imm0);
+  v16sf poly_mask = _mm512_castsi512_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v16sf*)_ps512_minus_cephes_DP1;
+  xmm2 = *(v16sf*)_ps512_minus_cephes_DP2;
+  xmm3 = *(v16sf*)_ps512_minus_cephes_DP3;
+  xmm1 = _mm512_mul_ps(y, xmm1);
+  xmm2 = _mm512_mul_ps(y, xmm2);
+  xmm3 = _mm512_mul_ps(y, xmm3);
+  x = _mm512_add_ps(x, xmm1);
+  x = _mm512_add_ps(x, xmm2);
+  x = _mm512_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v16sf*)_ps512_coscof_p0;
+  v16sf z = _mm512_mul_ps(x,x);
+
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_coscof_p1);
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_coscof_p2);
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_mul_ps(y, z);
+  v16sf tmp = _mm512_mul_ps(z, *(v16sf*)_ps512_0p5);
+  y = _mm512_sub_ps(y, tmp);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v16sf y2 = *(v16sf*)_ps512_sincof_p0;
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_add_ps(y2, *(v16sf*)_ps512_sincof_p1);
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_add_ps(y2, *(v16sf*)_ps512_sincof_p2);
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_mul_ps(y2, x);
+  y2 = _mm512_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+//  y2 = _mm512_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(xmm3), _mm512_castps_si512(y2)));
+//  y = _mm512_andnot_ps(xmm3, y);
+  y = _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(xmm3), _mm512_castps_si512(y)));
+  y = _mm512_add_ps(y,y2);
+  /* update the sign */
+//  y = _mm512_xor_ps(y, sign_bit);
+  y = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(y), _mm512_castps_si512(sign_bit)));
+
+  return y;
+}
+
+/* since sin512_ps and cos512_ps are almost identical, sincos512_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+static inline void sincos512_ps(v16sf x, v16sf *s, v16sf *c) {
+
+  v16sf xmm1, xmm2, xmm3 = _mm512_setzero_ps(), sign_bit_sin, y;
+  v16si imm0, imm2, imm4;
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+//  x = _mm512_and_ps(x, *(v16sf*)_ps512_inv_sign_mask);
+  x = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x), _mm512_castps_si512(*(v16sf*)_ps512_inv_sign_mask)));
+  /* extract the sign bit (upper one) */
+//  sign_bit_sin = _mm512_and_ps(sign_bit_sin, *(v16sf*)_ps512_sign_mask);
+  sign_bit_sin = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(sign_bit_sin), _mm512_castps_si512(*(v16sf*)_ps512_sign_mask)));
+  
+  /* scale by 4/Pi */
+  y = _mm512_mul_ps(x, *(v16sf*)_ps512_cephes_FOPI);
+
+  /* store the integer part of y in imm2 */
+  imm2 = _mm512_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _wrap_mm512_add_epi32(imm2, *(v16si*)_pi32_512_1);
+  imm2 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_inv1);
+
+  y = _mm512_cvtepi32_ps(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_4);
+  imm0 = _wrap_mm512_slli_epi32(imm0, 29);
+  //v16sf swap_sign_bit_sin = _mm512_castsi512_ps(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = _mm512_and_si512(imm2, *(v16si*)_pi32_512_2);
+//  imm2 = _mm512_cmpeq_epi32(imm2, *(v16si*)_pi32_512_0);
+  __mmask16 imm22 = _mm512_cmpeq_epi32_mask(imm2, *(v16si*)_pi32_512_0);
+  imm2 = _mm512_mask_blend_epi32(imm22, *(v16si*)_pi32_512_0, *(v16si*)_pi32_512_0xffffffff);
+
+  //v16sf poly_mask = _mm512_castsi512_ps(imm2);
+
+  v16sf swap_sign_bit_sin = _mm512_castsi512_ps(imm0);
+  v16sf poly_mask = _mm512_castsi512_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v16sf*)_ps512_minus_cephes_DP1;
+  xmm2 = *(v16sf*)_ps512_minus_cephes_DP2;
+  xmm3 = *(v16sf*)_ps512_minus_cephes_DP3;
+  xmm1 = _mm512_mul_ps(y, xmm1);
+  xmm2 = _mm512_mul_ps(y, xmm2);
+  xmm3 = _mm512_mul_ps(y, xmm3);
+  x = _mm512_add_ps(x, xmm1);
+  x = _mm512_add_ps(x, xmm2);
+  x = _mm512_add_ps(x, xmm3);
+
+  imm4 = _wrap_mm512_sub_epi32(imm4, *(v16si*)_pi32_512_2);
+  imm4 = _mm512_andnot_si512(imm4, *(v16si*)_pi32_512_4);
+  imm4 = _wrap_mm512_slli_epi32(imm4, 29);
+
+  v16sf sign_bit_cos = _mm512_castsi512_ps(imm4);
+
+//  sign_bit_sin = _mm512_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+  sign_bit_sin = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(sign_bit_sin), _mm512_castps_si512(swap_sign_bit_sin)));
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v16sf z = _mm512_mul_ps(x,x);
+  y = *(v16sf*)_ps512_coscof_p0;
+
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_coscof_p1);
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_coscof_p2);
+  y = _mm512_mul_ps(y, z);
+  y = _mm512_mul_ps(y, z);
+  v16sf tmp = _mm512_mul_ps(z, *(v16sf*)_ps512_0p5);
+  y = _mm512_sub_ps(y, tmp);
+  y = _mm512_add_ps(y, *(v16sf*)_ps512_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v16sf y2 = *(v16sf*)_ps512_sincof_p0;
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_add_ps(y2, *(v16sf*)_ps512_sincof_p1);
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_add_ps(y2, *(v16sf*)_ps512_sincof_p2);
+  y2 = _mm512_mul_ps(y2, z);
+  y2 = _mm512_mul_ps(y2, x);
+  y2 = _mm512_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+//  v16sf ysin2 = _mm512_and_ps(xmm3, y2);
+  v16sf ysin2 = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(xmm3), _mm512_castps_si512(y2)));
+//  v16sf ysin1 = _mm512_andnot_ps(xmm3, y);
+  v16sf ysin1 = _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(xmm3), _mm512_castps_si512(y)));
+  y2 = _mm512_sub_ps(y2,ysin2);
+  y = _mm512_sub_ps(y, ysin1);
+
+  xmm1 = _mm512_add_ps(ysin1,ysin2);
+  xmm2 = _mm512_add_ps(y,y2);
+ 
+  /* update the sign */
+//  *s = _mm512_xor_ps(xmm1, sign_bit_sin);
+  *s = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(xmm1), _mm512_castps_si512(sign_bit_sin)));
+//  *c = _mm512_xor_ps(xmm2, sign_bit_cos);
+  *c = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(xmm2), _mm512_castps_si512(sign_bit_cos)));
+}
+
+#endif
diff --git a/src/common/types.h b/src/common/types.h
index a0930a0f8..ada061c73 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -171,6 +171,55 @@ struct intgemm8avx512vnni  { int8_t x;  };
 #ifndef __CUDACC__ // vectorized types not available from .cu files
 
 // @TODO: check what intrinsics are actually available.
+/* Templated implementation to get to to work later
+typedef __m128 float32x4;
+template <class Register> static inline Register set1_ps(float to);
+template <> inline __m128 set1_ps<__m128>(float to) {
+  return _mm_set1_ps(to);
+}
+#ifdef __AVX__
+typedef __m256 float32x8;
+template <> inline __m256 set1_ps<__m256>(float to) {
+  return _mm256_set1_ps(to);
+}
+#endif
+#ifdef __AVX512F__
+typedef __m256 float32x16;
+template <> inline __m512 set1_ps<__m512>(float to) {
+  return _mm512_set1_ps(to);
+}
+#endif
+
+template<class Register>
+struct float32v {
+private:
+  Register f_;
+
+public:
+  float32vector() {}
+  float32vector(const Register& f) : f_(f) {}
+  float32vector(const float& f) : f_(set1_ps<Register>(f)) {}
+
+  operator const Register&() const { return f_; }
+  operator Register&() { return f_; }
+
+  float operator[] (size_t i) const {
+    return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __mXXX is an array of floats
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, Register f) {
+    size_t length = sizeof(Register)/sizeof(float);
+    float* a = (float*)&f;
+    out << "[" << a[0];
+    for(size_t i = 1; i < length; i++)
+      out << " " << a[i];
+    out << "]";
+    return out;
+  }
+
+};
+
+*/
 struct float32x4 {
 private:
   __m128 f_;
@@ -231,6 +280,35 @@ struct float32x8 {
 #endif
 #endif
 
+#ifdef __AVX512F__
+struct float32x16 {
+private:
+  __m512 f_;
+
+public:
+  float32x16() {}
+  float32x16(const __m512& f) : f_(f) {}
+  float32x16(const float& f) : f_(_mm512_set1_ps(f)) {} // __m256 _mm_set1_ps(float) copies value into all slots
+
+  operator const __m512&() const { return f_; }
+  operator __m512&() { return f_; }
+
+  float operator[] (size_t i) const {
+    return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, float32x16 f16) {
+    float* a = (float*)&f16;
+    out << "[" << a[0];
+    for(int i = 1; i < 16; i++)
+      out << " " << a[i];
+    out << "]";
+    return out;
+  }
+};
+#endif
+
+
 #if COMPILE_FP16
 
 // @TODO: check what intrinsics are actually available.
diff --git a/src/functional/operators.h b/src/functional/operators.h
index a14f153f1..8389e8d33 100644
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -463,6 +463,15 @@ struct Ops<float32x8> {
   }
 };
 
+} // end namespace functional
+} // end namespace marian
+#endif
+
+#ifdef __AVX512F__
+#include "3rd_party/avx512_mathfun.h"
+
+namespace marian {
+namespace functional {
 } // end namespace functional
 } // end namespace marian
 #endif

From e1bd54402d4cd28a9aebd7da5542e45907e0e24d Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Tue, 26 Jul 2022 16:10:39 +0000
Subject: [PATCH 2/4] First version of avx512intrinsics. Compiles but crashes

---
 src/common/types.h                   |  30 +++---
 src/functional/operators.h           | 133 +++++++++++++++++++++++++++
 src/functional/tensor.h              |  14 +++
 src/tensors/cpu/element.h            |  11 +++
 src/tensors/cpu/tensor_operators.cpp |  25 ++++-
 5 files changed, 199 insertions(+), 14 deletions(-)

diff --git a/src/common/types.h b/src/common/types.h
index ada061c73..1bcc96de1 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -171,21 +171,20 @@ struct intgemm8avx512vnni  { int8_t x;  };
 #ifndef __CUDACC__ // vectorized types not available from .cu files
 
 // @TODO: check what intrinsics are actually available.
-/* Templated implementation to get to to work later
-typedef __m128 float32x4;
+/* Currently doesn't work due to https://stackoverflow.com/questions/41676311/implication-of-gcc-warning-ignoring-attributes-on-template-argument-wignored
+// I am not entirely sure wrapping it in a struct wouldn't at some point accidentally cause a misalignment
+//Templated implementation to get to to work later
 template <class Register> static inline Register set1_ps(float to);
 template <> inline __m128 set1_ps<__m128>(float to) {
   return _mm_set1_ps(to);
 }
 #ifdef __AVX__
-typedef __m256 float32x8;
 template <> inline __m256 set1_ps<__m256>(float to) {
   return _mm256_set1_ps(to);
 }
 #endif
 #ifdef __AVX512F__
-typedef __m256 float32x16;
-template <> inline __m512 set1_ps<__m512>(float to) {
+template <> inline __m512 set1_ps<__m5__attribute__ ((aligned (16)))12>(float to) {
   return _mm512_set1_ps(to);
 }
 #endif
@@ -193,12 +192,12 @@ template <> inline __m512 set1_ps<__m512>(float to) {
 template<class Register>
 struct float32v {
 private:
-  Register f_;
+  Register __attribute__ ((aligned (32))) f_;
 
 public:
-  float32vector() {}
-  float32vector(const Register& f) : f_(f) {}
-  float32vector(const float& f) : f_(set1_ps<Register>(f)) {}
+  float32v() {}
+  float32v(const Register& f) : f_(f) {}
+  float32v(const float& f) : f_(set1_ps<Register>(f)) {}
 
   operator const Register&() const { return f_; }
   operator Register&() { return f_; }
@@ -208,7 +207,7 @@ struct float32v {
   }
 
   friend std::ostream& operator<<(std::ostream& out, Register f) {
-    size_t length = sizeof(Register)/sizeof(float);
+    const size_t constexpr length = sizeof(Register)/sizeof(float);
     float* a = (float*)&f;
     out << "[" << a[0];
     for(size_t i = 1; i < length; i++)
@@ -219,7 +218,15 @@ struct float32v {
 
 };
 
+using float32x4 = float32v<__m128>;
+#ifdef __AVX__
+using float32x8 = float32v<__m256>;
+#endif
+#ifdef __AVX512F__
+using float32x16 = float32v<__m512>;
+#endif
 */
+
 struct float32x4 {
 private:
   __m128 f_;
@@ -278,7 +285,6 @@ struct float32x8 {
 struct float32x8 {
 };
 #endif
-#endif
 
 #ifdef __AVX512F__
 struct float32x16 {
@@ -307,7 +313,7 @@ struct float32x16 {
   }
 };
 #endif
-
+#endif // #ifndef __CUDACC__
 
 #if COMPILE_FP16
 
diff --git a/src/functional/operators.h b/src/functional/operators.h
index 8389e8d33..7ec77baee 100644
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -472,6 +472,139 @@ struct Ops<float32x8> {
 
 namespace marian {
 namespace functional {
+
+
+//*******************************************************************************************
+// Specialization for float32x16 (=__m512, CPU AVX512 intrisics)
+template <>
+struct Ops<float32x16> {
+  typedef float Single;
+
+  static inline float32x16 loop16(const std::function<float(const float&)>& f, const float32x16& x) {
+    float32x16 out;
+    for(int i = 0; i < 16; i++)
+      ((float*)&out)[i] = f(((const float*)&x)[i]);
+    return out;
+  }
+
+  static inline float32x16 loop16(const std::function<float(const float&, const float&)>& f, const float32x16& x, const float32x16& y) {
+    float32x16 out;
+    for(int i = 0; i < 16; i++)
+      ((float*)&out)[i] = f(((const float*)&x)[i], ((const float*)&y)[i]);
+    return out;
+  }
+
+  static inline float32x16 loop16(const std::function<float(const float&, const float&, const float&)>& f, const float32x16& x, const float32x16& y, const float32x16& z) {
+    float32x16 out;
+    for(int i = 0; i < 16; i++)
+      ((float*)&out)[i] = f(((const float*)&x)[i], ((const float*)&y)[i], ((const float*)&z)[i]);
+    return out;
+  }
+
+  static inline float32x16 tanh(const float32x16& x) { // ( e^x - e^-x )/( e^x + e^-x )
+    float32x16 e2x = exp(mul(2.f, x));
+    return div(sub(e2x, 1.f), add(e2x, 1.f));
+  }
+
+  static inline float32x16 sin(const float32x16& x) { return sin512_ps(x); }
+  static inline float32x16 cos(const float32x16& x) { return cos512_ps(x); }
+  static inline float32x16 tan(const float32x16& x) { return div(sin(x), cos(x)); } // @TODO: use sincos256_ps
+  static inline float32x16 log(const float32x16& x) { return log512_ps(x); }
+  static inline float32x16 exp(const float32x16& x) { return exp512_ps(x); }
+
+  // @TODO: get rid of loop16 with proper intrisics
+  static inline float32x16 abs(const float32x16& x)  { return loop16(Ops<float>::abs, x); }
+  static inline float32x16 sqr(const float32x16& x)  { return _mm512_mul_ps(x, x); }
+  static inline float32x16 sqrt(const float32x16& x) { return _mm512_sqrt_ps(x); }
+  static inline float32x16 neg(const float32x16& x)  { return sub(0.f, x); }
+
+  // @TODO: get rid of loop16 with proper intrisics
+  static inline float32x16 sgn(const float32x16& x)  { return loop16(Ops<float>::sgn, x); }
+
+  static inline float32x16 round(const float32x16& x)  { return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEAREST_INT); } // Thank you, Intel: https://stackoverflow.com/questions/50854991/instrinsic-mm512-round-ps-is-missing-for-avx512
+  static inline float32x16 floor(const float32x16& x)  { return _mm512_floor_ps(x); }
+  static inline float32x16 ceil(const float32x16& x)   { return _mm512_ceil_ps(x); }
+
+  static inline float32x16 add(const float32x16& x, const float32x16& y) { return _mm512_add_ps(x, y); }
+  static inline float32x16 sub(const float32x16& x, const float32x16& y) { return _mm512_sub_ps(x, y); }
+  static inline float32x16 mul(const float32x16& x, const float32x16& y) { return _mm512_mul_ps(x, y); }
+  static inline float32x16 div(const float32x16& x, const float32x16& y) { return _mm512_div_ps(x, y); }
+
+  static inline float32x16 max(const float32x16& x, const float32x16& y) { return _mm512_max_ps(x, y); }
+  static inline float32x16 min(const float32x16& x, const float32x16& y) { return _mm512_min_ps(x, y); }
+  static inline float32x16 pow(const float32x16& x, const float32x16& y) { return exp(mul(y, log(x))); }
+
+  // @TODO: get rid of loop16 with proper intrisics
+  static inline float32x16 negate(float32x16& x)  { return loop16(Ops<float>::negate, x); }
+
+  static inline float32x16 eq(const float32x16& x, const float32x16& y)   { return loop16(Ops<float>::eq, x, y); }
+  static inline float32x16 neq(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::neq, x, y); }
+  static inline float32x16 gt(const float32x16& x, const float32x16& y)   { return loop16(Ops<float>::gt, x, y); }
+  static inline float32x16 lt(const float32x16& x, const float32x16& y)   { return loop16(Ops<float>::lt, x, y); }
+  static inline float32x16 geq(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::geq, x, y); }
+  static inline float32x16 leq(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::leq, x, y); }
+  static inline float32x16 and_(const float32x16& x, const float32x16& y) { return loop16(Ops<float>::and_, x, y); } // 'and' is used by gcc
+  static inline float32x16 or_(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::or_, x, y); } // 'or' is used by gcc
+
+
+  // Neural Networks specific functions
+  // @TODO: this is unsafe
+  static inline float32x16 sigmoid(const float32x16& x) {
+    float32x16 e = exp(x);
+    return div(e, add(1.f, e));
+  }
+
+  static inline float32x16 logaddexp(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::logaddexp, x, y); }
+
+  static inline float32x16 clip(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::clip, x, y); }
+  static inline float32x16 bump(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::bump, x, y); }
+
+  static inline float32x16 relu(const float32x16& x)  { return max(0.f, x); }
+
+  static inline float32x16 reluBack(const float32x16& x)  { return loop16(Ops<float>::reluBack, x); }
+  static inline float32x16 prelu(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::prelu, x, y); }
+  static inline float32x16 preluBack(const float32x16& x, const float32x16& y)  { return loop16(Ops<float>::preluBack, x, y); }
+
+  static inline float32x16 if_then_else(const float32x16& x, const float32x16& y, const float32x16& z) { return loop16(Ops<float>::if_then_else, x, y, z);  }
+
+  static inline Single sumReduce(const float32x16& x) {
+    // It's a sequence instruction so performance might be suboptimal, but this probably gives the compiler  the best chance at optimising it:
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_add_ps&ig_expand=5926,5926,5703,5730,5660,5660&avx512techs=AVX512F
+    Single sum = _mm512_reduce_add_ps(x);
+    return sum;
+    /*
+    Single sum = 0;
+    for(int i = 0; i < 16; ++i)
+      sum = Ops<Single>::add(sum, x[i]);
+    return sum;*/
+  }
+
+  static inline Single maxReduce(const float32x16& x) {
+    // It's a sequence instruction so performance might be suboptimal, but this probably gives the compiler  the best chance at optimising it:
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_max_ps&ig_expand=5926,5926,5703&avx512techs=AVX512F
+    Single max = _mm512_reduce_max_ps(x);
+    return max;
+    /*
+    Single maxs = x[0];
+    for(int i = 1; i < 16; ++i)
+      maxs = Ops<Single>::max(maxs, x[i]);
+    return maxs;*/
+  }
+
+  static inline Single minReduce(const float32x16& x) {
+    // It's a sequence instruction so performance might be suboptimal, but this probably gives the compiler  the best chance at optimising it:
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_min_ps&ig_expand=5926,5926,5703,5730&avx512techs=AVX512F
+    Single min = _mm512_reduce_min_ps(x);
+    return min;
+    /*
+    Single mins = x[0];
+    for(int i = 1; i < 16; ++i)
+      mins = Ops<Single>::min(mins, x[i]);
+    return mins;
+    */
+  }
+};
+
 } // end namespace functional
 } // end namespace marian
 #endif
diff --git a/src/functional/tensor.h b/src/functional/tensor.h
index f5549c608..1b73d5825 100644
--- a/src/functional/tensor.h
+++ b/src/functional/tensor.h
@@ -44,7 +44,21 @@ inline marian::Shape adapt<float32x8>(const marian::Shape& shape) {
   return x8Shape;
 }
 #endif
+
+#ifdef __AVX512F__
+// as above, but for a stride of 16, since we are processing 16 floats at once
+template <>
+inline marian::Shape adapt<float32x16>(const marian::Shape& shape) {
+  ABORT_IF(shape[-1] % 16 != 0,
+           "Last dim ({}) is not a multiple of 16 while converting to Tensor<float32x16>",
+           shape[-1]);
+
+  marian::Shape x16Shape = shape;
+  x16Shape.set(-1, shape[-1] / 16);
+  return x16Shape;
+}
 #endif
+#endif // __CUDACC__
 
 #if COMPILE_FP16
 // as above, but for a stride of 2, since we are processing 2 half floats at once. Works on GPU.
diff --git a/src/tensors/cpu/element.h b/src/tensors/cpu/element.h
index a0d111fd3..afdace652 100644
--- a/src/tensors/cpu/element.h
+++ b/src/tensors/cpu/element.h
@@ -83,10 +83,13 @@ template <class Functor, class... Tensors>
 void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors) {
 #ifndef __CUDACC__
   std::vector<marian::Tensor> ts({out, tensors...});
+  bool div16 = true;
   bool div8 = true;
   bool div4 = true;
 
   for(auto t : ts) {
+    if(t->shape()[-1] % 16 != 0)
+      div16 = false;
     if(t->shape()[-1] % 8 != 0)
       div8 = false;
     if(t->shape()[-1] % 4 != 0) {
@@ -95,6 +98,14 @@ void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors
     }
   }
 
+  if(div16) {
+    // std::cerr << "16: " << functor.to_string() << std::endl;
+#ifdef __AVX512F__
+    element<float32x16>(functor, out, tensors...);
+    return;
+#endif
+  }
+
   if(div8) {
     // std::cerr << "8: " << functor.to_string() << std::endl;
 #ifdef __AVX__
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 1e1adc38b..c4ba6bc52 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -452,7 +452,12 @@ void Softmax(Tensor out, Tensor in) {
 void Softmax(Tensor out, Tensor in) {
   matchOrAbort<float>(out->type());
   matchOrAbort<float>(in->type());
-
+#ifdef __AVX512F__
+  if(out->shape()[-1] % 16 == 0) {
+    Softmax<float32x16>(out, in);
+    return;
+  }
+#endif
 #ifdef __AVX__
   if(out->shape()[-1] % 8 == 0) {
     Softmax<float32x8>(out, in);
@@ -509,6 +514,12 @@ void LogSoftmax(Tensor out, Tensor in) {
   matchOrAbort<float>(out->type());
   matchOrAbort<float>(in->type());
 
+#ifdef __AVX512F__
+  if(out->shape()[-1] % 16 == 0) {
+    LogSoftmax<float32x16>(out, in);
+    return;
+  }
+#endif
 #ifdef __AVX__
   if(out->shape()[-1] % 8 == 0) {
     LogSoftmax<float32x8>(out, in);
@@ -1521,6 +1532,11 @@ void LSTMCellForwardTyped(Tensor out_, const std::vector<Tensor>& inputs) {
 
 void LSTMCellForward(Tensor out, std::vector<Tensor> inputs) {
   int cols = out->shape()[-1];
+#ifdef __AVX512F__
+  if(cols % 16 == 0)
+    LSTMCellForwardTyped<float32x16>(out, inputs);
+  else
+#endif
 #ifdef __AVX__
   if(cols % 8 == 0)
     LSTMCellForwardTyped<float32x8>(out, inputs);
@@ -1565,10 +1581,15 @@ void LSTMOutputForwardTyped(Tensor out_, const std::vector<Tensor>& inputs) {
 void LSTMOutputForward(Tensor out, std::vector<Tensor> inputs) {
   int cols = out->shape()[-1];
 
+#ifdef __AVX512F__
+  if(cols % 16 == 0)
+    LSTMOutputForwardTyped<float32x16>(out, inputs);
+  else
+#endif
 #ifdef __AVX__
   if(cols % 8 == 0)
     LSTMOutputForwardTyped<float32x8>(out, inputs);
-  else 
+  else
 #endif
   if(cols % 4 == 0)
     LSTMOutputForwardTyped<float32x4>(out, inputs);

From d326cfc349d890882a1779701a6ac25c1c16a425 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Wed, 27 Jul 2022 14:52:14 +0000
Subject: [PATCH 3/4] Fixes and optimisations

---
 src/3rd_party/avx512_mathfun.h | 14 ++++++-------
 src/common/types.h             |  6 +++++-
 src/functional/operators.h     | 37 +++++++++++++++++-----------------
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/src/3rd_party/avx512_mathfun.h b/src/3rd_party/avx512_mathfun.h
index f4d01d836..2355a8165 100644
--- a/src/3rd_party/avx512_mathfun.h
+++ b/src/3rd_party/avx512_mathfun.h
@@ -17,20 +17,20 @@ typedef __m512i v16si; // vector of 16 int   (avx512)
 
 /* yes I know, the top of this file is quite ugly */
 #ifdef _MSC_VER /* visual c++ */
-# define ALIGN32_BEG __declspec(align(32))
-# define ALIGN32_END 
+# define ALIGN64_BEG __declspec(align(64))
+# define ALIGN64_END
 #else /* gcc or icc */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+# define ALIGN64_BEG
+# define ALIGN64_END __attribute__((aligned(64)))
 #endif
 
 /* declare some AVX512 constants -- why can't I figure a better way to do that? */
 #define _PS512_CONST(Name, Val)                                            \
-  static const constexpr ALIGN32_BEG float _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
+  static const constexpr ALIGN64_BEG float _ps512_##Name[16] ALIGN64_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
 #define _PI32_CONST512(Name, Val)                                            \
-  static const constexpr ALIGN32_BEG int _pi32_512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
+  static const constexpr ALIGN64_BEG int _pi32_512_##Name[16] ALIGN64_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
 #define _PS512_CONST_TYPE(Name, Type, Val)                                 \
-  static const constexpr ALIGN32_BEG Type _ps512_##Name[16] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
+  static const constexpr ALIGN64_BEG Type _ps512_##Name[16] ALIGN64_END = { Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val, Val }
 
 _PS512_CONST(1  , 1.0f);
 _PS512_CONST(0p5, 0.5f);
diff --git a/src/common/types.h b/src/common/types.h
index 1bcc96de1..131f3f1e7 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -197,10 +197,11 @@ struct float32v {
 public:
   float32v() {}
   float32v(const Register& f) : f_(f) {}
-  float32v(const float& f) : f_(set1_ps<Register>(f)) {}
+  float32v(const float& f) : f_(set1_ps<Register>(f)) {} // copies value into all slots
 
   operator const Register&() const { return f_; }
   operator Register&() { return f_; }
+  Register* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference.
 
   float operator[] (size_t i) const {
     return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __mXXX is an array of floats
@@ -238,6 +239,7 @@ struct float32x4 {
 
   operator const __m128&() const { return f_; }
   operator __m128&() { return f_; }
+  __m128* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference.
 
   float operator[] (size_t i) const {
     return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats
@@ -266,6 +268,7 @@ struct float32x8 {
 
   operator const __m256&() const { return f_; }
   operator __m256&() { return f_; }
+  __m256* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference.
 
   float operator[] (size_t i) const {
     return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats
@@ -298,6 +301,7 @@ struct float32x16 {
 
   operator const __m512&() const { return f_; }
   operator __m512&() { return f_; }
+  __m512* get() { return &f_; } // For when we need to pass a ptr, as opposed to value or reference.
 
   float operator[] (size_t i) const {
     return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats
diff --git a/src/functional/operators.h b/src/functional/operators.h
index 7ec77baee..41f9ae5f0 100644
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -253,7 +253,12 @@ struct Ops<float32x4> {
 
   static inline float32x4 sin(const float32x4& x) { return sin_ps(x); }
   static inline float32x4 cos(const float32x4& x) { return cos_ps(x); }
-  static inline float32x4 tan(const float32x4& x) { return div(sin(x), cos(x)); }
+  static inline float32x4 tan(const float32x4& x) {
+    float32x4 my_sin;
+    float32x4 my_cos;
+    sincos_ps(x, my_sin.get(), my_cos.get());
+    return div(my_sin, my_cos);
+  }
   static inline float32x4 log(const float32x4& x) { return log_ps(x); }
   static inline float32x4 exp(const float32x4& x) { return exp_ps(x); }
 
@@ -382,7 +387,12 @@ struct Ops<float32x8> {
 
   static inline float32x8 sin(const float32x8& x) { return sin256_ps(x); }
   static inline float32x8 cos(const float32x8& x) { return cos256_ps(x); }
-  static inline float32x8 tan(const float32x8& x) { return div(sin(x), cos(x)); } // @TODO: use sincos256_ps
+  static inline float32x8 tan(const float32x8& x) {
+    float32x8 my_sin;
+    float32x8 my_cos;
+    sincos256_ps(x, my_sin.get(), my_cos.get());
+    return div(my_sin, my_cos);
+  }
   static inline float32x8 log(const float32x8& x) { return log256_ps(x); }
   static inline float32x8 exp(const float32x8& x) { return exp256_ps(x); }
 
@@ -508,7 +518,12 @@ struct Ops<float32x16> {
 
   static inline float32x16 sin(const float32x16& x) { return sin512_ps(x); }
   static inline float32x16 cos(const float32x16& x) { return cos512_ps(x); }
-  static inline float32x16 tan(const float32x16& x) { return div(sin(x), cos(x)); } // @TODO: use sincos256_ps
+  static inline float32x16 tan(const float32x16& x) {
+    float32x16 my_sin;
+    float32x16 my_cos;
+    sincos512_ps(x, my_sin.get(), my_cos.get());
+    return div(my_sin, my_cos);
+  }
   static inline float32x16 log(const float32x16& x) { return log512_ps(x); }
   static inline float32x16 exp(const float32x16& x) { return exp512_ps(x); }
 
@@ -572,11 +587,6 @@ struct Ops<float32x16> {
     // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_add_ps&ig_expand=5926,5926,5703,5730,5660,5660&avx512techs=AVX512F
     Single sum = _mm512_reduce_add_ps(x);
     return sum;
-    /*
-    Single sum = 0;
-    for(int i = 0; i < 16; ++i)
-      sum = Ops<Single>::add(sum, x[i]);
-    return sum;*/
   }
 
   static inline Single maxReduce(const float32x16& x) {
@@ -584,11 +594,6 @@ struct Ops<float32x16> {
     // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_max_ps&ig_expand=5926,5926,5703&avx512techs=AVX512F
     Single max = _mm512_reduce_max_ps(x);
     return max;
-    /*
-    Single maxs = x[0];
-    for(int i = 1; i < 16; ++i)
-      maxs = Ops<Single>::max(maxs, x[i]);
-    return maxs;*/
   }
 
   static inline Single minReduce(const float32x16& x) {
@@ -596,12 +601,6 @@ struct Ops<float32x16> {
     // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_reduce_min_ps&ig_expand=5926,5926,5703,5730&avx512techs=AVX512F
     Single min = _mm512_reduce_min_ps(x);
     return min;
-    /*
-    Single mins = x[0];
-    for(int i = 1; i < 16; ++i)
-      mins = Ops<Single>::min(mins, x[i]);
-    return mins;
-    */
   }
 };
 

From 34189282e67972d7bf8ca0c88cb008a80a680b5b Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Thu, 28 Jul 2022 22:58:31 +0000
Subject: [PATCH 4/4] Changelog stuff

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 01aea0251..ce678114c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 
 ## [Unreleased]
-
+- Added avx512 types and functional path.
+- Small optimisation on AVX and SSE instruction path.
 ### Added
 
 ### Fixed