From e6cd792fd2b2ff213ad5788e9eb4c86c2c1601af Mon Sep 17 00:00:00 2001
From: Henry de Valence <hdevalence@hdevalence.ca>
Date: Mon, 4 Feb 2019 18:08:13 -0800
Subject: [PATCH 1/6] Add _mm512_set1_epi64.

---
 crates/core_arch/src/simd.rs        | 4 ++++
 crates/core_arch/src/x86/avx512f.rs | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
index 5c8425623d..f6b2babf09 100644
--- a/crates/core_arch/src/simd.rs
+++ b/crates/core_arch/src/simd.rs
@@ -191,3 +191,7 @@ simd_ty!(i32x16[i32]:
          i32, i32, i32, i32, i32, i32, i32, i32
          | x0, x1, x2, x3, x4, x5, x6, x7,
          x8, x9, x10, x11, x12, x13, x14, x15);
+
+simd_ty!(i64x8[i64]:
+         i64, i64, i64, i64, i64, i64, i64, i64
+         | x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 94efadac74..8994f57724 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -87,6 +87,13 @@ pub unsafe fn _mm512_setr_epi32(
     mem::transmute(r)
 }
 
+/// Broadcast 64-bit integer `a` to all elements of `dst`.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
+    mem::transmute(i64x8::splat(a))
+}
+
 #[cfg(test)]
 mod tests {
     use std;

From 3d4f047badb7cca64d521b68cbf60e0bc38c5d60 Mon Sep 17 00:00:00 2001
From: Henry de Valence <hdevalence@hdevalence.ca>
Date: Mon, 4 Feb 2019 18:09:14 -0800
Subject: [PATCH 2/6] Add avx512ifma skeleton.

---
 crates/core_arch/src/x86/avx512ifma.rs | 84 ++++++++++++++++++++++++++
 crates/core_arch/src/x86/mod.rs        |  3 +
 2 files changed, 87 insertions(+)
 create mode 100644 crates/core_arch/src/x86/avx512ifma.rs

diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs
new file mode 100644
index 0000000000..d2509d4a50
--- /dev/null
+++ b/crates/core_arch/src/x86/avx512ifma.rs
@@ -0,0 +1,84 @@
+use core_arch::x86::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512IFMA52&expand=3488)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    vpmadd52huq_512(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52)
+#[inline]
+#[target_feature(enable = "avx512ifma")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    vpmadd52luq_512(a, b, c)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
+    fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"]
+    fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"]
+    fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"]
+    fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
+    #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"]
+    fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+    #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"]
+    fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
+}
+
+#[cfg(test)]
+mod tests {
+    use std;
+    use stdsimd_test::simd_test;
+
+    use core_arch::x86::*;
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52hi_epu64() {
+        let mut a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        a = _mm512_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm512_set1_epi64(11030549757952);
+
+        assert_eq_m512i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma")]
+    unsafe fn test_mm512_madd52lo_epu64() {
+        let mut a = _mm512_set1_epi64(10 << 40);
+        let b = _mm512_set1_epi64((11 << 40) + 4);
+        let c = _mm512_set1_epi64((12 << 40) + 3);
+
+        a = _mm512_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm512_set1_epi64(100055558127628);
+
+        assert_eq_m512i(a, expected);
+    }
+}
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 5870c2cc18..694b11cfa2 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -560,3 +560,6 @@ pub unsafe fn ud2() -> ! {
 
 mod avx512f;
 pub use self::avx512f::*;
+
+mod avx512ifma;
+pub use self::avx512ifma::*;

From ffcee6007ad510030ac2cc4f8010643c901f5d8e Mon Sep 17 00:00:00 2001
From: Henry de Valence <hdevalence@hdevalence.ca>
Date: Mon, 4 Feb 2019 19:34:09 -0800
Subject: [PATCH 3/6] Add AVX512VL variants of IFMA instructions.

---
 crates/core_arch/src/x86/avx512ifma.rs | 112 +++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs
index d2509d4a50..55379cf6de 100644
--- a/crates/core_arch/src/x86/avx512ifma.rs
+++ b/crates/core_arch/src/x86/avx512ifma.rs
@@ -31,6 +31,62 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
     vpmadd52luq_512(a, b, c)
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52huq_256(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52luq_256(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52huq_128(a, b, c)
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
+#[inline]
+#[target_feature(enable = "avx512ifma,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmadd52huq))]
+pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52luq_128(a, b, c)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
@@ -81,4 +137,60 @@ mod tests {
 
         assert_eq_m512i(a, expected);
     }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52hi_epu64() {
+        let mut a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        a = _mm256_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm256_madd52lo_epu64() {
+        let mut a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        a = _mm256_madd52lo_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52hi_epu64() {
+        let mut a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        a = _mm_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(a, expected);
+    }
+
+    #[simd_test(enable = "avx512ifma,avx512vl")]
+    unsafe fn test_mm_madd52lo_epu64() {
+        let mut a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        a = _mm_madd52hi_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(a, expected);
+    }
 }

From 0aee500f7bc518c1da7e14e8fd77c210818dfe1c Mon Sep 17 00:00:00 2001
From: Henry de Valence <hdevalence@hdevalence.ca>
Date: Tue, 5 Feb 2019 10:57:08 -0800
Subject: [PATCH 4/6] Fix incorrect assert_instr tests.

---
 crates/core_arch/src/x86/avx512ifma.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs
index 55379cf6de..9aacd9e02e 100644
--- a/crates/core_arch/src/x86/avx512ifma.rs
+++ b/crates/core_arch/src/x86/avx512ifma.rs
@@ -26,7 +26,7 @@ pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52)
 #[inline]
 #[target_feature(enable = "avx512ifma")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
 pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
     vpmadd52luq_512(a, b, c)
 }
@@ -54,7 +54,7 @@ pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494)
 #[inline]
 #[target_feature(enable = "avx512ifma,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
 pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
     vpmadd52luq_256(a, b, c)
 }
@@ -82,7 +82,7 @@ pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
 #[inline]
 #[target_feature(enable = "avx512ifma,avx512vl")]
-#[cfg_attr(test, assert_instr(vpmadd52huq))]
+#[cfg_attr(test, assert_instr(vpmadd52luq))]
 pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
     vpmadd52luq_128(a, b, c)
 }

From 36b10624c690237d14ec5c61a47a60e5bfb4b205 Mon Sep 17 00:00:00 2001
From: Henry de Valence <hdevalence@hdevalence.ca>
Date: Wed, 6 Feb 2019 20:08:48 -0800
Subject: [PATCH 5/6] Add a fixup pass for cpuid values when validating XML.

---
 crates/stdsimd-verify/tests/x86-intel.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs
index 4a81b8f16d..eb84c8d0d7 100644
--- a/crates/stdsimd-verify/tests/x86-intel.rs
+++ b/crates/stdsimd-verify/tests/x86-intel.rs
@@ -273,15 +273,25 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
             .flat_map(|c| c.to_lowercase())
             .collect::<String>();
 
+        // The XML file names IFMA as "avx512ifma52", while Rust calls
+        // it "avx512ifma".  Fix this mismatch by replacing the Intel
+        // name with the Rust name.
+        let fixup_cpuid = |cpuid: String| match cpuid.as_ref() {
+            "avx512ifma52" => String::from("avx512ifma"),
+            _ => cpuid,
+        };
+        let fixed_cpuid = fixup_cpuid(cpuid);
+
         let rust_feature = rust
             .target_feature
             .expect(&format!("no target feature listed for {}", rust.name));
-        if rust_feature.contains(&cpuid) {
+
+        if rust_feature.contains(&fixed_cpuid) {
             continue;
         }
         bail!(
             "intel cpuid `{}` not in `{}` for {}",
-            cpuid,
+            fixed_cpuid,
             rust_feature,
             rust.name
         )

From e354ca3cce74ffd4ca8950a08352347c54b4acae Mon Sep 17 00:00:00 2001
From: Henry de Valence <hdevalence@hdevalence.ca>
Date: Mon, 11 Feb 2019 08:41:21 -0800
Subject: [PATCH 6/6] Note that clang accepts _mm512_set1_epi64 on 32-bit.

Per https://github.com/rust-lang-nursery/stdsimd/pull/676#issuecomment-461384796 , LLVM is able to generate code for this intrinsic on `x86` targets.
---
 crates/stdsimd-verify/tests/x86-intel.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs
index eb84c8d0d7..29995454ea 100644
--- a/crates/stdsimd-verify/tests/x86-intel.rs
+++ b/crates/stdsimd-verify/tests/x86-intel.rs
@@ -369,7 +369,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
         // Apparently all of clang/msvc/gcc accept these intrinsics on
         // 32-bit, so let's do the same
         "_mm_set_epi64x" | "_mm_set1_epi64x" | "_mm256_set_epi64x" | "_mm256_setr_epi64x"
-        | "_mm256_set1_epi64x" => true,
+        | "_mm256_set1_epi64x" | "_mm512_set1_epi64" => true,
 
         // These return a 64-bit argument but they're assembled from other
         // 32-bit registers, so these work on 32-bit just fine. See #308 for