From e6cd792fd2b2ff213ad5788e9eb4c86c2c1601af Mon Sep 17 00:00:00 2001 From: Henry de Valence Date: Mon, 4 Feb 2019 18:08:13 -0800 Subject: [PATCH 1/6] Add _mm512_set1_epi64. --- crates/core_arch/src/simd.rs | 4 ++++ crates/core_arch/src/x86/avx512f.rs | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 5c8425623d..f6b2babf09 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -191,3 +191,7 @@ simd_ty!(i32x16[i32]: i32, i32, i32, i32, i32, i32, i32, i32 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); + +simd_ty!(i64x8[i64]: + i64, i64, i64, i64, i64, i64, i64, i64 + | x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 94efadac74..8994f57724 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -87,6 +87,13 @@ pub unsafe fn _mm512_setr_epi32( mem::transmute(r) } +/// Broadcast 64-bit integer `a` to all elements of `dst`. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { + mem::transmute(i64x8::splat(a)) +} + #[cfg(test)] mod tests { use std; From 3d4f047badb7cca64d521b68cbf60e0bc38c5d60 Mon Sep 17 00:00:00 2001 From: Henry de Valence Date: Mon, 4 Feb 2019 18:09:14 -0800 Subject: [PATCH 2/6] Add avx512ifma skeleton. --- crates/core_arch/src/x86/avx512ifma.rs | 84 ++++++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 3 + 2 files changed, 87 insertions(+) create mode 100644 crates/core_arch/src/x86/avx512ifma.rs diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs new file mode 100644 index 0000000000..d2509d4a50 --- /dev/null +++ b/crates/core_arch/src/x86/avx512ifma.rs @@ -0,0 +1,84 @@ +use core_arch::x86::*; + +#[cfg(test)] +use stdsimd_test::assert_instr; + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512IFMA52&expand=3488) +#[inline] +#[target_feature(enable = "avx512ifma")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + vpmadd52huq_512(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52) +#[inline] +#[target_feature(enable = "avx512ifma")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + vpmadd52luq_512(a, b, c) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"] + fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; + #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"] + fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; + #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"] + fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; + #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"] + fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; + #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"] + fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; + #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"] + fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; +} + +#[cfg(test)] +mod tests { + use std; + use stdsimd_test::simd_test; + + use core_arch::x86::*; + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_madd52hi_epu64() { + let mut a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + a = _mm512_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm512_set1_epi64(11030549757952); + + assert_eq_m512i(a, expected); + } + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_madd52lo_epu64() { + let mut a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + a = _mm512_madd52lo_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm512_set1_epi64(100055558127628); + + assert_eq_m512i(a, expected); + } +} diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 5870c2cc18..694b11cfa2 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -560,3 +560,6 @@ pub unsafe fn ud2() -> ! { mod avx512f; pub use self::avx512f::*; + +mod avx512ifma; +pub use self::avx512ifma::*; From ffcee6007ad510030ac2cc4f8010643c901f5d8e Mon Sep 17 00:00:00 2001 From: Henry de Valence Date: Mon, 4 Feb 2019 19:34:09 -0800 Subject: [PATCH 3/6] Add AVX512VL variants of IFMA instructions. --- crates/core_arch/src/x86/avx512ifma.rs | 112 +++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs index d2509d4a50..55379cf6de 100644 --- a/crates/core_arch/src/x86/avx512ifma.rs +++ b/crates/core_arch/src/x86/avx512ifma.rs @@ -31,6 +31,62 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51 vpmadd52luq_512(a, b, c) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52huq_256(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52luq_256(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52huq_128(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52luq_128(a, b, c) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"] @@ -81,4 +137,60 @@ mod tests { assert_eq_m512i(a, expected); } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_madd52hi_epu64() { + let mut a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + a = _mm256_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm256_set1_epi64x(11030549757952); + + assert_eq_m256i(a, expected); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_madd52lo_epu64() { + let mut a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + a = _mm256_madd52lo_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm256_set1_epi64x(100055558127628); + + assert_eq_m256i(a, expected); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_madd52hi_epu64() { + let mut a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + a = _mm_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm_set1_epi64x(11030549757952); + + assert_eq_m128i(a, expected); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_madd52lo_epu64() { + let mut a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + a = _mm_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm_set1_epi64x(11030549757952); + + assert_eq_m128i(a, expected); + } } From 0aee500f7bc518c1da7e14e8fd77c210818dfe1c Mon Sep 17 00:00:00 2001 From: Henry de Valence Date: Tue, 5 Feb 2019 10:57:08 -0800 Subject: [PATCH 4/6] Fix incorrect assert_instr tests. --- crates/core_arch/src/x86/avx512ifma.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs index 55379cf6de..9aacd9e02e 100644 --- a/crates/core_arch/src/x86/avx512ifma.rs +++ b/crates/core_arch/src/x86/avx512ifma.rs @@ -26,7 +26,7 @@ pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52) #[inline] #[target_feature(enable = "avx512ifma")] -#[cfg_attr(test, assert_instr(vpmadd52huq))] +#[cfg_attr(test, assert_instr(vpmadd52luq))] pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { vpmadd52luq_512(a, b, c) } @@ -54,7 +54,7 @@ pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494) #[inline] #[target_feature(enable = "avx512ifma,avx512vl")] -#[cfg_attr(test, assert_instr(vpmadd52huq))] +#[cfg_attr(test, assert_instr(vpmadd52luq))] pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { vpmadd52luq_256(a, b, c) } @@ -82,7 +82,7 @@ pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL) #[inline] #[target_feature(enable = "avx512ifma,avx512vl")] -#[cfg_attr(test, assert_instr(vpmadd52huq))] +#[cfg_attr(test, assert_instr(vpmadd52luq))] pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { vpmadd52luq_128(a, b, c) } From 36b10624c690237d14ec5c61a47a60e5bfb4b205 Mon Sep 17 00:00:00 2001 From: Henry de Valence Date: Wed, 6 Feb 2019 20:08:48 -0800 Subject: [PATCH 5/6] Add a fixup pass for cpuid values when validating XML. --- crates/stdsimd-verify/tests/x86-intel.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index 4a81b8f16d..eb84c8d0d7 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -273,15 +273,25 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { .flat_map(|c| c.to_lowercase()) .collect::(); + // The XML file names IFMA as "avx512ifma52", while Rust calls + // it "avx512ifma". Fix this mismatch by replacing the Intel + // name with the Rust name. + let fixup_cpuid = |cpuid: String| match cpuid.as_ref() { + "avx512ifma52" => String::from("avx512ifma"), + _ => cpuid, + }; + let fixed_cpuid = fixup_cpuid(cpuid); + let rust_feature = rust .target_feature .expect(&format!("no target feature listed for {}", rust.name)); - if rust_feature.contains(&cpuid) { + + if rust_feature.contains(&fixed_cpuid) { continue; } bail!( "intel cpuid `{}` not in `{}` for {}", - cpuid, + fixed_cpuid, rust_feature, rust.name ) From e354ca3cce74ffd4ca8950a08352347c54b4acae Mon Sep 17 00:00:00 2001 From: Henry de Valence Date: Mon, 11 Feb 2019 08:41:21 -0800 Subject: [PATCH 6/6] Note that clang accepts _mm512_set1_epi64 on 32-bit. Per https://github.com/rust-lang-nursery/stdsimd/pull/676#issuecomment-461384796 , LLVM is able to generate code for this intrinsic on `x86` targets. --- crates/stdsimd-verify/tests/x86-intel.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index eb84c8d0d7..29995454ea 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -369,7 +369,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { // Apparently all of clang/msvc/gcc accept these intrinsics on // 32-bit, so let's do the same "_mm_set_epi64x" | "_mm_set1_epi64x" | "_mm256_set_epi64x" | "_mm256_setr_epi64x" - | "_mm256_set1_epi64x" => true, + | "_mm256_set1_epi64x" | "_mm512_set1_epi64" => true, // These return a 64-bit argument but they're assembled from other // 32-bit registers, so these work on 32-bit just fine. See #308 for