From 3bdf567f5276346ec4981694bff7f053df61f378 Mon Sep 17 00:00:00 2001
From: dignifiedquire <me@dignifiedquire.com>
Date: Tue, 5 Nov 2019 14:41:09 +0100
Subject: [PATCH 1/3] feat(sha2): implement intrinsic based version

---
 Cargo.lock                    |  16 ++
 sha2/Cargo.toml               |   7 +
 sha2/src/lib.rs               |   5 +-
 sha2/src/platform.rs          |  78 ++++++++++
 sha2/src/sha256.rs            |  23 +--
 sha2/src/sha256_intrinsics.rs | 266 ++++++++++++++++++++++++++++++++++
 sha2/src/sha256_utils.rs      |   1 +
 7 files changed, 384 insertions(+), 12 deletions(-)
 create mode 100644 sha2/src/platform.rs
 create mode 100644 sha2/src/sha256_intrinsics.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8bdfe0138..7bf6b9b6a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -130,6 +130,15 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67c21572b4949434e4fc1e1978b99c5f77064153c59d998bf13ecd96fb5ecba7"
 
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.71"
@@ -236,6 +245,7 @@ dependencies = [
  "block-buffer",
  "digest",
  "hex-literal",
+ "lazy_static",
  "libc",
  "opaque-debug",
  "sha2-asm",
@@ -271,6 +281,12 @@ dependencies = [
  "opaque-debug",
 ]
 
+[[package]]
+name = "spin"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
+
 [[package]]
 name = "streebog"
 version = "0.9.0-pre"
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 7f7b79d8b..5db7d606f 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -21,6 +21,13 @@ opaque-debug = "0.2"
 sha2-asm = { version = "0.5", optional = true }
 libc = { version = "0.2.68", optional = true }
 
+[dependencies.lazy_static]
+version = "1.4.0"
+default-features = false
+# no_std feature is an anti-pattern. Why, lazy_static, why?
+# See https://github.com/rust-lang-nursery/lazy-static.rs/issues/150
+features = ["spin_no_std"]
+
 [dev-dependencies]
 digest = { version = "0.9", features = ["dev"] }
 hex-literal = "0.2"
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index c87c064b4..e9c9a3154 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -89,8 +89,11 @@ extern crate std;
 #[cfg(feature = "asm-aarch64")]
 mod aarch64;
 mod consts;
+mod platform;
 mod sha256;
-#[cfg(any(not(feature = "asm"), feature = "asm-aarch64", feature = "compress"))]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod sha256_intrinsics;
+#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
 mod sha256_utils;
 mod sha512;
 #[cfg(any(not(feature = "asm"), target_arch = "aarch64", feature = "compress"))]
diff --git a/sha2/src/platform.rs b/sha2/src/platform.rs
new file mode 100644
index 000000000..90e285e5a
--- /dev/null
+++ b/sha2/src/platform.rs
@@ -0,0 +1,78 @@
+#[allow(dead_code)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Platform {
+    Portable,
+    #[cfg(feature = "asm")]
+    Asm,
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    Sha,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct Implementation(Platform);
+
+impl Implementation {
+    pub fn detect() -> Self {
+        // Try the different implementations in order of how fast/modern they are.
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if let Some(sha_impl) = Self::sha_if_supported() {
+                return sha_impl;
+            }
+        }
+        #[cfg(feature = "asm")]
+        {
+            if let Some(asm_impl) = Self::asm_if_supported() {
+                return asm_impl;
+            }
+        }
+
+        Self::portable()
+    }
+
+    pub fn portable() -> Self {
+        Implementation(Platform::Portable)
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[allow(unreachable_code)]
+    pub fn sha_if_supported() -> Option<Self> {
+        // Check whether sha support is assumed by the build.
+        #[cfg(target_feature = "sha")]
+        {
+            return Some(Implementation(Platform::Sha));
+        }
+        // Otherwise dynamically check for support if we can.
+        #[cfg(feature = "std")]
+        {
+            if std::is_x86_feature_detected!("sha") {
+                return Some(Implementation(Platform::Sha));
+            }
+        }
+        None
+    }
+
+    #[cfg(feature = "asm")]
+    pub fn asm_if_supported() -> Option<Self> {
+        return Some(Implementation(Platform::Asm));
+    }
+
+    #[inline]
+    pub fn compress256(&self, state: &mut [u32; 8], block: &[u8; 64]) {
+        match self.0 {
+            Platform::Portable => {
+                use crate::sha256_utils;
+                sha256_utils::compress256(state, block);
+            }
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::Sha => {
+                use crate::sha256_intrinsics;
+                unsafe { sha256_intrinsics::compress256(state, block) };
+            }
+            #[cfg(feature = "asm")]
+            Platform::Asm => {
+                sha2_asm::compress256(state, block);
+            }
+        }
+    }
+}
diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs
index c30671b86..5ad73b69f 100644
--- a/sha2/src/sha256.rs
+++ b/sha2/src/sha256.rs
@@ -1,6 +1,7 @@
 //! SHA-256
 
 use crate::consts::{H224, H256, STATE_LEN};
+use crate::platform::Implementation;
 use block_buffer::BlockBuffer;
 use digest::impl_write;
 use digest::{
@@ -9,15 +10,13 @@ use digest::{
 };
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
 
-#[cfg(not(feature = "asm"))]
-use crate::sha256_utils::compress256;
-
-#[cfg(feature = "asm")]
-use sha2_asm::compress256;
-
 type BlockSize = U64;
 type Block = GenericArray<u8, BlockSize>;
 
+lazy_static::lazy_static! {
+    static ref IMPL: Implementation = Implementation::detect();
+}
+
 /// A structure that represents that state of a digest computation for the
 /// SHA-2 512 family of digest functions
 #[derive(Clone)]
@@ -33,7 +32,7 @@ impl Engine256State {
     #[cfg(not(feature = "asm-aarch64"))]
     pub fn process_block(&mut self, block: &Block) {
         let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) };
-        compress256(&mut self.h, block);
+        IMPL.compress256(&mut self.h, block);
     }
 
     #[cfg(feature = "asm-aarch64")]
@@ -71,14 +70,16 @@ impl Engine256 {
     fn update(&mut self, input: &[u8]) {
         // Assumes that input.len() can be converted to u64 without overflow
         self.len += (input.len() as u64) << 3;
-        let s = &mut self.state;
-        self.buffer.input_block(input, |b| s.process_block(b));
+        let self_state = &mut self.state;
+        self.buffer
+            .input_block(input, |input| self_state.process_block(input));
     }
 
     fn finish(&mut self) {
-        let s = &mut self.state;
+        let self_state = &mut self.state;
         let l = self.len;
-        self.buffer.len64_padding_be(l, |b| s.process_block(b));
+        self.buffer
+            .len64_padding_be(l, |b| self_state.process_block(b));
     }
 
     fn reset(&mut self, h: &[u32; STATE_LEN]) {
diff --git a/sha2/src/sha256_intrinsics.rs b/sha2/src/sha256_intrinsics.rs
new file mode 100644
index 000000000..1a27384bd
--- /dev/null
+++ b/sha2/src/sha256_intrinsics.rs
@@ -0,0 +1,266 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+/// Process a block with the SHA-256 algorithm.
+/// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c
+#[inline]
+#[target_feature(enable = "sha")]
+pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) {
+    // TODO: Process multiple blocks
+
+    let mut state0: __m128i;
+    let mut state1: __m128i;
+
+    let mut msg: __m128i;
+    let mut tmp: __m128i;
+
+    let mut msg0: __m128i;
+    let mut msg1: __m128i;
+    let mut msg2: __m128i;
+    let mut msg3: __m128i;
+
+    let mut abef_save: __m128i;
+    let mut cdgh_save: __m128i;
+
+    #[allow(non_snake_case)]
+    let MASK: __m128i = _mm_set_epi64x(
+        0x0c0d_0e0f_0809_0a0bu64 as i64,
+        0x0405_0607_0001_0203u64 as i64,
+    );
+
+    let mut block_offset = 0;
+    let mut length = 64;
+
+    // Load initial values
+    tmp = _mm_loadu_si128(state.as_ptr().add(0) as *const __m128i);
+    state1 = _mm_loadu_si128(state.as_ptr().add(4) as *const __m128i);
+
+    tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB
+    state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
+    state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
+    state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
+
+    while length >= 64 {
+        // Save current state
+        abef_save = state0;
+        cdgh_save = state1;
+
+        // Rounds 0-3
+        msg = _mm_loadu_si128(block.as_ptr().add(block_offset + 0) as *const __m128i);
+        msg0 = _mm_shuffle_epi8(msg, MASK);
+        msg = _mm_add_epi32(
+            msg0,
+            _mm_set_epi64x(0xE9B5DBA5B5C0FBCFu64 as i64, 0x71374491428A2F98u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+        // Rounds 4-7
+        msg1 = _mm_loadu_si128(block.as_ptr().add(block_offset + 16) as *const __m128i);
+        msg1 = _mm_shuffle_epi8(msg1, MASK);
+        msg = _mm_add_epi32(
+            msg1,
+            _mm_set_epi64x(0xAB1C5ED5923F82A4u64 as i64, 0x59F111F13956C25Bu64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg0 = _mm_sha256msg1_epu32(msg0, msg1);
+
+        // Rounds 8-11
+        msg2 = _mm_loadu_si128(block.as_ptr().add(block_offset + 32) as *const __m128i);
+        msg2 = _mm_shuffle_epi8(msg2, MASK);
+        msg = _mm_add_epi32(
+            msg2,
+            _mm_set_epi64x(0x550C7DC3243185BEu64 as i64, 0x12835B01D807AA98u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg1 = _mm_sha256msg1_epu32(msg1, msg2);
+
+        // Rounds 12-15
+        msg3 = _mm_loadu_si128(block.as_ptr().add(block_offset + 48) as *const __m128i);
+        msg3 = _mm_shuffle_epi8(msg3, MASK);
+        msg = _mm_add_epi32(
+            msg3,
+            _mm_set_epi64x(0xC19BF1749BDC06A7u64 as i64, 0x80DEB1FE72BE5D74u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg3, msg2, 4);
+        msg0 = _mm_add_epi32(msg0, tmp);
+        msg0 = _mm_sha256msg2_epu32(msg0, msg3);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg2 = _mm_sha256msg1_epu32(msg2, msg3);
+
+        // Rounds 16-19
+        msg = _mm_add_epi32(
+            msg0,
+            _mm_set_epi64x(0x240CA1CC0FC19DC6u64 as i64, 0xEFBE4786E49B69C1u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg0, msg3, 4);
+        msg1 = _mm_add_epi32(msg1, tmp);
+        msg1 = _mm_sha256msg2_epu32(msg1, msg0);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg3 = _mm_sha256msg1_epu32(msg3, msg0);
+
+        // Rounds 20-23
+        msg = _mm_add_epi32(
+            msg1,
+            _mm_set_epi64x(0x76F988DA5CB0A9DCu64 as i64, 0x4A7484AA2DE92C6Fu64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg1, msg0, 4);
+        msg2 = _mm_add_epi32(msg2, tmp);
+        msg2 = _mm_sha256msg2_epu32(msg2, msg1);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg0 = _mm_sha256msg1_epu32(msg0, msg1);
+
+        // Rounds 24-27
+        msg = _mm_add_epi32(
+            msg2,
+            _mm_set_epi64x(0xBF597FC7B00327C8u64 as i64, 0xA831C66D983E5152u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg2, msg1, 4);
+        msg3 = _mm_add_epi32(msg3, tmp);
+        msg3 = _mm_sha256msg2_epu32(msg3, msg2);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg1 = _mm_sha256msg1_epu32(msg1, msg2);
+
+        // Rounds 28-31
+        msg = _mm_add_epi32(
+            msg3,
+            _mm_set_epi64x(0x1429296706CA6351u64 as i64, 0xD5A79147C6E00BF3u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg3, msg2, 4);
+        msg0 = _mm_add_epi32(msg0, tmp);
+        msg0 = _mm_sha256msg2_epu32(msg0, msg3);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg2 = _mm_sha256msg1_epu32(msg2, msg3);
+
+        // Rounds 32-35
+        msg = _mm_add_epi32(
+            msg0,
+            _mm_set_epi64x(0x53380D134D2C6DFCu64 as i64, 0x2E1B213827B70A85u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg0, msg3, 4);
+        msg1 = _mm_add_epi32(msg1, tmp);
+        msg1 = _mm_sha256msg2_epu32(msg1, msg0);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg3 = _mm_sha256msg1_epu32(msg3, msg0);
+
+        // Rounds 36-39
+        msg = _mm_add_epi32(
+            msg1,
+            _mm_set_epi64x(0x92722C8581C2C92Eu64 as i64, 0x766A0ABB650A7354u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg1, msg0, 4);
+        msg2 = _mm_add_epi32(msg2, tmp);
+        msg2 = _mm_sha256msg2_epu32(msg2, msg1);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg0 = _mm_sha256msg1_epu32(msg0, msg1);
+
+        // Rounds 40-43
+        msg = _mm_add_epi32(
+            msg2,
+            _mm_set_epi64x(0xC76C51A3C24B8B70u64 as i64, 0xA81A664BA2BFE8A1u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg2, msg1, 4);
+        msg3 = _mm_add_epi32(msg3, tmp);
+        msg3 = _mm_sha256msg2_epu32(msg3, msg2);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg1 = _mm_sha256msg1_epu32(msg1, msg2);
+
+        // Rounds 44-47
+        msg = _mm_add_epi32(
+            msg3,
+            _mm_set_epi64x(0x106AA070F40E3585u64 as i64, 0xD6990624D192E819u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg3, msg2, 4);
+        msg0 = _mm_add_epi32(msg0, tmp);
+        msg0 = _mm_sha256msg2_epu32(msg0, msg3);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg2 = _mm_sha256msg1_epu32(msg2, msg3);
+
+        // Rounds 48-51
+        msg = _mm_add_epi32(
+            msg0,
+            _mm_set_epi64x(0x34B0BCB52748774Cu64 as i64, 0x1E376C0819A4C116u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg0, msg3, 4);
+        msg1 = _mm_add_epi32(msg1, tmp);
+        msg1 = _mm_sha256msg2_epu32(msg1, msg0);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+        msg3 = _mm_sha256msg1_epu32(msg3, msg0);
+
+        // Rounds 52-55
+        msg = _mm_add_epi32(
+            msg1,
+            _mm_set_epi64x(0x682E6FF35B9CCA4Fu64 as i64, 0x4ED8AA4A391C0CB3u64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg1, msg0, 4);
+        msg2 = _mm_add_epi32(msg2, tmp);
+        msg2 = _mm_sha256msg2_epu32(msg2, msg1);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+        // Rounds 56-59
+        msg = _mm_add_epi32(
+            msg2,
+            _mm_set_epi64x(0x8CC7020884C87814u64 as i64, 0x78A5636F748F82EEu64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        tmp = _mm_alignr_epi8(msg2, msg1, 4);
+        msg3 = _mm_add_epi32(msg3, tmp);
+        msg3 = _mm_sha256msg2_epu32(msg3, msg2);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+        // Rounds 60-63
+        msg = _mm_add_epi32(
+            msg3,
+            _mm_set_epi64x(0xC67178F2BEF9A3F7u64 as i64, 0xA4506CEB90BEFFFAu64 as i64),
+        );
+        state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+        msg = _mm_shuffle_epi32(msg, 0x0E);
+        state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+        // Combine state
+        state0 = _mm_add_epi32(state0, abef_save);
+        state1 = _mm_add_epi32(state1, cdgh_save);
+
+        block_offset += 64;
+        length -= 64;
+    }
+
+    tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA
+    state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
+    state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
+    state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF
+
+    // Save state
+    _mm_storeu_si128(state.as_ptr().add(0) as *mut __m128i, state0);
+    _mm_storeu_si128(state.as_ptr().add(4) as *mut __m128i, state1);
+}
diff --git a/sha2/src/sha256_utils.rs b/sha2/src/sha256_utils.rs
index 7d2ec9f63..f34777a91 100644
--- a/sha2/src/sha256_utils.rs
+++ b/sha2/src/sha256_utils.rs
@@ -309,6 +309,7 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
 /// implemented by any CPU (at the time of this writing), and so they are
 /// emulated in this library until the instructions become more common, and gain
 ///  support in LLVM (and GCC, etc.).
+#[inline]
 pub fn compress256(state: &mut [u32; 8], block: &[u8; 64]) {
     let mut block_u32 = [0u32; BLOCK_LEN];
     for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {

From 5206c2100c26b0e809df2215546605f013c6f79b Mon Sep 17 00:00:00 2001
From: dignifiedquire <me@dignifiedquire.com>
Date: Sat, 11 Jan 2020 12:58:17 +0100
Subject: [PATCH 2/3] fixup

---
 sha2/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index e9c9a3154..7fe072be9 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -93,7 +93,6 @@ mod platform;
 mod sha256;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod sha256_intrinsics;
-#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
 mod sha256_utils;
 mod sha512;
 #[cfg(any(not(feature = "asm"), target_arch = "aarch64", feature = "compress"))]

From 2ea5e4f53acbc6b3f3ac76af53ed1cf14a32e25c Mon Sep 17 00:00:00 2001
From: dignifiedquire <me@dignifiedquire.com>
Date: Thu, 11 Jun 2020 13:57:01 +0200
Subject: [PATCH 3/3] pull in fixes from fork

---
 Cargo.lock                    | 42 +++++++++++++++++++++++++++++++++++
 sha2/Cargo.toml               |  3 +++
 sha2/src/platform.rs          | 37 +++++++++++++++++++-----------
 sha2/src/sha256.rs            | 14 ------------
 sha2/src/sha256_intrinsics.rs |  9 ++++----
 5 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7bf6b9b6a..6b6d22119 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,5 +1,11 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
+[[package]]
+name = "bitflags"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
+
 [[package]]
 name = "blake2"
 version = "0.9.0"
@@ -197,6 +203,17 @@ version = "0.5.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7e0456befd48169b9f13ef0f0ad46d492cf9d2dbb918bcf38e01eed4ce3ec5e4"
 
+[[package]]
+name = "raw-cpuid"
+version = "7.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4a349ca83373cfa5d6dbb66fd76e58b2cca08da71a5f6400de0a0a6a9bceeaf"
+dependencies = [
+ "bitflags",
+ "cc",
+ "rustc_version",
+]
+
 [[package]]
 name = "ripemd160"
 version = "0.9.0"
@@ -217,6 +234,30 @@ dependencies = [
  "opaque-debug",
 ]
 
+[[package]]
+name = "rustc_version"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "semver"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
+dependencies = [
+ "semver-parser",
+]
+
+[[package]]
+name = "semver-parser"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
+
 [[package]]
 name = "sha-1"
 version = "0.9.0"
@@ -248,6 +289,7 @@ dependencies = [
  "lazy_static",
  "libc",
  "opaque-debug",
+ "raw-cpuid",
  "sha2-asm",
 ]
 
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 5db7d606f..18ad7e860 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -21,6 +21,9 @@ opaque-debug = "0.2"
 sha2-asm = { version = "0.5", optional = true }
 libc = { version = "0.2.68", optional = true }
 
+[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
+raw-cpuid = "7.0.3"
+
 [dependencies.lazy_static]
 version = "1.4.0"
 default-features = false
diff --git a/sha2/src/platform.rs b/sha2/src/platform.rs
index 90e285e5a..dc8af398f 100644
--- a/sha2/src/platform.rs
+++ b/sha2/src/platform.rs
@@ -20,7 +20,7 @@ impl Implementation {
                 return sha_impl;
             }
         }
-        #[cfg(feature = "asm")]
+        #[cfg(any(feature = "asm", feature = "asm-aarch64"))]
         {
             if let Some(asm_impl) = Self::asm_if_supported() {
                 return asm_impl;
@@ -37,24 +37,35 @@ impl Implementation {
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     #[allow(unreachable_code)]
     pub fn sha_if_supported() -> Option<Self> {
-        // Check whether sha support is assumed by the build.
-        #[cfg(target_feature = "sha")]
-        {
+        use raw_cpuid::CpuId;
+
+        // Use raw_cpuid instead of is_x86_feature_detected, to ensure the check
+        // never happens at compile time.
+        let cpuid = CpuId::new();
+        let is_runtime_ok = cpuid
+            .get_extended_feature_info()
+            .map(|info| info.has_sha())
+            .unwrap_or_default();
+
+        // Make sure this computer actually supports it
+        if is_runtime_ok {
             return Some(Implementation(Platform::Sha));
         }
-        // Otherwise dynamically check for support if we can.
-        #[cfg(feature = "std")]
-        {
-            if std::is_x86_feature_detected!("sha") {
-                return Some(Implementation(Platform::Sha));
-            }
-        }
+
         None
     }
 
-    #[cfg(feature = "asm")]
+    #[cfg(any(feature = "asm", feature = "asm-arch64"))]
     pub fn asm_if_supported() -> Option<Self> {
-        return Some(Implementation(Platform::Asm));
+        #[cfg(feature = "asm-aarch64")]
+        let supported = ::aarch64::sha2_supported();
+        #[cfg(not(feature = "asm-aarch64"))]
+        let supported = false;
+
+        if supported {
+            return Some(Implementation(Platform::Asm));
+        }
+        None
     }
 
     #[inline]
diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs
index 5ad73b69f..18d1dfbef 100644
--- a/sha2/src/sha256.rs
+++ b/sha2/src/sha256.rs
@@ -29,24 +29,10 @@ impl Engine256State {
         Engine256State { h: *h }
     }
 
-    #[cfg(not(feature = "asm-aarch64"))]
     pub fn process_block(&mut self, block: &Block) {
         let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) };
         IMPL.compress256(&mut self.h, block);
     }
-
-    #[cfg(feature = "asm-aarch64")]
-    pub fn process_block(&mut self, block: &Block) {
-        let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) };
-        // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha2") once
-        // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented
-        // to let us use it on no_std.
-        if ::aarch64::sha2_supported() {
-            compress256(&mut self.h, block);
-        } else {
-            ::sha256_utils::compress256(&mut self.h, block);
-        }
-    }
 }
 
 /// A structure that keeps track of the state of the Sha-256 operation and
diff --git a/sha2/src/sha256_intrinsics.rs b/sha2/src/sha256_intrinsics.rs
index 1a27384bd..1f1bd555c 100644
--- a/sha2/src/sha256_intrinsics.rs
+++ b/sha2/src/sha256_intrinsics.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::cast_ptr_alignment)] // Safe to cast without alignment checks as the loads and stores do not require alignment.
+
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -6,7 +8,6 @@ use core::arch::x86_64::*;
 /// Process a block with the SHA-256 algorithm.
 /// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c
 #[inline]
-#[target_feature(enable = "sha")]
 pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) {
     // TODO: Process multiple blocks
 
@@ -48,7 +49,7 @@ pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) {
         cdgh_save = state1;
 
         // Rounds 0-3
-        msg = _mm_loadu_si128(block.as_ptr().add(block_offset + 0) as *const __m128i);
+        msg = _mm_loadu_si128(block.as_ptr().add(block_offset) as *const __m128i);
         msg0 = _mm_shuffle_epi8(msg, MASK);
         msg = _mm_add_epi32(
             msg0,
@@ -261,6 +262,6 @@ pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) {
     state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF
 
     // Save state
-    _mm_storeu_si128(state.as_ptr().add(0) as *mut __m128i, state0);
-    _mm_storeu_si128(state.as_ptr().add(4) as *mut __m128i, state1);
+    _mm_storeu_si128(state.as_mut_ptr().add(0) as *mut __m128i, state0);
+    _mm_storeu_si128(state.as_mut_ptr().add(4) as *mut __m128i, state1);
 }