From 3bdf567f5276346ec4981694bff7f053df61f378 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Tue, 5 Nov 2019 14:41:09 +0100 Subject: [PATCH 1/3] feat(sha2): implement intrinsic based version --- Cargo.lock | 16 ++ sha2/Cargo.toml | 7 + sha2/src/lib.rs | 5 +- sha2/src/platform.rs | 78 ++++++++++ sha2/src/sha256.rs | 23 +-- sha2/src/sha256_intrinsics.rs | 266 ++++++++++++++++++++++++++++++++++ sha2/src/sha256_utils.rs | 1 + 7 files changed, 384 insertions(+), 12 deletions(-) create mode 100644 sha2/src/platform.rs create mode 100644 sha2/src/sha256_intrinsics.rs diff --git a/Cargo.lock b/Cargo.lock index 8bdfe0138..7bf6b9b6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -130,6 +130,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67c21572b4949434e4fc1e1978b99c5f77064153c59d998bf13ecd96fb5ecba7" +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin", +] + [[package]] name = "libc" version = "0.2.71" @@ -236,6 +245,7 @@ dependencies = [ "block-buffer", "digest", "hex-literal", + "lazy_static", "libc", "opaque-debug", "sha2-asm", @@ -271,6 +281,12 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + [[package]] name = "streebog" version = "0.9.0-pre" diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 7f7b79d8b..5db7d606f 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -21,6 +21,13 @@ opaque-debug = "0.2" sha2-asm = { version = "0.5", optional = true } libc = { version = "0.2.68", optional = true } +[dependencies.lazy_static] +version = "1.4.0" +default-features = false +# no_std feature is an anti-pattern. Why, lazy_static, why? +# See https://github.com/rust-lang-nursery/lazy-static.rs/issues/150 +features = ["spin_no_std"] + [dev-dependencies] digest = { version = "0.9", features = ["dev"] } hex-literal = "0.2" diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index c87c064b4..e9c9a3154 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -89,8 +89,11 @@ extern crate std; #[cfg(feature = "asm-aarch64")] mod aarch64; mod consts; +mod platform; mod sha256; -#[cfg(any(not(feature = "asm"), feature = "asm-aarch64", feature = "compress"))] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod sha256_intrinsics; +#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] mod sha256_utils; mod sha512; #[cfg(any(not(feature = "asm"), target_arch = "aarch64", feature = "compress"))] diff --git a/sha2/src/platform.rs b/sha2/src/platform.rs new file mode 100644 index 000000000..90e285e5a --- /dev/null +++ b/sha2/src/platform.rs @@ -0,0 +1,78 @@ +#[allow(dead_code)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Platform { + Portable, + #[cfg(feature = "asm")] + Asm, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Sha, +} + +#[derive(Clone, Copy, Debug)] +pub struct Implementation(Platform); + +impl Implementation { + pub fn detect() -> Self { + // Try the different implementations in order of how fast/modern they are. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if let Some(sha_impl) = Self::sha_if_supported() { + return sha_impl; + } + } + #[cfg(feature = "asm")] + { + if let Some(asm_impl) = Self::asm_if_supported() { + return asm_impl; + } + } + + Self::portable() + } + + pub fn portable() -> Self { + Implementation(Platform::Portable) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[allow(unreachable_code)] + pub fn sha_if_supported() -> Option { + // Check whether sha support is assumed by the build. + #[cfg(target_feature = "sha")] + { + return Some(Implementation(Platform::Sha)); + } + // Otherwise dynamically check for support if we can. + #[cfg(feature = "std")] + { + if std::is_x86_feature_detected!("sha") { + return Some(Implementation(Platform::Sha)); + } + } + None + } + + #[cfg(feature = "asm")] + pub fn asm_if_supported() -> Option { + return Some(Implementation(Platform::Asm)); + } + + #[inline] + pub fn compress256(&self, state: &mut [u32; 8], block: &[u8; 64]) { + match self.0 { + Platform::Portable => { + use crate::sha256_utils; + sha256_utils::compress256(state, block); + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::Sha => { + use crate::sha256_intrinsics; + unsafe { sha256_intrinsics::compress256(state, block) }; + } + #[cfg(feature = "asm")] + Platform::Asm => { + sha2_asm::compress256(state, block); + } + } + } +} diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs index c30671b86..5ad73b69f 100644 --- a/sha2/src/sha256.rs +++ b/sha2/src/sha256.rs @@ -1,6 +1,7 @@ //! SHA-256 use crate::consts::{H224, H256, STATE_LEN}; +use crate::platform::Implementation; use block_buffer::BlockBuffer; use digest::impl_write; use digest::{ @@ -9,15 +10,13 @@ use digest::{ }; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; -#[cfg(not(feature = "asm"))] -use crate::sha256_utils::compress256; - -#[cfg(feature = "asm")] -use sha2_asm::compress256; - type BlockSize = U64; type Block = GenericArray; +lazy_static::lazy_static! { + static ref IMPL: Implementation = Implementation::detect(); +} + /// A structure that represents that state of a digest computation for the /// SHA-2 512 family of digest functions #[derive(Clone)] @@ -33,7 +32,7 @@ impl Engine256State { #[cfg(not(feature = "asm-aarch64"))] pub fn process_block(&mut self, block: &Block) { let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) }; - compress256(&mut self.h, block); + IMPL.compress256(&mut self.h, block); } #[cfg(feature = "asm-aarch64")] @@ -71,14 +70,16 @@ impl Engine256 { fn update(&mut self, input: &[u8]) { // Assumes that input.len() can be converted to u64 without overflow self.len += (input.len() as u64) << 3; - let s = &mut self.state; - self.buffer.input_block(input, |b| s.process_block(b)); + let self_state = &mut self.state; + self.buffer + .input_block(input, |input| self_state.process_block(input)); } fn finish(&mut self) { - let s = &mut self.state; + let self_state = &mut self.state; let l = self.len; - self.buffer.len64_padding_be(l, |b| s.process_block(b)); + self.buffer + .len64_padding_be(l, |b| self_state.process_block(b)); } fn reset(&mut self, h: &[u32; STATE_LEN]) { diff --git a/sha2/src/sha256_intrinsics.rs b/sha2/src/sha256_intrinsics.rs new file mode 100644 index 000000000..1a27384bd --- /dev/null +++ b/sha2/src/sha256_intrinsics.rs @@ -0,0 +1,266 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +/// Process a block with the SHA-256 algorithm. +/// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c +#[inline] +#[target_feature(enable = "sha")] +pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) { + // TODO: Process multiple blocks + + let mut state0: __m128i; + let mut state1: __m128i; + + let mut msg: __m128i; + let mut tmp: __m128i; + + let mut msg0: __m128i; + let mut msg1: __m128i; + let mut msg2: __m128i; + let mut msg3: __m128i; + + let mut abef_save: __m128i; + let mut cdgh_save: __m128i; + + #[allow(non_snake_case)] + let MASK: __m128i = _mm_set_epi64x( + 0x0c0d_0e0f_0809_0a0bu64 as i64, + 0x0405_0607_0001_0203u64 as i64, + ); + + let mut block_offset = 0; + let mut length = 64; + + // Load initial values + tmp = _mm_loadu_si128(state.as_ptr().add(0) as *const __m128i); + state1 = _mm_loadu_si128(state.as_ptr().add(4) as *const __m128i); + + tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB + state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH + state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF + state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH + + while length >= 64 { + // Save current state + abef_save = state0; + cdgh_save = state1; + + // Rounds 0-3 + msg = _mm_loadu_si128(block.as_ptr().add(block_offset + 0) as *const __m128i); + msg0 = _mm_shuffle_epi8(msg, MASK); + msg = _mm_add_epi32( + msg0, + _mm_set_epi64x(0xE9B5DBA5B5C0FBCFu64 as i64, 0x71374491428A2F98u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Rounds 4-7 + msg1 = _mm_loadu_si128(block.as_ptr().add(block_offset + 16) as *const __m128i); + msg1 = _mm_shuffle_epi8(msg1, MASK); + msg = _mm_add_epi32( + msg1, + _mm_set_epi64x(0xAB1C5ED5923F82A4u64 as i64, 0x59F111F13956C25Bu64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg0 = _mm_sha256msg1_epu32(msg0, msg1); + + // Rounds 8-11 + msg2 = _mm_loadu_si128(block.as_ptr().add(block_offset + 32) as *const __m128i); + msg2 = _mm_shuffle_epi8(msg2, MASK); + msg = _mm_add_epi32( + msg2, + _mm_set_epi64x(0x550C7DC3243185BEu64 as i64, 0x12835B01D807AA98u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg1 = _mm_sha256msg1_epu32(msg1, msg2); + + // Rounds 12-15 + msg3 = _mm_loadu_si128(block.as_ptr().add(block_offset + 48) as *const __m128i); + msg3 = _mm_shuffle_epi8(msg3, MASK); + msg = _mm_add_epi32( + msg3, + _mm_set_epi64x(0xC19BF1749BDC06A7u64 as i64, 0x80DEB1FE72BE5D74u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg3, msg2, 4); + msg0 = _mm_add_epi32(msg0, tmp); + msg0 = _mm_sha256msg2_epu32(msg0, msg3); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg2 = _mm_sha256msg1_epu32(msg2, msg3); + + // Rounds 16-19 + msg = _mm_add_epi32( + msg0, + _mm_set_epi64x(0x240CA1CC0FC19DC6u64 as i64, 0xEFBE4786E49B69C1u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg0, msg3, 4); + msg1 = _mm_add_epi32(msg1, tmp); + msg1 = _mm_sha256msg2_epu32(msg1, msg0); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg3 = _mm_sha256msg1_epu32(msg3, msg0); + + // Rounds 20-23 + msg = _mm_add_epi32( + msg1, + _mm_set_epi64x(0x76F988DA5CB0A9DCu64 as i64, 0x4A7484AA2DE92C6Fu64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg1, msg0, 4); + msg2 = _mm_add_epi32(msg2, tmp); + msg2 = _mm_sha256msg2_epu32(msg2, msg1); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg0 = _mm_sha256msg1_epu32(msg0, msg1); + + // Rounds 24-27 + msg = _mm_add_epi32( + msg2, + _mm_set_epi64x(0xBF597FC7B00327C8u64 as i64, 0xA831C66D983E5152u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg2, msg1, 4); + msg3 = _mm_add_epi32(msg3, tmp); + msg3 = _mm_sha256msg2_epu32(msg3, msg2); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg1 = _mm_sha256msg1_epu32(msg1, msg2); + + // Rounds 28-31 + msg = _mm_add_epi32( + msg3, + _mm_set_epi64x(0x1429296706CA6351u64 as i64, 0xD5A79147C6E00BF3u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg3, msg2, 4); + msg0 = _mm_add_epi32(msg0, tmp); + msg0 = _mm_sha256msg2_epu32(msg0, msg3); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg2 = _mm_sha256msg1_epu32(msg2, msg3); + + // Rounds 32-35 + msg = _mm_add_epi32( + msg0, + _mm_set_epi64x(0x53380D134D2C6DFCu64 as i64, 0x2E1B213827B70A85u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg0, msg3, 4); + msg1 = _mm_add_epi32(msg1, tmp); + msg1 = _mm_sha256msg2_epu32(msg1, msg0); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg3 = _mm_sha256msg1_epu32(msg3, msg0); + + // Rounds 36-39 + msg = _mm_add_epi32( + msg1, + _mm_set_epi64x(0x92722C8581C2C92Eu64 as i64, 0x766A0ABB650A7354u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg1, msg0, 4); + msg2 = _mm_add_epi32(msg2, tmp); + msg2 = _mm_sha256msg2_epu32(msg2, msg1); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg0 = _mm_sha256msg1_epu32(msg0, msg1); + + // Rounds 40-43 + msg = _mm_add_epi32( + msg2, + _mm_set_epi64x(0xC76C51A3C24B8B70u64 as i64, 0xA81A664BA2BFE8A1u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg2, msg1, 4); + msg3 = _mm_add_epi32(msg3, tmp); + msg3 = _mm_sha256msg2_epu32(msg3, msg2); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg1 = _mm_sha256msg1_epu32(msg1, msg2); + + // Rounds 44-47 + msg = _mm_add_epi32( + msg3, + _mm_set_epi64x(0x106AA070F40E3585u64 as i64, 0xD6990624D192E819u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg3, msg2, 4); + msg0 = _mm_add_epi32(msg0, tmp); + msg0 = _mm_sha256msg2_epu32(msg0, msg3); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg2 = _mm_sha256msg1_epu32(msg2, msg3); + + // Rounds 48-51 + msg = _mm_add_epi32( + msg0, + _mm_set_epi64x(0x34B0BCB52748774Cu64 as i64, 0x1E376C0819A4C116u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg0, msg3, 4); + msg1 = _mm_add_epi32(msg1, tmp); + msg1 = _mm_sha256msg2_epu32(msg1, msg0); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msg3 = _mm_sha256msg1_epu32(msg3, msg0); + + // Rounds 52-55 + msg = _mm_add_epi32( + msg1, + _mm_set_epi64x(0x682E6FF35B9CCA4Fu64 as i64, 0x4ED8AA4A391C0CB3u64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg1, msg0, 4); + msg2 = _mm_add_epi32(msg2, tmp); + msg2 = _mm_sha256msg2_epu32(msg2, msg1); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Rounds 56-59 + msg = _mm_add_epi32( + msg2, + _mm_set_epi64x(0x8CC7020884C87814u64 as i64, 0x78A5636F748F82EEu64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msg2, msg1, 4); + msg3 = _mm_add_epi32(msg3, tmp); + msg3 = _mm_sha256msg2_epu32(msg3, msg2); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Rounds 60-63 + msg = _mm_add_epi32( + msg3, + _mm_set_epi64x(0xC67178F2BEF9A3F7u64 as i64, 0xA4506CEB90BEFFFAu64 as i64), + ); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Combine state + state0 = _mm_add_epi32(state0, abef_save); + state1 = _mm_add_epi32(state1, cdgh_save); + + block_offset += 64; + length -= 64; + } + + tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA + state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG + state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA + state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF + + // Save state + _mm_storeu_si128(state.as_ptr().add(0) as *mut __m128i, state0); + _mm_storeu_si128(state.as_ptr().add(4) as *mut __m128i, state1); +} diff --git a/sha2/src/sha256_utils.rs b/sha2/src/sha256_utils.rs index 7d2ec9f63..f34777a91 100644 --- a/sha2/src/sha256_utils.rs +++ b/sha2/src/sha256_utils.rs @@ -309,6 +309,7 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { /// implemented by any CPU (at the time of this writing), and so they are /// emulated in this library until the instructions become more common, and gain /// support in LLVM (and GCC, etc.). +#[inline] pub fn compress256(state: &mut [u32; 8], block: &[u8; 64]) { let mut block_u32 = [0u32; BLOCK_LEN]; for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { From 5206c2100c26b0e809df2215546605f013c6f79b Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Sat, 11 Jan 2020 12:58:17 +0100 Subject: [PATCH 2/3] fixup --- sha2/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index e9c9a3154..7fe072be9 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -93,7 +93,6 @@ mod platform; mod sha256; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod sha256_intrinsics; -#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] mod sha256_utils; mod sha512; #[cfg(any(not(feature = "asm"), target_arch = "aarch64", feature = "compress"))] From 2ea5e4f53acbc6b3f3ac76af53ed1cf14a32e25c Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Thu, 11 Jun 2020 13:57:01 +0200 Subject: [PATCH 3/3] pull in fixes from fork --- Cargo.lock | 42 +++++++++++++++++++++++++++++++++++ sha2/Cargo.toml | 3 +++ sha2/src/platform.rs | 37 +++++++++++++++++++----------- sha2/src/sha256.rs | 14 ------------ sha2/src/sha256_intrinsics.rs | 9 ++++---- 5 files changed, 74 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7bf6b9b6a..6b6d22119 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,11 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + [[package]] name = "blake2" version = "0.9.0" @@ -197,6 +203,17 @@ version = "0.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e0456befd48169b9f13ef0f0ad46d492cf9d2dbb918bcf38e01eed4ce3ec5e4" +[[package]] +name = "raw-cpuid" +version = "7.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4a349ca83373cfa5d6dbb66fd76e58b2cca08da71a5f6400de0a0a6a9bceeaf" +dependencies = [ + "bitflags", + "cc", + "rustc_version", +] + [[package]] name = "ripemd160" version = "0.9.0" @@ -217,6 +234,30 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver", +] + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "sha-1" version = "0.9.0" @@ -248,6 +289,7 @@ dependencies = [ "lazy_static", "libc", "opaque-debug", + "raw-cpuid", "sha2-asm", ] diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 5db7d606f..18ad7e860 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -21,6 +21,9 @@ opaque-debug = "0.2" sha2-asm = { version = "0.5", optional = true } libc = { version = "0.2.68", optional = true } +[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] +raw-cpuid = "7.0.3" + [dependencies.lazy_static] version = "1.4.0" default-features = false diff --git a/sha2/src/platform.rs b/sha2/src/platform.rs index 90e285e5a..dc8af398f 100644 --- a/sha2/src/platform.rs +++ b/sha2/src/platform.rs @@ -20,7 +20,7 @@ impl Implementation { return sha_impl; } } - #[cfg(feature = "asm")] + #[cfg(any(feature = "asm", feature = "asm-aarch64"))] { if let Some(asm_impl) = Self::asm_if_supported() { return asm_impl; @@ -37,24 +37,35 @@ impl Implementation { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[allow(unreachable_code)] pub fn sha_if_supported() -> Option { - // Check whether sha support is assumed by the build. - #[cfg(target_feature = "sha")] - { + use raw_cpuid::CpuId; + + // Use raw_cpuid instead of is_x86_feature_detected, to ensure the check + // never happens at compile time. + let cpuid = CpuId::new(); + let is_runtime_ok = cpuid + .get_extended_feature_info() + .map(|info| info.has_sha()) + .unwrap_or_default(); + + // Make sure this computer actually supports it + if is_runtime_ok { return Some(Implementation(Platform::Sha)); } - // Otherwise dynamically check for support if we can. - #[cfg(feature = "std")] - { - if std::is_x86_feature_detected!("sha") { - return Some(Implementation(Platform::Sha)); - } - } + None } - #[cfg(feature = "asm")] + #[cfg(any(feature = "asm", feature = "asm-arch64"))] pub fn asm_if_supported() -> Option { - return Some(Implementation(Platform::Asm)); + #[cfg(feature = "asm-aarch64")] + let supported = ::aarch64::sha2_supported(); + #[cfg(not(feature = "asm-aarch64"))] + let supported = false; + + if supported { + return Some(Implementation(Platform::Asm)); + } + None } #[inline] diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs index 5ad73b69f..18d1dfbef 100644 --- a/sha2/src/sha256.rs +++ b/sha2/src/sha256.rs @@ -29,24 +29,10 @@ impl Engine256State { Engine256State { h: *h } } - #[cfg(not(feature = "asm-aarch64"))] pub fn process_block(&mut self, block: &Block) { let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) }; IMPL.compress256(&mut self.h, block); } - - #[cfg(feature = "asm-aarch64")] - pub fn process_block(&mut self, block: &Block) { - let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) }; - // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha2") once - // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented - // to let us use it on no_std. - if ::aarch64::sha2_supported() { - compress256(&mut self.h, block); - } else { - ::sha256_utils::compress256(&mut self.h, block); - } - } } /// A structure that keeps track of the state of the Sha-256 operation and diff --git a/sha2/src/sha256_intrinsics.rs b/sha2/src/sha256_intrinsics.rs index 1a27384bd..1f1bd555c 100644 --- a/sha2/src/sha256_intrinsics.rs +++ b/sha2/src/sha256_intrinsics.rs @@ -1,3 +1,5 @@ +#![allow(clippy::cast_ptr_alignment)] // Safe to cast without alignment checks as the loads and stores do not require alignment. + #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -6,7 +8,6 @@ use core::arch::x86_64::*; /// Process a block with the SHA-256 algorithm. /// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c #[inline] -#[target_feature(enable = "sha")] pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) { // TODO: Process multiple blocks @@ -48,7 +49,7 @@ pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) { cdgh_save = state1; // Rounds 0-3 - msg = _mm_loadu_si128(block.as_ptr().add(block_offset + 0) as *const __m128i); + msg = _mm_loadu_si128(block.as_ptr().add(block_offset) as *const __m128i); msg0 = _mm_shuffle_epi8(msg, MASK); msg = _mm_add_epi32( msg0, @@ -261,6 +262,6 @@ pub unsafe fn compress256(state: &mut [u32; 8], block: &[u8; 64]) { state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF // Save state - _mm_storeu_si128(state.as_ptr().add(0) as *mut __m128i, state0); - _mm_storeu_si128(state.as_ptr().add(4) as *mut __m128i, state1); + _mm_storeu_si128(state.as_mut_ptr().add(0) as *mut __m128i, state0); + _mm_storeu_si128(state.as_mut_ptr().add(4) as *mut __m128i, state1); }