From 57ef977181ace3661a55da952d2f77ffe94d7d85 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sun, 29 Aug 2021 19:49:52 +0300 Subject: [PATCH 01/11] Add sha256-avx2 and sha256-ni --- sha2/Cargo.toml | 7 + sha2/benches/lib.rs | 31 -- sha2/benches/sha2.rs | 12 + sha2/build.rs | 17 +- sha2/src/lib.rs | 25 ++ sha2/src/sha256_x64_avx2.S | 802 +++++++++++++++++++++++++++++++++++++ sha2/src/sha256_x64_ni.S | 367 +++++++++++++++++ 7 files changed, 1229 insertions(+), 32 deletions(-) delete mode 100644 sha2/benches/lib.rs create mode 100644 sha2/benches/sha2.rs create mode 100644 sha2/src/sha256_x64_avx2.S create mode 100644 sha2/src/sha256_x64_ni.S diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 4899a90..49a84ef 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -12,3 +12,10 @@ edition = "2018" [build-dependencies] cc = "1.0" + +[dev-dependencies] +criterion = {version= "0.3.5",features=["html_reports"] } + +[[bench]] +name = "sha2" +harness = false \ No newline at end of file diff --git a/sha2/benches/lib.rs b/sha2/benches/lib.rs deleted file mode 100644 index 31d1ef6..0000000 --- a/sha2/benches/lib.rs +++ /dev/null @@ -1,31 +0,0 @@ -#![no_std] -#![feature(test)] - -extern crate test; - -use test::Bencher; - -#[bench] -fn bench_compress256(b: &mut Bencher) { - let mut state = Default::default(); - let data = [[0u8; 64]]; - - b.iter(|| { - sha2_asm::compress256(&mut state, &data); - }); - - b.bytes = data.len() as u64; -} - -#[cfg(not(target_arch = "aarch64"))] -#[bench] -fn bench_compress512(b: &mut Bencher) { - let mut state = Default::default(); - let data = [[0u8; 128]]; - - b.iter(|| { - sha2_asm::compress512(&mut state, &data); - }); - - b.bytes = data.len() as u64; -} diff --git a/sha2/benches/sha2.rs b/sha2/benches/sha2.rs new file mode 100644 index 0000000..95af693 --- /dev/null +++ b/sha2/benches/sha2.rs @@ -0,0 +1,12 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +pub fn criterion_benchmark(c: &mut Criterion) { + let mut state = Default::default(); + let data = [[0u8; 64]]; + c.bench_function("sha256", |b| { + b.iter(|| sha2_asm::compress256(black_box(&mut state), black_box(&data))) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/sha2/build.rs b/sha2/build.rs index 4fd331f..87b08fc 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -1,14 +1,29 @@ +use std::collections::HashSet; + fn main() { use std::env; let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); + // panic!("{}", env::var("CARGO_CFG_TARGET_ARCH").unwrap()); + let features: HashSet = env::var("CARGO_CFG_TARGET_FEATURE") + .unwrap_or_default() + .split(',') + .map(|x| x.to_string()) + .collect(); let mut build256 = cc::Build::new(); let (sha256_path, sha512_path) = if target_arch == "x86" { ("src/sha256_x86.S", "src/sha512_x86.S") } else if target_arch == "x86_64" { - ("src/sha256_x64.S", "src/sha512_x64.S") + // using aes-ni, cause it's fastest + if features.contains("aes") { + ("src/sha256_x64_ni.S", "src/sha512_x64.S") + } else if features.contains("avx2") { + ("src/sha256_x64_avx2.S", "src/sha512_x64.S") + } else { + ("src/sha256_x64.S", "src/sha512_x64.S") + } } else if target_arch == "aarch64" && target_vendor == "apple" { build256.flag("-march=armv8-a+crypto"); ("src/sha256_aarch64_apple.S", "") diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index c01bd96..5dd9b58 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -14,10 +14,35 @@ compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures"); #[link(name = "sha256", kind = "static")] +#[allow(dead_code)] extern "C" { fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]); + #[cfg(target_feature = "avx2")] + fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize); + #[cfg(target_feature = "aes")] + fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64); } +#[cfg(not(target_feature = "aes"))] +#[cfg(target_feature = "avx2")] +#[inline] +pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + if !blocks.is_empty() { + unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) } + } +} + +#[cfg(target_feature = "aes")] +#[cfg(not(target_feature = "avx2"))] +#[inline] +pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + if !blocks.is_empty() { + unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) } + } +} + +#[cfg(not(target_feature = "aes"))] +#[cfg(not(target_feature = "avx2"))] /// Safe wrapper around assembly implementation of SHA256 compression function #[inline] pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { diff --git a/sha2/src/sha256_x64_avx2.S b/sha2/src/sha256_x64_avx2.S new file mode 100644 index 0000000..c8c6689 --- /dev/null +++ b/sha2/src/sha256_x64_avx2.S @@ -0,0 +1,802 @@ +# Copied and slightly modified +# from the linux kernel arch/x86/crypto/sha256-avx2-asm.S +# by Harald Hoyer + +######################################################################## +# Implement fast SHA-256 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 2 blocks at a time, with 4 lanes per block +######################################################################## + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + +################################ + +X0 = %ymm4 +X1 = %ymm5 +X2 = %ymm6 +X3 = %ymm7 + +# XMM versions of above +XWORD0 = %xmm4 +XWORD1 = %xmm5 +XWORD2 = %xmm6 +XWORD3 = %xmm7 + +XTMP0 = %ymm0 +XTMP1 = %ymm1 +XTMP2 = %ymm2 +XTMP3 = %ymm3 +XTMP4 = %ymm8 +XFER = %ymm9 +XTMP5 = %ymm11 + +SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %ymm13 + +X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK + +NUM_BLKS = %rdx # 3rd arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg +c = %ecx +d = %r8d +e = %edx # clobbers NUM_BLKS +y3 = %esi # clobbers INP + +SRND = CTX # SRND is same register as CTX + +a = %eax +b = %ebx +f = %r9d +g = %r10d +h = %r11d +old_h = %r11d + +T1 = %r12d +y0 = %r13d +y1 = %r14d +y2 = %r15d + + +_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round +_XMM_SAVE_SIZE = 0 +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_CTX_SIZE = 8 +_RSP_SIZE = 8 + +_XFER = 0 +_XMM_SAVE = _XFER + _XFER_SIZE +_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE +_INP = _INP_END + _INP_END_SIZE +_CTX = _INP + _INP_SIZE +_RSP = _CTX + _CTX_SIZE +STACK_SIZE = _RSP + _RSP_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs + X_ = X0 + X0 = X1 + X1 = X2 + X2 = X3 + X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED disp +################################### RND N + 0 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + vpsrld $7, XTMP1, XTMP2 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + vpslld $(32-7), XTMP1, XTMP3 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 + + vpsrld $18, XTMP1, XTMP2 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 1 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 1*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + vpslld $(32-18), XTMP1, XTMP1 + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + + vpxor XTMP1, XTMP3, XTMP3 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + + + ROTATE_ARGS + +################################### RND N + 2 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + offset = \disp + 2*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} + rorx $11, e, y1 # y1 = e >> 11 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} + and e, y2 # y2 = (f^g)&e # CH + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + vpxor XTMP3, XTMP2, XTMP2 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a ,T1 # T1 = (a >> 2) # S0 + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1,h # h = k + w + h + S0 # -- + add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3,h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 3 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 3*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP3, XTMP2, XTMP2 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $2, a, T1 # T1 = (a >> 2) # S0 + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} + + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + rotate_Xs +.endm + +.macro DO_4ROUNDS disp +################################### RND N + 0 ########################### + + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 1 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*1 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 2 ############################## + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*2 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 3 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*3 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + +.endm + +######################################################################## +## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) +## arg 1 : pointer to state +## arg 2 : pointer to input data +## arg 3 : Num blocks +######################################################################## +.text +#ifdef __APPLE__ +.globl _sha256_transform_rorx +_sha256_transform_rorx: +.align 5 +#else +.globl sha256_transform_rorx +sha256_transform_rorx: +.align 32 +#endif + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + mov %rsp, %rax + subq $STACK_SIZE, %rsp + and $-32, %rsp # align rsp to 32 byte boundary + mov %rax, _RSP(%rsp) + + + shl $6, NUM_BLKS # convert to bytes + jz done_hash + lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block + mov NUM_BLKS, _INP_END(%rsp) + + cmp NUM_BLKS, INP + je only_one_block + + ## load initial digest + mov (CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + +loop0: + ## Load first 16 dwords from two blocks + VMOVDQ 0*32(INP),XTMP0 + VMOVDQ 1*32(INP),XTMP1 + VMOVDQ 2*32(INP),XTMP2 + VMOVDQ 3*32(INP),XTMP3 + + ## byte swap data + vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 + vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 + vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 + vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 + + ## transpose data into high/low halves + vperm2i128 $0x20, XTMP2, XTMP0, X0 + vperm2i128 $0x31, XTMP2, XTMP0, X1 + vperm2i128 $0x20, XTMP3, XTMP1, X2 + vperm2i128 $0x31, XTMP3, XTMP1, X3 + +last_block_enter: + add $64, INP + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 12 each + xor SRND, SRND + + lea K256(%rip), %rbp + +#ifdef __APPLE__ +.align 4 +#else +.align 16 +#endif +loop1: + vpaddd 0*32(%rbp, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 0*32 + + vpaddd 1*32(%rbp, SRND), X0, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 1*32 + + vpaddd 2*32(%rbp, SRND), X0, XFER + vmovdqa XFER, 2*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 2*32 + + vpaddd 3*32(%rbp, SRND), X0, XFER + vmovdqa XFER, 3*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 3*32 + + add $4*32, SRND + cmp $3*4*32, SRND + jb loop1 + +loop2: + ## Do last 16 rounds with no scheduling + vpaddd 0*32(%rbp, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + DO_4ROUNDS _XFER + 0*32 + + vpaddd 1*32(%rbp, SRND), X1, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + DO_4ROUNDS _XFER + 1*32 + add $2*32, SRND + + vmovdqa X2, X0 + vmovdqa X3, X1 + + cmp $4*4*32, SRND + jb loop2 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + ja done_hash + + #### Do second block using previously scheduled results + xor SRND, SRND +#ifdef __APPLE__ +.align 4 +#else +.align 16 +#endif +loop3: + DO_4ROUNDS _XFER + 0*32 + 16 + DO_4ROUNDS _XFER + 1*32 + 16 + add $2*32, SRND + cmp $4*4*32, SRND + jb loop3 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + add $64, INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + jb loop0 + ja done_hash + +do_last_block: + VMOVDQ 0*16(INP),XWORD0 + VMOVDQ 1*16(INP),XWORD1 + VMOVDQ 2*16(INP),XWORD2 + VMOVDQ 3*16(INP),XWORD3 + + vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 + vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 + vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 + vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 + + jmp last_block_enter + +only_one_block: + + ## load initial digest + mov (4*0)(CTX),a + mov (4*1)(CTX),b + mov (4*2)(CTX),c + mov (4*3)(CTX),d + mov (4*4)(CTX),e + mov (4*5)(CTX),f + mov (4*6)(CTX),g + mov (4*7)(CTX),h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + jmp do_last_block + +done_hash: + + mov _RSP(%rsp), %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + ret + +#ifdef __APPLE__ +.text +.align 6 +#else +.section .rodata +.align 64 +#endif +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +#ifdef __APPLE__ +.align 5 +#else +.align 32 +#endif +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 + +# shuffle xBxA -> 00BA +#ifdef __APPLE__ +.align 5 +#else +.align 32 +#endif +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 + +# shuffle xDxC -> DC00 +#ifdef __APPLE__ +.align 5 +#else +.align 32 +#endif +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF \ No newline at end of file diff --git a/sha2/src/sha256_x64_ni.S b/sha2/src/sha256_x64_ni.S new file mode 100644 index 0000000..afcdd3a --- /dev/null +++ b/sha2/src/sha256_x64_ni.S @@ -0,0 +1,367 @@ +# Copied and slightly modified +# from the linux kernel arch/x86/crypto/sha256_ni_asm.S +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define ENTRY(name) \ + .globl name; \ + .align 4,0x90; \ + name: + +#define ENDPROC(name) \ + .type name, @function; \ + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 +#define MSGTMP4 %xmm7 + +#define SHUF_MASK %xmm8 + +#define ABEF_SAVE %xmm9 +#define CDGH_SAVE %xmm10 + +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * The function takes a pointer to the current hash values, a pointer to the + * input data, and a number of 64 byte blocks to process. Once all blocks have + * been processed, the digest pointer is updated with the resulting hash value. + * The function only processes complete blocks, there is no functionality to + * store partial blocks. All message padding and hash value initialization must + * be done outside the update function. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha256_ni_transform(uint32_t *digest, const void *data, + uint32_t numBlocks); + * digest : pointer to digest + * data: pointer to input data + * numBlocks: Number of blocks to process + */ + +.text +.align 32 +ENTRY(sha256_ni_transform) + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* + * load initial hash values + * Need to reorder these appropriately + * DCBA, HGFE -> ABEF, CDGH + */ + movdqu 0*16(DIGEST_PTR), STATE0 + movdqu 1*16(DIGEST_PTR), STATE1 + + pshufd $0xB1, STATE0, STATE0 /* CDAB */ + pshufd $0x1B, STATE1, STATE1 /* EFGH */ + movdqa STATE0, MSGTMP4 + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + lea K256(%rip), SHA256CONSTANTS + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa STATE0, ABEF_SAVE + movdqa STATE1, CDGH_SAVE + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP0 + paddd 0*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP1 + paddd 1*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP2 + paddd 2*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP3 + paddd 3*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + movdqa MSGTMP0, MSG + paddd 4*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + movdqa MSGTMP1, MSG + paddd 5*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + movdqa MSGTMP2, MSG + paddd 6*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + movdqa MSGTMP3, MSG + paddd 7*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + movdqa MSGTMP0, MSG + paddd 8*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + movdqa MSGTMP1, MSG + paddd 9*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + movdqa MSGTMP2, MSG + paddd 10*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + movdqa MSGTMP3, MSG + paddd 11*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + movdqa MSGTMP0, MSG + paddd 12*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + movdqa MSGTMP1, MSG + paddd 13*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 56-59 */ + movdqa MSGTMP2, MSG + paddd 14*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 60-63 */ + movdqa MSGTMP3, MSG + paddd 15*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd ABEF_SAVE, STATE0 + paddd CDGH_SAVE, STATE1 + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, STATE0, STATE0 /* FEBA */ + pshufd $0xB1, STATE1, STATE1 /* DCHG */ + movdqa STATE0, MSGTMP4 + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, MSGTMP4, STATE1 /* HGFE */ + + movdqu STATE0, 0*16(DIGEST_PTR) + movdqu STATE1, 1*16(DIGEST_PTR) + +.Ldone_hash: + + ret +ENDPROC(sha256_ni_transform) + +ENTRY(sha256_ni_built) + xor %eax,%eax + inc %eax + ret +ENDPROC(sha256_ni_built) + +.data +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 \ No newline at end of file From 354bc0c66b9358c0c63665bb8426581827b36f76 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sun, 29 Aug 2021 20:46:51 +0300 Subject: [PATCH 02/11] Implement fast SHA-512 with AVX2 instructions --- sha2/benches/sha2.rs | 5 + sha2/build.rs | 2 +- sha2/src/lib.rs | 13 + sha2/src/sha512_x64_avx2.S | 754 +++++++++++++++++++++++++++++++++++++ 4 files changed, 773 insertions(+), 1 deletion(-) create mode 100644 sha2/src/sha512_x64_avx2.S diff --git a/sha2/benches/sha2.rs b/sha2/benches/sha2.rs index 95af693..29401dc 100644 --- a/sha2/benches/sha2.rs +++ b/sha2/benches/sha2.rs @@ -6,6 +6,11 @@ pub fn criterion_benchmark(c: &mut Criterion) { c.bench_function("sha256", |b| { b.iter(|| sha2_asm::compress256(black_box(&mut state), black_box(&data))) }); + let mut state = Default::default(); + let data = [[0u8; 128]]; + c.bench_function("sha512", |b| { + b.iter(|| sha2_asm::compress512(black_box(&mut state), black_box(&data))) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/sha2/build.rs b/sha2/build.rs index 87b08fc..db3a56c 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -20,7 +20,7 @@ fn main() { if features.contains("aes") { ("src/sha256_x64_ni.S", "src/sha512_x64.S") } else if features.contains("avx2") { - ("src/sha256_x64_avx2.S", "src/sha512_x64.S") + ("src/sha256_x64_avx2.S", "src/sha512_x64_avx2.S") } else { ("src/sha256_x64.S", "src/sha512_x64.S") } diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index 5dd9b58..bf4eac6 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -54,14 +54,27 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { #[cfg(not(target_arch = "aarch64"))] #[link(name = "sha512", kind = "static")] extern "C" { + #[cfg(not(target_feature = "avx2"))] fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]); + #[cfg(target_feature = "avx2")] + fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize); } /// Safe wrapper around assembly implementation of SHA512 compression function /// /// This function is available only on x86 and x86-64 targets. +#[inline] #[cfg(not(target_arch = "aarch64"))] +#[cfg(target_feature = "avx2")] +pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + if !blocks.is_empty() { + unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) } + } +} + #[inline] +#[cfg(not(target_arch = "aarch64"))] +#[cfg(not(target_feature = "avx2"))] pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) { for block in blocks { unsafe { sha512_compress(state, block) } diff --git a/sha2/src/sha512_x64_avx2.S b/sha2/src/sha512_x64_avx2.S new file mode 100644 index 0000000..2dcd971 --- /dev/null +++ b/sha2/src/sha512_x64_avx2.S @@ -0,0 +1,754 @@ +######################################################################## +# Implement fast SHA-512 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford +# Kirk Yap +# David Cote +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 blocks at a time, with 4 lanes per block +######################################################################## + +#define ENTRY(name) \ + .globl name; \ + .align 4,0x90; \ + name: + +#define ENDPROC(name) \ + .type name, @function; \ + +.text + +# Virtual Registers +Y_0 = %ymm4 +Y_1 = %ymm5 +Y_2 = %ymm6 +Y_3 = %ymm7 + +YTMP0 = %ymm0 +YTMP1 = %ymm1 +YTMP2 = %ymm2 +YTMP3 = %ymm3 +YTMP4 = %ymm8 +XFER = YTMP0 + +BYTE_FLIP_MASK = %ymm9 + +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 +# 2nd arg +INP = %rsi +# 3rd arg +NUM_BLKS = %rdx + +c = %rcx +d = %r8 +e = %rdx +y3 = %rsi + +TBL = %rdi # clobbers CTX1 + +a = %rax +b = %rbx + +f = %r9 +g = %r10 +h = %r11 +old_h = %r11 + +T1 = %r12 # clobbers CTX2 +y0 = %r13 +y1 = %r14 +y2 = %r15 + +# Local variables (stack frame) +XFER_SIZE = 4*8 +SRND_SIZE = 1*8 +INP_SIZE = 1*8 +INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 + +frame_XFER = 0 +frame_SRND = frame_XFER + XFER_SIZE +frame_INP = frame_SRND + SRND_SIZE +frame_INPEND = frame_INP + INP_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_size = frame_CTX + CTX_SIZE + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask +# Load ymm with mem and byte swap each dword +.macro COPY_YMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm +# rotate_Ys +# Rotate values of symbols Y0...Y3 +.macro rotate_Ys + Y_ = Y_0 + Y_0 = Y_1 + Y_1 = Y_2 + Y_2 = Y_3 + Y_3 = Y_ +.endm + +# RotateState +.macro RotateState + # Rotate symbols a..h right + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL +# YDST = {YSRC1, YSRC2} >> RVAL*8 +.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL + vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} + vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 +.endm + +.macro FOUR_ROUNDS_AND_SCHED +################################### RND N + 0 ######################################### + + # Extract w[t-7] + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] + # Calculate w[t-16] + w[t-7] + vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] + # Extract w[t-15] + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] + + # Calculate sigma0 + + # Calculate w[t-15] ror 1 + vpsrlq $1, YTMP1, YTMP2 + vpsllq $(64-1), YTMP1, YTMP3 + vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 + # Calculate w[t-15] shr 7 + vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add frame_XFER(%rsp),h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + rorx $14, e, y1 # y1 = (e >> 14) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 1 ######################################### + + # Calculate w[t-15] ror 8 + vpsrlq $8, YTMP1, YTMP2 + vpsllq $(64-8), YTMP1, YTMP1 + vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 + # XOR the three components + vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 + vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 + + + # Add three components, w[t-16], w[t-7] and sigma0 + vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 + # Move to appropriate lanes for calculating w[16] and w[17] + vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} + # Move to appropriate lanes for calculating w[18] and w[19] + vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} + + # Calculate w[16] and w[17] in both 128 bit lanes + + # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes + vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} + vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} + + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + +################################### RND N + 2 ######################################### + + vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} + vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} + vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} + vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} + + # Add sigma1 to the other compunents to get w[16] and w[17] + vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} + + # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane + vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- + + rorx $18, e, y1 # y1 = e >> 18 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + and e, y2 # y2 = (f^g)&e # CH + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 3 ######################################### + + vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} + vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} + vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} + vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} + + # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] + # to newly calculated sigma1 to get w[18] and w[19] + vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} + + # Form w[19, w[18], w17], w[16] + vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + rorx $39, a, y1 # y1 = a >> 39 # S0A + add y0, y2 # y2 = S1 + CH # -- + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + rotate_Ys +.endm + +.macro DO_4ROUNDS + +################################### RND N + 0 ######################################### + + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 1 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 2 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 3 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +.endm + +######################################################################## +# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "blocks" is the message length in SHA512 blocks +######################################################################## +ENTRY(sha512_transform_rorx) + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + + shl $7, NUM_BLKS # convert to bytes + jz done_hash + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, frame_INPEND(%rsp) + + ## load initial digest + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + +loop0: + lea K512(%rip), TBL + + ## byte swap first 16 dwords + COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK + + mov INP, frame_INP(%rsp) + + ## schedule 64 input dwords, by doing 12 rounds of 4 each + movq $4, frame_SRND(%rsp) + +.align 16 +loop1: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 1*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 2*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 3*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(4*32), TBL + FOUR_ROUNDS_AND_SCHED + + subq $1, frame_SRND(%rsp) + jne loop1 + + movq $2, frame_SRND(%rsp) +loop2: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + DO_4ROUNDS + vpaddq 1*32(TBL), Y_1, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(2*32), TBL + DO_4ROUNDS + + vmovdqa Y_2, Y_0 + vmovdqa Y_3, Y_1 + + subq $1, frame_SRND(%rsp) + jne loop2 + + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h + + mov frame_INP(%rsp), INP + add $128, INP + cmp frame_INPEND(%rsp), INP + jne loop0 + +done_hash: + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + ret +ENDPROC(sha512_transform_rorx) + +######################################################################## +### Binary Data + + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 + +.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 +.align 32 +MASK_YMM_LO: + .octa 0x00000000000000000000000000000000 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF \ No newline at end of file From d80c79a1a21f7def8603e7731dedeb84e645bda4 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sun, 29 Aug 2021 21:04:26 +0300 Subject: [PATCH 03/11] Update build.rs --- sha2/Cargo.toml | 3 +++ sha2/build.rs | 18 ++++++++++++------ sha2/src/lib.rs | 47 ++++++++++++++++++++++++----------------------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 49a84ef..1a4760f 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -10,6 +10,9 @@ keywords = ["crypto", "sha2", "asm"] categories = ["cryptography", "no-std"] edition = "2018" +[dependencies] +cfg-if = "1.0.0" + [build-dependencies] cc = "1.0" diff --git a/sha2/build.rs b/sha2/build.rs index db3a56c..59f9cf1 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -16,14 +16,20 @@ fn main() { let (sha256_path, sha512_path) = if target_arch == "x86" { ("src/sha256_x86.S", "src/sha512_x86.S") } else if target_arch == "x86_64" { - // using aes-ni, cause it's fastest - if features.contains("aes") { - ("src/sha256_x64_ni.S", "src/sha512_x64.S") + let sha512 = if features.contains("avx2") { + "src/sha512_x64_avx2.S" + } else { + "src/sha512_x64.S" + }; + // Prioritizing aes-ni, cause it's fastest + let sha256 = if features.contains("aes") { + "src/sha256_x64_ni.S" } else if features.contains("avx2") { - ("src/sha256_x64_avx2.S", "src/sha512_x64_avx2.S") + "src/sha256_x64_avx2.S" } else { - ("src/sha256_x64.S", "src/sha512_x64.S") - } + "src/sha256_x64.S" + }; + (sha256, sha512) } else if target_arch == "aarch64" && target_vendor == "apple" { build256.flag("-march=armv8-a+crypto"); ("src/sha256_aarch64_apple.S", "") diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index bf4eac6..765320f 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -23,31 +23,32 @@ extern "C" { fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64); } -#[cfg(not(target_feature = "aes"))] -#[cfg(target_feature = "avx2")] -#[inline] -pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - if !blocks.is_empty() { - unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) } +cfg_if::cfg_if! { + if #[cfg(target_feature = "aes")] + { + #[inline] + pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + if !blocks.is_empty() { + unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) } + } + } } -} - -#[cfg(target_feature = "aes")] -#[cfg(not(target_feature = "avx2"))] -#[inline] -pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - if !blocks.is_empty() { - unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) } + else if #[cfg(target_feature = "avx2")] + { + #[inline] + pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + if !blocks.is_empty() { + unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) } + } + } } -} - -#[cfg(not(target_feature = "aes"))] -#[cfg(not(target_feature = "avx2"))] -/// Safe wrapper around assembly implementation of SHA256 compression function -#[inline] -pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - for block in blocks { - unsafe { sha256_compress(state, block) } + else{ + #[inline] + pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + for block in blocks { + unsafe { sha256_compress(state, block) } + } + } } } From c7dd9e5131b129a2d4e9ce5969932fbb004f0119 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sun, 29 Aug 2021 21:12:30 +0300 Subject: [PATCH 04/11] Cleanup --- sha2/build.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/sha2/build.rs b/sha2/build.rs index 59f9cf1..c61dd68 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -5,7 +5,6 @@ fn main() { let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); - // panic!("{}", env::var("CARGO_CFG_TARGET_ARCH").unwrap()); let features: HashSet = env::var("CARGO_CFG_TARGET_FEATURE") .unwrap_or_default() .split(',') From c705ae29a90098b45f782f5766242ef954f06871 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sun, 29 Aug 2021 22:24:54 +0300 Subject: [PATCH 05/11] Add runtime feature detection. Update build --- sha2/Cargo.toml | 2 +- sha2/build.rs | 41 +++++++++++------------------ sha2/src/lib.rs | 70 ++++++++++++++++++++++--------------------------- 3 files changed, 48 insertions(+), 65 deletions(-) diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 1a4760f..41f7d8b 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -11,7 +11,7 @@ categories = ["cryptography", "no-std"] edition = "2018" [dependencies] -cfg-if = "1.0.0" +cpufeatures = "0.2.1" [build-dependencies] cc = "1.0" diff --git a/sha2/build.rs b/sha2/build.rs index c61dd68..70f0635 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -1,40 +1,28 @@ -use std::collections::HashSet; - fn main() { use std::env; let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); - let features: HashSet = env::var("CARGO_CFG_TARGET_FEATURE") - .unwrap_or_default() - .split(',') - .map(|x| x.to_string()) - .collect(); let mut build256 = cc::Build::new(); let (sha256_path, sha512_path) = if target_arch == "x86" { - ("src/sha256_x86.S", "src/sha512_x86.S") + (["src/sha256_x86.S"].to_vec(), ["src/sha512_x86.S"].to_vec()) } else if target_arch == "x86_64" { - let sha512 = if features.contains("avx2") { - "src/sha512_x64_avx2.S" - } else { - "src/sha512_x64.S" - }; - // Prioritizing aes-ni, cause it's fastest - let sha256 = if features.contains("aes") { - "src/sha256_x64_ni.S" - } else if features.contains("avx2") { - "src/sha256_x64_avx2.S" - } else { - "src/sha256_x64.S" - }; + let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].to_vec(); + // Prioritizing sha-ni, cause it's fastest + let sha256 = [ + "src/sha256_x64_ni.S", + "src/sha256_x64_avx2.S", + "src/sha256_x64.S", + ] + .to_vec(); (sha256, sha512) } else if target_arch == "aarch64" && target_vendor == "apple" { build256.flag("-march=armv8-a+crypto"); - ("src/sha256_aarch64_apple.S", "") + (["src/sha256_aarch64_apple.S"].to_vec(), [""].to_vec()) } else if target_arch == "aarch64" { build256.flag("-march=armv8-a+crypto"); - ("src/sha256_aarch64.S", "") + (["src/sha256_aarch64.S"].to_vec(), [""].to_vec()) } else { panic!("Unsupported target architecture"); }; @@ -42,8 +30,11 @@ fn main() { if target_arch != "aarch64" { cc::Build::new() .flag("-c") - .file(sha512_path) + .files(sha512_path) .compile("libsha512.a"); } - build256.flag("-c").file(sha256_path).compile("libsha256.a"); + build256 + .flag("-c") + .files(sha256_path) + .compile("libsha256.a"); } diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index 765320f..1f385dc 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -13,41 +13,38 @@ #[cfg(not(any(target_arch = "x86_64", target_arch = "x86", target_arch = "aarch64")))] compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures"); +cpufeatures::new!(cpuid_avx2, "avx2"); +cpufeatures::new!(cpuid_sha, "sha"); + #[link(name = "sha256", kind = "static")] #[allow(dead_code)] extern "C" { fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]); - #[cfg(target_feature = "avx2")] + // setting usize is safe, cause we are on 64 bit platform fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize); - #[cfg(target_feature = "aes")] - fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64); + fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: usize); } -cfg_if::cfg_if! { - if #[cfg(target_feature = "aes")] - { - #[inline] - pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - if !blocks.is_empty() { - unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) } - } +/// Safe wrapper around assembly implementation of SHA256 compression function +/// +#[inline] +pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + let token_sha = cpuid_sha::init(); + if token_sha.get() { + if !blocks.is_empty() { + unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len()) } } + return; } - else if #[cfg(target_feature = "avx2")] - { - #[inline] - pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - if !blocks.is_empty() { - unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) } - } + let token: cpuid_avx2::InitToken = cpuid_avx2::init(); + + if token.get() { + if !blocks.is_empty() { + unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) } } - } - else{ - #[inline] - pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - for block in blocks { - unsafe { sha256_compress(state, block) } - } + } else { + for block in blocks { + unsafe { sha256_compress(state, block) } } } } @@ -55,29 +52,24 @@ cfg_if::cfg_if! { #[cfg(not(target_arch = "aarch64"))] #[link(name = "sha512", kind = "static")] extern "C" { - #[cfg(not(target_feature = "avx2"))] fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]); - #[cfg(target_feature = "avx2")] fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize); } /// Safe wrapper around assembly implementation of SHA512 compression function /// /// This function is available only on x86 and x86-64 targets. -#[inline] #[cfg(not(target_arch = "aarch64"))] -#[cfg(target_feature = "avx2")] -pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - if !blocks.is_empty() { - unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) } - } -} - #[inline] -#[cfg(not(target_arch = "aarch64"))] -#[cfg(not(target_feature = "avx2"))] pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - for block in blocks { - unsafe { sha512_compress(state, block) } + let token: cpuid_avx2::InitToken = cpuid_avx2::init(); + if token.get() { + if !blocks.is_empty() { + unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) } + } + } else { + for block in blocks { + unsafe { sha512_compress(state, block) } + } } } From e17dee4913feb88c4efffa5e063f3b3d27020766 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Mon, 30 Aug 2021 11:14:21 +0300 Subject: [PATCH 06/11] Refactor --- sha2/Cargo.toml | 7 ------- sha2/benches/lib.rs | 31 +++++++++++++++++++++++++++++++ sha2/benches/sha2.rs | 17 ----------------- sha2/build.rs | 10 +++++----- sha2/src/lib.rs | 13 ++++++------- 5 files changed, 42 insertions(+), 36 deletions(-) create mode 100644 sha2/benches/lib.rs delete mode 100644 sha2/benches/sha2.rs diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 41f7d8b..b00cb4f 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -15,10 +15,3 @@ cpufeatures = "0.2.1" [build-dependencies] cc = "1.0" - -[dev-dependencies] -criterion = {version= "0.3.5",features=["html_reports"] } - -[[bench]] -name = "sha2" -harness = false \ No newline at end of file diff --git a/sha2/benches/lib.rs b/sha2/benches/lib.rs new file mode 100644 index 0000000..31d1ef6 --- /dev/null +++ b/sha2/benches/lib.rs @@ -0,0 +1,31 @@ +#![no_std] +#![feature(test)] + +extern crate test; + +use test::Bencher; + +#[bench] +fn bench_compress256(b: &mut Bencher) { + let mut state = Default::default(); + let data = [[0u8; 64]]; + + b.iter(|| { + sha2_asm::compress256(&mut state, &data); + }); + + b.bytes = data.len() as u64; +} + +#[cfg(not(target_arch = "aarch64"))] +#[bench] +fn bench_compress512(b: &mut Bencher) { + let mut state = Default::default(); + let data = [[0u8; 128]]; + + b.iter(|| { + sha2_asm::compress512(&mut state, &data); + }); + + b.bytes = data.len() as u64; +} diff --git a/sha2/benches/sha2.rs b/sha2/benches/sha2.rs deleted file mode 100644 index 29401dc..0000000 --- a/sha2/benches/sha2.rs +++ /dev/null @@ -1,17 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; - -pub fn criterion_benchmark(c: &mut Criterion) { - let mut state = Default::default(); - let data = [[0u8; 64]]; - c.bench_function("sha256", |b| { - b.iter(|| sha2_asm::compress256(black_box(&mut state), black_box(&data))) - }); - let mut state = Default::default(); - let data = [[0u8; 128]]; - c.bench_function("sha512", |b| { - b.iter(|| sha2_asm::compress512(black_box(&mut state), black_box(&data))) - }); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/sha2/build.rs b/sha2/build.rs index 70f0635..0c0e913 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -6,23 +6,23 @@ fn main() { let mut build256 = cc::Build::new(); let (sha256_path, sha512_path) = if target_arch == "x86" { - (["src/sha256_x86.S"].to_vec(), ["src/sha512_x86.S"].to_vec()) + (["src/sha256_x86.S"].iter(), ["src/sha512_x86.S"].iter()) } else if target_arch == "x86_64" { - let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].to_vec(); + let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(); // Prioritizing sha-ni, cause it's fastest let sha256 = [ "src/sha256_x64_ni.S", "src/sha256_x64_avx2.S", "src/sha256_x64.S", ] - .to_vec(); + .iter(); (sha256, sha512) } else if target_arch == "aarch64" && target_vendor == "apple" { build256.flag("-march=armv8-a+crypto"); - (["src/sha256_aarch64_apple.S"].to_vec(), [""].to_vec()) + (["src/sha256_aarch64_apple.S"].iter(), [""].iter()) } else if target_arch == "aarch64" { build256.flag("-march=armv8-a+crypto"); - (["src/sha256_aarch64.S"].to_vec(), [""].to_vec()) + (["src/sha256_aarch64.S"].iter(), [""].iter()) } else { panic!("Unsupported target architecture"); }; diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index 1f385dc..038a4b8 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -20,9 +20,8 @@ cpufeatures::new!(cpuid_sha, "sha"); #[allow(dead_code)] extern "C" { fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]); - // setting usize is safe, cause we are on 64 bit platform - fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize); - fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: usize); + fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: u64); + fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64); } /// Safe wrapper around assembly implementation of SHA256 compression function @@ -32,7 +31,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { let token_sha = cpuid_sha::init(); if token_sha.get() { if !blocks.is_empty() { - unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len()) } + unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) } } return; } @@ -40,7 +39,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { if token.get() { if !blocks.is_empty() { - unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) } + unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) } } } else { for block in blocks { @@ -53,7 +52,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { #[link(name = "sha512", kind = "static")] extern "C" { fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]); - fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize); + fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: u64); } /// Safe wrapper around assembly implementation of SHA512 compression function @@ -65,7 +64,7 @@ pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) { let token: cpuid_avx2::InitToken = cpuid_avx2::init(); if token.get() { if !blocks.is_empty() { - unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) } + unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) } } } else { for block in blocks { From b685df3949cb36ab47a798f9db65714eaafda6f5 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Mon, 30 Aug 2021 11:27:14 +0300 Subject: [PATCH 07/11] Fix not linux os build --- sha2/build.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sha2/build.rs b/sha2/build.rs index 0c0e913..6d6a286 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -3,20 +3,24 @@ fn main() { let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); - + let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let mut build256 = cc::Build::new(); let (sha256_path, sha512_path) = if target_arch == "x86" { (["src/sha256_x86.S"].iter(), ["src/sha512_x86.S"].iter()) } else if target_arch == "x86_64" { - let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(); - // Prioritizing sha-ni, cause it's fastest - let sha256 = [ - "src/sha256_x64_ni.S", - "src/sha256_x64_avx2.S", - "src/sha256_x64.S", - ] - .iter(); - (sha256, sha512) + if target_os == "linux" { + ( + [ + "src/sha256_x64_ni.S", + "src/sha256_x64_avx2.S", + "src/sha256_x64.S", + ] + .iter(), + ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(), + ) + } else { + (["src/sha256_x64.S"].iter(), ["src/sha512_x64.S"].iter()) + } } else if target_arch == "aarch64" && target_vendor == "apple" { build256.flag("-march=armv8-a+crypto"); (["src/sha256_aarch64_apple.S"].iter(), [""].iter()) From 9ad48b8a461c58ee9fc2a823a3f36c439a7b37ce Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sat, 4 Sep 2021 17:17:50 +0300 Subject: [PATCH 08/11] Update --- Cargo.lock | 18 +- sha2/build.rs | 7 +- sha2/src/lib.rs | 9 - sha2/src/sha256_x64_ni.S | 367 --------------------------------------- 4 files changed, 18 insertions(+), 383 deletions(-) delete mode 100644 sha2/src/sha256_x64_ni.S diff --git a/Cargo.lock b/Cargo.lock index cee27d7..cd18aa2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,6 +9,21 @@ version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed67cbde08356238e75fc4656be4749481eeffb09e19f320a25237d5221c985d" +[[package]] +name = "cpufeatures" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" + [[package]] name = "md5-asm" version = "0.5.0" @@ -25,9 +40,10 @@ dependencies = [ [[package]] name = "sha2-asm" -version = "0.6.1" +version = "0.6.2" dependencies = [ "cc", + "cpufeatures", ] [[package]] diff --git a/sha2/build.rs b/sha2/build.rs index 6d6a286..643216f 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -10,12 +10,7 @@ fn main() { } else if target_arch == "x86_64" { if target_os == "linux" { ( - [ - "src/sha256_x64_ni.S", - "src/sha256_x64_avx2.S", - "src/sha256_x64.S", - ] - .iter(), + ["src/sha256_x64_avx2.S", "src/sha256_x64.S"].iter(), ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(), ) } else { diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index 038a4b8..8fa3ccd 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -14,27 +14,18 @@ compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures"); cpufeatures::new!(cpuid_avx2, "avx2"); -cpufeatures::new!(cpuid_sha, "sha"); #[link(name = "sha256", kind = "static")] #[allow(dead_code)] extern "C" { fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]); fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: u64); - fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64); } /// Safe wrapper around assembly implementation of SHA256 compression function /// #[inline] pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - let token_sha = cpuid_sha::init(); - if token_sha.get() { - if !blocks.is_empty() { - unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) } - } - return; - } let token: cpuid_avx2::InitToken = cpuid_avx2::init(); if token.get() { diff --git a/sha2/src/sha256_x64_ni.S b/sha2/src/sha256_x64_ni.S deleted file mode 100644 index afcdd3a..0000000 --- a/sha2/src/sha256_x64_ni.S +++ /dev/null @@ -1,367 +0,0 @@ -# Copied and slightly modified -# from the linux kernel arch/x86/crypto/sha256_ni_asm.S -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley - * Tim Chen - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#define ENTRY(name) \ - .globl name; \ - .align 4,0x90; \ - name: - -#define ENDPROC(name) \ - .type name, @function; \ - -#define DIGEST_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 -#define MSGTMP4 %xmm7 - -#define SHUF_MASK %xmm8 - -#define ABEF_SAVE %xmm9 -#define CDGH_SAVE %xmm10 - -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. - * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * - * void sha256_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks); - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process - */ - -.text -.align 32 -ENTRY(sha256_ni_transform) - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* - * load initial hash values - * Need to reorder these appropriately - * DCBA, HGFE -> ABEF, CDGH - */ - movdqu 0*16(DIGEST_PTR), STATE0 - movdqu 1*16(DIGEST_PTR), STATE1 - - pshufd $0xB1, STATE0, STATE0 /* CDAB */ - pshufd $0x1B, STATE1, STATE1 /* EFGH */ - movdqa STATE0, MSGTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256(%rip), SHA256CONSTANTS - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa STATE0, ABEF_SAVE - movdqa STATE1, CDGH_SAVE - - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP0 - paddd 0*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP1 - paddd 1*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP2 - paddd 2*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP3 - paddd 3*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - movdqa MSGTMP0, MSG - paddd 4*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - movdqa MSGTMP1, MSG - paddd 5*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - movdqa MSGTMP2, MSG - paddd 6*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - movdqa MSGTMP3, MSG - paddd 7*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - movdqa MSGTMP0, MSG - paddd 8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - movdqa MSGTMP1, MSG - paddd 9*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - movdqa MSGTMP2, MSG - paddd 10*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - movdqa MSGTMP3, MSG - paddd 11*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - movdqa MSGTMP0, MSG - paddd 12*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - movdqa MSGTMP1, MSG - paddd 13*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 56-59 */ - movdqa MSGTMP2, MSG - paddd 14*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 60-63 */ - movdqa MSGTMP3, MSG - paddd 15*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Add current hash values with previously saved */ - paddd ABEF_SAVE, STATE0 - paddd CDGH_SAVE, STATE1 - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - pshufd $0x1B, STATE0, STATE0 /* FEBA */ - pshufd $0xB1, STATE1, STATE1 /* DCHG */ - movdqa STATE0, MSGTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ - - movdqu STATE0, 0*16(DIGEST_PTR) - movdqu STATE1, 1*16(DIGEST_PTR) - -.Ldone_hash: - - ret -ENDPROC(sha256_ni_transform) - -ENTRY(sha256_ni_built) - xor %eax,%eax - inc %eax - ret -ENDPROC(sha256_ni_built) - -.data -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 \ No newline at end of file From efec464c638a4bdf94b8c1d5bd96a50da90a9fc3 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sun, 5 Sep 2021 13:55:29 +0300 Subject: [PATCH 09/11] Refactor build script --- sha2/build.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sha2/build.rs b/sha2/build.rs index 643216f..dba680e 100644 --- a/sha2/build.rs +++ b/sha2/build.rs @@ -5,23 +5,23 @@ fn main() { let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let mut build256 = cc::Build::new(); - let (sha256_path, sha512_path) = if target_arch == "x86" { - (["src/sha256_x86.S"].iter(), ["src/sha512_x86.S"].iter()) + let (sha256_path, sha512_path): (&[&str], &[&str]) = if target_arch == "x86" { + (&["src/sha256_x86.S"], &["src/sha512_x86.S"]) } else if target_arch == "x86_64" { if target_os == "linux" { ( - ["src/sha256_x64_avx2.S", "src/sha256_x64.S"].iter(), - ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(), + &["src/sha256_x64_avx2.S", "src/sha256_x64.S"], + &["src/sha512_x64_avx2.S", "src/sha512_x64.S"], ) } else { - (["src/sha256_x64.S"].iter(), ["src/sha512_x64.S"].iter()) + (&["src/sha256_x64.S"], &["src/sha512_x64.S"]) } } else if target_arch == "aarch64" && target_vendor == "apple" { build256.flag("-march=armv8-a+crypto"); - (["src/sha256_aarch64_apple.S"].iter(), [""].iter()) + (&["src/sha256_aarch64_apple.S"], &[""]) } else if target_arch == "aarch64" { build256.flag("-march=armv8-a+crypto"); - (["src/sha256_aarch64.S"].iter(), [""].iter()) + (&["src/sha256_aarch64.S"], &[""]) } else { panic!("Unsupported target architecture"); }; From 99627cfed08499fc19adce427ab69ab182d8b307 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Mon, 6 Sep 2021 14:27:44 +0300 Subject: [PATCH 10/11] Small fix Fix --- sha2/Cargo.toml | 2 +- sha2/src/lib.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index b00cb4f..70ed1ec 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -11,7 +11,7 @@ categories = ["cryptography", "no-std"] edition = "2018" [dependencies] -cpufeatures = "0.2.1" +cpufeatures = "0.2" [build-dependencies] cc = "1.0" diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index 8fa3ccd..730e6c8 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -43,7 +43,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { #[link(name = "sha512", kind = "static")] extern "C" { fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]); - fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: u64); + fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize); } /// Safe wrapper around assembly implementation of SHA512 compression function @@ -55,7 +55,7 @@ pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) { let token: cpuid_avx2::InitToken = cpuid_avx2::init(); if token.get() { if !blocks.is_empty() { - unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) } + unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) } } } else { for block in blocks { From 43813ab705cf3b10ec55438495ce15a81284c56b Mon Sep 17 00:00:00 2001 From: Vladimir Date: Wed, 8 Sep 2021 23:08:44 +0300 Subject: [PATCH 11/11] Fix issues --- sha2/src/lib.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index 730e6c8..07485c9 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -13,24 +13,23 @@ #[cfg(not(any(target_arch = "x86_64", target_arch = "x86", target_arch = "aarch64")))] compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures"); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] cpufeatures::new!(cpuid_avx2, "avx2"); #[link(name = "sha256", kind = "static")] #[allow(dead_code)] extern "C" { fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]); - fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: u64); + fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize); } /// Safe wrapper around assembly implementation of SHA256 compression function /// #[inline] pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - let token: cpuid_avx2::InitToken = cpuid_avx2::init(); - - if token.get() { + if cpuid_avx2::get() { if !blocks.is_empty() { - unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) } + unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) } } } else { for block in blocks { @@ -52,8 +51,7 @@ extern "C" { #[cfg(not(target_arch = "aarch64"))] #[inline] pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - let token: cpuid_avx2::InitToken = cpuid_avx2::init(); - if token.get() { + if cpuid_avx2::get() { if !blocks.is_empty() { unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) } }