From 57ef977181ace3661a55da952d2f77ffe94d7d85 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Sun, 29 Aug 2021 19:49:52 +0300
Subject: [PATCH 01/11] Add sha256-avx2 and sha256-ni

---
 sha2/Cargo.toml            |   7 +
 sha2/benches/lib.rs        |  31 --
 sha2/benches/sha2.rs       |  12 +
 sha2/build.rs              |  17 +-
 sha2/src/lib.rs            |  25 ++
 sha2/src/sha256_x64_avx2.S | 802 +++++++++++++++++++++++++++++++++++++
 sha2/src/sha256_x64_ni.S   | 367 +++++++++++++++++
 7 files changed, 1229 insertions(+), 32 deletions(-)
 delete mode 100644 sha2/benches/lib.rs
 create mode 100644 sha2/benches/sha2.rs
 create mode 100644 sha2/src/sha256_x64_avx2.S
 create mode 100644 sha2/src/sha256_x64_ni.S
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 4899a90..49a84ef 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -12,3 +12,10 @@ edition = "2018"
 
 [build-dependencies]
 cc = "1.0"
+
+[dev-dependencies]
+criterion = {version= "0.3.5",features=["html_reports"] }
+
+[[bench]]
+name = "sha2"
+harness = false
\ No newline at end of file
diff --git a/sha2/benches/lib.rs b/sha2/benches/lib.rs
deleted file mode 100644
index 31d1ef6..0000000
--- a/sha2/benches/lib.rs
+++ /dev/null
@@ -1,31 +0,0 @@
-#![no_std]
-#![feature(test)]
-
-extern crate test;
-
-use test::Bencher;
-
-#[bench]
-fn bench_compress256(b: &mut Bencher) {
-    let mut state = Default::default();
-    let data = [[0u8; 64]];
-
-    b.iter(|| {
-        sha2_asm::compress256(&mut state, &data);
-    });
-
-    b.bytes = data.len() as u64;
-}
-
-#[cfg(not(target_arch = "aarch64"))]
-#[bench]
-fn bench_compress512(b: &mut Bencher) {
-    let mut state = Default::default();
-    let data = [[0u8; 128]];
-
-    b.iter(|| {
-        sha2_asm::compress512(&mut state, &data);
-    });
-
-    b.bytes = data.len() as u64;
-}
diff --git a/sha2/benches/sha2.rs b/sha2/benches/sha2.rs
new file mode 100644
index 0000000..95af693
--- /dev/null
+++ b/sha2/benches/sha2.rs
@@ -0,0 +1,12 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+
+pub fn criterion_benchmark(c: &mut Criterion) {
+    let mut state = Default::default();
+    let data = [[0u8; 64]];
+    c.bench_function("sha256", |b| {
+        b.iter(|| sha2_asm::compress256(black_box(&mut state), black_box(&data)))
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/sha2/build.rs b/sha2/build.rs
index 4fd331f..87b08fc 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -1,14 +1,29 @@
+use std::collections::HashSet;
+
 fn main() {
     use std::env;
 
     let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
     let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default();
+    // panic!("{}", env::var("CARGO_CFG_TARGET_ARCH").unwrap());
+    let features: HashSet<String> = env::var("CARGO_CFG_TARGET_FEATURE")
+        .unwrap_or_default()
+        .split(',')
+        .map(|x| x.to_string())
+        .collect();
 
     let mut build256 = cc::Build::new();
     let (sha256_path, sha512_path) = if target_arch == "x86" {
         ("src/sha256_x86.S", "src/sha512_x86.S")
     } else if target_arch == "x86_64" {
-        ("src/sha256_x64.S", "src/sha512_x64.S")
+        // using aes-ni, cause it's fastest
+        if features.contains("aes") {
+            ("src/sha256_x64_ni.S", "src/sha512_x64.S")
+        } else if features.contains("avx2") {
+            ("src/sha256_x64_avx2.S", "src/sha512_x64.S")
+        } else {
+            ("src/sha256_x64.S", "src/sha512_x64.S")
+        }
     } else if target_arch == "aarch64" && target_vendor == "apple" {
         build256.flag("-march=armv8-a+crypto");
         ("src/sha256_aarch64_apple.S", "")
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index c01bd96..5dd9b58 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -14,10 +14,35 @@
 compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures");
 
 #[link(name = "sha256", kind = "static")]
+#[allow(dead_code)]
 extern "C" {
     fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]);
+    #[cfg(target_feature = "avx2")]
+    fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize);
+    #[cfg(target_feature = "aes")]
+    fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64);
 }
 
+#[cfg(not(target_feature = "aes"))]
+#[cfg(target_feature = "avx2")]
+#[inline]
+pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    if !blocks.is_empty() {
+        unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
+    }
+}
+
+#[cfg(target_feature = "aes")]
+#[cfg(not(target_feature = "avx2"))]
+#[inline]
+pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    if !blocks.is_empty() {
+        unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) }
+    }
+}
+
+#[cfg(not(target_feature = "aes"))]
+#[cfg(not(target_feature = "avx2"))]
 /// Safe wrapper around assembly implementation of SHA256 compression function
 #[inline]
 pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
diff --git a/sha2/src/sha256_x64_avx2.S b/sha2/src/sha256_x64_avx2.S
new file mode 100644
index 0000000..c8c6689
--- /dev/null
+++ b/sha2/src/sha256_x64_avx2.S
@@ -0,0 +1,802 @@
+# Copied and slightly modified
+# from the linux kernel arch/x86/crypto/sha256-avx2-asm.S
+# by Harald Hoyer <harald@hoyer.xyz>
+
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER  = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx	# 3rd arg
+INP	= %rsi  # 2nd arg
+CTX	= %rdi	# 1st arg
+c	= %ecx
+d	= %r8d
+e       = %edx	# clobbers NUM_BLKS
+y3	= %esi	# clobbers INP
+
+SRND	= CTX	# SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE	= 0
+_INP_END_SIZE	= 8
+_INP_SIZE	= 8
+_CTX_SIZE	= 8
+_RSP_SIZE	= 8
+
+_XFER		= 0
+_XMM_SAVE	= _XFER     + _XFER_SIZE
+_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
+_INP		= _INP_END  + _INP_END_SIZE
+_CTX		= _INP      + _INP_SIZE
+_RSP		= _CTX      + _CTX_SIZE
+STACK_SIZE	= _RSP      + _RSP_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+	X_ = X0
+	X0 = X1
+	X1 = X2
+	X2 = X3
+	X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+	old_h = h
+	TMP_ = h
+	h = g
+	g = f
+	f = e
+	e = d
+	d = c
+	c = b
+	b = a
+	a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+
+	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	vpsrld	$7, XTMP1, XTMP2
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpslld	$(32-7), XTMP1, XTMP3
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
+
+	vpsrld	$18, XTMP1, XTMP2
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 1*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	vpslld	$(32-18), XTMP1, XTMP1
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+
+	vpxor	XTMP1, XTMP3, XTMP3
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
+	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	offset = \disp + 2*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+
+	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	vpxor	XTMP3, XTMP2, XTMP2
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
+	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1,h		# h = k + w + h + S0                    # --
+	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3,h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 3*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP3, XTMP2, XTMP2
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
+
+	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*1 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*2 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*3 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
+## arg 1 : pointer to state
+## arg 2 : pointer to input data
+## arg 3 : Num blocks
+########################################################################
+.text
+#ifdef __APPLE__
+.globl _sha256_transform_rorx
+_sha256_transform_rorx:
+.align 5
+#else
+.globl sha256_transform_rorx
+sha256_transform_rorx:
+.align 32
+#endif
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	mov	%rsp, %rax
+	subq	$STACK_SIZE, %rsp
+	and	$-32, %rsp	# align rsp to 32 byte boundary
+	mov	%rax, _RSP(%rsp)
+
+
+	shl	$6, NUM_BLKS	# convert to bytes
+	jz	done_hash
+	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+	mov	NUM_BLKS, _INP_END(%rsp)
+
+	cmp	NUM_BLKS, INP
+	je	only_one_block
+
+	## load initial digest
+	mov	(CTX), a
+	mov	4*1(CTX), b
+	mov	4*2(CTX), c
+	mov	4*3(CTX), d
+	mov	4*4(CTX), e
+	mov	4*5(CTX), f
+	mov	4*6(CTX), g
+	mov	4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+
+loop0:
+	## Load first 16 dwords from two blocks
+	VMOVDQ	0*32(INP),XTMP0
+	VMOVDQ	1*32(INP),XTMP1
+	VMOVDQ	2*32(INP),XTMP2
+	VMOVDQ	3*32(INP),XTMP3
+
+	## byte swap data
+	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
+	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
+	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
+	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
+
+	## transpose data into high/low halves
+	vperm2i128	$0x20, XTMP2, XTMP0, X0
+	vperm2i128	$0x31, XTMP2, XTMP0, X1
+	vperm2i128	$0x20, XTMP3, XTMP1, X2
+	vperm2i128	$0x31, XTMP3, XTMP1, X3
+
+last_block_enter:
+	add	$64, INP
+	mov	INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 12 each
+	xor	SRND, SRND
+
+    lea K256(%rip), %rbp
+
+#ifdef __APPLE__
+.align 4
+#else
+.align 16
+#endif
+loop1:
+	vpaddd	0*32(%rbp, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
+
+	vpaddd	1*32(%rbp, SRND), X0, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
+
+	vpaddd	2*32(%rbp, SRND), X0, XFER
+	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
+
+	vpaddd	3*32(%rbp, SRND), X0, XFER
+	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
+
+	add	$4*32, SRND
+	cmp	$3*4*32, SRND
+	jb	loop1
+
+loop2:
+	## Do last 16 rounds with no scheduling
+	vpaddd	0*32(%rbp, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	_XFER + 0*32
+
+	vpaddd	1*32(%rbp, SRND), X1, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	_XFER + 1*32
+	add	$2*32, SRND
+
+	vmovdqa	X2, X0
+	vmovdqa	X3, X1
+
+	cmp	$4*4*32, SRND
+	jb	loop2
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	ja	done_hash
+
+	#### Do second block using previously scheduled results
+	xor	SRND, SRND
+#ifdef __APPLE__
+.align 4
+#else
+.align 16
+#endif
+loop3:
+	DO_4ROUNDS	 _XFER + 0*32 + 16
+	DO_4ROUNDS	 _XFER + 1*32 + 16
+	add	$2*32, SRND
+	cmp	$4*4*32, SRND
+	jb	loop3
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+	add	$64, INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	jb	loop0
+	ja	done_hash
+
+do_last_block:
+	VMOVDQ	0*16(INP),XWORD0
+	VMOVDQ	1*16(INP),XWORD1
+	VMOVDQ	2*16(INP),XWORD2
+	VMOVDQ	3*16(INP),XWORD3
+
+	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
+	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
+	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
+	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+	jmp	last_block_enter
+
+only_one_block:
+
+	## load initial digest
+	mov	(4*0)(CTX),a
+	mov	(4*1)(CTX),b
+	mov	(4*2)(CTX),c
+	mov	(4*3)(CTX),d
+	mov	(4*4)(CTX),e
+	mov	(4*5)(CTX),f
+	mov	(4*6)(CTX),g
+	mov	(4*7)(CTX),h
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+	jmp	do_last_block
+
+done_hash:
+
+	mov	_RSP(%rsp), %rsp
+
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	ret
+
+#ifdef __APPLE__
+.text
+.align 6
+#else
+.section	.rodata
+.align 64
+#endif
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+#ifdef __APPLE__
+.align 5
+#else
+.align 32
+#endif
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+#ifdef __APPLE__
+.align 5
+#else
+.align 32
+#endif
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+#ifdef __APPLE__
+.align 5
+#else
+.align 32
+#endif
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
\ No newline at end of file
diff --git a/sha2/src/sha256_x64_ni.S b/sha2/src/sha256_x64_ni.S
new file mode 100644
index 0000000..afcdd3a
--- /dev/null
+++ b/sha2/src/sha256_x64_ni.S
@@ -0,0 +1,367 @@
+# Copied and slightly modified
+# from the linux kernel arch/x86/crypto/sha256_ni_asm.S
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define ENTRY(name) \
+        .globl name; \
+        .align 4,0x90; \
+        name:
+
+#define ENDPROC(name) \
+        .type name, @function; \
+
+#define DIGEST_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+#define MSGTMP4		%xmm7
+
+#define SHUF_MASK	%xmm8
+
+#define ABEF_SAVE	%xmm9
+#define CDGH_SAVE	%xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process.  Once all blocks have
+ * been processed, the digest pointer is  updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks.  All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha256_ni_transform(uint32_t *digest, const void *data,
+		uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+
+.text
+.align 32
+ENTRY(sha256_ni_transform)
+
+	shl		$6, NUM_BLKS		/*  convert to bytes */
+	jz		.Ldone_hash
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/*
+	 * load initial hash values
+	 * Need to reorder these appropriately
+	 * DCBA, HGFE -> ABEF, CDGH
+	 */
+	movdqu		0*16(DIGEST_PTR), STATE0
+	movdqu		1*16(DIGEST_PTR), STATE1
+
+	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
+	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
+	movdqa		STATE0, MSGTMP4
+	palignr		$8, STATE1,  STATE0		/* ABEF */
+	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+	lea		K256(%rip), SHA256CONSTANTS
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		STATE0, ABEF_SAVE
+	movdqa		STATE1, CDGH_SAVE
+
+	/* Rounds 0-3 */
+	movdqu		0*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP0
+		paddd		0*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movdqu		1*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP1
+		paddd		1*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movdqu		2*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP2
+		paddd		2*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movdqu		3*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP3
+		paddd		3*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	movdqa		MSGTMP0, MSG
+		paddd		4*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	movdqa		MSGTMP1, MSG
+		paddd		5*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	movdqa		MSGTMP2, MSG
+		paddd		6*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	movdqa		MSGTMP3, MSG
+		paddd		7*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	movdqa		MSGTMP0, MSG
+		paddd		8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	movdqa		MSGTMP1, MSG
+		paddd		9*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	movdqa		MSGTMP2, MSG
+		paddd		10*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	movdqa		MSGTMP3, MSG
+		paddd		11*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	movdqa		MSGTMP0, MSG
+		paddd		12*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	movdqa		MSGTMP1, MSG
+		paddd		13*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 56-59 */
+	movdqa		MSGTMP2, MSG
+		paddd		14*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 60-63 */
+	movdqa		MSGTMP3, MSG
+		paddd		15*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Add current hash values with previously saved */
+	paddd		ABEF_SAVE, STATE0
+	paddd		CDGH_SAVE, STATE1
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
+	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
+	movdqa		STATE0, MSGTMP4
+	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
+	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+
+	movdqu		STATE0, 0*16(DIGEST_PTR)
+	movdqu		STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+	ret
+ENDPROC(sha256_ni_transform)
+
+ENTRY(sha256_ni_built)
+    xor         %eax,%eax
+    inc         %eax
+    ret
+ENDPROC(sha256_ni_built)
+
+.data
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
\ No newline at end of file

From 354bc0c66b9358c0c63665bb8426581827b36f76 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Sun, 29 Aug 2021 20:46:51 +0300
Subject: [PATCH 02/11] Implement fast SHA-512 with AVX2 instructions

---
 sha2/benches/sha2.rs       |   5 +
 sha2/build.rs              |   2 +-
 sha2/src/lib.rs            |  13 +
 sha2/src/sha512_x64_avx2.S | 754 +++++++++++++++++++++++++++++++++++++
 4 files changed, 773 insertions(+), 1 deletion(-)
 create mode 100644 sha2/src/sha512_x64_avx2.S

diff --git a/sha2/benches/sha2.rs b/sha2/benches/sha2.rs
index 95af693..29401dc 100644
--- a/sha2/benches/sha2.rs
+++ b/sha2/benches/sha2.rs
@@ -6,6 +6,11 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     c.bench_function("sha256", |b| {
         b.iter(|| sha2_asm::compress256(black_box(&mut state), black_box(&data)))
     });
+    let mut state = Default::default();
+    let data = [[0u8; 128]];
+    c.bench_function("sha512", |b| {
+        b.iter(|| sha2_asm::compress512(black_box(&mut state), black_box(&data)))
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/sha2/build.rs b/sha2/build.rs
index 87b08fc..db3a56c 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -20,7 +20,7 @@ fn main() {
         if features.contains("aes") {
             ("src/sha256_x64_ni.S", "src/sha512_x64.S")
         } else if features.contains("avx2") {
-            ("src/sha256_x64_avx2.S", "src/sha512_x64.S")
+            ("src/sha256_x64_avx2.S", "src/sha512_x64_avx2.S")
         } else {
             ("src/sha256_x64.S", "src/sha512_x64.S")
         }
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index 5dd9b58..bf4eac6 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -54,14 +54,27 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
 #[cfg(not(target_arch = "aarch64"))]
 #[link(name = "sha512", kind = "static")]
 extern "C" {
+    #[cfg(not(target_feature = "avx2"))]
     fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]);
+    #[cfg(target_feature = "avx2")]
+    fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize);
 }
 
 /// Safe wrapper around assembly implementation of SHA512 compression function
 ///
 /// This function is available only on x86 and x86-64 targets.
+#[inline]
 #[cfg(not(target_arch = "aarch64"))]
+#[cfg(target_feature = "avx2")]
+pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+    if !blocks.is_empty() {
+        unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
+    }
+}
+
 #[inline]
+#[cfg(not(target_arch = "aarch64"))]
+#[cfg(not(target_feature = "avx2"))]
 pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
     for block in blocks {
         unsafe { sha512_compress(state, block) }
diff --git a/sha2/src/sha512_x64_avx2.S b/sha2/src/sha512_x64_avx2.S
new file mode 100644
index 0000000..2dcd971
--- /dev/null
+++ b/sha2/src/sha512_x64_avx2.S
@@ -0,0 +1,754 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#define ENTRY(name) \
+        .globl name; \
+        .align 4,0x90; \
+        name:
+
+#define ENDPROC(name) \
+        .type name, @function; \
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER  = YTMP0
+
+BYTE_FLIP_MASK  = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1        = %rdi
+CTX2        = %r12
+# 2nd arg
+INP         = %rsi
+# 3rd arg
+NUM_BLKS    = %rdx
+
+c           = %rcx
+d           = %r8
+e           = %rdx
+y3          = %rsi
+
+TBL   = %rdi # clobbers CTX1
+
+a     = %rax
+b     = %rbx
+
+f     = %r9
+g     = %r10
+h     = %r11
+old_h = %r11
+
+T1    = %r12 # clobbers CTX2
+y0    = %r13
+y1    = %r14
+y2    = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_size = frame_CTX + CTX_SIZE
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+	Y_ = Y_0
+	Y_0 = Y_1
+	Y_1 = Y_2
+	Y_2 = Y_3
+	Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+	# Rotate symbols a..h right
+	old_h  = h
+	TMP_   = h
+	h      = g
+	g      = f
+	f      = e
+	e      = d
+	d      = c
+	c      = b
+	b      = a
+	a      = TMP_
+.endm
+
+# macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
+	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+	# Extract w[t-7]
+	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
+	# Calculate w[t-16] + w[t-7]
+	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
+	# Extract w[t-15]
+	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
+
+	# Calculate sigma0
+
+	# Calculate w[t-15] ror 1
+	vpsrlq		$1, YTMP1, YTMP2
+	vpsllq		$(64-1), YTMP1, YTMP3
+	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
+	# Calculate w[t-15] shr 7
+	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	frame_XFER(%rsp),h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	# Calculate w[t-15] ror 8
+	vpsrlq		$8, YTMP1, YTMP2
+	vpsllq		$(64-8), YTMP1, YTMP1
+	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
+	# XOR the three components
+	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
+
+
+	# Add three components, w[t-16], w[t-7] and sigma0
+	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
+	# Move to appropriate lanes for calculating w[16] and w[17]
+	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
+	# Move to appropriate lanes for calculating w[18] and w[19]
+	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+	# Calculate w[16] and w[17] in both 128 bit lanes
+
+	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
+	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
+
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+
+################################### RND N + 2 #########################################
+
+	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
+	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
+	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+	# Add sigma1 to the other compunents to get w[16] and w[17]
+	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
+
+	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
+	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
+	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+	# to newly calculated sigma1 to get w[18] and w[19]
+	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
+
+	# Form w[19, w[18], w17], w[16]
+	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+	rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 2 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "blocks" is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_rorx)
+	# Save GPRs
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	# Allocate Stack Space
+	push	%rbp
+	mov	%rsp, %rbp
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+
+	shl	$7, NUM_BLKS	# convert to bytes
+	jz	done_hash
+	add	INP, NUM_BLKS	# pointer to end of data
+	mov	NUM_BLKS, frame_INPEND(%rsp)
+
+	## load initial digest
+	mov	8*0(CTX1), a
+	mov	8*1(CTX1), b
+	mov	8*2(CTX1), c
+	mov	8*3(CTX1), d
+	mov	8*4(CTX1), e
+	mov	8*5(CTX1), f
+	mov	8*6(CTX1), g
+	mov	8*7(CTX1), h
+
+	# save %rdi (CTX) before it gets clobbered
+	mov	%rdi, frame_CTX(%rsp)
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+loop0:
+	lea	K512(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+	mov	INP, frame_INP(%rsp)
+
+	## schedule 64 input dwords, by doing 12 rounds of 4 each
+	movq	$4, frame_SRND(%rsp)
+
+.align 16
+loop1:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	1*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	2*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	3*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(4*32), TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	subq	$1, frame_SRND(%rsp)
+	jne	loop1
+
+	movq	$2, frame_SRND(%rsp)
+loop2:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	DO_4ROUNDS
+	vpaddq	1*32(TBL), Y_1, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(2*32), TBL
+	DO_4ROUNDS
+
+	vmovdqa	Y_2, Y_0
+	vmovdqa	Y_3, Y_1
+
+	subq	$1, frame_SRND(%rsp)
+	jne	loop2
+
+	mov	frame_CTX(%rsp), CTX2
+	addm	8*0(CTX2), a
+	addm	8*1(CTX2), b
+	addm	8*2(CTX2), c
+	addm	8*3(CTX2), d
+	addm	8*4(CTX2), e
+	addm	8*5(CTX2), f
+	addm	8*6(CTX2), g
+	addm	8*7(CTX2), h
+
+	mov	frame_INP(%rsp), INP
+	add	$128, INP
+	cmp	frame_INPEND(%rsp), INP
+	jne	loop0
+
+done_hash:
+
+	# Restore Stack Pointer
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	# Restore GPRs
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	ret
+ENDPROC(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+	.octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+	.octa 0x00000000000000000000000000000000
+	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
\ No newline at end of file

From d80c79a1a21f7def8603e7731dedeb84e645bda4 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Sun, 29 Aug 2021 21:04:26 +0300
Subject: [PATCH 03/11] Update build.rs

---
 sha2/Cargo.toml |  3 +++
 sha2/build.rs   | 18 ++++++++++++------
 sha2/src/lib.rs | 47 ++++++++++++++++++++++++-----------------------
 3 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 49a84ef..1a4760f 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -10,6 +10,9 @@ keywords = ["crypto", "sha2", "asm"]
 categories = ["cryptography", "no-std"]
 edition = "2018"
 
+[dependencies]
+cfg-if = "1.0.0"
+
 [build-dependencies]
 cc = "1.0"
 
diff --git a/sha2/build.rs b/sha2/build.rs
index db3a56c..59f9cf1 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -16,14 +16,20 @@ fn main() {
     let (sha256_path, sha512_path) = if target_arch == "x86" {
         ("src/sha256_x86.S", "src/sha512_x86.S")
     } else if target_arch == "x86_64" {
-        // using aes-ni, cause it's fastest
-        if features.contains("aes") {
-            ("src/sha256_x64_ni.S", "src/sha512_x64.S")
+        let sha512 = if features.contains("avx2") {
+            "src/sha512_x64_avx2.S"
+        } else {
+            "src/sha512_x64.S"
+        };
+        // Prioritizing aes-ni, cause it's fastest
+        let sha256 = if features.contains("aes") {
+            "src/sha256_x64_ni.S"
         } else if features.contains("avx2") {
-            ("src/sha256_x64_avx2.S", "src/sha512_x64_avx2.S")
+            "src/sha256_x64_avx2.S"
         } else {
-            ("src/sha256_x64.S", "src/sha512_x64.S")
-        }
+            "src/sha256_x64.S"
+        };
+        (sha256, sha512)
     } else if target_arch == "aarch64" && target_vendor == "apple" {
         build256.flag("-march=armv8-a+crypto");
         ("src/sha256_aarch64_apple.S", "")
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index bf4eac6..765320f 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -23,31 +23,32 @@ extern "C" {
     fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64);
 }
 
-#[cfg(not(target_feature = "aes"))]
-#[cfg(target_feature = "avx2")]
-#[inline]
-pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    if !blocks.is_empty() {
-        unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
+cfg_if::cfg_if! {
+    if #[cfg(target_feature = "aes")]
+    {
+        #[inline]
+        pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+            if !blocks.is_empty() {
+                unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) }
+            }
+        }
     }
-}
-
-#[cfg(target_feature = "aes")]
-#[cfg(not(target_feature = "avx2"))]
-#[inline]
-pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    if !blocks.is_empty() {
-        unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) }
+    else if #[cfg(target_feature = "avx2")]
+    {
+        #[inline]
+        pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+            if !blocks.is_empty() {
+                unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
+            }
+        }
     }
-}
-
-#[cfg(not(target_feature = "aes"))]
-#[cfg(not(target_feature = "avx2"))]
-/// Safe wrapper around assembly implementation of SHA256 compression function
-#[inline]
-pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    for block in blocks {
-        unsafe { sha256_compress(state, block) }
+    else{
+        #[inline]
+        pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+            for block in blocks {
+                unsafe { sha256_compress(state, block) }
+            }
+        }
     }
 }
 

From c7dd9e5131b129a2d4e9ce5969932fbb004f0119 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Sun, 29 Aug 2021 21:12:30 +0300
Subject: [PATCH 04/11] Cleanup

---
 sha2/build.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sha2/build.rs b/sha2/build.rs
index 59f9cf1..c61dd68 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -5,7 +5,6 @@ fn main() {
 
     let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
     let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default();
-    // panic!("{}", env::var("CARGO_CFG_TARGET_ARCH").unwrap());
     let features: HashSet<String> = env::var("CARGO_CFG_TARGET_FEATURE")
         .unwrap_or_default()
         .split(',')

From c705ae29a90098b45f782f5766242ef954f06871 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Sun, 29 Aug 2021 22:24:54 +0300
Subject: [PATCH 05/11] Add runtime feature detection. Update build

---
 sha2/Cargo.toml |  2 +-
 sha2/build.rs   | 41 +++++++++++------------------
 sha2/src/lib.rs | 70 ++++++++++++++++++++++---------------------------
 3 files changed, 48 insertions(+), 65 deletions(-)

diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 1a4760f..41f7d8b 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -11,7 +11,7 @@ categories = ["cryptography", "no-std"]
 edition = "2018"
 
 [dependencies]
-cfg-if = "1.0.0"
+cpufeatures = "0.2.1"
 
 [build-dependencies]
 cc = "1.0"
diff --git a/sha2/build.rs b/sha2/build.rs
index c61dd68..70f0635 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -1,40 +1,28 @@
-use std::collections::HashSet;
-
 fn main() {
     use std::env;
 
     let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
     let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default();
-    let features: HashSet<String> = env::var("CARGO_CFG_TARGET_FEATURE")
-        .unwrap_or_default()
-        .split(',')
-        .map(|x| x.to_string())
-        .collect();
 
     let mut build256 = cc::Build::new();
     let (sha256_path, sha512_path) = if target_arch == "x86" {
-        ("src/sha256_x86.S", "src/sha512_x86.S")
+        (["src/sha256_x86.S"].to_vec(), ["src/sha512_x86.S"].to_vec())
     } else if target_arch == "x86_64" {
-        let sha512 = if features.contains("avx2") {
-            "src/sha512_x64_avx2.S"
-        } else {
-            "src/sha512_x64.S"
-        };
-        // Prioritizing aes-ni, cause it's fastest
-        let sha256 = if features.contains("aes") {
-            "src/sha256_x64_ni.S"
-        } else if features.contains("avx2") {
-            "src/sha256_x64_avx2.S"
-        } else {
-            "src/sha256_x64.S"
-        };
+        let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].to_vec();
+        // Prioritizing sha-ni, cause it's fastest
+        let sha256 = [
+            "src/sha256_x64_ni.S",
+            "src/sha256_x64_avx2.S",
+            "src/sha256_x64.S",
+        ]
+        .to_vec();
         (sha256, sha512)
     } else if target_arch == "aarch64" && target_vendor == "apple" {
         build256.flag("-march=armv8-a+crypto");
-        ("src/sha256_aarch64_apple.S", "")
+        (["src/sha256_aarch64_apple.S"].to_vec(), [""].to_vec())
     } else if target_arch == "aarch64" {
         build256.flag("-march=armv8-a+crypto");
-        ("src/sha256_aarch64.S", "")
+        (["src/sha256_aarch64.S"].to_vec(), [""].to_vec())
     } else {
         panic!("Unsupported target architecture");
     };
@@ -42,8 +30,11 @@ fn main() {
     if target_arch != "aarch64" {
         cc::Build::new()
             .flag("-c")
-            .file(sha512_path)
+            .files(sha512_path)
             .compile("libsha512.a");
     }
-    build256.flag("-c").file(sha256_path).compile("libsha256.a");
+    build256
+        .flag("-c")
+        .files(sha256_path)
+        .compile("libsha256.a");
 }
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index 765320f..1f385dc 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -13,41 +13,38 @@
 #[cfg(not(any(target_arch = "x86_64", target_arch = "x86", target_arch = "aarch64")))]
 compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures");
 
+cpufeatures::new!(cpuid_avx2, "avx2");
+cpufeatures::new!(cpuid_sha, "sha");
+
 #[link(name = "sha256", kind = "static")]
 #[allow(dead_code)]
 extern "C" {
     fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]);
-    #[cfg(target_feature = "avx2")]
+    // setting usize is safe, cause we are on 64 bit platform
     fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize);
-    #[cfg(target_feature = "aes")]
-    fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64);
+    fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: usize);
 }
 
-cfg_if::cfg_if! {
-    if #[cfg(target_feature = "aes")]
-    {
-        #[inline]
-        pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-            if !blocks.is_empty() {
-                unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) }
-            }
+/// Safe wrapper around assembly implementation of SHA256 compression function
+///
+#[inline]
+pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    let token_sha = cpuid_sha::init();
+    if token_sha.get() {
+        if !blocks.is_empty() {
+            unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len()) }
         }
+        return;
     }
-    else if #[cfg(target_feature = "avx2")]
-    {
-        #[inline]
-        pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-            if !blocks.is_empty() {
-                unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
-            }
+    let token: cpuid_avx2::InitToken = cpuid_avx2::init();
+
+    if token.get() {
+        if !blocks.is_empty() {
+            unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
         }
-    }
-    else{
-        #[inline]
-        pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-            for block in blocks {
-                unsafe { sha256_compress(state, block) }
-            }
+    } else {
+        for block in blocks {
+            unsafe { sha256_compress(state, block) }
         }
     }
 }
@@ -55,29 +52,24 @@ cfg_if::cfg_if! {
 #[cfg(not(target_arch = "aarch64"))]
 #[link(name = "sha512", kind = "static")]
 extern "C" {
-    #[cfg(not(target_feature = "avx2"))]
     fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]);
-    #[cfg(target_feature = "avx2")]
     fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize);
 }
 
 /// Safe wrapper around assembly implementation of SHA512 compression function
 ///
 /// This function is available only on x86 and x86-64 targets.
-#[inline]
 #[cfg(not(target_arch = "aarch64"))]
-#[cfg(target_feature = "avx2")]
-pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-    if !blocks.is_empty() {
-        unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
-    }
-}
-
 #[inline]
-#[cfg(not(target_arch = "aarch64"))]
-#[cfg(not(target_feature = "avx2"))]
 pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-    for block in blocks {
-        unsafe { sha512_compress(state, block) }
+    let token: cpuid_avx2::InitToken = cpuid_avx2::init();
+    if token.get() {
+        if !blocks.is_empty() {
+            unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
+        }
+    } else {
+        for block in blocks {
+            unsafe { sha512_compress(state, block) }
+        }
     }
 }

From e17dee4913feb88c4efffa5e063f3b3d27020766 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Mon, 30 Aug 2021 11:14:21 +0300
Subject: [PATCH 06/11] Refactor

---
 sha2/Cargo.toml      |  7 -------
 sha2/benches/lib.rs  | 31 +++++++++++++++++++++++++++++++
 sha2/benches/sha2.rs | 17 -----------------
 sha2/build.rs        | 10 +++++-----
 sha2/src/lib.rs      | 13 ++++++-------
 5 files changed, 42 insertions(+), 36 deletions(-)
 create mode 100644 sha2/benches/lib.rs
 delete mode 100644 sha2/benches/sha2.rs

diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 41f7d8b..b00cb4f 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -15,10 +15,3 @@ cpufeatures = "0.2.1"
 
 [build-dependencies]
 cc = "1.0"
-
-[dev-dependencies]
-criterion = {version= "0.3.5",features=["html_reports"] }
-
-[[bench]]
-name = "sha2"
-harness = false
\ No newline at end of file
diff --git a/sha2/benches/lib.rs b/sha2/benches/lib.rs
new file mode 100644
index 0000000..31d1ef6
--- /dev/null
+++ b/sha2/benches/lib.rs
@@ -0,0 +1,31 @@
+#![no_std]
+#![feature(test)]
+
+extern crate test;
+
+use test::Bencher;
+
+#[bench]
+fn bench_compress256(b: &mut Bencher) {
+    let mut state = Default::default();
+    let data = [[0u8; 64]];
+
+    b.iter(|| {
+        sha2_asm::compress256(&mut state, &data);
+    });
+
+    b.bytes = data.len() as u64;
+}
+
+#[cfg(not(target_arch = "aarch64"))]
+#[bench]
+fn bench_compress512(b: &mut Bencher) {
+    let mut state = Default::default();
+    let data = [[0u8; 128]];
+
+    b.iter(|| {
+        sha2_asm::compress512(&mut state, &data);
+    });
+
+    b.bytes = data.len() as u64;
+}
diff --git a/sha2/benches/sha2.rs b/sha2/benches/sha2.rs
deleted file mode 100644
index 29401dc..0000000
--- a/sha2/benches/sha2.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-
-pub fn criterion_benchmark(c: &mut Criterion) {
-    let mut state = Default::default();
-    let data = [[0u8; 64]];
-    c.bench_function("sha256", |b| {
-        b.iter(|| sha2_asm::compress256(black_box(&mut state), black_box(&data)))
-    });
-    let mut state = Default::default();
-    let data = [[0u8; 128]];
-    c.bench_function("sha512", |b| {
-        b.iter(|| sha2_asm::compress512(black_box(&mut state), black_box(&data)))
-    });
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
diff --git a/sha2/build.rs b/sha2/build.rs
index 70f0635..0c0e913 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -6,23 +6,23 @@ fn main() {
 
     let mut build256 = cc::Build::new();
     let (sha256_path, sha512_path) = if target_arch == "x86" {
-        (["src/sha256_x86.S"].to_vec(), ["src/sha512_x86.S"].to_vec())
+        (["src/sha256_x86.S"].iter(), ["src/sha512_x86.S"].iter())
     } else if target_arch == "x86_64" {
-        let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].to_vec();
+        let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter();
         // Prioritizing sha-ni, cause it's fastest
         let sha256 = [
             "src/sha256_x64_ni.S",
             "src/sha256_x64_avx2.S",
             "src/sha256_x64.S",
         ]
-        .to_vec();
+        .iter();
         (sha256, sha512)
     } else if target_arch == "aarch64" && target_vendor == "apple" {
         build256.flag("-march=armv8-a+crypto");
-        (["src/sha256_aarch64_apple.S"].to_vec(), [""].to_vec())
+        (["src/sha256_aarch64_apple.S"].iter(), [""].iter())
     } else if target_arch == "aarch64" {
         build256.flag("-march=armv8-a+crypto");
-        (["src/sha256_aarch64.S"].to_vec(), [""].to_vec())
+        (["src/sha256_aarch64.S"].iter(), [""].iter())
     } else {
         panic!("Unsupported target architecture");
     };
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index 1f385dc..038a4b8 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -20,9 +20,8 @@ cpufeatures::new!(cpuid_sha, "sha");
 #[allow(dead_code)]
 extern "C" {
     fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]);
-    // setting usize is safe, cause we are on 64 bit platform
-    fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize);
-    fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: usize);
+    fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: u64);
+    fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64);
 }
 
 /// Safe wrapper around assembly implementation of SHA256 compression function
@@ -32,7 +31,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     let token_sha = cpuid_sha::init();
     if token_sha.get() {
         if !blocks.is_empty() {
-            unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len()) }
+            unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) }
         }
         return;
     }
@@ -40,7 +39,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
 
     if token.get() {
         if !blocks.is_empty() {
-            unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
+            unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) }
         }
     } else {
         for block in blocks {
@@ -53,7 +52,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
 #[link(name = "sha512", kind = "static")]
 extern "C" {
     fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]);
-    fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize);
+    fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: u64);
 }
 
 /// Safe wrapper around assembly implementation of SHA512 compression function
@@ -65,7 +64,7 @@ pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
     let token: cpuid_avx2::InitToken = cpuid_avx2::init();
     if token.get() {
         if !blocks.is_empty() {
-            unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
+            unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) }
         }
     } else {
         for block in blocks {

From b685df3949cb36ab47a798f9db65714eaafda6f5 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Mon, 30 Aug 2021 11:27:14 +0300
Subject: [PATCH 07/11] Fix not linux os build

---
 sha2/build.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/sha2/build.rs b/sha2/build.rs
index 0c0e913..6d6a286 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -3,20 +3,24 @@ fn main() {
 
     let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
     let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default();
-
+    let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
     let mut build256 = cc::Build::new();
     let (sha256_path, sha512_path) = if target_arch == "x86" {
         (["src/sha256_x86.S"].iter(), ["src/sha512_x86.S"].iter())
     } else if target_arch == "x86_64" {
-        let sha512 = ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter();
-        // Prioritizing sha-ni, cause it's fastest
-        let sha256 = [
-            "src/sha256_x64_ni.S",
-            "src/sha256_x64_avx2.S",
-            "src/sha256_x64.S",
-        ]
-        .iter();
-        (sha256, sha512)
+        if target_os == "linux" {
+            (
+                [
+                    "src/sha256_x64_ni.S",
+                    "src/sha256_x64_avx2.S",
+                    "src/sha256_x64.S",
+                ]
+                .iter(),
+                ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(),
+            )
+        } else {
+            (["src/sha256_x64.S"].iter(), ["src/sha512_x64.S"].iter())
+        }
     } else if target_arch == "aarch64" && target_vendor == "apple" {
         build256.flag("-march=armv8-a+crypto");
         (["src/sha256_aarch64_apple.S"].iter(), [""].iter())

From 9ad48b8a461c58ee9fc2a823a3f36c439a7b37ce Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Sat, 4 Sep 2021 17:17:50 +0300
Subject: [PATCH 08/11] Update

---
 Cargo.lock               |  18 +-
 sha2/build.rs            |   7 +-
 sha2/src/lib.rs          |   9 -
 sha2/src/sha256_x64_ni.S | 367 ---------------------------------------
 4 files changed, 18 insertions(+), 383 deletions(-)
 delete mode 100644 sha2/src/sha256_x64_ni.S

diff --git a/Cargo.lock b/Cargo.lock
index cee27d7..cd18aa2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9,6 +9,21 @@ version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed67cbde08356238e75fc4656be4749481eeffb09e19f320a25237d5221c985d"
 
+[[package]]
+name = "cpufeatures"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.101"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21"
+
 [[package]]
 name = "md5-asm"
 version = "0.5.0"
@@ -25,9 +40,10 @@ dependencies = [
 
 [[package]]
 name = "sha2-asm"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
  "cc",
+ "cpufeatures",
 ]
 
 [[package]]
diff --git a/sha2/build.rs b/sha2/build.rs
index 6d6a286..643216f 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -10,12 +10,7 @@ fn main() {
     } else if target_arch == "x86_64" {
         if target_os == "linux" {
             (
-                [
-                    "src/sha256_x64_ni.S",
-                    "src/sha256_x64_avx2.S",
-                    "src/sha256_x64.S",
-                ]
-                .iter(),
+                ["src/sha256_x64_avx2.S", "src/sha256_x64.S"].iter(),
                 ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(),
             )
         } else {
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index 038a4b8..8fa3ccd 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -14,27 +14,18 @@
 compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures");
 
 cpufeatures::new!(cpuid_avx2, "avx2");
-cpufeatures::new!(cpuid_sha, "sha");
 
 #[link(name = "sha256", kind = "static")]
 #[allow(dead_code)]
 extern "C" {
     fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]);
     fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: u64);
-    fn sha256_ni_transform(digest: &mut [u32; 8], data: *const [u8; 64], nblk: u64);
 }
 
 /// Safe wrapper around assembly implementation of SHA256 compression function
 ///
 #[inline]
 pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    let token_sha = cpuid_sha::init();
-    if token_sha.get() {
-        if !blocks.is_empty() {
-            unsafe { sha256_ni_transform(state, blocks.as_ptr(), blocks.len() as u64) }
-        }
-        return;
-    }
     let token: cpuid_avx2::InitToken = cpuid_avx2::init();
 
     if token.get() {
diff --git a/sha2/src/sha256_x64_ni.S b/sha2/src/sha256_x64_ni.S
deleted file mode 100644
index afcdd3a..0000000
--- a/sha2/src/sha256_x64_ni.S
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copied and slightly modified
-# from the linux kernel arch/x86/crypto/sha256_ni_asm.S
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * 	Sean Gulley <sean.m.gulley@intel.com>
- * 	Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 	* Redistributions of source code must retain the above copyright
- * 	  notice, this list of conditions and the following disclaimer.
- * 	* Redistributions in binary form must reproduce the above copyright
- * 	  notice, this list of conditions and the following disclaimer in
- * 	  the documentation and/or other materials provided with the
- * 	  distribution.
- * 	* Neither the name of Intel Corporation nor the names of its
- * 	  contributors may be used to endorse or promote products derived
- * 	  from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define ENTRY(name) \
-        .globl name; \
-        .align 4,0x90; \
-        name:
-
-#define ENDPROC(name) \
-        .type name, @function; \
-
-#define DIGEST_PTR	%rdi	/* 1st arg */
-#define DATA_PTR	%rsi	/* 2nd arg */
-#define NUM_BLKS	%rdx	/* 3rd arg */
-
-#define SHA256CONSTANTS	%rax
-
-#define MSG		%xmm0
-#define STATE0		%xmm1
-#define STATE1		%xmm2
-#define MSGTMP0		%xmm3
-#define MSGTMP1		%xmm4
-#define MSGTMP2		%xmm5
-#define MSGTMP3		%xmm6
-#define MSGTMP4		%xmm7
-
-#define SHUF_MASK	%xmm8
-
-#define ABEF_SAVE	%xmm9
-#define CDGH_SAVE	%xmm10
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process.  Once all blocks have
- * been processed, the digest pointer is  updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks.  All message padding and hash value initialization must
- * be done outside the update function.
- *
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
- * void sha256_ni_transform(uint32_t *digest, const void *data,
-		uint32_t numBlocks);
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
- */
-
-.text
-.align 32
-ENTRY(sha256_ni_transform)
-
-	shl		$6, NUM_BLKS		/*  convert to bytes */
-	jz		.Ldone_hash
-	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
-
-	/*
-	 * load initial hash values
-	 * Need to reorder these appropriately
-	 * DCBA, HGFE -> ABEF, CDGH
-	 */
-	movdqu		0*16(DIGEST_PTR), STATE0
-	movdqu		1*16(DIGEST_PTR), STATE1
-
-	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
-	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
-	movdqa		STATE0, MSGTMP4
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
-
-	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-	lea		K256(%rip), SHA256CONSTANTS
-
-.Lloop0:
-	/* Save hash values for addition after rounds */
-	movdqa		STATE0, ABEF_SAVE
-	movdqa		STATE1, CDGH_SAVE
-
-	/* Rounds 0-3 */
-	movdqu		0*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP0
-		paddd		0*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 4-7 */
-	movdqu		1*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP1
-		paddd		1*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 8-11 */
-	movdqu		2*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP2
-		paddd		2*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 12-15 */
-	movdqu		3*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP3
-		paddd		3*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 16-19 */
-	movdqa		MSGTMP0, MSG
-		paddd		4*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 20-23 */
-	movdqa		MSGTMP1, MSG
-		paddd		5*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 24-27 */
-	movdqa		MSGTMP2, MSG
-		paddd		6*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 28-31 */
-	movdqa		MSGTMP3, MSG
-		paddd		7*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 32-35 */
-	movdqa		MSGTMP0, MSG
-		paddd		8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 36-39 */
-	movdqa		MSGTMP1, MSG
-		paddd		9*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 40-43 */
-	movdqa		MSGTMP2, MSG
-		paddd		10*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 44-47 */
-	movdqa		MSGTMP3, MSG
-		paddd		11*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 48-51 */
-	movdqa		MSGTMP0, MSG
-		paddd		12*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 52-55 */
-	movdqa		MSGTMP1, MSG
-		paddd		13*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 56-59 */
-	movdqa		MSGTMP2, MSG
-		paddd		14*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 60-63 */
-	movdqa		MSGTMP3, MSG
-		paddd		15*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Add current hash values with previously saved */
-	paddd		ABEF_SAVE, STATE0
-	paddd		CDGH_SAVE, STATE1
-
-	/* Increment data pointer and loop if more to process */
-	add		$64, DATA_PTR
-	cmp		NUM_BLKS, DATA_PTR
-	jne		.Lloop0
-
-	/* Write hash values back in the correct order */
-	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
-	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
-	movdqa		STATE0, MSGTMP4
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
-
-	movdqu		STATE0, 0*16(DIGEST_PTR)
-	movdqu		STATE1, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
-
-	ret
-ENDPROC(sha256_ni_transform)
-
-ENTRY(sha256_ni_built)
-    xor         %eax,%eax
-    inc         %eax
-    ret
-ENDPROC(sha256_ni_built)
-
-.data
-.align 64
-K256:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-PSHUFFLE_BYTE_FLIP_MASK:
-	.octa 0x0c0d0e0f08090a0b0405060700010203
\ No newline at end of file

From efec464c638a4bdf94b8c1d5bd96a50da90a9fc3 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Sun, 5 Sep 2021 13:55:29 +0300
Subject: [PATCH 09/11] Refactor build script

---
 sha2/build.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sha2/build.rs b/sha2/build.rs
index 643216f..dba680e 100644
--- a/sha2/build.rs
+++ b/sha2/build.rs
@@ -5,23 +5,23 @@ fn main() {
     let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default();
     let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
     let mut build256 = cc::Build::new();
-    let (sha256_path, sha512_path) = if target_arch == "x86" {
-        (["src/sha256_x86.S"].iter(), ["src/sha512_x86.S"].iter())
+    let (sha256_path, sha512_path): (&[&str], &[&str]) = if target_arch == "x86" {
+        (&["src/sha256_x86.S"], &["src/sha512_x86.S"])
     } else if target_arch == "x86_64" {
         if target_os == "linux" {
             (
-                ["src/sha256_x64_avx2.S", "src/sha256_x64.S"].iter(),
-                ["src/sha512_x64_avx2.S", "src/sha512_x64.S"].iter(),
+                &["src/sha256_x64_avx2.S", "src/sha256_x64.S"],
+                &["src/sha512_x64_avx2.S", "src/sha512_x64.S"],
             )
         } else {
-            (["src/sha256_x64.S"].iter(), ["src/sha512_x64.S"].iter())
+            (&["src/sha256_x64.S"], &["src/sha512_x64.S"])
         }
     } else if target_arch == "aarch64" && target_vendor == "apple" {
         build256.flag("-march=armv8-a+crypto");
-        (["src/sha256_aarch64_apple.S"].iter(), [""].iter())
+        (&["src/sha256_aarch64_apple.S"], &[""])
     } else if target_arch == "aarch64" {
         build256.flag("-march=armv8-a+crypto");
-        (["src/sha256_aarch64.S"].iter(), [""].iter())
+        (&["src/sha256_aarch64.S"], &[""])
     } else {
         panic!("Unsupported target architecture");
     };

From 99627cfed08499fc19adce427ab69ab182d8b307 Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Mon, 6 Sep 2021 14:27:44 +0300
Subject: [PATCH 10/11] Small fix

Fix
---
 sha2/Cargo.toml | 2 +-
 sha2/src/lib.rs | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index b00cb4f..70ed1ec 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -11,7 +11,7 @@ categories = ["cryptography", "no-std"]
 edition = "2018"
 
 [dependencies]
-cpufeatures = "0.2.1"
+cpufeatures = "0.2"
 
 [build-dependencies]
 cc = "1.0"
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index 8fa3ccd..730e6c8 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -43,7 +43,7 @@ pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
 #[link(name = "sha512", kind = "static")]
 extern "C" {
     fn sha512_compress(state: &mut [u64; 8], block: &[u8; 128]);
-    fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: u64);
+    fn sha512_transform_rorx(state: &mut [u64; 8], block: *const [u8; 128], num_blocks: usize);
 }
 
 /// Safe wrapper around assembly implementation of SHA512 compression function
@@ -55,7 +55,7 @@ pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
     let token: cpuid_avx2::InitToken = cpuid_avx2::init();
     if token.get() {
         if !blocks.is_empty() {
-            unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) }
+            unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
         }
     } else {
         for block in blocks {

From 43813ab705cf3b10ec55438495ce15a81284c56b Mon Sep 17 00:00:00 2001
From: Vladimir <v.petrzhikovskiy@dexpa.io>
Date: Wed, 8 Sep 2021 23:08:44 +0300
Subject: [PATCH 11/11] Fix issues

---
 sha2/src/lib.rs | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index 730e6c8..07485c9 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -13,24 +13,23 @@
 #[cfg(not(any(target_arch = "x86_64", target_arch = "x86", target_arch = "aarch64")))]
 compile_error!("crate can only be used on x86, x86-64 and aarch64 architectures");
 
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 cpufeatures::new!(cpuid_avx2, "avx2");
 
 #[link(name = "sha256", kind = "static")]
 #[allow(dead_code)]
 extern "C" {
     fn sha256_compress(state: &mut [u32; 8], block: &[u8; 64]);
-    fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: u64);
+    fn sha256_transform_rorx(state: &mut [u32; 8], block: *const [u8; 64], num_blocks: usize);
 }
 
 /// Safe wrapper around assembly implementation of SHA256 compression function
 ///
 #[inline]
 pub fn compress256(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    let token: cpuid_avx2::InitToken = cpuid_avx2::init();
-
-    if token.get() {
+    if cpuid_avx2::get() {
         if !blocks.is_empty() {
-            unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len() as u64) }
+            unsafe { sha256_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
         }
     } else {
         for block in blocks {
@@ -52,8 +51,7 @@ extern "C" {
 #[cfg(not(target_arch = "aarch64"))]
 #[inline]
 pub fn compress512(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-    let token: cpuid_avx2::InitToken = cpuid_avx2::init();
-    if token.get() {
+    if cpuid_avx2::get() {
         if !blocks.is_empty() {
             unsafe { sha512_transform_rorx(state, blocks.as_ptr(), blocks.len()) }
         }