Skip to content

Commit 13cc6c8

Browse files
authored
sha1: port sha1-asm to arch intrinsics (#596)
1 parent 6be8466 commit 13cc6c8

File tree

5 files changed

+180
-9
lines changed

5 files changed

+180
-9
lines changed

sha1/src/compress.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ cfg_if::cfg_if! {
44
if #[cfg(feature = "force-soft")] {
55
mod soft;
66
use soft::compress as compress_inner;
7-
} else if #[cfg(target_arch = "loongarch64")] {
7+
} else if #[cfg(all(target_arch = "aarch64"))] {
8+
mod soft;
9+
mod aarch64;
10+
use aarch64::compress as compress_inner;
11+
} else if #[cfg(target_arch = "loongarch64")] {
812
mod loongarch64_asm;
913
use loongarch64_asm::compress as compress_inner;
1014
} else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {

sha1/src/compress/aarch64.rs

Lines changed: 169 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,183 @@
11
//! SHA-1 `aarch64` backend.
22
3+
use crate::K;
4+
35
// Per rustc target feature docs for `aarch64-unknown-linux-gnu` and
46
// `aarch64-apple-darwin` platforms, the `sha2` target feature enables
57
// SHA-1 as well:
68
//
79
// > Enable SHA1 and SHA256 support.
810
cpufeatures::new!(sha1_hwcap, "sha2");
911

12+
// note that `sha2` implicitly enables `neon`
13+
#[target_feature(enable = "sha2")]
14+
unsafe fn compress_sha1_neon(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
15+
use core::arch::aarch64::*;
16+
17+
let mut abcd = vld1q_u32(state.as_ptr());
18+
let mut e0 = state[4];
19+
let [k0, k1, k2, k3] = K.map(|k| vdupq_n_u32(k));
20+
let (mut e1, mut tmp0, mut tmp1);
21+
22+
for block in blocks {
23+
let abcd_cpy = abcd;
24+
let e0_cpy = e0;
25+
26+
// Load and reverse byte order
27+
let [mut msg0, mut msg1, mut msg2, mut msg3] = [0, 1, 2, 3].map(|i| {
28+
let p = block.as_ptr().add(16 * i);
29+
vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)))
30+
});
31+
32+
tmp0 = vaddq_u32(msg0, k0);
33+
tmp1 = vaddq_u32(msg1, k0);
34+
35+
// Rounds 0-3
36+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
37+
abcd = vsha1cq_u32(abcd, e0, tmp0);
38+
tmp0 = vaddq_u32(msg2, k0);
39+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
40+
41+
// Rounds 4-7
42+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
43+
abcd = vsha1cq_u32(abcd, e1, tmp1);
44+
tmp1 = vaddq_u32(msg3, k0);
45+
msg0 = vsha1su1q_u32(msg0, msg3);
46+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
47+
48+
// Rounds 8-11
49+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
50+
abcd = vsha1cq_u32(abcd, e0, tmp0);
51+
tmp0 = vaddq_u32(msg0, k0);
52+
msg1 = vsha1su1q_u32(msg1, msg0);
53+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
54+
55+
// Rounds 12-15
56+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
57+
abcd = vsha1cq_u32(abcd, e1, tmp1);
58+
tmp1 = vaddq_u32(msg1, k1);
59+
msg2 = vsha1su1q_u32(msg2, msg1);
60+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
61+
62+
// Rounds 16-19
63+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
64+
abcd = vsha1cq_u32(abcd, e0, tmp0);
65+
tmp0 = vaddq_u32(msg2, k1);
66+
msg3 = vsha1su1q_u32(msg3, msg2);
67+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
68+
69+
// Rounds 20-23
70+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
71+
abcd = vsha1pq_u32(abcd, e1, tmp1);
72+
tmp1 = vaddq_u32(msg3, k1);
73+
msg0 = vsha1su1q_u32(msg0, msg3);
74+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
75+
76+
// Rounds 24-27
77+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
78+
abcd = vsha1pq_u32(abcd, e0, tmp0);
79+
tmp0 = vaddq_u32(msg0, k1);
80+
msg1 = vsha1su1q_u32(msg1, msg0);
81+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
82+
83+
// Rounds 28-31
84+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
85+
abcd = vsha1pq_u32(abcd, e1, tmp1);
86+
tmp1 = vaddq_u32(msg1, k1);
87+
msg2 = vsha1su1q_u32(msg2, msg1);
88+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
89+
90+
// Rounds 32-35
91+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
92+
abcd = vsha1pq_u32(abcd, e0, tmp0);
93+
tmp0 = vaddq_u32(msg2, k2);
94+
msg3 = vsha1su1q_u32(msg3, msg2);
95+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
96+
97+
// Rounds 36-39
98+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
99+
abcd = vsha1pq_u32(abcd, e1, tmp1);
100+
tmp1 = vaddq_u32(msg3, k2);
101+
msg0 = vsha1su1q_u32(msg0, msg3);
102+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
103+
104+
// Rounds 40-43
105+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
106+
abcd = vsha1mq_u32(abcd, e0, tmp0);
107+
tmp0 = vaddq_u32(msg0, k2);
108+
msg1 = vsha1su1q_u32(msg1, msg0);
109+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
110+
111+
// Rounds 44-47
112+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
113+
abcd = vsha1mq_u32(abcd, e1, tmp1);
114+
tmp1 = vaddq_u32(msg1, k2);
115+
msg2 = vsha1su1q_u32(msg2, msg1);
116+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
117+
118+
// Rounds 48-51
119+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
120+
abcd = vsha1mq_u32(abcd, e0, tmp0);
121+
tmp0 = vaddq_u32(msg2, k2);
122+
msg3 = vsha1su1q_u32(msg3, msg2);
123+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
124+
125+
// Rounds 52-55
126+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
127+
abcd = vsha1mq_u32(abcd, e1, tmp1);
128+
tmp1 = vaddq_u32(msg3, k3);
129+
msg0 = vsha1su1q_u32(msg0, msg3);
130+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
131+
132+
// Rounds 56-59
133+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
134+
abcd = vsha1mq_u32(abcd, e0, tmp0);
135+
tmp0 = vaddq_u32(msg0, k3);
136+
msg1 = vsha1su1q_u32(msg1, msg0);
137+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
138+
139+
// Rounds 60-63
140+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
141+
abcd = vsha1pq_u32(abcd, e1, tmp1);
142+
tmp1 = vaddq_u32(msg1, k3);
143+
msg2 = vsha1su1q_u32(msg2, msg1);
144+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
145+
146+
// Rounds 64-67
147+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
148+
abcd = vsha1pq_u32(abcd, e0, tmp0);
149+
tmp0 = vaddq_u32(msg2, k3);
150+
msg3 = vsha1su1q_u32(msg3, msg2);
151+
152+
// Rounds 68-71
153+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
154+
abcd = vsha1pq_u32(abcd, e1, tmp1);
155+
tmp1 = vaddq_u32(msg3, k3);
156+
157+
// Rounds 72-75
158+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
159+
abcd = vsha1pq_u32(abcd, e0, tmp0);
160+
161+
// Rounds 76-79
162+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
163+
abcd = vsha1pq_u32(abcd, e1, tmp1);
164+
165+
// Update state
166+
abcd = vaddq_u32(abcd_cpy, abcd);
167+
e0 = e0.wrapping_add(e0_cpy);
168+
}
169+
170+
// Save state
171+
vst1q_u32(state.as_mut_ptr(), abcd);
172+
state[4] = e0;
173+
}
174+
10175
pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
11-
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
12-
// after stabilization
176+
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 after stabilization
13177
if sha1_hwcap::get() {
14-
sha1_asm::compress(state, blocks);
178+
unsafe {
179+
compress_sha1_neon(state, blocks);
180+
}
15181
} else {
16182
super::soft::compress(state, blocks);
17183
}

sha1/src/compress/loongarch64_asm.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
//! LoongArch64 assembly backend
22
3+
use crate::K;
34
use core::arch::asm;
45

5-
const K: [u32; 4] = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6];
6-
76
macro_rules! c {
87
($($l:expr)*) => {
98
concat!($($l ,)*)

sha1/src/compress/soft.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#![allow(clippy::many_single_char_names)]
22
use super::BLOCK_SIZE;
3-
4-
const K: [u32; 4] = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6];
3+
use crate::K;
54

65
#[inline(always)]
76
fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {

sha1/src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ pub use compress::compress;
3333

3434
const STATE_LEN: usize = 5;
3535
const BLOCK_SIZE: usize = <Sha1Core as BlockSizeUser>::BlockSize::USIZE;
36+
const H0: [u32; STATE_LEN] = [0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0];
37+
#[allow(dead_code)]
38+
const K: [u32; 4] = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6];
3639

3740
/// Core SHA-1 hasher state.
3841
#[derive(Clone)]
@@ -85,7 +88,7 @@ impl Default for Sha1Core {
8588
#[inline]
8689
fn default() -> Self {
8790
Self {
88-
h: [0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0],
91+
h: H0,
8992
block_len: 0,
9093
}
9194
}

0 commit comments

Comments
 (0)