Skip to content

Commit a740275

Browse files
committed
sha1: port sha1-asm to arch::aarch64 intrinsics
1 parent 8998905 commit a740275

File tree

2 files changed

+171
-4
lines changed

2 files changed

+171
-4
lines changed

sha1/src/compress.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ cfg_if::cfg_if! {
44
if #[cfg(feature = "force-soft")] {
55
mod soft;
66
use soft::compress as compress_inner;
7-
} else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
7+
} else if #[cfg(all(target_arch = "aarch64"))] {
88
mod soft;
99
mod aarch64;
1010
use aarch64::compress as compress_inner;

sha1/src/compress/aarch64.rs

+170-3
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,178 @@
77
// > Enable SHA1 and SHA256 support.
88
cpufeatures::new!(sha1_hwcap, "sha2");
99

10+
const K: [u32; 4] = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6];
11+
12+
// note that `sha2` implicitly enables `neon`
13+
#[target_feature(enable = "sha2")]
14+
unsafe fn compress_sha1_neon(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
15+
use core::arch::aarch64::*;
16+
17+
let mut abcd = vld1q_u32(state.as_ptr());
18+
let mut e0 = state[4];
19+
let mut e1;
20+
let (mut msg0, mut msg1, mut msg2, mut msg3);
21+
22+
for block in blocks {
23+
let abcd_cpy = abcd;
24+
let e0_cpy = e0;
25+
26+
// Load and reverse byte order
27+
let bp = block.as_ptr();
28+
msg0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bp)));
29+
msg1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bp.add(16))));
30+
msg2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bp.add(32))));
31+
msg3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bp.add(48))));
32+
33+
let mut tmp0 = vaddq_u32(msg0, vdupq_n_u32(K[0]));
34+
let mut tmp1 = vaddq_u32(msg1, vdupq_n_u32(K[0]));
35+
36+
// Rounds 0-3
37+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
38+
abcd = vsha1cq_u32(abcd, e0, tmp0);
39+
tmp0 = vaddq_u32(msg2, vdupq_n_u32(K[0]));
40+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
41+
42+
// Rounds 4-7
43+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
44+
abcd = vsha1cq_u32(abcd, e1, tmp1);
45+
tmp1 = vaddq_u32(msg3, vdupq_n_u32(K[0]));
46+
msg0 = vsha1su1q_u32(msg0, msg3);
47+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
48+
49+
// Rounds 8-11
50+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
51+
abcd = vsha1cq_u32(abcd, e0, tmp0);
52+
tmp0 = vaddq_u32(msg0, vdupq_n_u32(K[0]));
53+
msg1 = vsha1su1q_u32(msg1, msg0);
54+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
55+
56+
// Rounds 12-15
57+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
58+
abcd = vsha1cq_u32(abcd, e1, tmp1);
59+
tmp1 = vaddq_u32(msg1, vdupq_n_u32(K[1]));
60+
msg2 = vsha1su1q_u32(msg2, msg1);
61+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
62+
63+
// Rounds 16-19
64+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
65+
abcd = vsha1cq_u32(abcd, e0, tmp0);
66+
tmp0 = vaddq_u32(msg2, vdupq_n_u32(K[1]));
67+
msg3 = vsha1su1q_u32(msg3, msg2);
68+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
69+
70+
// Rounds 20-23
71+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
72+
abcd = vsha1pq_u32(abcd, e1, tmp1);
73+
tmp1 = vaddq_u32(msg3, vdupq_n_u32(K[1]));
74+
msg0 = vsha1su1q_u32(msg0, msg3);
75+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
76+
77+
// Rounds 24-27
78+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
79+
abcd = vsha1pq_u32(abcd, e0, tmp0);
80+
tmp0 = vaddq_u32(msg0, vdupq_n_u32(K[1]));
81+
msg1 = vsha1su1q_u32(msg1, msg0);
82+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
83+
84+
// Rounds 28-31
85+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
86+
abcd = vsha1pq_u32(abcd, e1, tmp1);
87+
tmp1 = vaddq_u32(msg1, vdupq_n_u32(K[1]));
88+
msg2 = vsha1su1q_u32(msg2, msg1);
89+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
90+
91+
// Rounds 32-35
92+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
93+
abcd = vsha1pq_u32(abcd, e0, tmp0);
94+
tmp0 = vaddq_u32(msg2, vdupq_n_u32(K[2]));
95+
msg3 = vsha1su1q_u32(msg3, msg2);
96+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
97+
98+
// Rounds 36-39
99+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
100+
abcd = vsha1pq_u32(abcd, e1, tmp1);
101+
tmp1 = vaddq_u32(msg3, vdupq_n_u32(K[2]));
102+
msg0 = vsha1su1q_u32(msg0, msg3);
103+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
104+
105+
// Rounds 40-43
106+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
107+
abcd = vsha1mq_u32(abcd, e0, tmp0);
108+
tmp0 = vaddq_u32(msg0, vdupq_n_u32(K[2]));
109+
msg1 = vsha1su1q_u32(msg1, msg0);
110+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
111+
112+
// Rounds 44-47
113+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
114+
abcd = vsha1mq_u32(abcd, e1, tmp1);
115+
tmp1 = vaddq_u32(msg1, vdupq_n_u32(K[2]));
116+
msg2 = vsha1su1q_u32(msg2, msg1);
117+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
118+
119+
// Rounds 48-51
120+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
121+
abcd = vsha1mq_u32(abcd, e0, tmp0);
122+
tmp0 = vaddq_u32(msg2, vdupq_n_u32(K[2]));
123+
msg3 = vsha1su1q_u32(msg3, msg2);
124+
msg0 = vsha1su0q_u32(msg0, msg1, msg2);
125+
126+
// Rounds 52-55
127+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
128+
abcd = vsha1mq_u32(abcd, e1, tmp1);
129+
tmp1 = vaddq_u32(msg3, vdupq_n_u32(K[3]));
130+
msg0 = vsha1su1q_u32(msg0, msg3);
131+
msg1 = vsha1su0q_u32(msg1, msg2, msg3);
132+
133+
// Rounds 56-59
134+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
135+
abcd = vsha1mq_u32(abcd, e0, tmp0);
136+
tmp0 = vaddq_u32(msg0, vdupq_n_u32(K[3]));
137+
msg1 = vsha1su1q_u32(msg1, msg0);
138+
msg2 = vsha1su0q_u32(msg2, msg3, msg0);
139+
140+
// Rounds 60-63
141+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
142+
abcd = vsha1pq_u32(abcd, e1, tmp1);
143+
tmp1 = vaddq_u32(msg1, vdupq_n_u32(K[3]));
144+
msg2 = vsha1su1q_u32(msg2, msg1);
145+
msg3 = vsha1su0q_u32(msg3, msg0, msg1);
146+
147+
// Rounds 64-67
148+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
149+
abcd = vsha1pq_u32(abcd, e0, tmp0);
150+
tmp0 = vaddq_u32(msg2, vdupq_n_u32(K[3]));
151+
msg3 = vsha1su1q_u32(msg3, msg2);
152+
153+
// Rounds 68-71
154+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
155+
abcd = vsha1pq_u32(abcd, e1, tmp1);
156+
tmp1 = vaddq_u32(msg3, vdupq_n_u32(K[3]));
157+
158+
// Rounds 72-75
159+
e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
160+
abcd = vsha1pq_u32(abcd, e0, tmp0);
161+
162+
// Rounds 76-79
163+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));
164+
abcd = vsha1pq_u32(abcd, e1, tmp1);
165+
166+
// Update state
167+
abcd = vaddq_u32(abcd_cpy, abcd);
168+
e0 = e0.wrapping_add(e0_cpy);
169+
}
170+
171+
// Save state
172+
vst1q_u32(state.as_mut_ptr(), abcd);
173+
state[4] = e0;
174+
}
175+
10176
pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
11-
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
12-
// after stabilization
177+
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 after stabilization
13178
if sha1_hwcap::get() {
14-
sha1_asm::compress(state, blocks);
179+
unsafe {
180+
compress_sha1_neon(state, blocks);
181+
}
15182
} else {
16183
super::soft::compress(state, blocks);
17184
}

0 commit comments

Comments
 (0)